blob: e46b7057656f1f470a2c02861115472e345e84b4 [file] [log] [blame]
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Models for loading in chrome.
(Redirect the following to the general model module once we have one)
A model is an object with the following methods.
CostMs(): return the cost of the model in milliseconds.
Set(): set model-specific parameters.
ResourceGraph
This creates a DAG of resource dependencies from loading.log_requests to model
loading time. The model may be parameterized by changing the loading time of
a particular or all resources.
"""
import logging
import os
import urlparse
import sys
import dag
import loading_trace
import request_dependencies_lens
class ResourceGraph(object):
"""A model of loading by a DAG (tree?) of resource dependancies.
Set parameters:
cache_all: if true, assume zero loading time for all resources.
"""
def __init__(self, trace, content_lens=None):
"""Create from a LoadingTrace (or json of a trace).
Args:
trace: (LoadingTrace/JSON) Loading trace or JSON of a trace.
content_lens: (ContentClassificationLens) Lens used to annotate the
nodes, or None.
"""
if type(trace) == dict:
trace = loading_trace.LoadingTrace.FromJsonDict(trace)
self._content_lens = content_lens
self._BuildDag(trace)
self._global_start = min([n.StartTime() for n in self._node_info])
# Sort before splitting children so that we can correctly dectect if a
# reparented child is actually a dependency for a child of its new parent.
try:
for n in dag.TopologicalSort(self._nodes):
self._SplitChildrenByTime(self._node_info[n.Index()])
except AssertionError as exc:
sys.stderr.write('Bad topological sort: %s\n'
'Skipping child split\n' % str(exc))
self._cache_all = False
self._node_filter = lambda _: True
@classmethod
def CheckImageLoadConsistency(cls, g1, g2):
"""Check that images have the same dependancies between ResourceGraphs.
Image resources are identified by their short names.
Args:
g1: a ResourceGraph instance
g2: a ResourceGraph instance
Returns:
A list of discrepancy tuples. If this list is empty, g1 and g2 are
consistent with respect to image load dependencies. Otherwise, each tuple
is of the form:
( g1 resource short name or str(list of short names),
g2 resource short name or str(list of short names),
human-readable discrepancy reason )
Either or both of the g1 and g2 image resource short names may be None if
it's not applicable for the discrepancy reason.
"""
discrepancies = []
g1_image_to_info = g1._ExtractImages()
g2_image_to_info = g2._ExtractImages()
for image in set(g1_image_to_info.keys()) - set(g2_image_to_info.keys()):
discrepancies.append((image, None, 'Missing in g2'))
for image in set(g2_image_to_info.keys()) - set(g1_image_to_info.keys()):
discrepancies.append((None, image, 'Missing in g1'))
for image in set(g1_image_to_info.keys()) & set(g2_image_to_info.keys()):
def PredecessorInfo(g, n):
info = [g._ShortName(p) for p in n.Node().Predecessors()]
info.sort()
return str(info)
g1_pred = PredecessorInfo(g1, g1_image_to_info[image])
g2_pred = PredecessorInfo(g2, g2_image_to_info[image])
if g1_pred != g2_pred:
discrepancies.append((g1_pred, g2_pred,
'Predecessor mismatch for ' + image))
return discrepancies
def Set(self, cache_all=None, node_filter=None):
"""Set model parameters.
TODO(mattcary): add parameters for caching certain types of resources (just
scripts, just cachable, etc).
Args:
cache_all: boolean that if true ignores emperical resource load times for
all resources.
node_filter: a Node->boolean used to restrict the graph for most
operations.
"""
if self._cache_all is not None:
self._cache_all = cache_all
if node_filter is not None:
self._node_filter = node_filter
def Nodes(self):
"""Return iterable of all nodes via their _NodeInfos.
Returns:
Iterable of node infos in arbitrary order.
"""
for n in self._node_info:
if self._node_filter(n.Node()):
yield n
def EdgeCosts(self, node_filter=None):
"""Edge costs.
Args:
node_filter: if not none, a Node->boolean filter to use instead of the
current one from Set.
Returns:
The total edge costs of our graph.
"""
node_filter = self._node_filter if node_filter is None else node_filter
total = 0
for n in self._node_info:
if not node_filter(n.Node()):
continue
for s in n.Node().Successors():
if node_filter(s):
total += self._EdgeCost(n.Node(), s)
return total
def Intersect(self, other_nodes):
"""Return iterable of nodes that intersect with another graph.
Args:
other_nodes: iterable of the nodes of another graph, eg from Nodes().
Returns:
an iterable of (mine, other) pairs for all nodes for which the URL is
identical.
"""
other_map = {n.Url(): n for n in other_nodes}
for n in self._node_info:
if self._node_filter(n.Node()) and n.Url() in other_map:
yield(n, other_map[n.Url()])
def Cost(self, path_list=None):
"""Compute cost of current model.
Args:
path_list: if not None, gets a list of NodeInfo in the longest path.
Returns:
Cost of the longest path.
"""
costs = [0] * len(self._nodes)
for n in dag.TopologicalSort(self._nodes, self._node_filter):
cost = 0
if n.Predecessors():
cost = max([costs[p.Index()] + self._EdgeCost(p, n)
for p in n.Predecessors()])
if not self._cache_all:
cost += self._NodeCost(n)
costs[n.Index()] = cost
max_cost = max(costs)
assert max_cost > 0 # Otherwise probably the filter went awry.
if path_list is not None:
del path_list[:-1]
n = (i for i in self._nodes if costs[i.Index()] == max_cost).next()
path_list.append(self._node_info[n.Index()])
while n.Predecessors():
n = reduce(lambda costliest, next:
next if (self._node_filter(next) and
costs[next.Index()] > costs[costliest.Index()])
else costliest,
n.Predecessors())
path_list.insert(0, self._node_info[n.Index()])
return max_cost
def FilterAds(self, node):
"""A filter for use in eg, Cost, to remove advertising nodes.
Args:
node: A dag.Node.
Returns:
True if the node is not ad-related.
"""
node_info = self._node_info[node.Index()]
return not (node_info.IsAd() or node_info.IsTracking())
def MakeGraphviz(self, output, highlight=None):
"""Output a graphviz representation of our DAG.
Args:
output: a file-like output stream which recieves a graphviz dot.
highlight: a list of node items to emphasize. Any resource url which
contains any highlight text will be distinguished in the output.
"""
output.write("""digraph dependencies {
rankdir = LR;
""")
orphans = set()
try:
sorted_nodes = dag.TopologicalSort(self._nodes,
node_filter=self._node_filter)
except AssertionError as exc:
sys.stderr.write('Bad topological sort: %s\n'
'Writing children in order\n' % str(exc))
sorted_nodes = self._nodes
for n in sorted_nodes:
if not n.Successors() and not n.Predecessors():
orphans.add(n)
if orphans:
output.write("""subgraph cluster_orphans {
color=black;
label="Orphans";
""")
for n in orphans:
output.write(self._GraphvizNode(n.Index(), highlight))
output.write('}\n')
output.write("""subgraph cluster_nodes {
color=invis;
""")
for n in sorted_nodes:
if not n.Successors() and not n.Predecessors():
continue
output.write(self._GraphvizNode(n.Index(), highlight))
for n in sorted_nodes:
for s in n.Successors():
style = 'color = orange'
annotations = self._EdgeAnnotation(n, s)
if 'redirect' in annotations:
style = 'color = black'
elif 'parser' in annotations:
style = 'color = red'
elif 'stack' in annotations:
style = 'color = blue'
elif 'script_inferred' in annotations:
style = 'color = purple'
if 'timing' in annotations:
style += '; style=dashed'
arrow = '[%s; label="%s"]' % (style, self._EdgeCost(n, s))
output.write('%d -> %d %s;\n' % (n.Index(), s.Index(), arrow))
output.write('}\n}\n')
def ResourceInfo(self):
"""Get resource info.
Returns:
A list of _NodeInfo objects that describe the resources fetched.
"""
return self._node_info
def DebugString(self):
"""Graph structure for debugging.
TODO(mattcary): this fails for graphs with more than one component or where
self._nodes[0] is not a root.
Returns:
A human-readable string of the graph.
"""
output = []
queue = [self._nodes[0]]
visited = set()
while queue:
n = queue.pop(0)
assert n not in visited
visited.add(n)
children = n.SortedSuccessors()
output.append('%d -> [%s]' %
(n.Index(), ' '.join([str(c.Index()) for c in children])))
for c in children:
assert n in c.Predecessors() # Integrity checking
queue.append(c)
assert len(visited) == len(self._nodes)
return '\n'.join(output)
##
## Internal items
##
_CONTENT_KIND_TO_COLOR = {
'application': 'blue', # Scripts.
'font': 'grey70',
'image': 'orange', # This probably catches gifs?
'video': 'hotpink1',
}
_CONTENT_TYPE_TO_COLOR = {
'html': 'red',
'css': 'green',
'script': 'blue',
'javascript': 'blue',
'json': 'purple',
'gif': 'grey',
'image': 'orange',
'jpeg': 'orange',
'png': 'orange',
'plain': 'brown3',
'octet-stream': 'brown3',
'other': 'white',
}
# This resource type may induce a timing dependency. See _SplitChildrenByTime
# for details.
# TODO(mattcary): are these right?
_CAN_BE_TIMING_PARENT = set(['script', 'magic-debug-content'])
_CAN_MAKE_TIMING_DEPENDENCE = set(['json', 'other', 'magic-debug-content'])
class _NodeInfo(object):
"""Our internal class that adds cost and other information to nodes.
Costs are stored on the node as well as edges. Edge information is only
stored on successor edges and not predecessor, that is, you get them from
the parent and not the child.
We also store the request on the node, and expose request-derived
information like content type.
"""
def __init__(self, node, request):
"""Create a new node info.
Args:
node: The node to augment.
request: The request associated with this node.
"""
self._request = request
self._is_ad = False
self._is_tracking = False
self._node = node
self._edge_costs = {}
self._edge_annotations = {}
# All fields in timing are millis relative to request_time, which is epoch
# seconds.
self._node_cost = max(
[0] + [t for f, t in request.timing._asdict().iteritems()
if f != 'request_time'])
def __str__(self):
return self.ShortName()
def Node(self):
return self._node
def Index(self):
return self._node.Index()
def SetRequestContent(self, is_ad, is_tracking):
"""Sets the kind of content the request relates to.
Args:
is_ad: (bool) Whether the request is an Ad.
is_tracking: (bool) Whether the request is related to tracking.
"""
(self._is_ad, self._is_tracking) = (is_ad, is_tracking)
def IsAd(self):
return self._is_ad
def IsTracking(self):
return self._is_tracking
def Request(self):
return self._request
def NodeCost(self):
return self._node_cost
def EdgeCost(self, s):
return self._edge_costs[s]
def StartTime(self):
return self._request.timing.request_time * 1000
def EndTime(self):
return self._request.timing.request_time * 1000 + self._node_cost
def EdgeAnnotation(self, s):
assert s.Node() in self.Node().Successors()
return self._edge_annotations.get(s, [])
def ContentType(self):
return self._request.GetContentType()
def ShortName(self):
"""Returns either the hostname of the resource, or the filename,
or the end of the path. Tries to include the domain as much as possible.
"""
parsed = urlparse.urlparse(self._request.url)
path = parsed.path
hostname = parsed.hostname if parsed.hostname else '?.?.?'
if path != '' and path != '/':
last_path = parsed.path.split('/')[-1]
if len(last_path) < 10:
if len(path) < 10:
return hostname + '/' + path
else:
return hostname + '/..' + parsed.path[-10:]
elif len(last_path) > 10:
return hostname + '/..' + last_path[:5]
else:
return hostname + '/..' + last_path
else:
return hostname
def Url(self):
return self._request.url
def SetEdgeCost(self, child, cost):
assert child.Node() in self._node.Successors()
self._edge_costs[child] = cost
def AddEdgeAnnotation(self, s, annotation):
assert s.Node() in self._node.Successors()
self._edge_annotations.setdefault(s, []).append(annotation)
def ReparentTo(self, old_parent, new_parent):
"""Move costs and annotatations from old_parent to new_parent.
Also updates the underlying node connections, ie, do not call
old_parent.RemoveSuccessor(), etc.
Args:
old_parent: the _NodeInfo of a current parent of self. We assert this
is actually a parent.
new_parent: the _NodeInfo of the new parent. We assert it is not already
a parent.
"""
assert old_parent.Node() in self.Node().Predecessors()
assert new_parent.Node() not in self.Node().Predecessors()
edge_annotations = old_parent._edge_annotations.pop(self, [])
edge_cost = old_parent._edge_costs.pop(self)
old_parent.Node().RemoveSuccessor(self.Node())
new_parent.Node().AddSuccessor(self.Node())
new_parent.SetEdgeCost(self, edge_cost)
for a in edge_annotations:
new_parent.AddEdgeAnnotation(self, a)
def __eq__(self, o):
return self.Node().Index() == o.Node().Index()
def __hash__(self):
return hash(self.Node().Index())
def _ShortName(self, node):
"""Convenience function for redirecting Nodes to _NodeInfo."""
return self._node_info[node.Index()].ShortName()
def _Url(self, node):
"""Convenience function for redirecting Nodes to _NodeInfo."""
return self._node_info[node.Index()].Url()
def _NodeCost(self, node):
"""Convenience function for redirecting Nodes to _NodeInfo."""
return self._node_info[node.Index()].NodeCost()
def _EdgeCost(self, parent, child):
"""Convenience function for redirecting Nodes to _NodeInfo."""
return self._node_info[parent.Index()].EdgeCost(
self._node_info[child.Index()])
def _EdgeAnnotation(self, parent, child):
"""Convenience function for redirecting Nodes to _NodeInfo."""
return self._node_info[parent.Index()].EdgeAnnotation(
self._node_info[child.Index()])
def _BuildDag(self, trace):
"""Build DAG of resources.
Build a DAG from our requests and augment with _NodeInfo (see above) in a
parallel array indexed by Node.Index().
Creates self._nodes and self._node_info.
Args:
trace: A LoadingTrace.
"""
self._nodes = []
self._node_info = []
index_by_request = {}
for request in trace.request_track.GetEvents():
next_index = len(self._nodes)
assert request not in index_by_request
index_by_request[request] = next_index
node = dag.Node(next_index)
node_info = self._NodeInfo(node, request)
if self._content_lens:
node_info.SetRequestContent(
self._content_lens.IsAdRequest(request),
self._content_lens.IsTrackingRequest(request))
self._nodes.append(node)
self._node_info.append(node_info)
dependencies = request_dependencies_lens.RequestDependencyLens(
trace).GetRequestDependencies()
for parent_rq, child_rq, reason in dependencies:
parent = self._node_info[index_by_request[parent_rq]]
child = self._node_info[index_by_request[child_rq]]
edge_cost = child.StartTime() - parent.EndTime()
if edge_cost < 0:
edge_cost = 0
if child.StartTime() < parent.StartTime():
logging.error('Inverted dependency: %s->%s',
parent.ShortName(), child.ShortName())
# Note that child.StartTime() < parent.EndTime() appears to happen a
# fair amount in practice.
parent.Node().AddSuccessor(child.Node())
parent.SetEdgeCost(child, edge_cost)
parent.AddEdgeAnnotation(child, reason)
def _SplitChildrenByTime(self, parent):
"""Split children of a node by request times.
The initiator of a request may not be the true dependency of a request. For
example, a script may appear to load several resources independently, but in
fact one of them may be a JSON data file, and the remaining resources assets
described in the JSON. The assets should be dependent upon the JSON data
file, and not the original script.
This function approximates that by rearranging the children of a node
according to their request times. The predecessor of each child is made to
be the node with the greatest finishing time, that is before the start time
of the child.
We do this by sorting the nodes twice, once by start time and once by end
time. We mark the earliest end time, and then we walk the start time list,
advancing the end time mark when it is less than our current start time.
This is refined by only considering assets which we believe actually create
a dependency. We only split if the original parent is a script, and the new
parent a data file. We confirm these relationships heuristically by loading
pages multiple times and ensuring that dependacies do not change; see
CheckImageLoadConsistency() for details.
We incorporate this heuristic by skipping over any non-script/json resources
when moving the end mark.
TODO(mattcary): More heuristics, like incorporating cachability somehow, and
not just picking arbitrarily if there are two nodes with the same end time
(does that ever really happen?)
Args:
parent: the NodeInfo whose children we are going to rearrange.
"""
if parent.ContentType() not in self._CAN_BE_TIMING_PARENT:
return # No dependency changes.
children_by_start_time = [self._node_info[s.Index()]
for s in parent.Node().Successors()]
children_by_start_time.sort(key=lambda c: c.StartTime())
children_by_end_time = [self._node_info[s.Index()]
for s in parent.Node().Successors()]
children_by_end_time.sort(key=lambda c: c.EndTime())
end_mark = 0
for current in children_by_start_time:
if current.StartTime() < parent.EndTime() - 1e-5:
logging.warning('Child loaded before parent finished: %s -> %s',
parent.ShortName(), current.ShortName())
go_to_next_child = False
while end_mark < len(children_by_end_time):
if children_by_end_time[end_mark] == current:
go_to_next_child = True
break
elif (children_by_end_time[end_mark].ContentType() not in
self._CAN_MAKE_TIMING_DEPENDENCE):
end_mark += 1
elif (end_mark < len(children_by_end_time) - 1 and
children_by_end_time[end_mark + 1].EndTime() <
current.StartTime()):
end_mark += 1
else:
break
if end_mark >= len(children_by_end_time):
break # It's not possible to rearrange any more children.
if go_to_next_child:
continue # We can't rearrange this child, but the next child may be
# eligible.
if children_by_end_time[end_mark].EndTime() <= current.StartTime():
current.ReparentTo(parent, children_by_end_time[end_mark])
children_by_end_time[end_mark].AddEdgeAnnotation(current, 'timing')
def _ContentTypeToColor(self, content_type):
if not content_type:
type_str = 'other'
elif '/' in content_type:
kind, type_str = content_type.split('/', 1)
if kind in self._CONTENT_KIND_TO_COLOR:
return self._CONTENT_KIND_TO_COLOR[kind]
else:
type_str = content_type
return self._CONTENT_TYPE_TO_COLOR[type_str]
def _GraphvizNode(self, index, highlight):
"""Returns a graphviz node description for a given node.
Args:
index: index of the node.
highlight: a list of node items to emphasize. Any resource url which
contains any highlight text will be distinguished in the output.
Returns:
A string describing the resource in graphviz format.
The resource is color-coded according to its content type, and its shape
is oval if its max-age is less than 300s (or if it's not cacheable).
"""
node_info = self._node_info[index]
color = self._ContentTypeToColor(node_info.ContentType())
max_age = node_info.Request().MaxAge()
shape = 'polygon' if max_age > 300 else 'oval'
styles = ['filled']
if highlight:
for fragment in highlight:
if fragment in node_info.Url():
styles.append('dotted')
break
if node_info.IsAd() or node_info.IsTracking():
styles += ['bold', 'diagonals']
return ('%d [label = "%s\\n%.2f->%.2f (%.2f)"; style = "%s"; '
'fillcolor = %s; shape = %s];\n'
% (index, node_info.ShortName(),
node_info.StartTime() - self._global_start,
node_info.EndTime() - self._global_start,
node_info.EndTime() - node_info.StartTime(),
','.join(styles), color, shape))
def _ExtractImages(self):
"""Return interesting image resources.
Uninteresting image resources are things like ads that we don't expect to be
constant across fetches.
Returns:
Dict of image url + short name to NodeInfo.
"""
image_to_info = {}
for n in self._node_info:
if (n.ContentType().startswith('image') and
not self._IsAdUrl(n.Url())):
key = str((n.Url(), n.ShortName(), n.StartTime()))
assert key not in image_to_info, n.Url()
image_to_info[key] = n
return image_to_info