blob: 910c280eccf3bc8a1ca635eb815dbffd666550e7 [file] [log] [blame]
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Compute core set for a page.
This script is a collection of utilities for working with core sets.
"""
import argparse
import glob
import json
import logging
import multiprocessing
import os
import sys
import dependency_graph
import loading_trace
import request_dependencies_lens
import resource_sack
def _Progress(x):
sys.stderr.write(x + '\n')
def _PageCore(prefix, graph_set_names, output):
"""Compute the page core over sets defined by graph_set_names."""
assert graph_set_names
graph_sets = []
sack = resource_sack.GraphSack()
for name in graph_set_names:
name_graphs = []
_Progress('Processing %s' % name)
for filename in glob.iglob('-'.join([prefix, name, '*.trace'])):
_Progress('Reading %s' % filename)
trace = loading_trace.LoadingTrace.FromJsonFile(filename)
graph = dependency_graph.RequestDependencyGraph(
trace.request_track.GetEvents(),
request_dependencies_lens.RequestDependencyLens(trace))
sack.ConsumeGraph(graph)
name_graphs.append(graph)
graph_sets.append(name_graphs)
core = sack.CoreSet(*graph_sets)
json.dump({'page_core': [{'label': b.label,
'name': b.name,
'count': b.num_nodes}
for b in core],
'non_core': [{'label': b.label,
'name': b.name,
'count': b.num_nodes}
for b in sack.bags if b not in core],
'threshold': sack.CORE_THRESHOLD},
output, sort_keys=True, indent=2)
output.write('\n')
def _DoSite(site, graph_sets, input_dir, output_dir):
"""Compute the appropriate page core for a site.
Used by _Spawn.
"""
_Progress('Doing %s on %s' % (site, '/'.join(graph_sets)))
prefix = os.path.join(input_dir, site)
with file(os.path.join(output_dir,
'%s-%s.json' % (site, '.'.join(graph_sets))),
'w') as output:
_PageCore(prefix, graph_sets, output)
def _DoSiteRedirect(t):
"""Unpack arguments for map call.
Note that multiprocessing.Pool.map cannot use a lambda (as it needs to be
serialized into the executing process).
"""
_DoSite(*t)
def _Spawn(site_list_file, graph_sets, input_dir, output_dir, workers):
"""Spool site computation out to a multiprocessing pool."""
with file(site_list_file) as site_file:
sites = [l.strip() for l in site_file.readlines()]
_Progress('Using sites:\n %s' % '\n '.join(sites))
pool = multiprocessing.Pool(workers, maxtasksperchild=1)
pool.map(_DoSiteRedirect, [(s, graph_sets, input_dir, output_dir)
for s in sites])
def _ReadCoreSet(filename):
data = json.load(open(filename))
return set(page['name'] for page in data['page_core'])
def _Compare(a_name, b_name, csv):
"""Compare two core sets."""
a = _ReadCoreSet(a_name)
b = _ReadCoreSet(b_name)
result = (resource_sack.GraphSack.CoreSimilarity(a, b),
' Equal' if a == b else 'UnEqual',
'a<=b' if a <= b else 'a!<b',
'a>=b' if b <= a else 'a!>b')
if csv:
print '%s,%s,%s,%s' % result
else:
print '%.2f %s %s %s' % result
if __name__ == '__main__':
logging.basicConfig(level=logging.ERROR)
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
spawn = subparsers.add_parser(
'spawn', help=('spawn page core set computation from a sites list.\n'
'A core set will be computed for each site by '
'combining all run indicies from site traces for each '
'--set, then computing the page core over the sets. '
'Assumes trace file names in form {input-dir}/'
'{site}-{set}-{run index}.trace'))
spawn.add_argument('--sets', required=True,
help='sets to combine, comma-separated')
spawn.add_argument('--sites', required=True, help='file containing sites')
spawn.add_argument('--workers', default=8, type=int,
help=('number of parallel workers. Each worker seems to '
'use about 0.5-1G/trace when processing. Total '
'memory usage should be kept less than physical '
'memory for the job to run in a reasonable time'))
spawn.add_argument('--input_dir', required=True,
help='trace input directory')
spawn.add_argument('--output_dir', required=True,
help=('core set output directory. Each site will have one '
'JSON file generated listing the core set as well '
'as some metadata like the threshold used'))
spawn.set_defaults(executor=lambda args:
_Spawn(site_list_file=args.sites,
graph_sets=args.sets.split(','),
input_dir=args.input_dir,
output_dir=args.output_dir,
workers=args.workers))
page_core = subparsers.add_parser(
'page_core',
help=('compute page core set for a group of files of form '
'{--prefix}{set}*.trace over each set in --sets'))
page_core.add_argument('--sets', required=True,
help='sets to combine, comma-separated')
page_core.add_argument('--prefix', required=True,
help='trace file prefix')
page_core.add_argument('--output', required=True,
help='JSON output file name')
page_core.set_defaults(executor=lambda args:
_PageCore(args.prefix, args.sets.split(','),
file(args.output, 'w')))
compare = subparsers.add_parser(
'compare',
help=('compare two core sets (as output by spawn, page_core or '
'all_cores) using Jaccard index. Outputs on stdout'))
compare.add_argument('--a', required=True, help='the first core set JSON')
compare.add_argument('--b', required=True, help='the second core set JSON')
compare.add_argument('--csv', action='store_true', help='output as CSV')
compare.set_defaults(
executor=lambda args:
_Compare(args.a, args.b, args.csv))
args = parser.parse_args()
args.executor(args)