blob: e6b3736b50bd93b844295b8b9ed9180ffad89ef5 [file] [log] [blame]
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
""" This module implements the Stale-While-Revalidate performance improvement
experiment on third parties' resources.
The top level operations of the experiment are:
1. Record WPR archive;
2. Create a patched WPR archive so that all resource are getting cached;
3. Record original cache using the patched WPR archive;
4. Setup the benchmark producing the list of URL to enable SWR in a JSON file;
5. Create the benchmark cache by:
- Remove No-Store resources;
- Adding the SWR header on resources that are experimentally required to
have it;
- Patch SWR header on resources that already had it to make sure the
the SWR freshness is not out of date;
- And restore all other headers so that response headers such as
Set-Cookie are still in the cache to avoid entropy caused by
different cookie values.
6. Run the benchmark;
7. Extract metrics into CSV files.
"""
import csv
import json
import logging
import os
import shutil
from urlparse import urlparse
import chrome_cache
import common_util
import loading_trace
import request_track
import sandwich_metrics
import sandwich_runner
import sandwich_utils
import task_manager
import wpr_backend
def _ExtractRegexMatchingUrls(urls, domain_regexes):
urls_to_enable = set()
for url in urls:
if url in urls_to_enable:
continue
parsed_url = urlparse(url)
for domain_regex in domain_regexes:
if domain_regex.search(parsed_url.netloc):
urls_to_enable.add(url)
break
return urls_to_enable
def _BuildBenchmarkCache(
original_wpr_trace_path, urls_to_enable_swr,
original_cache_trace_path, original_cache_archive_path,
cache_archive_dest_path):
# Load trace that was generated at original cache creation.
logging.info('loading %s', original_wpr_trace_path)
trace = loading_trace.LoadingTrace.FromJsonFile(original_wpr_trace_path)
# Lists URLs that should not be in the cache or already have SWR headers.
urls_should_not_be_cached = set()
urls_already_with_swr = set()
for request in trace.request_track.GetEvents():
caching_policy = request_track.CachingPolicy(request)
if not caching_policy.IsCacheable():
urls_should_not_be_cached.add(request.url)
elif caching_policy.GetFreshnessLifetimes()[1] > 0:
urls_already_with_swr.add(request.url)
# Trace are fat, kill this one to save up memory for the next one to load in
# this scope.
del trace
# Load trace that was generated at original cache creation.
logging.info('loading %s', original_cache_trace_path)
trace = loading_trace.LoadingTrace.FromJsonFile(original_cache_trace_path)
# Create cache contents.
delete_count = 0
swr_patch_count = 0
originaly_swr_patch_count = 0
noswr_patch_count = 0
with common_util.TemporaryDirectory(prefix='sandwich_tmp') as tmp_path:
cache_path = os.path.join(tmp_path, 'cache')
chrome_cache.UnzipDirectoryContent(original_cache_archive_path, cache_path)
cache_backend = chrome_cache.CacheBackend(cache_path, 'simple')
cache_keys = set(cache_backend.ListKeys())
for request in trace.request_track.GetEvents():
if request.url not in cache_keys:
continue
if request.url in urls_should_not_be_cached:
cache_backend.DeleteKey(request.url)
delete_count += 1
continue
if not request.HasReceivedResponse():
continue
if request.url in urls_to_enable_swr:
request.SetHTTPResponseHeader(
'cache-control', 'max-age=0,stale-while-revalidate=315360000')
request.SetHTTPResponseHeader(
'last-modified', 'Thu, 23 Jun 2016 11:30:00 GMT')
swr_patch_count += 1
elif request.url in urls_already_with_swr:
# Force to use SWR on resources that originally attempted to use it.
request.SetHTTPResponseHeader(
'cache-control', 'max-age=0,stale-while-revalidate=315360000')
# The resource originally had SWR enabled therefore we don't
# Last-Modified to repro exactly the performance impact in case these
# headers were not set properly causing an invalidation instead of a
# revalidation.
originaly_swr_patch_count += 1
else:
# Force synchronous revalidation.
request.SetHTTPResponseHeader('cache-control', 'max-age=0')
noswr_patch_count += 1
raw_headers = request.GetRawResponseHeaders()
cache_backend.UpdateRawResponseHeaders(request.url, raw_headers)
chrome_cache.ZipDirectoryContent(cache_path, cache_archive_dest_path)
logging.info('patched %d cached resources with forced SWR', swr_patch_count)
logging.info('patched %d cached resources with original SWR',
originaly_swr_patch_count)
logging.info('patched %d cached resources without SWR', noswr_patch_count)
logging.info('deleted %d cached resources', delete_count)
def _ProcessRunOutputDir(benchmark_setup, runner_output_dir):
"""Process benchmark's run output directory.
Args:
cache_validation_result: Same as for _RunOutputVerifier
benchmark_setup: Same as for _RunOutputVerifier
runner_output_dir: Same as for SandwichRunner.output_dir
Returns:
List of dictionary.
"""
run_metrics_list = []
for repeat_id, repeat_dir in sandwich_runner.WalkRepeatedRuns(
runner_output_dir):
trace_path = os.path.join(repeat_dir, sandwich_runner.TRACE_FILENAME)
logging.info('processing trace: %s', trace_path)
trace = loading_trace.LoadingTrace.FromJsonFile(trace_path)
served_from_cache_urls = sandwich_utils.ListUrlRequests(
trace, sandwich_utils.RequestOutcome.ServedFromCache)
matching_subresource_count_used_from_cache = (
served_from_cache_urls.intersection(
set(benchmark_setup['urls_to_enable_swr'])))
run_metrics = {
'url': trace.url,
'repeat_id': repeat_id,
'benchmark_name': benchmark_setup['benchmark_name'],
'cache_recording.subresource_count':
len(benchmark_setup['effective_subresource_urls']),
'cache_recording.matching_subresource_count':
len(benchmark_setup['urls_to_enable_swr']),
'benchmark.matching_subresource_count_used_from_cache':
len(matching_subresource_count_used_from_cache)
}
run_metrics.update(
sandwich_metrics.ExtractCommonMetricsFromRepeatDirectory(
repeat_dir, trace))
run_metrics_list.append(run_metrics)
return run_metrics_list
class StaleWhileRevalidateBenchmarkBuilder(task_manager.Builder):
"""A builder for a graph of tasks for Stale-While-Revalidate study benchmarks.
"""
def __init__(self, common_builder):
task_manager.Builder.__init__(self,
common_builder.output_directory,
common_builder.output_subdirectory)
self._common_builder = common_builder
self._patched_wpr_path = None
self._original_cache_task = None
self._original_cache_trace_path = None
self._PopulateCommonPipelines()
def _PopulateCommonPipelines(self):
"""Creates necessary tasks to produce initial cache archives.
Here is the full dependency tree for the returned task:
depends on: common/original-cache.zip
depends on: common/webpages-patched.wpr
depends on: common/webpages.wpr
"""
@self.RegisterTask('common/webpages-patched.wpr',
dependencies=[self._common_builder.original_wpr_task])
def BuildPatchedWpr():
shutil.copyfile(
self._common_builder.original_wpr_task.path, BuildPatchedWpr.path)
wpr_archive = wpr_backend.WprArchiveBackend(BuildPatchedWpr.path)
wpr_url_entries = wpr_archive.ListUrlEntries()
for wpr_url_entry in wpr_url_entries:
sandwich_utils.PatchWprEntryToBeCached(wpr_url_entry)
logging.info('number of patched entries: %d', len(wpr_url_entries))
wpr_archive.Persist()
@self.RegisterTask('common/original-cache.zip',
dependencies=[BuildPatchedWpr])
def BuildOriginalCache():
runner = self._common_builder.CreateSandwichRunner()
runner.wpr_archive_path = BuildPatchedWpr.path
runner.cache_archive_path = BuildOriginalCache.path
runner.cache_operation = sandwich_runner.CacheOperation.SAVE
runner.output_dir = BuildOriginalCache.run_path
runner.Run()
BuildOriginalCache.run_path = BuildOriginalCache.path[:-4] + '-run'
self._original_cache_trace_path = os.path.join(
BuildOriginalCache.run_path, '0', sandwich_runner.TRACE_FILENAME)
self._patched_wpr_path = BuildPatchedWpr.path
self._original_cache_task = BuildOriginalCache
def PopulateBenchmark(self, benchmark_name, domain_regexes,
transformer_list_name, transformer_list):
"""Populate benchmarking tasks.
Args:
benchmark_name: Name of the benchmark.
domain_regexes: Compiled regexes of domains to enable SWR.
transformer_list_name: A string describing the transformers, will be used
in Task names (prefer names without spaces and special characters).
transformer_list: An ordered list of function that takes an instance of
SandwichRunner as parameter, would be applied immediately before
SandwichRunner.Run() in the given order.
Here is the full dependency of the added tree for the returned task:
<transformer_list_name>/<benchmark_name>-metrics.csv
depends on: <transformer_list_name>/<benchmark_name>-run/
depends on: common/<benchmark_name>-cache.zip
depends on: common/<benchmark_name>-setup.json
depends on: common/patched-cache.zip
"""
additional_column_names = [
'url',
'repeat_id',
'benchmark_name',
# Number of resources of the page.
'cache_recording.subresource_count',
# Number of resources matching at least one domain regex, to give an
# idea in the CSV how much the threshold influence additional SWR uses.
'cache_recording.matching_subresource_count',
# Number of resources fetched from cache matching at least one domain
# regex, to give an actual idea if it is possible to have performance
# improvement on the web page (or not because only XHR), but also tells
# if the page loading time should see a performance improvement or not
# compared with a different thresholds.
'benchmark.matching_subresource_count_used_from_cache']
shared_task_prefix = os.path.join('common', benchmark_name)
task_prefix = os.path.join(transformer_list_name, benchmark_name)
@self.RegisterTask(shared_task_prefix + '-setup.json', merge=True,
dependencies=[self._original_cache_task])
def SetupBenchmark():
logging.info('loading %s', self._original_cache_trace_path)
trace = loading_trace.LoadingTrace.FromJsonFile(
self._original_cache_trace_path)
logging.info('generating %s', SetupBenchmark.path)
effective_subresource_urls = sandwich_utils.ListUrlRequests(
trace, sandwich_utils.RequestOutcome.All)
urls_to_enable_swr = _ExtractRegexMatchingUrls(
effective_subresource_urls, domain_regexes)
logging.info(
'count of urls to enable SWR: %s', len(urls_to_enable_swr))
with open(SetupBenchmark.path, 'w') as output:
json.dump({
'benchmark_name': benchmark_name,
'urls_to_enable_swr': [url for url in urls_to_enable_swr],
'effective_subresource_urls':
[url for url in effective_subresource_urls]
}, output)
@self.RegisterTask(shared_task_prefix + '-cache.zip', merge=True,
dependencies=[SetupBenchmark])
def BuildBenchmarkCacheArchive():
benchmark_setup = json.load(open(SetupBenchmark.path))
_BuildBenchmarkCache(
original_wpr_trace_path=(
self._common_builder.original_wpr_recording_trace_path),
urls_to_enable_swr=set(benchmark_setup['urls_to_enable_swr']),
original_cache_trace_path=self._original_cache_trace_path,
original_cache_archive_path=self._original_cache_task.path,
cache_archive_dest_path=BuildBenchmarkCacheArchive.path)
@self.RegisterTask(task_prefix + '-run/', [BuildBenchmarkCacheArchive])
def RunBenchmark():
runner = self._common_builder.CreateSandwichRunner()
for transformer in transformer_list:
transformer(runner)
runner.wpr_archive_path = self._patched_wpr_path
runner.wpr_out_log_path = os.path.join(
RunBenchmark.path, sandwich_runner.WPR_LOG_FILENAME)
runner.cache_archive_path = BuildBenchmarkCacheArchive.path
runner.cache_operation = sandwich_runner.CacheOperation.PUSH
runner.output_dir = RunBenchmark.path
runner.chrome_args.append('--enable-features=StaleWhileRevalidate2')
runner.Run()
@self.RegisterTask(task_prefix + '-metrics.csv', [RunBenchmark])
def ExtractMetrics():
benchmark_setup = json.load(open(SetupBenchmark.path))
run_metrics_list = _ProcessRunOutputDir(
benchmark_setup, RunBenchmark.path)
run_metrics_list.sort(key=lambda e: e['repeat_id'])
with open(ExtractMetrics.path, 'w') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names +
sandwich_metrics.COMMON_CSV_COLUMN_NAMES))
writer.writeheader()
for run_metrics in run_metrics_list:
writer.writerow(run_metrics)
self._common_builder.default_final_tasks.append(ExtractMetrics)