|  | # Copyright 2016 The Chromium Authors. All rights reserved. | 
|  | # Use of this source code is governed by a BSD-style license that can be | 
|  | # found in the LICENSE file. | 
|  |  | 
|  | """Opens and modifies WPR archive. | 
|  | """ | 
|  |  | 
|  | import collections | 
|  | import os | 
|  | import re | 
|  | import sys | 
|  | from urlparse import urlparse | 
|  |  | 
|  |  | 
|  | _SRC_DIR = os.path.abspath(os.path.join( | 
|  | os.path.dirname(__file__), '..', '..', '..')) | 
|  |  | 
|  | _WEBPAGEREPLAY_DIR = os.path.join(_SRC_DIR, 'third_party', 'webpagereplay') | 
|  | _WEBPAGEREPLAY_HTTPARCHIVE = os.path.join(_WEBPAGEREPLAY_DIR, 'httparchive.py') | 
|  |  | 
|  | sys.path.append(os.path.join(_SRC_DIR, 'third_party', 'webpagereplay')) | 
|  | import httparchive | 
|  |  | 
|  | # Regex used to parse httparchive.py stdout's when listing all urls. | 
|  | _PARSE_WPR_REQUEST_REGEX = re.compile(r'^\S+\s+(?P<url>\S+)') | 
|  |  | 
|  | # Regex used to extract WPR domain from WPR log. | 
|  | _PARSE_WPR_DOMAIN_REGEX = re.compile(r'^\(WARNING\)\s.*\sHTTP server started on' | 
|  | r' (?P<netloc>\S+)\s*$') | 
|  |  | 
|  | # Regex used to extract URLs requests from WPR log. | 
|  | _PARSE_WPR_URL_REGEX = re.compile( | 
|  | r'^\((?P<level>\S+)\)\s.*\shttpproxy\..*\s(?P<method>[A-Z]+)\s+' | 
|  | r'(?P<url>https?://[a-zA-Z0-9\-_:.]+/?\S*)\s.*$') | 
|  |  | 
|  |  | 
|  | class WprUrlEntry(object): | 
|  | """Wpr url entry holding request and response infos. """ | 
|  |  | 
|  | def __init__(self, wpr_request, wpr_response): | 
|  | self._wpr_response = wpr_response | 
|  | self.url = self._ExtractUrl(str(wpr_request)) | 
|  |  | 
|  | def GetResponseHeadersDict(self): | 
|  | """Get a copied dictionary of available headers. | 
|  |  | 
|  | Returns: | 
|  | dict(name -> value) | 
|  | """ | 
|  | headers = collections.defaultdict(list) | 
|  | for (key, value) in self._wpr_response.original_headers: | 
|  | headers[key.lower()].append(value) | 
|  | return {k: ','.join(v) for (k, v) in headers.items()} | 
|  |  | 
|  | def SetResponseHeader(self, name, value): | 
|  | """Set a header value. | 
|  |  | 
|  | In the case where the <name> response header is present more than once | 
|  | in the response header list, then the given value is set only to the first | 
|  | occurrence of that given headers, and the next ones are removed. | 
|  |  | 
|  | Args: | 
|  | name: The name of the response header to set. | 
|  | value: The value of the response header to set. | 
|  | """ | 
|  | assert name.islower() | 
|  | new_headers = [] | 
|  | new_header_set = False | 
|  | for header in self._wpr_response.original_headers: | 
|  | if header[0].lower() != name: | 
|  | new_headers.append(header) | 
|  | elif not new_header_set: | 
|  | new_header_set = True | 
|  | new_headers.append((header[0], value)) | 
|  | if new_header_set: | 
|  | self._wpr_response.original_headers = new_headers | 
|  | else: | 
|  | self._wpr_response.original_headers.append((name, value)) | 
|  |  | 
|  | def DeleteResponseHeader(self, name): | 
|  | """Delete a header. | 
|  |  | 
|  | In the case where the <name> response header is present more than once | 
|  | in the response header list, this method takes care of removing absolutely | 
|  | all them. | 
|  |  | 
|  | Args: | 
|  | name: The name of the response header field to delete. | 
|  | """ | 
|  | assert name.islower() | 
|  | self._wpr_response.original_headers = \ | 
|  | [x for x in self._wpr_response.original_headers if x[0].lower() != name] | 
|  |  | 
|  | def RemoveResponseHeaderDirectives(self, name, directives_blacklist): | 
|  | """Removed a set of directives from response headers. | 
|  |  | 
|  | Also removes the cache header in case no more directives are left. | 
|  | It is useful, for example, to remove 'no-cache' from 'pragma: no-cache'. | 
|  |  | 
|  | Args: | 
|  | name: The name of the response header field to modify. | 
|  | directives_blacklist: Set of lowered directives to remove from list. | 
|  | """ | 
|  | response_headers = self.GetResponseHeadersDict() | 
|  | if name not in response_headers: | 
|  | return | 
|  | new_value = [] | 
|  | for header_name in response_headers[name].split(','): | 
|  | if header_name.strip().lower() not in directives_blacklist: | 
|  | new_value.append(header_name) | 
|  | if new_value: | 
|  | self.SetResponseHeader(name, ','.join(new_value)) | 
|  | else: | 
|  | self.DeleteResponseHeader(name) | 
|  |  | 
|  | @classmethod | 
|  | def _ExtractUrl(cls, request_string): | 
|  | match = _PARSE_WPR_REQUEST_REGEX.match(request_string) | 
|  | assert match, 'Looks like there is an issue with: {}'.format(request_string) | 
|  | return match.group('url') | 
|  |  | 
|  |  | 
|  | class WprArchiveBackend(object): | 
|  | """WPR archive back-end able to read and modify. """ | 
|  |  | 
|  | def __init__(self, wpr_archive_path): | 
|  | """Constructor: | 
|  |  | 
|  | Args: | 
|  | wpr_archive_path: The path of the WPR archive to read/modify. | 
|  | """ | 
|  | self._wpr_archive_path = wpr_archive_path | 
|  | self._http_archive = httparchive.HttpArchive.Load(wpr_archive_path) | 
|  |  | 
|  | def ListUrlEntries(self): | 
|  | """Iterates over all url entries | 
|  |  | 
|  | Returns: | 
|  | A list of WprUrlEntry. | 
|  | """ | 
|  | return [WprUrlEntry(request, self._http_archive[request]) | 
|  | for request in self._http_archive.get_requests()] | 
|  |  | 
|  | def Persist(self): | 
|  | """Persists the archive to disk. """ | 
|  | for request in self._http_archive.get_requests(): | 
|  | response = self._http_archive[request] | 
|  | response.headers = response._TrimHeaders(response.original_headers) | 
|  | self._http_archive.Persist(self._wpr_archive_path) | 
|  |  | 
|  |  | 
|  | # WPR request seen by the WPR's HTTP proxy. | 
|  | #   is_served: Boolean whether WPR has found a matching resource in the archive. | 
|  | #   method: HTTP method of the request ['GET', 'POST' and so on...]. | 
|  | #   url: The requested URL. | 
|  | #   is_wpr_host: Whether the requested url have WPR has an host such as: | 
|  | #     http://127.0.0.1:<WPR's HTTP listening port>/web-page-replay-command-exit | 
|  | WprRequest = collections.namedtuple('WprRequest', | 
|  | ['is_served', 'method', 'url', 'is_wpr_host']) | 
|  |  | 
|  |  | 
|  | def ExtractRequestsFromLog(log_path): | 
|  | """Extract list of requested handled by the WPR's HTTP proxy from a WPR log. | 
|  |  | 
|  | Args: | 
|  | log_path: The path of the WPR log to parse. | 
|  |  | 
|  | Returns: | 
|  | List of WprRequest. | 
|  | """ | 
|  | requests = [] | 
|  | wpr_http_netloc = None | 
|  | with open(log_path) as log_file: | 
|  | for line in log_file.readlines(): | 
|  | # Extract WPR's HTTP proxy's listening network location. | 
|  | match = _PARSE_WPR_DOMAIN_REGEX.match(line) | 
|  | if match: | 
|  | wpr_http_netloc = match.group('netloc') | 
|  | assert wpr_http_netloc.startswith('127.0.0.1:') | 
|  | continue | 
|  | # Extract the WPR requested URLs. | 
|  | match = _PARSE_WPR_URL_REGEX.match(line) | 
|  | if match: | 
|  | parsed_url = urlparse(match.group('url')) | 
|  | # Ignore strange URL requests such as http://ousvtzkizg/ | 
|  | # TODO(gabadie): Find and terminate the location where they are queried. | 
|  | if '.' not in parsed_url.netloc and ':' not in parsed_url.netloc: | 
|  | continue | 
|  | assert wpr_http_netloc | 
|  | request = WprRequest(is_served=(match.group('level') == 'DEBUG'), | 
|  | method=match.group('method'), url=match.group('url'), | 
|  | is_wpr_host=parsed_url.netloc == wpr_http_netloc) | 
|  | requests.append(request) | 
|  | return requests | 
|  |  | 
|  |  | 
|  | if __name__ == '__main__': | 
|  | import argparse | 
|  | parser = argparse.ArgumentParser(description='Tests cache back-end.') | 
|  | parser.add_argument('wpr_archive', type=str) | 
|  | command_line_args = parser.parse_args() | 
|  |  | 
|  | wpr_backend = WprArchiveBackend(command_line_args.wpr_archive) | 
|  | url_entries = wpr_backend.ListUrlEntries() | 
|  | print url_entries[0].url | 
|  | wpr_backend.Persist() |