blob: 0e658d40af77a5184696e61f52c1c58b72135953 [file] [log] [blame]
# Copyright (c) 2017 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from __future__ import print_function
import codecs
import os
import logging
import json
import re
import shutil
import sys
from io import BytesIO
import six.moves.urllib.request # pylint: disable=import-error
from telemetry.core import util
from telemetry.internal.browser import browser_finder
from telemetry.internal.browser import browser_options
HTML_SUFFIX = '.html'
STRIP_QUERY_PARAM_REGEX = re.compile(r'\?.*$')
EXPENSIVE_JS_TIMEOUT_SECONDS = 240
def _TransmitLargeJSONToTab(tab, json_obj, js_holder_name):
tab.ExecuteJavaScript(
'var {{ @js_holder_name }} = "";', js_holder_name=js_holder_name)
# To avoid crashing devtool connection (details in crbug.com/763119#c16),
# we break down the json string to chunks which each chunk has a maximum
# size of 100000 characters (100000 seems to not break the connection and
# makes sending data reasonably fast).
k = 0
step_size = 100000
json_obj_string = json.dumps(json_obj)
while k < len(json_obj_string):
sub_string_chunk = json_obj_string[k: k + step_size]
k += step_size
tab.ExecuteJavaScript(
'{{ @js_holder_name }} += {{ sub_string_chunk }};',
js_holder_name=js_holder_name, sub_string_chunk=sub_string_chunk)
tab.ExecuteJavaScript(
'{{ @js_holder_name }} = JSON.parse({{ @js_holder_name }});',
js_holder_name=js_holder_name)
def _CreateBrowser(finder_options, enable_browser_log):
if enable_browser_log:
# Enable NON_VERBOSE_LOGGING which also contains devtool's console logs.
finder_options.browser_options.logging_verbosity = (
browser_options.BrowserOptions.NON_VERBOSE_LOGGING)
# Do not upload the log to cloud storage.
finder_options.browser_options.logs_cloud_bucket = None
possible_browser = browser_finder.FindBrowser(finder_options)
return possible_browser.BrowserSession(finder_options.browser_options)
def _ReadSnapItSource(path):
""" Returns the contents of the snap-it source file at the given path
relative to the snap-it repository.
"""
full_path = os.path.join(util.GetCatapultThirdPartyDir(), 'snap-it', path)
with open(full_path) as f:
return f.read()
def _FetchImages(image_dir, frame_number, external_images):
if len(external_images) == 0:
return
image_count = len(external_images)
print('Fetching external images [local_dir=%s, frame_number=%d, '
'image_count=%d].' % (image_dir, frame_number, image_count))
for i in range(image_count):
[element_id, image_url] = external_images[i]
_, image_file_extension = os.path.splitext(image_url)
# Strip any query param and all subsequent characters. Note that
# we also do this JavaScript-side (see HTMLSerializer.fileSuffix),
# but the stripped file name isn't currently passed back in the
# interest of shipping less data around.
image_file_extension = STRIP_QUERY_PARAM_REGEX.sub('', image_file_extension)
image_file = os.path.join(image_dir, '%d-%s%s' % (
frame_number, element_id, image_file_extension))
sys.stdout.write('Fetching image #%i / %i\r' % (i, image_count))
sys.stdout.flush()
logging.info('Fetching image [frame_number=%d, %d/%d, local_file=%s, '
'url=%s].' % (frame_number, i, image_count, image_file,
image_url))
try:
image_request = six.moves.urllib.request.urlopen(image_url)
except IOError as e:
print('Error fetching image [local_file=%s, url=%s, message=%s].' % (
image_file, image_url, e))
continue
try:
with open(image_file, 'wb') as image_file_handle:
shutil.copyfileobj(BytesIO(image_request.read()), image_file_handle)
except IOError as e:
print('Error copying image [local_file=%s, url=%s, message=%s].' % (
image_file, image_url, e))
def _GetLocalImageDirectory(snapshot_path):
return os.path.splitext(snapshot_path)[0]
def _SnapPageToFile(finder_options, url, interactive, snapshot_path,
snapshot_file, enable_browser_log):
""" Save the HTML snapshot of the page whose address is |url| to
|snapshot_file|.
"""
with _CreateBrowser(finder_options, enable_browser_log) as browser:
tab = browser.tabs[0]
tab.Navigate(url)
if interactive:
raw_input(
'Activating interactive mode. Press enter after you finish '
"interacting with the page to snapshot the page's DOM content.")
print('Snapshotting content of %s. This could take a while...' % url)
tab.WaitForDocumentReadyStateToBeComplete()
tab.action_runner.WaitForNetworkQuiescence(
timeout_in_seconds=EXPENSIVE_JS_TIMEOUT_SECONDS)
snapit_script = _ReadSnapItSource('HTMLSerializer.js')
dom_combining_script = _ReadSnapItSource('popup.js')
image_dir = _GetLocalImageDirectory(snapshot_path)
if not os.path.exists(image_dir):
os.mkdir(image_dir)
serialized_doms = []
# |external_images| holds, for each frame, a list of tuples as
# (element id), (image src url) with the url as it was in the
# original unmodified page html. We use the element id to construct
# a page-unique local image filename. We use the url to fetch the
# image from the external server.
external_images = []
# Serialize the dom in each frame.
frame_number = 0
for context_id in tab.EnableAllContexts():
# Build a distinct local image path for each frame by including
# the frame number as the prefix string for the eventual file.
local_image_path = os.path.join(os.path.basename(image_dir),
'%d-' % frame_number)
tab.ExecuteJavaScript(snapit_script, context_id=context_id)
tab.ExecuteJavaScript(
'''
var serializedDom;
var htmlSerializer = new HTMLSerializer();
htmlSerializer.setLocalImagePath('%s');
htmlSerializer.processDocument(document);
htmlSerializer.fillHolesAsync(document, function(s) {
serializedDom = s.asDict();
});
''' % local_image_path, context_id=context_id,
timeout=EXPENSIVE_JS_TIMEOUT_SECONDS)
tab.WaitForJavaScriptCondition(
'serializedDom !== undefined', context_id=context_id)
serialized_doms.append(tab.EvaluateJavaScript(
'serializedDom', context_id=context_id))
external_images.append(tab.EvaluateJavaScript(
'htmlSerializer.externalImages', context_id=context_id))
frame_number += 1
# Execute doms combining code in blank page to minimize the chance of V8
# OOM.
tab.Navigate('about:blank')
tab.WaitForDocumentReadyStateToBeComplete()
# Sending all the serialized doms back to tab execution context.
tab.ExecuteJavaScript('var serializedDoms = [];')
for i, dom in enumerate(serialized_doms):
sys.stdout.write('Processing dom of frame #%i / %i\r' %
(i, len(serialized_doms)))
sys.stdout.flush()
_TransmitLargeJSONToTab(tab, dom, 'sub_dom')
tab.ExecuteJavaScript('serializedDoms.push(sub_dom);')
# Combine all the doms to one HTML string.
tab.EvaluateJavaScript(dom_combining_script,
timeout=EXPENSIVE_JS_TIMEOUT_SECONDS)
page_snapshot = tab.EvaluateJavaScript('outputHTMLString(serializedDoms);',
timeout=EXPENSIVE_JS_TIMEOUT_SECONDS)
print('Writing page snapshot [path=%s].' % snapshot_path)
snapshot_file.write(page_snapshot)
for i, image in enumerate(external_images):
_FetchImages(image_dir, i, image)
def SnapPage(finder_options, url, interactive, snapshot_path,
enable_browser_log):
""" Save the HTML snapshot of the page whose address is |url| to
the file located at the relative path |snapshot_path|.
"""
if not snapshot_path.endswith(HTML_SUFFIX):
raise ValueError('Snapshot path should end with \'%s\' [value=\'%s\'].' % (
HTML_SUFFIX, snapshot_path))
snapshot_path = os.path.abspath(snapshot_path)
with codecs.open(snapshot_path, 'w', 'utf-8') as f:
_SnapPageToFile(finder_options, url, interactive, snapshot_path, f,
enable_browser_log)
print('Successfully saved snapshot to file://%s' % snapshot_path)