telemetry/telemetry/internal/snap_page_util.py - catapult - Git at Google

 # Copyright (c) 2017 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 from __future__ import print_function
 import codecs
 import os
 import logging
 import json
 import re
 import shutil
 import sys
 from io import BytesIO

 import six.moves.urllib.request # pylint: disable=import-error
 from six.moves import input # pylint: disable=redefined-builtin

 from telemetry.core import util
 from telemetry.internal.browser import browser_finder
 from telemetry.internal.browser import browser_options


 HTML_SUFFIX = '.html'
 STRIP_QUERY_PARAM_REGEX = re.compile(r'\?.*$')
 EXPENSIVE_JS_TIMEOUT_SECONDS = 240

 def _TransmitLargeJSONToTab(tab, json_obj, js_holder_name):
   tab.ExecuteJavaScript(
       'var {{ @js_holder_name }} = "";', js_holder_name=js_holder_name)

   # To avoid crashing devtool connection (details in crbug.com/763119#c16),
   # we break down the json string to chunks which each chunk has a maximum
   # size of 100000 characters (100000 seems to not break the connection and
   # makes sending data reasonably fast).
   k = 0
   step_size = 100000
   json_obj_string = json.dumps(json_obj)
   while k < len(json_obj_string):
     sub_string_chunk = json_obj_string[k: k + step_size]
     k += step_size
     tab.ExecuteJavaScript(
         '{{ @js_holder_name }} += {{ sub_string_chunk }};',
         js_holder_name=js_holder_name, sub_string_chunk=sub_string_chunk)

   tab.ExecuteJavaScript(
       '{{ @js_holder_name }} = JSON.parse({{ @js_holder_name }});',
       js_holder_name=js_holder_name)


 def _CreateBrowser(finder_options, enable_browser_log):
   if enable_browser_log:
     # Enable NON_VERBOSE_LOGGING which also contains devtool's console logs.
     finder_options.browser_options.logging_verbosity = (
         browser_options.BrowserOptions.NON_VERBOSE_LOGGING)
     # Do not upload the log to cloud storage.
     finder_options.browser_options.logs_cloud_bucket = None

   possible_browser = browser_finder.FindBrowser(finder_options)
   return possible_browser.BrowserSession(finder_options.browser_options)


 def _ReadSnapItSource(path):
   """ Returns the contents of the snap-it source file at the given path
   relative to the snap-it repository.
   """
   full_path = os.path.join(util.GetCatapultThirdPartyDir(), 'snap-it', path)
   with open(full_path) as f:
     return f.read()


 def _FetchImages(image_dir, frame_number, external_images):
   if len(external_images) == 0:
     return

   image_count = len(external_images)
   print('Fetching external images [local_dir=%s, frame_number=%d, '
         'image_count=%d].' % (image_dir, frame_number, image_count))

   for i in range(image_count):
     [element_id, image_url] = external_images[i]
     _, image_file_extension = os.path.splitext(image_url)
     # Strip any query param and all subsequent characters. Note that
     # we also do this JavaScript-side (see HTMLSerializer.fileSuffix),
     # but the stripped file name isn't currently passed back in the
     # interest of shipping less data around.
     image_file_extension = STRIP_QUERY_PARAM_REGEX.sub('', image_file_extension)
     image_file = os.path.join(image_dir, '%d-%s%s' % (
         frame_number, element_id, image_file_extension))
     sys.stdout.write('Fetching image #%i / %i\r' % (i, image_count))
     sys.stdout.flush()
     logging.info('Fetching image [frame_number=%d, %d/%d, local_file=%s, '
                  'url=%s].' % (frame_number, i, image_count, image_file,
                                image_url))
     try:
       image_request = six.moves.urllib.request.urlopen(image_url)
     except IOError as e:
       print('Error fetching image [local_file=%s, url=%s, message=%s].' % (
           image_file, image_url, e))
       continue

     try:
       with open(image_file, 'wb') as image_file_handle:
         shutil.copyfileobj(BytesIO(image_request.read()), image_file_handle)
     except IOError as e:
       print('Error copying image [local_file=%s, url=%s, message=%s].' % (
           image_file, image_url, e))

 def _GetLocalImageDirectory(snapshot_path):
   return os.path.splitext(snapshot_path)[0]

 def _SnapPageToFile(finder_options, url, interactive, snapshot_path,
                     snapshot_file, enable_browser_log):
   """ Save the HTML snapshot of the page whose address is |url| to
   |snapshot_file|.
   """
   with _CreateBrowser(finder_options, enable_browser_log) as browser:
     tab = browser.tabs[0]
     tab.Navigate(url)
     if interactive:
       input(
           'Activating interactive mode. Press enter after you finish '
           "interacting with the page to snapshot the page's DOM content.")

     print('Snapshotting content of %s. This could take a while...' % url)
     tab.WaitForDocumentReadyStateToBeComplete()
     tab.action_runner.WaitForNetworkQuiescence(
         timeout_in_seconds=EXPENSIVE_JS_TIMEOUT_SECONDS)

     snapit_script = _ReadSnapItSource('HTMLSerializer.js')
     dom_combining_script = _ReadSnapItSource('popup.js')
     image_dir = _GetLocalImageDirectory(snapshot_path)
     if not os.path.exists(image_dir):
       os.mkdir(image_dir)
     serialized_doms = []
     # |external_images| holds, for each frame, a list of tuples as
     # (element id), (image src url) with the url as it was in the
     # original unmodified page html. We use the element id to construct
     # a page-unique local image filename. We use the url to fetch the
     # image from the external server.
     external_images = []

     # Serialize the dom in each frame.
     frame_number = 0
     for context_id in tab.EnableAllContexts():
       # Build a distinct local image path for each frame by including
       # the frame number as the prefix string for the eventual file.
       local_image_path = os.path.join(os.path.basename(image_dir),
                                       '%d-' % frame_number)
       tab.ExecuteJavaScript(snapit_script, context_id=context_id)
       tab.ExecuteJavaScript(
           '''
           var serializedDom;
           var htmlSerializer = new HTMLSerializer();
           htmlSerializer.setLocalImagePath('%s');
           htmlSerializer.processDocument(document);
           htmlSerializer.fillHolesAsync(document, function(s) {
             serializedDom = s.asDict();
           });
           ''' % local_image_path, context_id=context_id,
           timeout=EXPENSIVE_JS_TIMEOUT_SECONDS)
       tab.WaitForJavaScriptCondition(
           'serializedDom !== undefined', context_id=context_id)
       serialized_doms.append(tab.EvaluateJavaScript(
           'serializedDom', context_id=context_id))
       external_images.append(tab.EvaluateJavaScript(
           'htmlSerializer.externalImages', context_id=context_id))
       frame_number += 1

     # Execute doms combining code in blank page to minimize the chance of V8
     # OOM.
     tab.Navigate('about:blank')
     tab.WaitForDocumentReadyStateToBeComplete()

     # Sending all the serialized doms back to tab execution context.
     tab.ExecuteJavaScript('var serializedDoms = [];')
     for i, dom in enumerate(serialized_doms):
       sys.stdout.write('Processing dom of frame #%i / %i\r' %
                        (i, len(serialized_doms)))
       sys.stdout.flush()
       _TransmitLargeJSONToTab(tab, dom, 'sub_dom')
       tab.ExecuteJavaScript('serializedDoms.push(sub_dom);')

     # Combine all the doms to one HTML string.
     tab.EvaluateJavaScript(dom_combining_script,
                            timeout=EXPENSIVE_JS_TIMEOUT_SECONDS)
     page_snapshot = tab.EvaluateJavaScript('outputHTMLString(serializedDoms);',
                                            timeout=EXPENSIVE_JS_TIMEOUT_SECONDS)

     print('Writing page snapshot [path=%s].' % snapshot_path)
     snapshot_file.write(page_snapshot)
     for i, image in enumerate(external_images):
       _FetchImages(image_dir, i, image)


 def SnapPage(finder_options, url, interactive, snapshot_path,
              enable_browser_log):
   """ Save the HTML snapshot of the page whose address is |url| to
   the file located at the relative path |snapshot_path|.
   """
   if not snapshot_path.endswith(HTML_SUFFIX):
     raise ValueError('Snapshot path should end with \'%s\' [value=\'%s\'].' % (
         HTML_SUFFIX, snapshot_path))

   snapshot_path = os.path.abspath(snapshot_path)
   with codecs.open(snapshot_path, 'w', 'utf-8') as f:
     _SnapPageToFile(finder_options, url, interactive, snapshot_path, f,
                     enable_browser_log)
   print('Successfully saved snapshot to file://%s' % snapshot_path)
	# Copyright (c) 2017 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	from __future__ import print_function
	import codecs
	import os
	import logging
	import json
	import re
	import shutil
	import sys
	from io import BytesIO

	import six.moves.urllib.request # pylint: disable=import-error
	from six.moves import input # pylint: disable=redefined-builtin

	from telemetry.core import util
	from telemetry.internal.browser import browser_finder
	from telemetry.internal.browser import browser_options


	HTML_SUFFIX = '.html'
	STRIP_QUERY_PARAM_REGEX = re.compile(r'\?.*$')
	EXPENSIVE_JS_TIMEOUT_SECONDS = 240

	def _TransmitLargeJSONToTab(tab, json_obj, js_holder_name):
	tab.ExecuteJavaScript(
	'var {{ @js_holder_name }} = "";', js_holder_name=js_holder_name)

	# To avoid crashing devtool connection (details in crbug.com/763119#c16),
	# we break down the json string to chunks which each chunk has a maximum
	# size of 100000 characters (100000 seems to not break the connection and
	# makes sending data reasonably fast).
	k = 0
	step_size = 100000
	json_obj_string = json.dumps(json_obj)
	while k < len(json_obj_string):
	sub_string_chunk = json_obj_string[k: k + step_size]
	k += step_size
	tab.ExecuteJavaScript(
	'{{ @js_holder_name }} += {{ sub_string_chunk }};',
	js_holder_name=js_holder_name, sub_string_chunk=sub_string_chunk)

	tab.ExecuteJavaScript(
	'{{ @js_holder_name }} = JSON.parse({{ @js_holder_name }});',
	js_holder_name=js_holder_name)


	def _CreateBrowser(finder_options, enable_browser_log):
	if enable_browser_log:
	# Enable NON_VERBOSE_LOGGING which also contains devtool's console logs.
	finder_options.browser_options.logging_verbosity = (
	browser_options.BrowserOptions.NON_VERBOSE_LOGGING)
	# Do not upload the log to cloud storage.
	finder_options.browser_options.logs_cloud_bucket = None

	possible_browser = browser_finder.FindBrowser(finder_options)
	return possible_browser.BrowserSession(finder_options.browser_options)


	def _ReadSnapItSource(path):
	""" Returns the contents of the snap-it source file at the given path
	relative to the snap-it repository.
	"""
	full_path = os.path.join(util.GetCatapultThirdPartyDir(), 'snap-it', path)
	with open(full_path) as f:
	return f.read()


	def _FetchImages(image_dir, frame_number, external_images):
	if len(external_images) == 0:
	return

	image_count = len(external_images)
	print('Fetching external images [local_dir=%s, frame_number=%d, '
	'image_count=%d].' % (image_dir, frame_number, image_count))

	for i in range(image_count):
	[element_id, image_url] = external_images[i]
	_, image_file_extension = os.path.splitext(image_url)
	# Strip any query param and all subsequent characters. Note that
	# we also do this JavaScript-side (see HTMLSerializer.fileSuffix),
	# but the stripped file name isn't currently passed back in the
	# interest of shipping less data around.
	image_file_extension = STRIP_QUERY_PARAM_REGEX.sub('', image_file_extension)
	image_file = os.path.join(image_dir, '%d-%s%s' % (
	frame_number, element_id, image_file_extension))
	sys.stdout.write('Fetching image #%i / %i\r' % (i, image_count))
	sys.stdout.flush()
	logging.info('Fetching image [frame_number=%d, %d/%d, local_file=%s, '
	'url=%s].' % (frame_number, i, image_count, image_file,
	image_url))
	try:
	image_request = six.moves.urllib.request.urlopen(image_url)
	except IOError as e:
	print('Error fetching image [local_file=%s, url=%s, message=%s].' % (
	image_file, image_url, e))
	continue

	try:
	with open(image_file, 'wb') as image_file_handle:
	shutil.copyfileobj(BytesIO(image_request.read()), image_file_handle)
	except IOError as e:
	print('Error copying image [local_file=%s, url=%s, message=%s].' % (
	image_file, image_url, e))

	def _GetLocalImageDirectory(snapshot_path):
	return os.path.splitext(snapshot_path)[0]

	def _SnapPageToFile(finder_options, url, interactive, snapshot_path,
	snapshot_file, enable_browser_log):
	""" Save the HTML snapshot of the page whose address is \|url\| to
	\|snapshot_file\|.
	"""
	with _CreateBrowser(finder_options, enable_browser_log) as browser:
	tab = browser.tabs[0]
	tab.Navigate(url)
	if interactive:
	input(
	'Activating interactive mode. Press enter after you finish '
	"interacting with the page to snapshot the page's DOM content.")

	print('Snapshotting content of %s. This could take a while...' % url)
	tab.WaitForDocumentReadyStateToBeComplete()
	tab.action_runner.WaitForNetworkQuiescence(
	timeout_in_seconds=EXPENSIVE_JS_TIMEOUT_SECONDS)

	snapit_script = _ReadSnapItSource('HTMLSerializer.js')
	dom_combining_script = _ReadSnapItSource('popup.js')
	image_dir = _GetLocalImageDirectory(snapshot_path)
	if not os.path.exists(image_dir):
	os.mkdir(image_dir)
	serialized_doms = []
	# \|external_images\| holds, for each frame, a list of tuples as
	# (element id), (image src url) with the url as it was in the
	# original unmodified page html. We use the element id to construct
	# a page-unique local image filename. We use the url to fetch the
	# image from the external server.
	external_images = []

	# Serialize the dom in each frame.
	frame_number = 0
	for context_id in tab.EnableAllContexts():
	# Build a distinct local image path for each frame by including
	# the frame number as the prefix string for the eventual file.
	local_image_path = os.path.join(os.path.basename(image_dir),
	'%d-' % frame_number)
	tab.ExecuteJavaScript(snapit_script, context_id=context_id)
	tab.ExecuteJavaScript(
	'''
	var serializedDom;
	var htmlSerializer = new HTMLSerializer();
	htmlSerializer.setLocalImagePath('%s');
	htmlSerializer.processDocument(document);
	htmlSerializer.fillHolesAsync(document, function(s) {
	serializedDom = s.asDict();
	});
	''' % local_image_path, context_id=context_id,
	timeout=EXPENSIVE_JS_TIMEOUT_SECONDS)
	tab.WaitForJavaScriptCondition(
	'serializedDom !== undefined', context_id=context_id)
	serialized_doms.append(tab.EvaluateJavaScript(
	'serializedDom', context_id=context_id))
	external_images.append(tab.EvaluateJavaScript(
	'htmlSerializer.externalImages', context_id=context_id))
	frame_number += 1

	# Execute doms combining code in blank page to minimize the chance of V8
	# OOM.
	tab.Navigate('about:blank')
	tab.WaitForDocumentReadyStateToBeComplete()

	# Sending all the serialized doms back to tab execution context.
	tab.ExecuteJavaScript('var serializedDoms = [];')
	for i, dom in enumerate(serialized_doms):
	sys.stdout.write('Processing dom of frame #%i / %i\r' %
	(i, len(serialized_doms)))
	sys.stdout.flush()
	_TransmitLargeJSONToTab(tab, dom, 'sub_dom')
	tab.ExecuteJavaScript('serializedDoms.push(sub_dom);')

	# Combine all the doms to one HTML string.
	tab.EvaluateJavaScript(dom_combining_script,
	timeout=EXPENSIVE_JS_TIMEOUT_SECONDS)
	page_snapshot = tab.EvaluateJavaScript('outputHTMLString(serializedDoms);',
	timeout=EXPENSIVE_JS_TIMEOUT_SECONDS)

	print('Writing page snapshot [path=%s].' % snapshot_path)
	snapshot_file.write(page_snapshot)
	for i, image in enumerate(external_images):
	_FetchImages(image_dir, i, image)


	def SnapPage(finder_options, url, interactive, snapshot_path,
	enable_browser_log):
	""" Save the HTML snapshot of the page whose address is \|url\| to
	the file located at the relative path \|snapshot_path\|.
	"""
	if not snapshot_path.endswith(HTML_SUFFIX):
	raise ValueError('Snapshot path should end with \'%s\' [value=\'%s\'].' % (
	HTML_SUFFIX, snapshot_path))

	snapshot_path = os.path.abspath(snapshot_path)
	with codecs.open(snapshot_path, 'w', 'utf-8') as f:
	_SnapPageToFile(finder_options, url, interactive, snapshot_path, f,
	enable_browser_log)
	print('Successfully saved snapshot to file://%s' % snapshot_path)