cros_bundle/cb_url_lib.py - chromiumos/platform/factory-utils - Git at Google

 #!/usr/bin/python
 # Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """This module contains methods for interacting with online resources."""

 import contextlib
 import fnmatch
 import formatter
 import logging
 import os
 import re
 import urllib

 from cb_archive_hashing_lib import CheckMd5
 from cb_constants import BundlingError, IMAGE_GSD_BUCKET, IMAGE_GSD_PREFIX, \
     WORKDIR
 from cb_util import RunCommand
 from htmllib import HTMLParser


 class UrlLister(HTMLParser):

   """List all hyperlinks found on an html page.

   It contains the following fields:
   - urls: list of urls found

   The href attribute of all anchor tags will be stored in urls, so if the page
   has relative links then for those urls stored they will be relative links.
   Example:
   <a href="http://google.com/">Google</a> -> "http://google.com"
   <a href="my_filename_here.zip">My file!</a> -> "my_filename_here.zip"

   Modified from http://diveintopython.org/html_processing/extracting_data.html
   """

   def __init__(self, given_formatter):
     HTMLParser.__init__(self, given_formatter)
     self.urls = []

   def reset(self):
     """Reset the parser to clean state."""
     HTMLParser.reset(self)
     self.urls = []

   def start_a(self, attrs):
     """Add urls found to list of urls.

     Args:
       attrs: attributes of the anchor tag
     """
     href = [v for k, v in attrs if k == 'href']
     if href:
       self.urls.extend(href)


 class NameResolutionError(Exception):
   """Error to be thrown upon URL naming resolution failure."""
   def __init__(self, reason):
     Exception.__init__(self, reason)
     logging.debug('Name resolution failed on:\n' + reason + '\n')


 def _ConvertHttpToGsUrl(url):
   """Converts an http:// URL to the equivalent of gs:// URL.

   There are two ways of accessing files stored on GSD:
   1. via HTTP(S), using standard web browser. The URL looks like
     https://sandbox.google.com/storage/<BUCKET>/<CHANNEL>/<BOARD>/<RELEASE>/\
         <FILE>
   2. via GsUtil tool, a Python CLI. The URL looks like
     gs://<BUCKET>/<CHANNEL>/<BOARD>/<RELEASE>/<FILE>
     (See http://code.google.com/apis/storage/docs/gsutil.html for details)

   GsUtil supports path prefix matching ending with a wildcard '*'.

   Args:
     url: a string, HTTP(S) URL.
   Returns:
    a string, URL accessible through GsUtil tool e.g. gs://<blah>.
   """
   gs_url = url.replace(IMAGE_GSD_PREFIX, IMAGE_GSD_BUCKET)
   return gs_url + '/*'


 def DetermineUrl(url, token_list):
   """Return an exact URL linked from a page given a token_list to match.

   Assuming links are relative from the given page.
   If more than one URL is found to match, only the first is returned.
   All other matches are logged as warnings.

   token_list MUST contain tokens for beginning and end of a string to search.
   Example: to match filename 'ChromeOS-factory-R17-1235.3.0-a1-b2-stumpy.zip',
            good_token_list = ['chromeos', 'factory', 'stumpy', '.zip']

   Args:
     url: html page with a relative file links.
     token_list: a list of strings, in the order they are expected in the url.

   Returns:
     a string, an exact URL, or None if URL not present or link not found.
   """
   logging.debug('DetermineUrl(): HTTP url = %r', url)
   try:
     if url.startswith(IMAGE_GSD_PREFIX):
       http_url = url
       url = _ConvertHttpToGsUrl(http_url)
       logging.debug('DetermineUrl(): gs URL = %r', url)
       result = RunCommand(['gsutil', 'ls', url], redirect_stdout=True,
                           redirect_stderr=True)
       if result.returncode:
         msg = ('Error fetching index page for %s: stdout = %r, stderr = %r' %
                (http_url, result.output, result.error))
         logging.error(msg)
       else:
         return MatchUrl(result.output.split('\n'), token_list)
     else:
       usock = urllib.urlopen(url)
       htmlformatter = formatter.NullFormatter()
       parser = UrlLister(htmlformatter)
       parser.feed(usock.read())
       usock.close()
       parser.close()
       link = MatchUrl(parser.urls, token_list)
       if link:
         return os.path.join(url, link)
   except IOError:
     logging.warning('Could not open %s.', url)

   return None


 def MatchUrl(url_list, token_list):
   """Return a URL from a list given a token_list to match.

   Sample match:
     url = 'gs://chromeos-releases/dev-channel/stumpy/1235.3.0/\
            ChromeOS-factory-R17-1235.3.0-a1-b2-stumpy.zip'
     token_list = ['chromeos-factory', '1235.23.0', 'stumpy', '.zip']

   Using fnmatch over re so that we don't need to constantly update regex
   patterns whenever build team changes its file naming convention (and breaks
   our script).

   If more than one URL is found to match, only the first match is returned
   and a warning is logged.

   Args:
     url_list: a list of strings (full URLs).
     token_list: a list of strings, see DetermineUrl() docstring.

   Returns:
     a string, a matching URL, or None if no match found.
   """
   if not url_list or not token_list:
     return None

   match_list = []

   for url in url_list:
     filename = os.path.basename(url)
     if fnmatch.fnmatch(filename.lower(), '*'.join(token_list)):
       match_list.append(url)

   if not match_list:
     return None

   if len(match_list) > 1:
     logging.warning('MatchUrl(): token_list %r matches multiple urls (%r)',
                     token_list, match_list)
   return match_list[0]


 def Download(url):
   """Copy the contents of a file from a given URL to a local file.

   Local file stored in a tmp dir specified in "cb_constants.WORKDIR" variable.
   If local file exists, it will be overwritten by default.

   Modified from code.activestate.com/recipes/496685-downloading-a-file-from-
   the-web/

   Args:
     url: online location of file to download
   Returns:
     a boolean, True only when file is fully downloaded
   """
   local_file_name = os.path.join(WORKDIR, os.path.basename(url))
   try:
     if url.startswith(IMAGE_GSD_BUCKET):
       result = RunCommand(['gsutil', 'cp', url, local_file_name],
                           redirect_stdout=True, redirect_stderr=True)
       if not result.returncode:
         return True

       msg = ('Error fetching image %s: stdout = %r, stderr = %r' %
              (url, result.output, result.error))
       logging.error(msg)
     else:
       with contextlib.closing(urllib.urlopen(url)) as web_file:
         with open(local_file_name, 'w') as local_file:
           local_file.write(web_file.read())
           return True
   except IOError:
     logging.warning('Could not open %s or writing local file failed.', url)

   return False


 def DetermineThenDownloadCheckMd5(url, token_list, path, desc):
   """Determine exact url then download the resource and check MD5.

   Args:
     url: html page with a relative file links.
     token_list: a list of strings, see DetermineUrl() docstring.
     path: absolute path of directory to put resource.
     desc: a short string description of the resource to fetch.

   Returns:
     a string, the absolute path to the resource, None on failure.

   Raises:
     BundlingError when resources cannot be fetched or download integrity fails.
   """
   det_url = DetermineUrl(url, token_list)
   if not det_url:
     err = ' '.join([desc, 'exact URL could not be determined given input:', url]
                    + token_list)
     raise NameResolutionError(err)
   return DownloadCheckMd5(det_url, path, desc)


 def CheckResourceExistsWithMd5(filename, md5filename):
   """Check if a resource exists in the local file system with a good MD5.

   Args:
     filename: name of file to check for
     md5filename: name of file containing golden MD5 checksum
   Returns:
     a boolean, True when the file exists with good MD5
   """
   return (os.path.exists(filename) and
           os.path.exists(md5filename) and
           CheckMd5(filename, md5filename))


 def DownloadCheckMd5(url, path, desc):
   """Download a resource and check the MD5 checksum.

   Assuming a golden md5 is available from <resource_url>.md5
   Also checks if the resource is already locally present with an MD5 to check.

   Args:
     url: url at which resource can be downloaded
     path: absolute path of directory to put resource
     desc: a short string description of the resource to fetch
   Returns:
     a string, the absolute path to the resource, None on failure
   Raises:
     BundlingError when resources cannot be fetched or download integrity fails.
   """
   name = os.path.join(path, os.path.basename(url))
   if CheckResourceExistsWithMd5(name, name + '.md5'):
     logging.info('Resource %s already exists with good MD5, skipping fetch.',
                  name)
   else:
     logging.info('Downloading ' + url)
     if not Download(url):
       raise BundlingError(desc + ' could not be fetched.')
     if not Download(url + '.md5'):
       raise BundlingError(desc + ' MD5 could not be fetched.')
     if not CheckMd5(name, name + '.md5'):
       raise BundlingError(desc + ' MD5 checksum does not match.')
     logging.debug('MD5 checksum match succeeded for %s', name)
   return name
	#!/usr/bin/python
	# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""This module contains methods for interacting with online resources."""

	import contextlib
	import fnmatch
	import formatter
	import logging
	import os
	import re
	import urllib

	from cb_archive_hashing_lib import CheckMd5
	from cb_constants import BundlingError, IMAGE_GSD_BUCKET, IMAGE_GSD_PREFIX, \
	WORKDIR
	from cb_util import RunCommand
	from htmllib import HTMLParser


	class UrlLister(HTMLParser):

	"""List all hyperlinks found on an html page.

	It contains the following fields:
	- urls: list of urls found

	The href attribute of all anchor tags will be stored in urls, so if the page
	has relative links then for those urls stored they will be relative links.
	Example:
	<a href="http://google.com/">Google</a> -> "http://google.com"
	<a href="my_filename_here.zip">My file!</a> -> "my_filename_here.zip"

	Modified from http://diveintopython.org/html_processing/extracting_data.html
	"""

	def __init__(self, given_formatter):
	HTMLParser.__init__(self, given_formatter)
	self.urls = []

	def reset(self):
	"""Reset the parser to clean state."""
	HTMLParser.reset(self)
	self.urls = []

	def start_a(self, attrs):
	"""Add urls found to list of urls.

	Args:
	attrs: attributes of the anchor tag
	"""
	href = [v for k, v in attrs if k == 'href']
	if href:
	self.urls.extend(href)


	class NameResolutionError(Exception):
	"""Error to be thrown upon URL naming resolution failure."""
	def __init__(self, reason):
	Exception.__init__(self, reason)
	logging.debug('Name resolution failed on:\n' + reason + '\n')


	def _ConvertHttpToGsUrl(url):
	"""Converts an http:// URL to the equivalent of gs:// URL.

	There are two ways of accessing files stored on GSD:
	1. via HTTP(S), using standard web browser. The URL looks like
	https://sandbox.google.com/storage/<BUCKET>/<CHANNEL>/<BOARD>/<RELEASE>/\
	<FILE>
	2. via GsUtil tool, a Python CLI. The URL looks like
	gs://<BUCKET>/<CHANNEL>/<BOARD>/<RELEASE>/<FILE>
	(See http://code.google.com/apis/storage/docs/gsutil.html for details)

	GsUtil supports path prefix matching ending with a wildcard '*'.

	Args:
	url: a string, HTTP(S) URL.
	Returns:
	a string, URL accessible through GsUtil tool e.g. gs://<blah>.
	"""
	gs_url = url.replace(IMAGE_GSD_PREFIX, IMAGE_GSD_BUCKET)
	return gs_url + '/*'


	def DetermineUrl(url, token_list):
	"""Return an exact URL linked from a page given a token_list to match.

	Assuming links are relative from the given page.
	If more than one URL is found to match, only the first is returned.
	All other matches are logged as warnings.

	token_list MUST contain tokens for beginning and end of a string to search.
	Example: to match filename 'ChromeOS-factory-R17-1235.3.0-a1-b2-stumpy.zip',
	good_token_list = ['chromeos', 'factory', 'stumpy', '.zip']

	Args:
	url: html page with a relative file links.
	token_list: a list of strings, in the order they are expected in the url.

	Returns:
	a string, an exact URL, or None if URL not present or link not found.
	"""
	logging.debug('DetermineUrl(): HTTP url = %r', url)
	try:
	if url.startswith(IMAGE_GSD_PREFIX):
	http_url = url
	url = _ConvertHttpToGsUrl(http_url)
	logging.debug('DetermineUrl(): gs URL = %r', url)
	result = RunCommand(['gsutil', 'ls', url], redirect_stdout=True,
	redirect_stderr=True)
	if result.returncode:
	msg = ('Error fetching index page for %s: stdout = %r, stderr = %r' %
	(http_url, result.output, result.error))
	logging.error(msg)
	else:
	return MatchUrl(result.output.split('\n'), token_list)
	else:
	usock = urllib.urlopen(url)
	htmlformatter = formatter.NullFormatter()
	parser = UrlLister(htmlformatter)
	parser.feed(usock.read())
	usock.close()
	parser.close()
	link = MatchUrl(parser.urls, token_list)
	if link:
	return os.path.join(url, link)
	except IOError:
	logging.warning('Could not open %s.', url)

	return None


	def MatchUrl(url_list, token_list):
	"""Return a URL from a list given a token_list to match.

	Sample match:
	url = 'gs://chromeos-releases/dev-channel/stumpy/1235.3.0/\
	ChromeOS-factory-R17-1235.3.0-a1-b2-stumpy.zip'
	token_list = ['chromeos-factory', '1235.23.0', 'stumpy', '.zip']

	Using fnmatch over re so that we don't need to constantly update regex
	patterns whenever build team changes its file naming convention (and breaks
	our script).

	If more than one URL is found to match, only the first match is returned
	and a warning is logged.

	Args:
	url_list: a list of strings (full URLs).
	token_list: a list of strings, see DetermineUrl() docstring.

	Returns:
	a string, a matching URL, or None if no match found.
	"""
	if not url_list or not token_list:
	return None

	match_list = []

	for url in url_list:
	filename = os.path.basename(url)
	if fnmatch.fnmatch(filename.lower(), '*'.join(token_list)):
	match_list.append(url)

	if not match_list:
	return None

	if len(match_list) > 1:
	logging.warning('MatchUrl(): token_list %r matches multiple urls (%r)',
	token_list, match_list)
	return match_list[0]


	def Download(url):
	"""Copy the contents of a file from a given URL to a local file.

	Local file stored in a tmp dir specified in "cb_constants.WORKDIR" variable.
	If local file exists, it will be overwritten by default.

	Modified from code.activestate.com/recipes/496685-downloading-a-file-from-
	the-web/

	Args:
	url: online location of file to download
	Returns:
	a boolean, True only when file is fully downloaded
	"""
	local_file_name = os.path.join(WORKDIR, os.path.basename(url))
	try:
	if url.startswith(IMAGE_GSD_BUCKET):
	result = RunCommand(['gsutil', 'cp', url, local_file_name],
	redirect_stdout=True, redirect_stderr=True)
	if not result.returncode:
	return True

	msg = ('Error fetching image %s: stdout = %r, stderr = %r' %
	(url, result.output, result.error))
	logging.error(msg)
	else:
	with contextlib.closing(urllib.urlopen(url)) as web_file:
	with open(local_file_name, 'w') as local_file:
	local_file.write(web_file.read())
	return True
	except IOError:
	logging.warning('Could not open %s or writing local file failed.', url)

	return False


	def DetermineThenDownloadCheckMd5(url, token_list, path, desc):
	"""Determine exact url then download the resource and check MD5.

	Args:
	url: html page with a relative file links.
	token_list: a list of strings, see DetermineUrl() docstring.
	path: absolute path of directory to put resource.
	desc: a short string description of the resource to fetch.

	Returns:
	a string, the absolute path to the resource, None on failure.

	Raises:
	BundlingError when resources cannot be fetched or download integrity fails.
	"""
	det_url = DetermineUrl(url, token_list)
	if not det_url:
	err = ' '.join([desc, 'exact URL could not be determined given input:', url]
	+ token_list)
	raise NameResolutionError(err)
	return DownloadCheckMd5(det_url, path, desc)


	def CheckResourceExistsWithMd5(filename, md5filename):
	"""Check if a resource exists in the local file system with a good MD5.

	Args:
	filename: name of file to check for
	md5filename: name of file containing golden MD5 checksum
	Returns:
	a boolean, True when the file exists with good MD5
	"""
	return (os.path.exists(filename) and
	os.path.exists(md5filename) and
	CheckMd5(filename, md5filename))


	def DownloadCheckMd5(url, path, desc):
	"""Download a resource and check the MD5 checksum.

	Assuming a golden md5 is available from <resource_url>.md5
	Also checks if the resource is already locally present with an MD5 to check.

	Args:
	url: url at which resource can be downloaded
	path: absolute path of directory to put resource
	desc: a short string description of the resource to fetch
	Returns:
	a string, the absolute path to the resource, None on failure
	Raises:
	BundlingError when resources cannot be fetched or download integrity fails.
	"""
	name = os.path.join(path, os.path.basename(url))
	if CheckResourceExistsWithMd5(name, name + '.md5'):
	logging.info('Resource %s already exists with good MD5, skipping fetch.',
	name)
	else:
	logging.info('Downloading ' + url)
	if not Download(url):
	raise BundlingError(desc + ' could not be fetched.')
	if not Download(url + '.md5'):
	raise BundlingError(desc + ' MD5 could not be fetched.')
	if not CheckMd5(name, name + '.md5'):
	raise BundlingError(desc + ' MD5 checksum does not match.')
	logging.debug('MD5 checksum match succeeded for %s', name)
	return name