blob: f7239b1ebdc9629fdfc8e0ed198bd4cc71cb4f44 [file] [log] [blame]
# Copyright 2017 The LUCI Authors. All rights reserved.
# Use of this source code is governed under the Apache License, Version 2.0
# that can be found in the LICENSE file.
"""Methods for interacting with HTTP(s) URLs."""
from builtins import int
from urllib.parse import urlparse, urlencode, quote, unquote
import collections
from recipe_engine import recipe_api
class UrlApi(recipe_api.RecipeApi):
quote = staticmethod(quote)
unquote = staticmethod(unquote)
urlencode = staticmethod(urlencode)
# JSON prefix used with Gerrit and Gitiles.
GERRIT_JSON_PREFIX = ")]}'\n"
class HTTPError(recipe_api.StepFailure):
def __init__(self, msg, response):
super(UrlApi.HTTPError, self).__init__(msg)
self.response = response
class InfraHTTPError(recipe_api.InfraFailure):
def __init__(self, msg, response):
super(UrlApi.InfraHTTPError, self).__init__(msg)
self.response = response
# Status JSON output from "pycurl.py" resource.
_PyCurlStatus = collections.namedtuple('_PyCurlStatus', (
'status_code', 'success', 'size', 'error_body'))
class Response:
"""Response is an HTTP response object."""
def __init__(self, method, output, status, infra_step):
self._method = method
self._status = status
self._output = output
self._infra_step = infra_step
@property
def method(self):
"""Returns (str): The HTTP method, currently always GET."""
return self._method
@property
def status_code(self):
"""Returns (int): The HTTP status code."""
return self._status.status_code
@property
def output(self):
"""
Returns:
If JSON, the unmarshalled JSON response object.
If text, the result as a text string.
If raw, the result as a bytes object.
If file, the output Path.
On error, will be None.
"""
return self._output
@property
def error_body(self):
"""Returns the HTTP body when an error was encountered.
Returns (str or None): The error body, or None if not an error.
"""
return self._status.error_body
@property
def size(self):
"""Returns (int): The number of bytes in the HTTP response."""
return self._status.size
def raise_on_error(self):
"""Raises an exception if the HTTP operation was not successful.
Raises:
UrlApi.HTTPError on HTTP failure, if not an infra step.
UrlApi.InfraHTTPError on HTTP failure, if an infra step.
"""
if not self._status.success:
cls = UrlApi.InfraHTTPError if self._infra_step else UrlApi.HTTPError
raise cls('HTTP status (%d)' % (self.status_code,), self)
def join(self, *parts):
"""Constructs a URL path from composite parts.
Args:
* parts (str...): Strings to concastenate. Any leading or trailing slashes
will be stripped from intermediate strings to ensure that they join
together. Trailing slashes will not be stripped from the last part.
"""
if parts:
parts = list(parts)
if len(parts) > 1:
for i, p in enumerate(parts[:-1]):
parts[i] = p.strip('/')
parts[-1] = parts[-1].lstrip('/')
return '/'.join(parts)
def validate_url(self, v):
"""Validates that "v" is a valid URL.
A valid URL has a scheme and netloc, and must begin with HTTP or HTTPS.
Args:
* v (str): The URL to validate.
Returns (bool): True if the URL is considered secure, False if not.
Raises:
ValueError: if "v" is not valid.
"""
u = urlparse(v)
if u.scheme.lower() not in ('http', 'https'):
raise ValueError(
'URL scheme must be either http:// or https:// ({!r})'.format(v))
if not u.netloc:
raise ValueError(
'URL must specify a network location ({!r})'.format(v))
return u.scheme.lower() == 'https'
def get_file(self, url, path, step_name=None, headers=None,
transient_retry=True, strip_prefix=None):
"""GET data at given URL and writes it to file.
Args:
* url: URL to request.
* path (Path): the Path where the content will be written.
* step_name: optional step name, 'GET <url>' by default.
* headers: a {header_name: value} dictionary for HTTP headers.
* transient_retry (bool or int): Determines how transient HTTP errorts
(>500) will be retried. If True (default), errors will be retried up
to 10 times. If False, no transient retries will occur. If an integer
is supplied, this is the number of transient retries to perform. All
retries have exponential backoff applied.
* strip_prefix (str or None): If not None, this prefix must be present at
the beginning of the response, and will be stripped from the resulting
content (e.g., GERRIT_JSON_PREFIX).
Returns (UrlApi.Response):
Response with "path" as its "output" value.
Raises:
* HTTPError, InfraHTTPError: if the request failed.
* ValueError: If the request was invalid.
"""
return self._get_step(
url=url,
path=path,
step_name=step_name,
headers=headers,
transient_retry=transient_retry,
strip_prefix=strip_prefix,
default_test_data='')
def get_text(self, url, step_name=None, headers=None, transient_retry=True,
default_test_data=None):
"""GET data at given URL and writes it to file.
Args:
* url: URL to request.
* step_name: optional step name, 'GET <url>' by default.
* headers: a {header_name: value} dictionary for HTTP headers.
* transient_retry (bool or int): Determines how transient HTTP errorts
(>500) will be retried. If True (default), errors will be retried up
to 10 times. If False, no transient retries will occur. If an integer
is supplied, this is the number of transient retries to perform. All
retries have exponential backoff applied.
* default_test_data (str): If provided, use this as the text output when
testing if no overriding data is available.
Returns (UrlApi.Response): Response with the content as its output value.
Raises:
* HTTPError, InfraHTTPError: if the request failed.
* ValueError: If the request was invalid.
"""
assert isinstance(default_test_data, (type(None), str))
return self._get_step(
url=url,
step_name=step_name,
headers=headers,
transient_retry=transient_retry,
default_test_data=default_test_data)
def get_raw(self, url, step_name=None, headers=None, transient_retry=True,
default_test_data=None):
"""GET data at given URL and writes it to file.
Args:
* url: URL to request.
* step_name: optional step name, 'GET <url>' by default.
* headers: a {header_name: value} dictionary for HTTP headers.
* transient_retry (bool or int): Determines how transient HTTP errorts
(>500) will be retried. If True (default), errors will be retried up
to 10 times. If False, no transient retries will occur. If an integer
is supplied, this is the number of transient retries to perform. All
retries have exponential backoff applied.
* default_test_data (str): If provided, use this as the text output when
testing if no overriding data is available.
Returns (UrlApi.Response): Response with the content as its output value.
Raises:
* HTTPError, InfraHTTPError: if the request failed.
* ValueError: If the request was invalid.
"""
assert isinstance(default_test_data, (type(None), bytes))
return self._get_step(
url=url,
step_name=step_name,
headers=headers,
transient_retry=transient_retry,
default_test_data=default_test_data,
as_bytes=True)
def get_json(self, url, step_name=None, headers=None, transient_retry=True,
strip_prefix=None, log=False, default_test_data=None):
"""GET data at given URL and writes it to file.
Args:
* url: URL to request.
* step_name: optional step name, 'GET <url>' by default.
* headers: a {header_name: value} dictionary for HTTP headers.
* transient_retry (bool or int): Determines how transient HTTP errorts
(>500) will be retried. If True (default), errors will be retried up
to 10 times. If False, no transient retries will occur. If an integer
is supplied, this is the number of transient retries to perform. All
retries have exponential backoff applied.
* strip_prefix (str or None): If not None, this prefix must be present at
the beginning of the response, and will be stripped from the resulting
content (e.g., GERRIT_JSON_PREFIX).
* log (bool): If True, emit the JSON content as a log.
* default_test_data (jsonish): If provided, use this as the unmarshalled
JSON result when testing if no overriding data is available.
Returns (UrlApi.Response): Response with the JSON as its "output" value.
Raises:
* HTTPError, InfraHTTPError: if the request failed.
* ValueError: If the request was invalid.
"""
return self._get_step(
url=url,
step_name=step_name,
headers=headers,
transient_retry=transient_retry,
strip_prefix=strip_prefix,
as_json='log' if log else True,
default_test_data=default_test_data)
def _get_step(self, url, step_name, headers, transient_retry, path=None,
strip_prefix=None, as_json=False, as_bytes=False,
default_test_data=None):
step_name = step_name or 'GET %s' % url
is_secure = self.validate_url(url)
args = [
'--url', url,
'--status-json', self.m.json.output(add_json_log=False,
name='status_json'),
]
if as_json:
log = as_json == 'log'
args += ['--outfile', self.m.json.output(add_json_log=log,
name='output')]
elif path:
# path is only passed in by the get_file variant
# in this case, we should not use a placeholder
args += ['--outfile', path]
elif as_bytes:
args += ['--outfile', self.m.raw_io.output(leak_to=path, name='output')]
else:
args += ['--outfile', self.m.raw_io.output_text(leak_to=path,
name='output')]
if headers:
has_authorization_header = any(k.lower() == 'authorization'
for k in headers)
if has_authorization_header and not is_secure:
raise ValueError(
'Refusing to send authorization header to insecure URL: %s' % (
url,))
args += ['--headers-json', self.m.json.input(headers)]
if strip_prefix:
args += ['--strip-prefix', self.m.json.dumps(strip_prefix)]
assert isinstance(transient_retry, (bool, int))
if transient_retry is False:
args += ['--transient-retry', '0']
elif transient_retry is not True:
args += ['--transient-retry', str(transient_retry)]
result = self.m.step(
step_name,
['vpython3', '-u', self.resource('pycurl.py')] + args,
step_test_data=self.test_api._get_step_test_data(
status_cls=self._PyCurlStatus, is_json=as_json, is_bytes=as_bytes,
test_data=default_test_data))
output = path
if not output:
output_placeholder = (result.json.outputs if as_json
else result.raw_io.outputs if as_bytes
else result.raw_io.output_texts)
output = output_placeholder['output']
status = self._PyCurlStatus(**result.json.outputs['status_json'])
response = self.Response('GET', output, status, self.m.context.infra_step)
response.raise_on_error()
return response