blob: f1a61444f7d448acf8e1bf6b341e216a256fb7b5 [file] [log] [blame]
#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""URL downloading API.
Methods defined in this module:
Fetch(): fetchs a given URL using an HTTP request using on of the methods
GET, POST, HEAD, PUT, DELETE or PATCH request
"""
import httplib
import os
import StringIO
import threading
import UserDict
import urllib2
import urlparse
from google.appengine.api import apiproxy_stub_map
from google.appengine.api import urlfetch_service_pb
from google.appengine.api.urlfetch_errors import *
from google.appengine.runtime import apiproxy_errors
MAX_REDIRECTS = 5
GET = 1
POST = 2
HEAD = 3
PUT = 4
DELETE = 5
PATCH = 6
_URL_STRING_MAP = {
'GET': GET,
'POST': POST,
'HEAD': HEAD,
'PUT': PUT,
'DELETE': DELETE,
'PATCH': PATCH,
}
_VALID_METHODS = frozenset(_URL_STRING_MAP.values())
_thread_local_settings = threading.local()
class _CaselessDict(UserDict.IterableUserDict):
"""Case insensitive dictionary.
This class was lifted from os.py and slightly modified.
"""
def __init__(self, dict=None, **kwargs):
self.caseless_keys = {}
UserDict.IterableUserDict.__init__(self, dict, **kwargs)
def __setitem__(self, key, item):
"""Set dictionary item.
Args:
key: Key of new item. Key is case insensitive, so "d['Key'] = value "
will replace previous values set by "d['key'] = old_value".
item: Item to store.
"""
caseless_key = key.lower()
if caseless_key in self.caseless_keys:
del self.data[self.caseless_keys[caseless_key]]
self.caseless_keys[caseless_key] = key
self.data[key] = item
def __getitem__(self, key):
"""Get dictionary item.
Args:
key: Key of item to get. Key is case insensitive, so "d['Key']" is the
same as "d['key']".
Returns:
Item associated with key.
"""
return self.data[self.caseless_keys[key.lower()]]
def __delitem__(self, key):
"""Remove item from dictionary.
Args:
key: Key of item to remove. Key is case insensitive, so "del d['Key']" is
the same as "del d['key']"
"""
caseless_key = key.lower()
del self.data[self.caseless_keys[caseless_key]]
del self.caseless_keys[caseless_key]
def has_key(self, key):
"""Determine if dictionary has item with specific key.
Args:
key: Key to check for presence. Key is case insensitive, so
"d.has_key('Key')" evaluates to the same value as "d.has_key('key')".
Returns:
True if dictionary contains key, else False.
"""
return key.lower() in self.caseless_keys
def __contains__(self, key):
"""Same as 'has_key', but used for 'in' operator.'"""
return self.has_key(key)
def get(self, key, failobj=None):
"""Get dictionary item, defaulting to another value if it does not exist.
Args:
key: Key of item to get. Key is case insensitive, so "d['Key']" is the
same as "d['key']".
failobj: Value to return if key not in dictionary.
"""
try:
cased_key = self.caseless_keys[key.lower()]
except KeyError:
return failobj
return self.data[cased_key]
def update(self, dict=None, **kwargs):
"""Update dictionary using values from another dictionary and keywords.
Args:
dict: Dictionary to update from.
kwargs: Keyword arguments to update from.
"""
if dict:
try:
keys = dict.keys()
except AttributeError:
for k, v in dict:
self[k] = v
else:
for k in keys:
self[k] = dict[k]
if kwargs:
self.update(kwargs)
def copy(self):
"""Make a shallow, case sensitive copy of self."""
return dict(self)
def _is_fetching_self(url, method):
"""Checks if the fetch is for the same URL from which it originated.
Args:
url: str, The URL being fetched.
method: value from _VALID_METHODS.
Returns:
boolean indicating whether or not it seems that the app is trying to fetch
itself.
"""
if (method != GET or
"HTTP_HOST" not in os.environ or
"PATH_INFO" not in os.environ):
return False
_, host_port, path, _, _ = urlparse.urlsplit(url)
if host_port == os.environ['HTTP_HOST']:
current_path = urllib2.unquote(os.environ['PATH_INFO'])
desired_path = urllib2.unquote(path)
if (current_path == desired_path or
(current_path in ('', '/') and desired_path in ('', '/'))):
return True
return False
def create_rpc(deadline=None, callback=None):
"""Creates an RPC object for use with the urlfetch API.
Args:
deadline: Optional deadline in seconds for the operation; the default
is a system-specific deadline (typically 5 seconds).
callback: Optional callable to invoke on completion.
Returns:
An apiproxy_stub_map.UserRPC object specialized for this service.
"""
if deadline is None:
deadline = get_default_fetch_deadline()
return apiproxy_stub_map.UserRPC('urlfetch', deadline, callback)
def fetch(url, payload=None, method=GET, headers={},
allow_truncated=False, follow_redirects=True,
deadline=None, validate_certificate=None):
"""Fetches the given HTTP URL, blocking until the result is returned.
Other optional parameters are:
method: The constants GET, POST, HEAD, PUT, DELETE, or PATCH or the
same HTTP methods as strings.
payload: POST, PUT, or PATCH payload (implies method is not GET, HEAD,
or DELETE). this is ignored if the method is not POST, PUT, or PATCH.
headers: dictionary of HTTP headers to send with the request
allow_truncated: if true, truncate large responses and return them without
error. Otherwise, ResponseTooLargeError is raised when a response is
truncated.
follow_redirects: if true (the default), redirects are
transparently followed and the response (if less than 5
redirects) contains the final destination's payload and the
response status is 200. You lose, however, the redirect chain
information. If false, you see the HTTP response yourself,
including the 'Location' header, and redirects are not
followed.
deadline: deadline in seconds for the operation.
validate_certificate: if true, do not send request to server unless the
certificate is valid, signed by a trusted CA and the hostname matches
the certificate. A value of None indicates that the behaviour will be
chosen by the underlying urlfetch implementation.
We use a HTTP/1.1 compliant proxy to fetch the result.
The returned data structure has the following fields:
content: string containing the response from the server
status_code: HTTP status code returned by the server
headers: dictionary of headers returned by the server
If the URL is an empty string or obviously invalid, we throw an
urlfetch.InvalidURLError. If the server cannot be contacted, we throw a
urlfetch.DownloadError. Note that HTTP errors are returned as a part
of the returned structure, so HTTP errors like 404 do not result in an
exception.
"""
rpc = create_rpc(deadline=deadline)
make_fetch_call(rpc, url, payload, method, headers,
allow_truncated, follow_redirects, validate_certificate)
return rpc.get_result()
def make_fetch_call(rpc, url, payload=None, method=GET, headers={},
allow_truncated=False, follow_redirects=True,
validate_certificate=None):
"""Executes the RPC call to fetch a given HTTP URL.
The first argument is a UserRPC instance. See urlfetch.fetch for a
thorough description of remaining arguments.
Raises:
InvalidMethodError: if requested method is not in _VALID_METHODS
ResponseTooLargeError: if the response payload is too large
InvalidURLError: if there are issues with the content/size of the
requested URL
Returns:
The rpc object passed into the function.
"""
assert rpc.service == 'urlfetch', repr(rpc.service)
if isinstance(method, basestring):
method = method.upper()
method = _URL_STRING_MAP.get(method, method)
if method not in _VALID_METHODS:
raise InvalidMethodError('Invalid method %s.' % str(method))
if _is_fetching_self(url, method):
raise InvalidURLError("App cannot fetch the same URL as the one used for "
"the request.")
request = urlfetch_service_pb.URLFetchRequest()
response = urlfetch_service_pb.URLFetchResponse()
if isinstance(url, unicode):
url = url.encode('UTF-8')
request.set_url(url)
if method == GET:
request.set_method(urlfetch_service_pb.URLFetchRequest.GET)
elif method == POST:
request.set_method(urlfetch_service_pb.URLFetchRequest.POST)
elif method == HEAD:
request.set_method(urlfetch_service_pb.URLFetchRequest.HEAD)
elif method == PUT:
request.set_method(urlfetch_service_pb.URLFetchRequest.PUT)
elif method == DELETE:
request.set_method(urlfetch_service_pb.URLFetchRequest.DELETE)
elif method == PATCH:
request.set_method(urlfetch_service_pb.URLFetchRequest.PATCH)
if payload and method in (POST, PUT, PATCH):
request.set_payload(payload)
for key, value in headers.iteritems():
header_proto = request.add_header()
header_proto.set_key(key)
header_proto.set_value(str(value))
request.set_followredirects(follow_redirects)
if validate_certificate is not None:
request.set_mustvalidateservercertificate(validate_certificate)
if rpc.deadline is not None:
request.set_deadline(rpc.deadline)
rpc.make_call('Fetch', request, response, _get_fetch_result, allow_truncated)
return rpc
def _get_fetch_result(rpc):
"""Check success, handle exceptions, and return converted RPC result.
This method waits for the RPC if it has not yet finished, and calls the
post-call hooks on the first invocation.
Args:
rpc: A UserRPC object.
Raises:
InvalidURLError: if the url was invalid.
DownloadError: if there was a problem fetching the url.
ResponseTooLargeError: if the response was either truncated (and
allow_truncated=False was passed to make_fetch_call()), or if it
was too big for us to download.
Returns:
A _URLFetchResult object.
"""
assert rpc.service == 'urlfetch', repr(rpc.service)
assert rpc.method == 'Fetch', repr(rpc.method)
url = rpc.request.url()
try:
rpc.check_success()
except apiproxy_errors.RequestTooLargeError, err:
raise InvalidURLError(
'Request body too large fetching URL: ' + url)
except apiproxy_errors.ApplicationError, err:
error_detail = ''
if err.error_detail:
error_detail = ' Error: ' + err.error_detail
if (err.application_error ==
urlfetch_service_pb.URLFetchServiceError.INVALID_URL):
raise InvalidURLError(
'Invalid request URL: ' + url + error_detail)
if (err.application_error ==
urlfetch_service_pb.URLFetchServiceError.CLOSED):
raise ConnectionClosedError(
'Connection closed unexpectedly by server at URL: ' + url)
if (err.application_error ==
urlfetch_service_pb.URLFetchServiceError.TOO_MANY_REDIRECTS):
raise TooManyRedirectsError(
'Too many redirects at URL: ' + url + ' with redirect=true')
if (err.application_error ==
urlfetch_service_pb.URLFetchServiceError.MALFORMED_REPLY):
raise MalformedReplyError(
'Malformed HTTP reply received from server at URL: '
+ url + error_detail)
if (err.application_error ==
urlfetch_service_pb.URLFetchServiceError.INTERNAL_TRANSIENT_ERROR):
raise InternalTransientError(
'Temporary error in fetching URL: ' + url + ', please re-try')
if (err.application_error ==
urlfetch_service_pb.URLFetchServiceError.DNS_ERROR):
raise DNSLookupFailedError('DNS lookup failed for URL: ' + url)
if (err.application_error ==
urlfetch_service_pb.URLFetchServiceError.UNSPECIFIED_ERROR):
raise DownloadError('Unspecified error in fetching URL: '
+ url + error_detail)
if (err.application_error ==
urlfetch_service_pb.URLFetchServiceError.FETCH_ERROR):
raise DownloadError("Unable to fetch URL: " + url + error_detail)
if (err.application_error ==
urlfetch_service_pb.URLFetchServiceError.RESPONSE_TOO_LARGE):
raise ResponseTooLargeError('HTTP response too large from URL: ' + url)
if (err.application_error ==
urlfetch_service_pb.URLFetchServiceError.DEADLINE_EXCEEDED):
raise DeadlineExceededError(
'Deadline exceeded while waiting for HTTP response from URL: ' + url)
if (err.application_error ==
urlfetch_service_pb.URLFetchServiceError.SSL_CERTIFICATE_ERROR):
raise SSLCertificateError(
'Invalid and/or missing SSL certificate for URL: ' + url)
raise err
response = rpc.response
allow_truncated = rpc.user_data
result = _URLFetchResult(response)
if response.contentwastruncated() and not allow_truncated:
raise ResponseTooLargeError(result)
return result
Fetch = fetch
class _URLFetchResult(object):
"""A Pythonic representation of our fetch response protocol buffer.
"""
def __init__(self, response_proto):
"""Constructor.
Args:
response_proto: the URLFetchResponse proto buffer to wrap.
"""
self.__pb = response_proto
self.content = response_proto.content()
self.status_code = response_proto.statuscode()
self.content_was_truncated = response_proto.contentwastruncated()
self.final_url = response_proto.finalurl() or None
self.header_msg = httplib.HTTPMessage(
StringIO.StringIO(''.join(['%s: %s\n' % (h.key(), h.value())
for h in response_proto.header_list()] + ['\n'])))
self.headers = _CaselessDict(self.header_msg.items())
def get_default_fetch_deadline():
"""Get the default value for create_rpc()'s deadline parameter."""
return getattr(_thread_local_settings, "default_fetch_deadline", None)
def set_default_fetch_deadline(value):
"""Set the default value for create_rpc()'s deadline parameter.
This setting is thread-specific (i.e. it's stored in a thread local).
This function doesn't do any range or type checking of the value. The
default is None.
See also: create_rpc(), fetch()
"""
_thread_local_settings.default_fetch_deadline = value