blob: 8379365f41125b41672551f8a4ec8c270090a084 [file] [log] [blame]
#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Helpers shared by cloudstorage_stub and cloudstorage_api."""
__all__ = ['CS_XML_NS',
'CSFileStat',
'dt_str_to_posix',
'local_api_url',
'LOCAL_GCS_ENDPOINT',
'local_run',
'get_access_token',
'get_stored_content_length',
'get_metadata',
'GCSFileStat',
'http_time_to_posix',
'memory_usage',
'posix_time_to_http',
'posix_to_dt_str',
'set_access_token',
'validate_options',
'validate_bucket_name',
'validate_bucket_path',
'validate_file_path',
]
import calendar
import datetime
from email import utils as email_utils
import logging
import os
import re
try:
from google.appengine.api import runtime
except ImportError:
from google.appengine.api import runtime
_GCS_BUCKET_REGEX_BASE = r'[a-z0-9\.\-_]{3,63}'
_GCS_BUCKET_REGEX = re.compile(_GCS_BUCKET_REGEX_BASE + r'$')
_GCS_BUCKET_PATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'$')
_GCS_PATH_PREFIX_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'.*')
_GCS_FULLPATH_REGEX = re.compile(r'/' + _GCS_BUCKET_REGEX_BASE + r'/.*')
_GCS_METADATA = ['x-goog-meta-',
'content-disposition',
'cache-control',
'content-encoding']
_GCS_OPTIONS = _GCS_METADATA + ['x-goog-acl']
CS_XML_NS = 'http://doc.s3.amazonaws.com/2006-03-01'
LOCAL_GCS_ENDPOINT = '/_ah/gcs'
_access_token = ''
_MAX_GET_BUCKET_RESULT = 1000
def set_access_token(access_token):
"""Set the shared access token to authenticate with Google Cloud Storage.
When set, the library will always attempt to communicate with the
real Google Cloud Storage with this token even when running on dev appserver.
Note the token could expire so it's up to you to renew it.
When absent, the library will automatically request and refresh a token
on appserver, or when on dev appserver, talk to a Google Cloud Storage
stub.
Args:
access_token: you can get one by run 'gsutil -d ls' and copy the
str after 'Bearer'.
"""
global _access_token
_access_token = access_token
def get_access_token():
"""Returns the shared access token."""
return _access_token
class GCSFileStat(object):
"""Container for GCS file stat."""
def __init__(self,
filename,
st_size,
etag,
st_ctime,
content_type=None,
metadata=None,
is_dir=False):
"""Initialize.
For files, the non optional arguments are always set.
For directories, only filename and is_dir is set.
Args:
filename: a Google Cloud Storage filename of form '/bucket/filename'.
st_size: file size in bytes. long compatible.
etag: hex digest of the md5 hash of the file's content. str.
st_ctime: posix file creation time. float compatible.
content_type: content type. str.
metadata: a str->str dict of user specified options when creating
the file. Possible keys are x-goog-meta-, content-disposition,
content-encoding, and cache-control.
is_dir: True if this represents a directory. False if this is a real file.
"""
self.filename = filename
self.is_dir = is_dir
self.st_size = None
self.st_ctime = None
self.etag = None
self.content_type = content_type
self.metadata = metadata
if not is_dir:
self.st_size = long(st_size)
self.st_ctime = float(st_ctime)
if etag[0] == '"' and etag[-1] == '"':
etag = etag[1:-1]
self.etag = etag
def __repr__(self):
if self.is_dir:
return '(directory: %s)' % self.filename
return (
'(filename: %(filename)s, st_size: %(st_size)s, '
'st_ctime: %(st_ctime)s, etag: %(etag)s, '
'content_type: %(content_type)s, '
'metadata: %(metadata)s)' %
dict(filename=self.filename,
st_size=self.st_size,
st_ctime=self.st_ctime,
etag=self.etag,
content_type=self.content_type,
metadata=self.metadata))
def __cmp__(self, other):
if not isinstance(other, self.__class__):
raise ValueError('Argument to cmp must have the same type. '
'Expect %s, got %s', self.__class__.__name__,
other.__class__.__name__)
if self.filename > other.filename:
return 1
elif self.filename < other.filename:
return -1
return 0
def __hash__(self):
if self.etag:
return hash(self.etag)
return hash(self.filename)
CSFileStat = GCSFileStat
def get_stored_content_length(headers):
"""Return the content length (in bytes) of the object as stored in GCS.
x-goog-stored-content-length should always be present except when called via
the local dev_appserver. Therefore if it is not present we default to the
standard content-length header.
Args:
headers: a dict of headers from the http response.
Returns:
the stored content length.
"""
length = headers.get('x-goog-stored-content-length')
if length is None:
length = headers.get('content-length')
return length
def get_metadata(headers):
"""Get user defined options from HTTP response headers."""
return dict((k, v) for k, v in headers.iteritems()
if any(k.lower().startswith(valid) for valid in _GCS_METADATA))
def validate_bucket_name(name):
"""Validate a Google Storage bucket name.
Args:
name: a Google Storage bucket name with no prefix or suffix.
Raises:
ValueError: if name is invalid.
"""
_validate_path(name)
if not _GCS_BUCKET_REGEX.match(name):
raise ValueError('Bucket should be 3-63 characters long using only a-z,'
'0-9, underscore, dash or dot but got %s' % name)
def validate_bucket_path(path):
"""Validate a Google Cloud Storage bucket path.
Args:
path: a Google Storage bucket path. It should have form '/bucket'.
Raises:
ValueError: if path is invalid.
"""
_validate_path(path)
if not _GCS_BUCKET_PATH_REGEX.match(path):
raise ValueError('Bucket should have format /bucket '
'but got %s' % path)
def validate_file_path(path):
"""Validate a Google Cloud Storage file path.
Args:
path: a Google Storage file path. It should have form '/bucket/filename'.
Raises:
ValueError: if path is invalid.
"""
_validate_path(path)
if not _GCS_FULLPATH_REGEX.match(path):
raise ValueError('Path should have format /bucket/filename '
'but got %s' % path)
def _process_path_prefix(path_prefix):
"""Validate and process a Google Cloud Stoarge path prefix.
Args:
path_prefix: a Google Cloud Storage path prefix of format '/bucket/prefix'
or '/bucket/' or '/bucket'.
Raises:
ValueError: if path is invalid.
Returns:
a tuple of /bucket and prefix. prefix can be None.
"""
_validate_path(path_prefix)
if not _GCS_PATH_PREFIX_REGEX.match(path_prefix):
raise ValueError('Path prefix should have format /bucket, /bucket/, '
'or /bucket/prefix but got %s.' % path_prefix)
bucket_name_end = path_prefix.find('/', 1)
bucket = path_prefix
prefix = None
if bucket_name_end != -1:
bucket = path_prefix[:bucket_name_end]
prefix = path_prefix[bucket_name_end + 1:] or None
return bucket, prefix
def _validate_path(path):
"""Basic validation of Google Storage paths.
Args:
path: a Google Storage path. It should have form '/bucket/filename'
or '/bucket'.
Raises:
ValueError: if path is invalid.
TypeError: if path is not of type basestring.
"""
if not path:
raise ValueError('Path is empty')
if not isinstance(path, basestring):
raise TypeError('Path should be a string but is %s (%s).' %
(path.__class__, path))
def validate_options(options):
"""Validate Google Cloud Storage options.
Args:
options: a str->basestring dict of options to pass to Google Cloud Storage.
Raises:
ValueError: if option is not supported.
TypeError: if option is not of type str or value of an option
is not of type basestring.
"""
if not options:
return
for k, v in options.iteritems():
if not isinstance(k, str):
raise TypeError('option %r should be a str.' % k)
if not any(k.lower().startswith(valid) for valid in _GCS_OPTIONS):
raise ValueError('option %s is not supported.' % k)
if not isinstance(v, basestring):
raise TypeError('value %r for option %s should be of type basestring.' %
(v, k))
def http_time_to_posix(http_time):
"""Convert HTTP time format to posix time.
See http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1
for http time format.
Args:
http_time: time in RFC 2616 format. e.g.
"Mon, 20 Nov 1995 19:12:08 GMT".
Returns:
A float of secs from unix epoch.
"""
if http_time is not None:
return email_utils.mktime_tz(email_utils.parsedate_tz(http_time))
def posix_time_to_http(posix_time):
"""Convert posix time to HTML header time format.
Args:
posix_time: unix time.
Returns:
A datatime str in RFC 2616 format.
"""
if posix_time:
return email_utils.formatdate(posix_time, usegmt=True)
_DT_FORMAT = '%Y-%m-%dT%H:%M:%S'
def dt_str_to_posix(dt_str):
"""format str to posix.
datetime str is of format %Y-%m-%dT%H:%M:%S.%fZ,
e.g. 2013-04-12T00:22:27.978Z. According to ISO 8601, T is a separator
between date and time when they are on the same line.
Z indicates UTC (zero meridian).
A pointer: http://www.cl.cam.ac.uk/~mgk25/iso-time.html
This is used to parse LastModified node from GCS's GET bucket XML response.
Args:
dt_str: A datetime str.
Returns:
A float of secs from unix epoch. By posix definition, epoch is midnight
1970/1/1 UTC.
"""
parsable, _ = dt_str.split('.')
dt = datetime.datetime.strptime(parsable, _DT_FORMAT)
return calendar.timegm(dt.utctimetuple())
def posix_to_dt_str(posix):
"""Reverse of str_to_datetime.
This is used by GCS stub to generate GET bucket XML response.
Args:
posix: A float of secs from unix epoch.
Returns:
A datetime str.
"""
dt = datetime.datetime.utcfromtimestamp(posix)
dt_str = dt.strftime(_DT_FORMAT)
return dt_str + '.000Z'
def local_run():
"""Whether we should hit GCS dev appserver stub."""
server_software = os.environ.get('SERVER_SOFTWARE')
if server_software is None:
return True
if 'remote_api' in server_software:
return False
if server_software.startswith(('Development', 'testutil')):
return True
return False
def local_api_url():
"""Return URL for GCS emulation on dev appserver."""
return 'http://%s%s' % (os.environ.get('HTTP_HOST'), LOCAL_GCS_ENDPOINT)
def memory_usage(method):
"""Log memory usage before and after a method."""
def wrapper(*args, **kwargs):
logging.info('Memory before method %s is %s.',
method.__name__, runtime.memory_usage().current())
result = method(*args, **kwargs)
logging.info('Memory after method %s is %s',
method.__name__, runtime.memory_usage().current())
return result
return wrapper
def _add_ns(tagname):
return '{%(ns)s}%(tag)s' % {'ns': CS_XML_NS,
'tag': tagname}
_T_CONTENTS = _add_ns('Contents')
_T_LAST_MODIFIED = _add_ns('LastModified')
_T_ETAG = _add_ns('ETag')
_T_KEY = _add_ns('Key')
_T_SIZE = _add_ns('Size')
_T_PREFIX = _add_ns('Prefix')
_T_COMMON_PREFIXES = _add_ns('CommonPrefixes')
_T_NEXT_MARKER = _add_ns('NextMarker')
_T_IS_TRUNCATED = _add_ns('IsTruncated')