isolated_format.py - infra/luci/client-py - Git at Google

 # Copyright 2014 The LUCI Authors. All rights reserved.
 # Use of this source code is governed under the Apache License, Version 2.0
 # that can be found in the LICENSE file.

 """Understands .isolated files and can do local operations on them."""

 import hashlib
 import json
 import logging
 import os
 import re
 import stat
 import sys

 from utils import file_path
 from utils import fs
 from utils import tools


 # Version stored and expected in .isolated files.
 ISOLATED_FILE_VERSION = '1.6'


 # Chunk size to use when doing disk I/O.
 DISK_FILE_CHUNK = 1024 * 1024


 # Sadly, hashlib uses 'shaX' instead of the standard 'sha-X' so explicitly
 # specify the names here.
 SUPPORTED_ALGOS = {
   'sha-1': hashlib.sha1,
   'sha-256': hashlib.sha256,
   'sha-512': hashlib.sha512,
 }


 # Used for serialization.
 SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems())


 SUPPORTED_FILE_TYPES = ['basic', 'tar']


 class IsolatedError(ValueError):
   """Generic failure to load a .isolated file."""
   pass


 class MappingError(OSError):
   """Failed to recreate the tree."""
   pass


 def is_valid_hash(value, algo):
   """Returns if the value is a valid hash for the corresponding algorithm."""
   size = 2 * algo().digest_size
   return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))


 get_hash_algo_has_logged = False


 def get_hash_algo(namespace):
   """Return hash algorithm class to use when uploading to given |namespace|."""
   global get_hash_algo_has_logged
   chosen = None
   for name, algo in SUPPORTED_ALGOS.iteritems():
     if namespace.startswith(name + '-'):
       chosen = algo
       break

   if not get_hash_algo_has_logged:
     get_hash_algo_has_logged = True
     if chosen:
       logging.info('Using hash algo %s for namespace %s', chosen, namespace)
     else:
       logging.warn('No hash algo found in \'%s\', assuming sha-1', namespace)

   if not chosen:
     return hashlib.sha1

   return chosen


 def is_namespace_with_compression(namespace):
   """Returns True if given |namespace| stores compressed objects."""
   return namespace.endswith(('-gzip', '-deflate'))


 def hash_file(filepath, algo):
   """Calculates the hash of a file without reading it all in memory at once.

   |algo| should be one of hashlib hashing algorithm.
   """
   digest = algo()
   with fs.open(filepath, 'rb') as f:
     while True:
       chunk = f.read(DISK_FILE_CHUNK)
       if not chunk:
         break
       digest.update(chunk)
   return digest.hexdigest()


 class IsolatedFile(object):
   """Represents a single parsed .isolated file."""

   def __init__(self, obj_hash, algo):
     """|obj_hash| is really the hash of the file."""
     self.obj_hash = obj_hash
     self.algo = algo

     # Raw data.
     self.data = {}
     # A IsolatedFile instance, one per object in self.includes.
     self.children = []

     # Set once the .isolated file is loaded.
     self._is_loaded = False

   def __repr__(self):
     return 'IsolatedFile(%s, loaded: %s)' % (self.obj_hash, self._is_loaded)

   def load(self, content):
     """Verifies the .isolated file is valid and loads this object with the json
     data.
     """
     logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
     assert not self._is_loaded
     self.data = load_isolated(content, self.algo)
     self.children = [
         IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
     ]
     self._is_loaded = True

   @property
   def is_loaded(self):
     """Returns True if 'load' was already called."""
     return self._is_loaded


 def walk_includes(isolated):
   """Walks IsolatedFile include graph and yields IsolatedFile objects.

   Visits root node first, then recursively all children, left to right.
   Not yet loaded nodes are considered childless.
   """
   yield isolated
   for child in isolated.children:
     for x in walk_includes(child):
       yield x


 @tools.profile
 def expand_symlinks(indir, relfile):
   """Follows symlinks in |relfile|, but treating symlinks that point outside the
   build tree as if they were ordinary directories/files. Returns the final
   symlink-free target and a list of paths to symlinks encountered in the
   process.

   The rule about symlinks outside the build tree is for the benefit of the
   Chromium OS ebuild, which symlinks the output directory to an unrelated path
   in the chroot.

   Fails when a directory loop is detected, although in theory we could support
   that case.
   """
   is_directory = relfile.endswith(os.path.sep)
   done = indir
   todo = relfile.strip(os.path.sep)
   symlinks = []

   while todo:
     pre_symlink, symlink, post_symlink = file_path.split_at_symlink(done, todo)
     if not symlink:
       todo = file_path.fix_native_path_case(done, todo)
       done = os.path.join(done, todo)
       break
     symlink_path = os.path.join(done, pre_symlink, symlink)
     post_symlink = post_symlink.lstrip(os.path.sep)
     # readlink doesn't exist on Windows.
     # pylint: disable=E1101
     target = os.path.normpath(os.path.join(done, pre_symlink))
     symlink_target = fs.readlink(symlink_path)
     if os.path.isabs(symlink_target):
       # Absolute path are considered a normal directories. The use case is
       # generally someone who puts the output directory on a separate drive.
       target = symlink_target
     else:
       # The symlink itself could be using the wrong path case.
       target = file_path.fix_native_path_case(target, symlink_target)

     if not fs.exists(target):
       raise MappingError(
           'Symlink target doesn\'t exist: %s -> %s' % (symlink_path, target))
     target = file_path.get_native_path_case(target)
     if not file_path.path_starts_with(indir, target):
       done = symlink_path
       todo = post_symlink
       continue
     if file_path.path_starts_with(target, symlink_path):
       raise MappingError(
           'Can\'t map recursive symlink reference %s -> %s' %
           (symlink_path, target))
     logging.info('Found symlink: %s -> %s', symlink_path, target)
     symlinks.append(os.path.relpath(symlink_path, indir))
     # Treat the common prefix of the old and new paths as done, and start
     # scanning again.
     target = target.split(os.path.sep)
     symlink_path = symlink_path.split(os.path.sep)
     prefix_length = 0
     for target_piece, symlink_path_piece in zip(target, symlink_path):
       if target_piece == symlink_path_piece:
         prefix_length += 1
       else:
         break
     done = os.path.sep.join(target[:prefix_length])
     todo = os.path.join(
         os.path.sep.join(target[prefix_length:]), post_symlink)

   relfile = os.path.relpath(done, indir)
   relfile = relfile.rstrip(os.path.sep) + is_directory * os.path.sep
   return relfile, symlinks


 @tools.profile
 def expand_directory_and_symlink(indir, relfile, blacklist, follow_symlinks):
   """Expands a single input. It can result in multiple outputs.

   This function is recursive when relfile is a directory.

   Note: this code doesn't properly handle recursive symlink like one created
   with:
     ln -s .. foo
   """
   if os.path.isabs(relfile):
     raise MappingError(u'Can\'t map absolute path %s' % relfile)

   infile = file_path.normpath(os.path.join(indir, relfile))
   if not infile.startswith(indir):
     raise MappingError(u'Can\'t map file %s outside %s' % (infile, indir))

   filepath = os.path.join(indir, relfile)
   native_filepath = file_path.get_native_path_case(filepath)
   if filepath != native_filepath:
     # Special case './'.
     if filepath != native_filepath + u'.' + os.path.sep:
       # While it'd be nice to enforce path casing on Windows, it's impractical.
       # Also give up enforcing strict path case on OSX. Really, it's that sad.
       # The case where it happens is very specific and hard to reproduce:
       # get_native_path_case(
       #    u'Foo.framework/Versions/A/Resources/Something.nib') will return
       # u'Foo.framework/Versions/A/resources/Something.nib', e.g. lowercase 'r'.
       #
       # Note that this is really something deep in OSX because running
       # ls Foo.framework/Versions/A
       # will print out 'Resources', while file_path.get_native_path_case()
       # returns a lower case 'r'.
       #
       # So *something* is happening under the hood resulting in the command 'ls'
       # and Carbon.File.FSPathMakeRef('path').FSRefMakePath() to disagree.  We
       # have no idea why.
       if sys.platform not in ('darwin', 'win32'):
         raise MappingError(
             u'File path doesn\'t equal native file path\n%s != %s' %
             (filepath, native_filepath))

   symlinks = []
   if follow_symlinks:
     try:
       relfile, symlinks = expand_symlinks(indir, relfile)
     except OSError:
       # The file doesn't exist, it will throw below.
       pass

   if relfile.endswith(os.path.sep):
     if not fs.isdir(infile):
       raise MappingError(
           u'%s is not a directory but ends with "%s"' % (infile, os.path.sep))

     # Special case './'.
     if relfile.startswith(u'.' + os.path.sep):
       relfile = relfile[2:]
     outfiles = symlinks
     try:
       for filename in fs.listdir(infile):
         inner_relfile = os.path.join(relfile, filename)
         if blacklist and blacklist(inner_relfile):
           continue
         if fs.isdir(os.path.join(indir, inner_relfile)):
           inner_relfile += os.path.sep
         outfiles.extend(
             expand_directory_and_symlink(indir, inner_relfile, blacklist,
                                          follow_symlinks))
       return outfiles
     except OSError as e:
       raise MappingError(
           u'Unable to iterate over directory %s.\n%s' % (infile, e))
   else:
     # Always add individual files even if they were blacklisted.
     if fs.isdir(infile):
       raise MappingError(
           u'Input directory %s must have a trailing slash' % infile)

     if not fs.isfile(infile):
       raise MappingError(u'Input file %s doesn\'t exist' % infile)

     return symlinks + [relfile]


 def expand_directories_and_symlinks(
     indir, infiles, blacklist, follow_symlinks, ignore_broken_items):
   """Expands the directories and the symlinks, applies the blacklist and
   verifies files exist.

   Files are specified in os native path separator.
   """
   outfiles = []
   for relfile in infiles:
     try:
       outfiles.extend(
           expand_directory_and_symlink(
               indir, relfile, blacklist, follow_symlinks))
     except MappingError as e:
       if not ignore_broken_items:
         raise
       logging.info('warning: %s', e)
   return outfiles


 @tools.profile
 def file_to_metadata(filepath, prevdict, read_only, algo, collapse_symlinks):
   """Processes an input file, a dependency, and return meta data about it.

   Behaviors:
   - Retrieves the file mode, file size, file timestamp, file link
     destination if it is a file link and calcultate the SHA-1 of the file's
     content if the path points to a file and not a symlink.

   Arguments:
     filepath: File to act on.
     prevdict: the previous dictionary. It is used to retrieve the cached hash
               to skip recalculating the hash. Optional.
     read_only: If 1 or 2, the file mode is manipulated. In practice, only save
                one of 4 modes: 0755 (rwx), 0644 (rw), 0555 (rx), 0444 (r). On
                windows, mode is not set since all files are 'executable' by
                default.
     algo:      Hashing algorithm used.
     collapse_symlinks: True if symlinked files should be treated like they were
                        the normal underlying file.

   Returns:
     The necessary dict to create a entry in the 'files' section of an .isolated
     file.
   """
   # TODO(maruel): None is not a valid value.
   assert read_only in (None, 0, 1, 2), read_only
   out = {}
   # Always check the file stat and check if it is a link. The timestamp is used
   # to know if the file's content/symlink destination should be looked into.
   # E.g. only reuse from prevdict if the timestamp hasn't changed.
   # There is the risk of the file's timestamp being reset to its last value
   # manually while its content changed. We don't protect against that use case.
   try:
     if collapse_symlinks:
       # os.stat follows symbolic links
       filestats = fs.stat(filepath)
     else:
       # os.lstat does not follow symbolic links, and thus preserves them.
       filestats = fs.lstat(filepath)
   except OSError:
     # The file is not present.
     raise MappingError('%s is missing' % filepath)
   is_link = stat.S_ISLNK(filestats.st_mode)

   if sys.platform != 'win32':
     # Ignore file mode on Windows since it's not really useful there.
     filemode = stat.S_IMODE(filestats.st_mode)
     # Remove write access for group and all access to 'others'.
     filemode &= ~(stat.S_IWGRP | stat.S_IRWXO)
     if read_only:
       filemode &= ~stat.S_IWUSR
     if filemode & (stat.S_IXUSR|stat.S_IRGRP) == (stat.S_IXUSR|stat.S_IRGRP):
       # Only keep x group bit if both x user bit and group read bit are set.
       filemode |= stat.S_IXGRP
     else:
       filemode &= ~stat.S_IXGRP
     if not is_link:
       out['m'] = filemode

   # Used to skip recalculating the hash or link destination. Use the most recent
   # update time.
   out['t'] = int(round(filestats.st_mtime))

   if not is_link:
     out['s'] = filestats.st_size
     # If the timestamp wasn't updated and the file size is still the same, carry
     # on the hash.
     if (prevdict.get('t') == out['t'] and
         prevdict.get('s') == out['s']):
       # Reuse the previous hash if available.
       out['h'] = prevdict.get('h')
     if not out.get('h'):
       out['h'] = hash_file(filepath, algo)
   else:
     # If the timestamp wasn't updated, carry on the link destination.
     if prevdict.get('t') == out['t']:
       # Reuse the previous link destination if available.
       out['l'] = prevdict.get('l')
     if out.get('l') is None:
       # The link could be in an incorrect path case. In practice, this only
       # happen on OSX on case insensitive HFS.
       # TODO(maruel): It'd be better if it was only done once, in
       # expand_directory_and_symlink(), so it would not be necessary to do again
       # here.
       symlink_value = fs.readlink(filepath)  # pylint: disable=E1101
       filedir = file_path.get_native_path_case(os.path.dirname(filepath))
       native_dest = file_path.fix_native_path_case(filedir, symlink_value)
       out['l'] = os.path.relpath(native_dest, filedir)
   return out


 def save_isolated(isolated, data):
   """Writes one or multiple .isolated files.

   Note: this reference implementation does not create child .isolated file so it
   always returns an empty list.

   Returns the list of child isolated files that are included by |isolated|.
   """
   # Make sure the data is valid .isolated data by 'reloading' it.
   algo = SUPPORTED_ALGOS[data['algo']]
   load_isolated(json.dumps(data), algo)
   tools.write_json(isolated, data, True)
   return []


 def split_path(path):
   """Splits a path and return a list with each element."""
   out = []
   while path:
     path, rest = os.path.split(path)
     if rest:
       out.append(rest)
   return out


 def load_isolated(content, algo):
   """Verifies the .isolated file is valid and loads this object with the json
   data.

   Arguments:
   - content: raw serialized content to load.
   - algo: hashlib algorithm class. Used to confirm the algorithm matches the
           algorithm used on the Isolate Server.
   """
   if not algo:
     raise IsolatedError('\'algo\' is required')
   try:
     data = json.loads(content)
   except ValueError as v:
     logging.error('Failed to parse .isolated file:\n%s', content)
     raise IsolatedError('Failed to parse (%s): %s...' % (v, content[:100]))

   if not isinstance(data, dict):
     raise IsolatedError('Expected dict, got %r' % data)

   # Check 'version' first, since it could modify the parsing after.
   value = data.get('version', '1.0')
   if not isinstance(value, basestring):
     raise IsolatedError('Expected string, got %r' % value)
   try:
     version = tuple(map(int, value.split('.')))
   except ValueError:
     raise IsolatedError('Expected valid version, got %r' % value)

   expected_version = tuple(map(int, ISOLATED_FILE_VERSION.split('.')))
   # Major version must match.
   if version[0] != expected_version[0]:
     raise IsolatedError(
         'Expected compatible \'%s\' version, got %r' %
         (ISOLATED_FILE_VERSION, value))

   algo_name = SUPPORTED_ALGOS_REVERSE[algo]

   for key, value in data.iteritems():
     if key == 'algo':
       if not isinstance(value, basestring):
         raise IsolatedError('Expected string, got %r' % value)
       if value not in SUPPORTED_ALGOS:
         raise IsolatedError(
             'Expected one of \'%s\', got %r' %
             (', '.join(sorted(SUPPORTED_ALGOS)), value))
       if value != SUPPORTED_ALGOS_REVERSE[algo]:
         raise IsolatedError(
             'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value))

     elif key == 'command':
       if not isinstance(value, list):
         raise IsolatedError('Expected list, got %r' % value)
       if not value:
         raise IsolatedError('Expected non-empty command')
       for subvalue in value:
         if not isinstance(subvalue, basestring):
           raise IsolatedError('Expected string, got %r' % subvalue)

     elif key == 'files':
       if not isinstance(value, dict):
         raise IsolatedError('Expected dict, got %r' % value)
       for subkey, subvalue in value.iteritems():
         if not isinstance(subkey, basestring):
           raise IsolatedError('Expected string, got %r' % subkey)
         if os.path.isabs(subkey) or subkey.startswith('\\\\'):
           # Disallow '\\\\', it could UNC on Windows but disallow this
           # everywhere.
           raise IsolatedError('File path can\'t be absolute: %r' % subkey)
         if subkey.endswith(('/', '\\')):
           raise IsolatedError(
               'File path can\'t end with \'%s\': %r' % (subkey[-1], subkey))
         if '..' in split_path(subkey):
           raise IsolatedError('File path can\'t reference parent: %r' % subkey)
         if not isinstance(subvalue, dict):
           raise IsolatedError('Expected dict, got %r' % subvalue)
         for subsubkey, subsubvalue in subvalue.iteritems():
           if subsubkey == 'l':
             if not isinstance(subsubvalue, basestring):
               raise IsolatedError('Expected string, got %r' % subsubvalue)
           elif subsubkey == 'm':
             if not isinstance(subsubvalue, int):
               raise IsolatedError('Expected int, got %r' % subsubvalue)
           elif subsubkey == 'h':
             if not is_valid_hash(subsubvalue, algo):
               raise IsolatedError('Expected %s, got %r' %
                                   (algo_name, subsubvalue))
           elif subsubkey == 's':
             if not isinstance(subsubvalue, (int, long)):
               raise IsolatedError('Expected int or long, got %r' % subsubvalue)
           elif subsubkey == 't':
             if subsubvalue not in SUPPORTED_FILE_TYPES:
               raise IsolatedError('Expected one of \'%s\', got %r' % (
                   ', '.join(sorted(SUPPORTED_FILE_TYPES)), subsubvalue))
           else:
             raise IsolatedError('Unknown subsubkey %s' % subsubkey)
         if bool('h' in subvalue) == bool('l' in subvalue):
           raise IsolatedError(
               'Need only one of \'h\' (%s) or \'l\' (link), got: %r' %
               (algo_name, subvalue))
         if bool('h' in subvalue) != bool('s' in subvalue):
           raise IsolatedError(
               'Both \'h\' (%s) and \'s\' (size) should be set, got: %r' %
               (algo_name, subvalue))
         if bool('s' in subvalue) == bool('l' in subvalue):
           raise IsolatedError(
               'Need only one of \'s\' (size) or \'l\' (link), got: %r' %
               subvalue)
         if bool('l' in subvalue) and bool('m' in subvalue):
           raise IsolatedError(
               'Cannot use \'m\' (mode) and \'l\' (link), got: %r' %
               subvalue)

     elif key == 'includes':
       if not isinstance(value, list):
         raise IsolatedError('Expected list, got %r' % value)
       if not value:
         raise IsolatedError('Expected non-empty includes list')
       for subvalue in value:
         if not is_valid_hash(subvalue, algo):
           raise IsolatedError('Expected %s, got %r' % (algo_name, subvalue))

     elif key == 'os':
       if version >= (1, 4):
         raise IsolatedError('Key \'os\' is not allowed starting version 1.4')

     elif key == 'read_only':
       if not value in (0, 1, 2):
         raise IsolatedError('Expected 0, 1 or 2, got %r' % value)

     elif key == 'relative_cwd':
       if not isinstance(value, basestring):
         raise IsolatedError('Expected string, got %r' % value)

     elif key == 'version':
       # Already checked above.
       pass

     else:
       raise IsolatedError('Unknown key %r' % key)

   # Automatically fix os.path.sep if necessary. While .isolated files are always
   # in the the native path format, someone could want to download an .isolated
   # tree from another OS.
   wrong_path_sep = '/' if os.path.sep == '\\' else '\\'
   if 'files' in data:
     data['files'] = dict(
         (k.replace(wrong_path_sep, os.path.sep), v)
         for k, v in data['files'].iteritems())
     for v in data['files'].itervalues():
       if 'l' in v:
         v['l'] = v['l'].replace(wrong_path_sep, os.path.sep)
   if 'relative_cwd' in data:
     data['relative_cwd'] = data['relative_cwd'].replace(
         wrong_path_sep, os.path.sep)
   return data
	# Copyright 2014 The LUCI Authors. All rights reserved.
	# Use of this source code is governed under the Apache License, Version 2.0
	# that can be found in the LICENSE file.

	"""Understands .isolated files and can do local operations on them."""

	import hashlib
	import json
	import logging
	import os
	import re
	import stat
	import sys

	from utils import file_path
	from utils import fs
	from utils import tools


	# Version stored and expected in .isolated files.
	ISOLATED_FILE_VERSION = '1.6'


	# Chunk size to use when doing disk I/O.
	DISK_FILE_CHUNK = 1024 * 1024


	# Sadly, hashlib uses 'shaX' instead of the standard 'sha-X' so explicitly
	# specify the names here.
	SUPPORTED_ALGOS = {
	'sha-1': hashlib.sha1,
	'sha-256': hashlib.sha256,
	'sha-512': hashlib.sha512,
	}


	# Used for serialization.
	SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems())


	SUPPORTED_FILE_TYPES = ['basic', 'tar']


	class IsolatedError(ValueError):
	"""Generic failure to load a .isolated file."""
	pass


	class MappingError(OSError):
	"""Failed to recreate the tree."""
	pass


	def is_valid_hash(value, algo):
	"""Returns if the value is a valid hash for the corresponding algorithm."""
	size = 2 * algo().digest_size
	return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))


	get_hash_algo_has_logged = False


	def get_hash_algo(namespace):
	"""Return hash algorithm class to use when uploading to given \|namespace\|."""
	global get_hash_algo_has_logged
	chosen = None
	for name, algo in SUPPORTED_ALGOS.iteritems():
	if namespace.startswith(name + '-'):
	chosen = algo
	break

	if not get_hash_algo_has_logged:
	get_hash_algo_has_logged = True
	if chosen:
	logging.info('Using hash algo %s for namespace %s', chosen, namespace)
	else:
	logging.warn('No hash algo found in \'%s\', assuming sha-1', namespace)

	if not chosen:
	return hashlib.sha1

	return chosen


	def is_namespace_with_compression(namespace):
	"""Returns True if given \|namespace\| stores compressed objects."""
	return namespace.endswith(('-gzip', '-deflate'))


	def hash_file(filepath, algo):
	"""Calculates the hash of a file without reading it all in memory at once.

	\|algo\| should be one of hashlib hashing algorithm.
	"""
	digest = algo()
	with fs.open(filepath, 'rb') as f:
	while True:
	chunk = f.read(DISK_FILE_CHUNK)
	if not chunk:
	break
	digest.update(chunk)
	return digest.hexdigest()


	class IsolatedFile(object):
	"""Represents a single parsed .isolated file."""

	def __init__(self, obj_hash, algo):
	"""\|obj_hash\| is really the hash of the file."""
	self.obj_hash = obj_hash
	self.algo = algo

	# Raw data.
	self.data = {}
	# A IsolatedFile instance, one per object in self.includes.
	self.children = []

	# Set once the .isolated file is loaded.
	self._is_loaded = False

	def __repr__(self):
	return 'IsolatedFile(%s, loaded: %s)' % (self.obj_hash, self._is_loaded)

	def load(self, content):
	"""Verifies the .isolated file is valid and loads this object with the json
	data.
	"""
	logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
	assert not self._is_loaded
	self.data = load_isolated(content, self.algo)
	self.children = [
	IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
	]
	self._is_loaded = True

	@property
	def is_loaded(self):
	"""Returns True if 'load' was already called."""
	return self._is_loaded


	def walk_includes(isolated):
	"""Walks IsolatedFile include graph and yields IsolatedFile objects.

	Visits root node first, then recursively all children, left to right.
	Not yet loaded nodes are considered childless.
	"""
	yield isolated
	for child in isolated.children:
	for x in walk_includes(child):
	yield x


	@tools.profile
	def expand_symlinks(indir, relfile):
	"""Follows symlinks in \|relfile\|, but treating symlinks that point outside the
	build tree as if they were ordinary directories/files. Returns the final
	symlink-free target and a list of paths to symlinks encountered in the
	process.

	The rule about symlinks outside the build tree is for the benefit of the
	Chromium OS ebuild, which symlinks the output directory to an unrelated path
	in the chroot.

	Fails when a directory loop is detected, although in theory we could support
	that case.
	"""
	is_directory = relfile.endswith(os.path.sep)
	done = indir
	todo = relfile.strip(os.path.sep)
	symlinks = []

	while todo:
	pre_symlink, symlink, post_symlink = file_path.split_at_symlink(done, todo)
	if not symlink:
	todo = file_path.fix_native_path_case(done, todo)
	done = os.path.join(done, todo)
	break
	symlink_path = os.path.join(done, pre_symlink, symlink)
	post_symlink = post_symlink.lstrip(os.path.sep)
	# readlink doesn't exist on Windows.
	# pylint: disable=E1101
	target = os.path.normpath(os.path.join(done, pre_symlink))
	symlink_target = fs.readlink(symlink_path)
	if os.path.isabs(symlink_target):
	# Absolute path are considered a normal directories. The use case is
	# generally someone who puts the output directory on a separate drive.
	target = symlink_target
	else:
	# The symlink itself could be using the wrong path case.
	target = file_path.fix_native_path_case(target, symlink_target)

	if not fs.exists(target):
	raise MappingError(
	'Symlink target doesn\'t exist: %s -> %s' % (symlink_path, target))
	target = file_path.get_native_path_case(target)
	if not file_path.path_starts_with(indir, target):
	done = symlink_path
	todo = post_symlink
	continue
	if file_path.path_starts_with(target, symlink_path):
	raise MappingError(
	'Can\'t map recursive symlink reference %s -> %s' %
	(symlink_path, target))
	logging.info('Found symlink: %s -> %s', symlink_path, target)
	symlinks.append(os.path.relpath(symlink_path, indir))
	# Treat the common prefix of the old and new paths as done, and start
	# scanning again.
	target = target.split(os.path.sep)
	symlink_path = symlink_path.split(os.path.sep)
	prefix_length = 0
	for target_piece, symlink_path_piece in zip(target, symlink_path):
	if target_piece == symlink_path_piece:
	prefix_length += 1
	else:
	break
	done = os.path.sep.join(target[:prefix_length])
	todo = os.path.join(
	os.path.sep.join(target[prefix_length:]), post_symlink)

	relfile = os.path.relpath(done, indir)
	relfile = relfile.rstrip(os.path.sep) + is_directory * os.path.sep
	return relfile, symlinks


	@tools.profile
	def expand_directory_and_symlink(indir, relfile, blacklist, follow_symlinks):
	"""Expands a single input. It can result in multiple outputs.

	This function is recursive when relfile is a directory.

	Note: this code doesn't properly handle recursive symlink like one created
	with:
	ln -s .. foo
	"""
	if os.path.isabs(relfile):
	raise MappingError(u'Can\'t map absolute path %s' % relfile)

	infile = file_path.normpath(os.path.join(indir, relfile))
	if not infile.startswith(indir):
	raise MappingError(u'Can\'t map file %s outside %s' % (infile, indir))

	filepath = os.path.join(indir, relfile)
	native_filepath = file_path.get_native_path_case(filepath)
	if filepath != native_filepath:
	# Special case './'.
	if filepath != native_filepath + u'.' + os.path.sep:
	# While it'd be nice to enforce path casing on Windows, it's impractical.
	# Also give up enforcing strict path case on OSX. Really, it's that sad.
	# The case where it happens is very specific and hard to reproduce:
	# get_native_path_case(
	# u'Foo.framework/Versions/A/Resources/Something.nib') will return
	# u'Foo.framework/Versions/A/resources/Something.nib', e.g. lowercase 'r'.
	#
	# Note that this is really something deep in OSX because running
	# ls Foo.framework/Versions/A
	# will print out 'Resources', while file_path.get_native_path_case()
	# returns a lower case 'r'.
	#
	# So something is happening under the hood resulting in the command 'ls'
	# and Carbon.File.FSPathMakeRef('path').FSRefMakePath() to disagree. We
	# have no idea why.
	if sys.platform not in ('darwin', 'win32'):
	raise MappingError(
	u'File path doesn\'t equal native file path\n%s != %s' %
	(filepath, native_filepath))

	symlinks = []
	if follow_symlinks:
	try:
	relfile, symlinks = expand_symlinks(indir, relfile)
	except OSError:
	# The file doesn't exist, it will throw below.
	pass

	if relfile.endswith(os.path.sep):
	if not fs.isdir(infile):
	raise MappingError(
	u'%s is not a directory but ends with "%s"' % (infile, os.path.sep))

	# Special case './'.
	if relfile.startswith(u'.' + os.path.sep):
	relfile = relfile[2:]
	outfiles = symlinks
	try:
	for filename in fs.listdir(infile):
	inner_relfile = os.path.join(relfile, filename)
	if blacklist and blacklist(inner_relfile):
	continue
	if fs.isdir(os.path.join(indir, inner_relfile)):
	inner_relfile += os.path.sep
	outfiles.extend(
	expand_directory_and_symlink(indir, inner_relfile, blacklist,
	follow_symlinks))
	return outfiles
	except OSError as e:
	raise MappingError(
	u'Unable to iterate over directory %s.\n%s' % (infile, e))
	else:
	# Always add individual files even if they were blacklisted.
	if fs.isdir(infile):
	raise MappingError(
	u'Input directory %s must have a trailing slash' % infile)

	if not fs.isfile(infile):
	raise MappingError(u'Input file %s doesn\'t exist' % infile)

	return symlinks + [relfile]


	def expand_directories_and_symlinks(
	indir, infiles, blacklist, follow_symlinks, ignore_broken_items):
	"""Expands the directories and the symlinks, applies the blacklist and
	verifies files exist.

	Files are specified in os native path separator.
	"""
	outfiles = []
	for relfile in infiles:
	try:
	outfiles.extend(
	expand_directory_and_symlink(
	indir, relfile, blacklist, follow_symlinks))
	except MappingError as e:
	if not ignore_broken_items:
	raise
	logging.info('warning: %s', e)
	return outfiles


	@tools.profile
	def file_to_metadata(filepath, prevdict, read_only, algo, collapse_symlinks):
	"""Processes an input file, a dependency, and return meta data about it.

	Behaviors:
	- Retrieves the file mode, file size, file timestamp, file link
	destination if it is a file link and calcultate the SHA-1 of the file's
	content if the path points to a file and not a symlink.

	Arguments:
	filepath: File to act on.
	prevdict: the previous dictionary. It is used to retrieve the cached hash
	to skip recalculating the hash. Optional.
	read_only: If 1 or 2, the file mode is manipulated. In practice, only save
	one of 4 modes: 0755 (rwx), 0644 (rw), 0555 (rx), 0444 (r). On
	windows, mode is not set since all files are 'executable' by
	default.
	algo: Hashing algorithm used.
	collapse_symlinks: True if symlinked files should be treated like they were
	the normal underlying file.

	Returns:
	The necessary dict to create a entry in the 'files' section of an .isolated
	file.
	"""
	# TODO(maruel): None is not a valid value.
	assert read_only in (None, 0, 1, 2), read_only
	out = {}
	# Always check the file stat and check if it is a link. The timestamp is used
	# to know if the file's content/symlink destination should be looked into.
	# E.g. only reuse from prevdict if the timestamp hasn't changed.
	# There is the risk of the file's timestamp being reset to its last value
	# manually while its content changed. We don't protect against that use case.
	try:
	if collapse_symlinks:
	# os.stat follows symbolic links
	filestats = fs.stat(filepath)
	else:
	# os.lstat does not follow symbolic links, and thus preserves them.
	filestats = fs.lstat(filepath)
	except OSError:
	# The file is not present.
	raise MappingError('%s is missing' % filepath)
	is_link = stat.S_ISLNK(filestats.st_mode)

	if sys.platform != 'win32':
	# Ignore file mode on Windows since it's not really useful there.
	filemode = stat.S_IMODE(filestats.st_mode)
	# Remove write access for group and all access to 'others'.
	filemode &= ~(stat.S_IWGRP \| stat.S_IRWXO)
	if read_only:
	filemode &= ~stat.S_IWUSR
	if filemode & (stat.S_IXUSR\|stat.S_IRGRP) == (stat.S_IXUSR\|stat.S_IRGRP):
	# Only keep x group bit if both x user bit and group read bit are set.
	filemode \|= stat.S_IXGRP
	else:
	filemode &= ~stat.S_IXGRP
	if not is_link:
	out['m'] = filemode

	# Used to skip recalculating the hash or link destination. Use the most recent
	# update time.
	out['t'] = int(round(filestats.st_mtime))

	if not is_link:
	out['s'] = filestats.st_size
	# If the timestamp wasn't updated and the file size is still the same, carry
	# on the hash.
	if (prevdict.get('t') == out['t'] and
	prevdict.get('s') == out['s']):
	# Reuse the previous hash if available.
	out['h'] = prevdict.get('h')
	if not out.get('h'):
	out['h'] = hash_file(filepath, algo)
	else:
	# If the timestamp wasn't updated, carry on the link destination.
	if prevdict.get('t') == out['t']:
	# Reuse the previous link destination if available.
	out['l'] = prevdict.get('l')
	if out.get('l') is None:
	# The link could be in an incorrect path case. In practice, this only
	# happen on OSX on case insensitive HFS.
	# TODO(maruel): It'd be better if it was only done once, in
	# expand_directory_and_symlink(), so it would not be necessary to do again
	# here.
	symlink_value = fs.readlink(filepath) # pylint: disable=E1101
	filedir = file_path.get_native_path_case(os.path.dirname(filepath))
	native_dest = file_path.fix_native_path_case(filedir, symlink_value)
	out['l'] = os.path.relpath(native_dest, filedir)
	return out


	def save_isolated(isolated, data):
	"""Writes one or multiple .isolated files.

	Note: this reference implementation does not create child .isolated file so it
	always returns an empty list.

	Returns the list of child isolated files that are included by \|isolated\|.
	"""
	# Make sure the data is valid .isolated data by 'reloading' it.
	algo = SUPPORTED_ALGOS[data['algo']]
	load_isolated(json.dumps(data), algo)
	tools.write_json(isolated, data, True)
	return []


	def split_path(path):
	"""Splits a path and return a list with each element."""
	out = []
	while path:
	path, rest = os.path.split(path)
	if rest:
	out.append(rest)
	return out


	def load_isolated(content, algo):
	"""Verifies the .isolated file is valid and loads this object with the json
	data.

	Arguments:
	- content: raw serialized content to load.
	- algo: hashlib algorithm class. Used to confirm the algorithm matches the
	algorithm used on the Isolate Server.
	"""
	if not algo:
	raise IsolatedError('\'algo\' is required')
	try:
	data = json.loads(content)
	except ValueError as v:
	logging.error('Failed to parse .isolated file:\n%s', content)
	raise IsolatedError('Failed to parse (%s): %s...' % (v, content[:100]))

	if not isinstance(data, dict):
	raise IsolatedError('Expected dict, got %r' % data)

	# Check 'version' first, since it could modify the parsing after.
	value = data.get('version', '1.0')
	if not isinstance(value, basestring):
	raise IsolatedError('Expected string, got %r' % value)
	try:
	version = tuple(map(int, value.split('.')))
	except ValueError:
	raise IsolatedError('Expected valid version, got %r' % value)

	expected_version = tuple(map(int, ISOLATED_FILE_VERSION.split('.')))
	# Major version must match.
	if version[0] != expected_version[0]:
	raise IsolatedError(
	'Expected compatible \'%s\' version, got %r' %
	(ISOLATED_FILE_VERSION, value))

	algo_name = SUPPORTED_ALGOS_REVERSE[algo]

	for key, value in data.iteritems():
	if key == 'algo':
	if not isinstance(value, basestring):
	raise IsolatedError('Expected string, got %r' % value)
	if value not in SUPPORTED_ALGOS:
	raise IsolatedError(
	'Expected one of \'%s\', got %r' %
	(', '.join(sorted(SUPPORTED_ALGOS)), value))
	if value != SUPPORTED_ALGOS_REVERSE[algo]:
	raise IsolatedError(
	'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value))

	elif key == 'command':
	if not isinstance(value, list):
	raise IsolatedError('Expected list, got %r' % value)
	if not value:
	raise IsolatedError('Expected non-empty command')
	for subvalue in value:
	if not isinstance(subvalue, basestring):
	raise IsolatedError('Expected string, got %r' % subvalue)

	elif key == 'files':
	if not isinstance(value, dict):
	raise IsolatedError('Expected dict, got %r' % value)
	for subkey, subvalue in value.iteritems():
	if not isinstance(subkey, basestring):
	raise IsolatedError('Expected string, got %r' % subkey)
	if os.path.isabs(subkey) or subkey.startswith('\\\\'):
	# Disallow '\\\\', it could UNC on Windows but disallow this
	# everywhere.
	raise IsolatedError('File path can\'t be absolute: %r' % subkey)
	if subkey.endswith(('/', '\\')):
	raise IsolatedError(
	'File path can\'t end with \'%s\': %r' % (subkey[-1], subkey))
	if '..' in split_path(subkey):
	raise IsolatedError('File path can\'t reference parent: %r' % subkey)
	if not isinstance(subvalue, dict):
	raise IsolatedError('Expected dict, got %r' % subvalue)
	for subsubkey, subsubvalue in subvalue.iteritems():
	if subsubkey == 'l':
	if not isinstance(subsubvalue, basestring):
	raise IsolatedError('Expected string, got %r' % subsubvalue)
	elif subsubkey == 'm':
	if not isinstance(subsubvalue, int):
	raise IsolatedError('Expected int, got %r' % subsubvalue)
	elif subsubkey == 'h':
	if not is_valid_hash(subsubvalue, algo):
	raise IsolatedError('Expected %s, got %r' %
	(algo_name, subsubvalue))
	elif subsubkey == 's':
	if not isinstance(subsubvalue, (int, long)):
	raise IsolatedError('Expected int or long, got %r' % subsubvalue)
	elif subsubkey == 't':
	if subsubvalue not in SUPPORTED_FILE_TYPES:
	raise IsolatedError('Expected one of \'%s\', got %r' % (
	', '.join(sorted(SUPPORTED_FILE_TYPES)), subsubvalue))
	else:
	raise IsolatedError('Unknown subsubkey %s' % subsubkey)
	if bool('h' in subvalue) == bool('l' in subvalue):
	raise IsolatedError(
	'Need only one of \'h\' (%s) or \'l\' (link), got: %r' %
	(algo_name, subvalue))
	if bool('h' in subvalue) != bool('s' in subvalue):
	raise IsolatedError(
	'Both \'h\' (%s) and \'s\' (size) should be set, got: %r' %
	(algo_name, subvalue))
	if bool('s' in subvalue) == bool('l' in subvalue):
	raise IsolatedError(
	'Need only one of \'s\' (size) or \'l\' (link), got: %r' %
	subvalue)
	if bool('l' in subvalue) and bool('m' in subvalue):
	raise IsolatedError(
	'Cannot use \'m\' (mode) and \'l\' (link), got: %r' %
	subvalue)

	elif key == 'includes':
	if not isinstance(value, list):
	raise IsolatedError('Expected list, got %r' % value)
	if not value:
	raise IsolatedError('Expected non-empty includes list')
	for subvalue in value:
	if not is_valid_hash(subvalue, algo):
	raise IsolatedError('Expected %s, got %r' % (algo_name, subvalue))

	elif key == 'os':
	if version >= (1, 4):
	raise IsolatedError('Key \'os\' is not allowed starting version 1.4')

	elif key == 'read_only':
	if not value in (0, 1, 2):
	raise IsolatedError('Expected 0, 1 or 2, got %r' % value)

	elif key == 'relative_cwd':
	if not isinstance(value, basestring):
	raise IsolatedError('Expected string, got %r' % value)

	elif key == 'version':
	# Already checked above.
	pass

	else:
	raise IsolatedError('Unknown key %r' % key)

	# Automatically fix os.path.sep if necessary. While .isolated files are always
	# in the the native path format, someone could want to download an .isolated
	# tree from another OS.
	wrong_path_sep = '/' if os.path.sep == '\\' else '\\'
	if 'files' in data:
	data['files'] = dict(
	(k.replace(wrong_path_sep, os.path.sep), v)
	for k, v in data['files'].iteritems())
	for v in data['files'].itervalues():
	if 'l' in v:
	v['l'] = v['l'].replace(wrong_path_sep, os.path.sep)
	if 'relative_cwd' in data:
	data['relative_cwd'] = data['relative_cwd'].replace(
	wrong_path_sep, os.path.sep)
	return data