trace_inputs.py - infra/luci/client-py - Git at Google

 #!/usr/bin/env python
 # coding=utf-8
 # Copyright 2012 The LUCI Authors. All rights reserved.
 # Use of this source code is governed under the Apache License, Version 2.0
 # that can be found in the LICENSE file.

 """Traces an executable and its child processes and extract the files accessed
 by them.

 The implementation uses OS-specific API. The native Kernel logger and the ETL
 interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
 The OS-specific implementation is hidden in an 'API' interface.

 The results are embedded in a Results instance. The tracing is done in two
 phases, the first is to do the actual trace and generate an
 implementation-specific log file. Then the log file is parsed to extract the
 information, including the individual child processes and the files accessed
 from the log.
 """

 import codecs
 import csv
 import errno
 import getpass
 import glob
 import logging
 import os
 import re
 import stat
 import subprocess
 import sys
 import tempfile
 import threading
 import time
 import weakref

 from utils import tools
 tools.force_local_third_party()

 # third_party/
 import colorama
 from depot_tools import fix_encoding
 from depot_tools import subcommand

 # pylint: disable=ungrouped-imports
 from utils import file_path
 from utils import fs
 from utils import logging_utils
 from utils import subprocess42

 ## OS-specific imports

 if sys.platform == 'win32':
   from ctypes.wintypes import byref, c_int, c_wchar_p
   from ctypes.wintypes import windll # pylint: disable=E0611


 __version__ = '0.2'


 BASE_DIR = os.path.dirname(os.path.abspath(
     __file__.decode(sys.getfilesystemencoding())))
 ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))


 class TracingFailure(Exception):
   """An exception occurred during tracing."""
   def __init__(self, description, pid, line_number, line, *args):
     super(TracingFailure, self).__init__(
         description, pid, line_number, line, *args)
     self.description = description
     self.pid = pid
     self.line_number = line_number
     self.line = line
     self.extra = args

   def __str__(self):
     out = self.description
     if self.pid:
       out += '\npid: %d' % self.pid
     if self.line_number:
       out += '\nline: %d' % self.line_number
     if self.line:
       out += '\n%s' % self.line
     if self.extra:
       out += '\n' + ', '.join(map(str, filter(None, self.extra)))
     return out


 ## OS-specific functions

 if sys.platform == 'win32':
   def get_current_encoding():
     """Returns the 'ANSI' code page associated to the process."""
     return 'cp%d' % int(windll.kernel32.GetACP())


   def CommandLineToArgvW(command_line):
     """Splits a commandline into argv using CommandLineToArgvW()."""
     # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
     size = c_int()
     assert isinstance(command_line, unicode)
     ptr = windll.shell32.CommandLineToArgvW(command_line, byref(size))
     try:
       return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
     finally:
       windll.kernel32.LocalFree(ptr)


 def can_trace():
   """Returns True if the user is an administrator on Windows.

   It is required for tracing to work.
   """
   if sys.platform != 'win32':
     return True
   return bool(windll.shell32.IsUserAnAdmin())


 def create_subprocess_thunk():
   """Creates a small temporary script to start the child process.

   This thunk doesn't block, its unique name is used to identify it as the
   parent.
   """
   handle, name = tempfile.mkstemp(prefix=u'trace_inputs_thunk', suffix='.py')
   try:
     os.write(
         handle,
         (
           'import subprocess, sys\n'
           'sys.exit(subprocess.call(sys.argv[2:]))\n'
         ))
   finally:
     os.close(handle)
   return name


 def create_exec_thunk():
   """Creates a small temporary script to start the child executable.

   Reads from the file handle provided as the first argument to block, then
   execv() the command to be traced.
   """
   handle, name = tempfile.mkstemp(prefix=u'trace_inputs_thunk', suffix='.py')
   try:
     os.write(
         handle,
         (
           'import os, sys\n'
           'fd = int(sys.argv[1])\n'
           # This will block until the controlling process writes a byte on the
           # pipe. It will do so once the tracing tool, e.g. strace, is ready to
           # trace.
           'os.read(fd, 1)\n'
           'os.close(fd)\n'
           'os.execv(sys.argv[2], sys.argv[2:])\n'
         ))
   finally:
     os.close(handle)
   return name


 def strace_process_quoted_arguments(text):
   """Extracts quoted arguments on a string and return the arguments as a list.

   Implemented as an automaton. Supports incomplete strings in the form
   '"foo"...'.

   Example:
     With text = '"foo", "bar"', the function will return ['foo', 'bar']

   TODO(maruel): Implement escaping.
   """
   # All the possible states of the DFA.
   (NEED_QUOTE,         # Beginning of a new arguments.
    INSIDE_STRING,      # Inside an argument.
    ESCAPED,            # Found a '\' inside a quote. Treat the next char as-is.
    NEED_COMMA_OR_DOT,  # Right after the closing quote of an argument. Could be
                        # a series of 3 dots or a comma.
    NEED_SPACE,         # Right after a comma
    NEED_DOT_2,         # Found a dot, need a second one.
    NEED_DOT_3,         # Found second dot, need a third one.
    NEED_COMMA,         # Found third dot, need a comma.
    ) = range(8)

   state = NEED_QUOTE
   out = []
   for index, char in enumerate(text):
     if char == '"':
       if state == NEED_QUOTE:
         state = INSIDE_STRING
         # A new argument was found.
         out.append('')
       elif state == INSIDE_STRING:
         # The argument is now closed.
         state = NEED_COMMA_OR_DOT
       elif state == ESCAPED:
         out[-1] += char
         state = INSIDE_STRING
       else:
         raise ValueError(
             'Can\'t process char \'%s\' at column %d for: %r' % (
               char, index, text),
             index,
             text)
     elif char == ',':
       if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
         state = NEED_SPACE
       elif state == INSIDE_STRING:
         out[-1] += char
       elif state == ESCAPED:
         out[-1] += char
         state = INSIDE_STRING
       else:
         raise ValueError(
             'Can\'t process char \'%s\' at column %d for: %r' % (
               char, index, text),
             index,
             text)
     elif char == ' ':
       if state == NEED_SPACE:
         state = NEED_QUOTE
       elif state == INSIDE_STRING:
         out[-1] += char
       elif state == ESCAPED:
         out[-1] += char
         state = INSIDE_STRING
       else:
         raise ValueError(
             'Can\'t process char \'%s\' at column %d for: %r' % (
               char, index, text),
             index,
             text)
     elif char == '.':
       if state in (NEED_QUOTE, NEED_COMMA_OR_DOT):
         # The string is incomplete, this mean the strace -s flag should be
         # increased.
         # For NEED_QUOTE, the input string would look like '"foo", ...'.
         # For NEED_COMMA_OR_DOT, the input string would look like '"foo"...'
         state = NEED_DOT_2
       elif state == NEED_DOT_2:
         state = NEED_DOT_3
       elif state == NEED_DOT_3:
         state = NEED_COMMA
       elif state == INSIDE_STRING:
         out[-1] += char
       elif state == ESCAPED:
         out[-1] += char
         state = INSIDE_STRING
       else:
         raise ValueError(
             'Can\'t process char \'%s\' at column %d for: %r' % (
               char, index, text),
             index,
             text)
     elif char == '\\':
       if state == ESCAPED:
         out[-1] += char
         state = INSIDE_STRING
       elif state == INSIDE_STRING:
         state = ESCAPED
       else:
         raise ValueError(
             'Can\'t process char \'%s\' at column %d for: %r' % (
               char, index, text),
             index,
             text)
     else:
       if state == INSIDE_STRING:
         out[-1] += char
       else:
         raise ValueError(
             'Can\'t process char \'%s\' at column %d for: %r' % (
               char, index, text),
             index,
             text)
   if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
     raise ValueError(
         'String is incorrectly terminated: %r' % text,
         text)
   return out


 def assert_is_renderable(pseudo_string):
   """Asserts the input is a valid object to be processed by render()."""
   assert (
       pseudo_string is None or
       isinstance(pseudo_string, unicode) or
       hasattr(pseudo_string, 'render')), repr(pseudo_string)


 def render(pseudo_string):
   """Converts the pseudo-string to an unicode string."""
   if pseudo_string is None or isinstance(pseudo_string, unicode):
     return pseudo_string
   return pseudo_string.render()


 class Results(object):
   """Results of a trace session."""

   class _TouchedObject(object):
     """Something, a file or a directory, that was accessed."""
     def __init__(self, root, path, tainted, size, nb_files):
       logging.debug(
           '%s(%s, %s, %s, %s, %s)' %
           (self.__class__.__name__, root, path, tainted, size, nb_files))
       assert_is_renderable(root)
       assert_is_renderable(path)
       self.root = root
       self.path = path
       self.tainted = tainted
       self.nb_files = nb_files
       # Can be used as a cache or a default value, depending on context. In
       # particular, once self.tainted is True, because the path was replaced
       # with a variable, it is not possible to look up the file size.
       self._size = size
       # These are cache only.
       self._real_path = None

       # Check internal consistency.
       assert path, path
       assert tainted or bool(root) != bool(file_path.isabs(path)), (root, path)
       assert tainted or (
           not fs.exists(self.full_path) or
           (self.full_path == file_path.get_native_path_case(self.full_path))), (
               tainted,
               self.full_path,
               file_path.get_native_path_case(self.full_path))

     @property
     def existent(self):
       return self.size != -1

     @property
     def full_path(self):
       if self.root:
         return os.path.join(self.root, self.path)
       return self.path

     @property
     def real_path(self):
       """Returns the path with symlinks resolved."""
       if not self._real_path:
         self._real_path = os.path.realpath(self.full_path)
       return self._real_path

     @property
     def size(self):
       """File's size. -1 is not existent.

       Once tainted, it is not possible the retrieve the file size anymore since
       the path is composed of variables.
       """
       if self._size is None and not self.tainted:
         try:
           self._size = fs.stat(self.full_path).st_size
         except OSError:
           self._size = -1
       return self._size

     def flatten(self):
       """Returns a dict representing this object.

       A 'size' of 0 means the file was only touched and not read.
       """
       return {
         'path': self.path,
         'size': self.size,
       }

     def replace_variables(self, variables):
       """Replaces the root of this File with one of the variables if it matches.

       If a variable replacement occurs, the cloned object becomes tainted.
       """
       for variable, root_path in variables.iteritems():
         if self.path.startswith(root_path):
           return self._clone(
               self.root, variable + self.path[len(root_path):], True)
       # No need to clone, returns ourself.
       return self

     def strip_root(self, root):
       """Returns a clone of itself with 'root' stripped off.

       Note that the file is kept if it is either accessible from a symlinked
       path that was used to access the file or through the real path.
       """
       # Check internal consistency.
       assert (
           self.tainted or
           (file_path.isabs(root) and root.endswith(os.path.sep))), root
       if not self.full_path.startswith(root):
         # Now try to resolve the symlinks to see if it can be reached this way.
         # Only try *after* trying without resolving symlink.
         if not self.real_path.startswith(root):
           return None
         path = self.real_path
       else:
         path = self.full_path
       return self._clone(root, path[len(root):], self.tainted)

     def _clone(self, new_root, new_path, tainted):
       raise NotImplementedError(self.__class__.__name__)

   class File(_TouchedObject):
     """A file that was accessed. May not be present anymore.

     If tainted is true, it means it is not a real path anymore as a variable
     replacement occurred.

     |mode| can be one of None, TOUCHED, READ or WRITE.
     """
     # Was probed for existence, and it is existent, but was never _opened_.
     TOUCHED = 't'
     # Opened for read only and guaranteed to not have been written to.
     READ = 'r'
     # Opened for write.
     WRITE = 'w'

     # They are listed in order of priority. E.g. if a file is traced as TOUCHED
     # then as WRITE, only keep WRITE. None means no idea, which is a problem on
     # Windows.
     ACCEPTABLE_MODES = (None, TOUCHED, READ, WRITE)

     def __init__(self, root, path, tainted, size, mode):
       assert mode in self.ACCEPTABLE_MODES
       super(Results.File, self).__init__(root, path, tainted, size, 1)
       self.mode = mode

     def _clone(self, new_root, new_path, tainted):
       """Clones itself keeping meta-data."""
       # Keep the self.size and self._real_path caches for performance reason. It
       # is also important when the file becomes tainted (with a variable instead
       # of the real path) since self.path is not an on-disk path anymore so
       # out._size cannot be updated.
       out = self.__class__(new_root, new_path, tainted, self.size, self.mode)
       out._real_path = self._real_path
       return out

     def flatten(self):
       out = super(Results.File, self).flatten()
       out['mode'] = self.mode
       return out

   class Directory(_TouchedObject):
     """A directory of files. Must exist.

     For a Directory instance, self.size is not a cache, it's an actual value
     that is never modified and represents the total size of the files contained
     in this directory. It is possible that the directory is empty so that
     size==0; this happens if there's only an invalid symlink in it.
     """
     def __init__(self, root, path, tainted, size, nb_files):
       """path='.' is a valid value and must be handled appropriately."""
       assert not path.endswith(os.path.sep), path
       super(Results.Directory, self).__init__(
           root, path + os.path.sep, tainted, size, nb_files)

     def flatten(self):
       out = super(Results.Directory, self).flatten()
       out['nb_files'] = self.nb_files
       return out

     def _clone(self, new_root, new_path, tainted):
       """Clones itself keeping meta-data."""
       out = self.__class__(
           new_root,
           new_path.rstrip(os.path.sep),
           tainted,
           self.size,
           self.nb_files)
       out._real_path = self._real_path
       return out

   class Process(object):
     """A process that was traced.

     Contains references to the files accessed by this process and its children.
     """
     def __init__(self, pid, files, executable, command, initial_cwd, children):
       logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
       self.pid = pid
       self.files = sorted(files, key=lambda x: x.path)
       self.children = children
       self.executable = executable
       self.command = command
       self.initial_cwd = initial_cwd

       # Check internal consistency.
       assert len(set(f.path for f in self.files)) == len(self.files), sorted(
           f.path for f in self.files)
       assert isinstance(self.children, list)
       assert isinstance(self.files, list)

     @property
     def all(self):
       for child in self.children:
         for i in child.all:
           yield i
       yield self

     def flatten(self):
       return {
         'children': [c.flatten() for c in self.children],
         'command': self.command,
         'executable': self.executable,
         'files': [f.flatten() for f in self.files],
         'initial_cwd': self.initial_cwd,
         'pid': self.pid,
       }

     def strip_root(self, root):
       assert file_path.isabs(root) and root.endswith(os.path.sep), root
       # Loads the files after since they are constructed as objects.
       out = self.__class__(
           self.pid,
           filter(None, (f.strip_root(root) for f in self.files)),
           self.executable,
           self.command,
           self.initial_cwd,
           [c.strip_root(root) for c in self.children])
       logging.debug(
           'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
       return out

   def __init__(self, process):
     self.process = process
     # Cache.
     self._files = None

   def flatten(self):
     return {
       'root': self.process.flatten(),
     }

   @property
   def files(self):
     if self._files is None:
       self._files = sorted(
           sum((p.files for p in self.process.all), []),
           key=lambda x: x.path)
     return self._files

   @property
   def existent(self):
     return [f for f in self.files if f.existent]

   @property
   def non_existent(self):
     return [f for f in self.files if not f.existent]

   def strip_root(self, root):
     """Returns a clone with all the files outside the directory |root| removed
     and converts all the path to be relative paths.

     It keeps files accessible through the |root| directory or that have been
     accessed through any symlink which points to the same directory.
     """
     # Resolve any symlink
     root = os.path.realpath(root)
     root = (
         file_path.get_native_path_case(root).rstrip(os.path.sep) + os.path.sep)
     logging.debug('strip_root(%s)' % root)
     return Results(self.process.strip_root(root))


 class ApiBase(object):
   """OS-agnostic API to trace a process and its children."""
   class Context(object):
     """Processes one log line at a time and keeps the list of traced processes.

     The parsing is complicated by the fact that logs are traced out of order for
     strace but in-order for dtrace and logman. In addition, on Windows it is
     very frequent that processids are reused so a flat list cannot be used. But
     at the same time, it is impossible to faithfully construct a graph when the
     logs are processed out of order. So both a tree and a flat mapping are used,
     the tree is the real process tree, while the flat mapping stores the last
     valid process for the corresponding processid. For the strace case, the
     tree's head is guessed at the last moment.
     """
     class Process(object):
       """Keeps context for one traced child process.

       Logs all the files this process touched. Ignores directories.
       """
       def __init__(self, blacklist, pid, initial_cwd):
         # Check internal consistency.
         assert isinstance(pid, int), repr(pid)
         assert_is_renderable(initial_cwd)
         self.pid = pid
         # children are Process instances.
         self.children = []
         self.initial_cwd = initial_cwd
         self.cwd = None
         self.files = {}
         self.executable = None
         self.command = None
         self._blacklist = blacklist

       def to_results_process(self):
         """Resolves file case sensitivity and or late-bound strings."""
         # When resolving files, it's normal to get dupe because a file could be
         # opened multiple times with different case. Resolve the deduplication
         # here.
         def fix_path(x):
           """Returns the native file path case.

           Converts late-bound strings.
           """
           if not x:
             # Do not convert None instance to 'None'.
             return x
           x = render(x)
           if os.path.isabs(x):
             # If the path is not absolute, which tends to happen occasionally on
             # Windows, it is not possible to get the native path case so ignore
             # that trace. It mostly happens for 'executable' value.
             x = file_path.get_native_path_case(x)
           return x

         def fix_and_blacklist_path(x, m):
           """Receives a tuple (filepath, mode) and processes filepath."""
           x = fix_path(x)
           if not x:
             return None, None
           # The blacklist needs to be reapplied, since path casing could
           # influence blacklisting.
           if self._blacklist(x):
             return None, None
           # Filters out directories. Some may have passed through.
           if fs.isdir(x):
             return None, None
           return x, m

         # Renders all the files as strings, as some could be RelativePath
         # instances. It is important to do it first since there could still be
         # multiple entries with the same path but different modes.
         rendered = (
             fix_and_blacklist_path(f, m) for f, m in self.files.iteritems())
         files = sorted(
           (f for f in rendered if f[0]),
           key=lambda x: (x[0], Results.File.ACCEPTABLE_MODES.index(x[1])))
         # Then converting into a dict will automatically clean up lesser
         # important values.
         files = [
           Results.File(None, f, False, None, m)
           for f, m in dict(files).iteritems()
         ]
         return Results.Process(
             self.pid,
             files,
             fix_path(self.executable),
             self.command,
             fix_path(self.initial_cwd),
             [c.to_results_process() for c in self.children])

       def add_file(self, filepath, mode):
         """Adds a file if it passes the blacklist."""
         if self._blacklist(render(filepath)):
           return
         logging.debug('add_file(%d, %s, %s)', self.pid, filepath, mode)
         # Note that filepath and not render(filepath) is added. It is because
         # filepath could be something else than a string, like a RelativePath
         # instance for dtrace logs.
         modes = Results.File.ACCEPTABLE_MODES
         old_mode = self.files.setdefault(filepath, mode)
         if old_mode != mode and modes.index(old_mode) < modes.index(mode):
           # Take the highest value.
           self.files[filepath] = mode

     def __init__(self, blacklist):
       self.blacklist = blacklist
       # Initial process.
       self.root_process = None
       # dict to accelerate process lookup, to not have to lookup the whole graph
       # each time.
       self._process_lookup = {}

   class Tracer(object):
     """During it's lifetime, the tracing subsystem is enabled."""
     def __init__(self, logname):
       self._logname = logname
       self._lock = threading.RLock()
       self._traces = []
       self._initialized = True
       self._scripts_to_cleanup = []

     def trace(self, cmd, cwd, tracename, output):
       """Runs the OS-specific trace program on an executable.

       Arguments:
       - cmd: The command (a list) to run.
       - cwd: Current directory to start the child process in.
       - tracename: Name of the trace in the logname file.
       - output: If False, redirects output to PIPEs.

       Returns a tuple (resultcode, output) and updates the internal trace
       entries.
       """
       # The implementation adds an item to self._traces.
       raise NotImplementedError(self.__class__.__name__)

     def close(self, _timeout=None):
       """Saves the meta-data in the logname file.

       For kernel-based tracing, stops the tracing subsystem.

       Must not be used manually when using 'with' construct.
       """
       with self._lock:
         if not self._initialized:
           raise TracingFailure(
               'Called %s.close() on an unitialized object' %
                   self.__class__.__name__,
               None, None, None)
         try:
           while self._scripts_to_cleanup:
             try:
               fs.remove(self._scripts_to_cleanup.pop())
             except OSError as e:
               logging.error('Failed to delete a temporary script: %s', e)
           tools.write_json(self._logname, self._gen_logdata(), False)
         finally:
           self._initialized = False

     def post_process_log(self):
       """Post-processes the log so it becomes faster to load afterward.

       Must not be used manually when using 'with' construct.
       """
       assert not self._initialized, 'Must stop tracing first.'

     def _gen_logdata(self):
       """Returns the data to be saved in the trace file."""
       return  {
         'traces': self._traces,
       }

     def __enter__(self):
       """Enables 'with' statement."""
       return self

     def __exit__(self, exc_type, exc_value, traceback):
       """Enables 'with' statement."""
       self.close()
       # If an exception was thrown, do not process logs.
       if not exc_type:
         self.post_process_log()

   def get_tracer(self, logname):
     """Returns an ApiBase.Tracer instance.

     Initializes the tracing subsystem, which is a requirement for kernel-based
     tracers. Only one tracer instance should be live at a time!

     logname is the filepath to the json file that will contain the meta-data
     about the logs.
     """
     return self.Tracer(logname)

   @staticmethod
   def clean_trace(logname):
     """Deletes an old log."""
     raise NotImplementedError()

   @classmethod
   def parse_log(cls, logname, blacklist, trace_name):
     """Processes trace logs and returns the files opened and the files that do
     not exist.

     It does not track directories.

     Arguments:
       - logname: must be an absolute path.
       - blacklist: must be a lambda.
       - trace_name: optional trace to read, defaults to reading all traces.

     Most of the time, files that do not exist are temporary test files that
     should be put in /tmp instead. See http://crbug.com/116251.

     Returns a list of dict with keys:
       - results: A Results instance.
       - trace: The corresponding tracename parameter provided to
                get_tracer().trace().
       - output: Output gathered during execution, if get_tracer().trace(...,
                 output=False) was used.
     """
     raise NotImplementedError(cls.__class__.__name__)


 class Strace(ApiBase):
   """strace implies linux."""
   @staticmethod
   def load_filename(filename):
     """Parses a filename in a log."""
     # TODO(maruel): Be compatible with strace -x.
     assert isinstance(filename, str)
     out = ''
     i = 0
     while i < len(filename):
       c = filename[i]
       if c == '\\':
         out += chr(int(filename[i+1:i+4], 8))
         i += 4
       else:
         out += c
         i += 1
     # TODO(maruel): That's not necessarily true that the current code page is
     # utf-8.
     return out.decode('utf-8')

   class Context(ApiBase.Context):
     """Processes a strace log line and keeps the list of existent and non
     existent files accessed.

     Ignores directories.

     Uses late-binding to processes the cwd of each process. The problem is that
     strace generates one log file per process it traced but doesn't give any
     information about which process was started when and by who. So process the
     logs out of order and use late binding with RelativePath to be able to
     deduce the initial directory of each process once all the logs are parsed.

     TODO(maruel): Use the script even in the non-sudo case, so log parsing can
     be done in two phase: first find the root process, then process the child
     processes in order. With that, it should be possible to not use RelativePath
     anymore. This would significantly simplify the code!
     """
     class Process(ApiBase.Context.Process):
       """Represents the state of a process.

       Contains all the information retrieved from the pid-specific log.
       """
       # Function names are using ([a-z_0-9]+)
       # This is the most common format. function(args) = result
       RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.*?)\)\s+= (.+)$')
       # An interrupted function call, only grab the minimal header.
       RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
       # A resumed function call.
       RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
       # A process received a signal.
       RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
       # A process didn't handle a signal. Ignore any junk appearing before,
       # because the process was forcibly killed so it won't open any new file.
       RE_KILLED = re.compile(
           r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
       # The process has exited.
       RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
       # A call was canceled. Ignore any prefix.
       RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
       # The process has exited due to the ptrace sandbox. RE_PROCESS_EXITED will
       # follow on next line.
       RE_PTRACE = re.compile(r'^.*<ptrace\(SYSCALL\):No such process>$')
       # Happens when strace fails to even get the function name.
       UNNAMED_FUNCTION = '????'

       # Corner-case in python, a class member function decorator must not be
       # @staticmethod.
       def parse_args(regexp, expect_zero):  # pylint: disable=E0213
         """Automatically convert the str 'args' into a list of processed
         arguments.

         Arguments:
         - regexp is used to parse args.
         - expect_zero: one of True, False or None.
           - True: will check for result.startswith('0') first and will ignore
             the trace line completely otherwise. This is important because for
             many functions, the regexp will not process if the call failed.
           - False: will check for not result.startswith(('?', '-1')) for the
             same reason than with True.
           - None: ignore result.
         """
         def meta_hook(function):
           assert function.__name__.startswith('handle_')
           def hook(self, args, result):
             if expect_zero is True and not result.startswith('0'):
               return None
             if expect_zero is False and result.startswith(('?', '-1')):
               return None
             match = re.match(regexp, args)
             if not match:
               raise TracingFailure(
                   'Failed to parse %s(%s) = %s' %
                   (function.__name__[len('handle_'):], args, result),
                   None, None, None)
             return function(self, match.groups(), result)
           return hook
         return meta_hook

       class RelativePath(object):
         """A late-bound relative path."""
         def __init__(self, parent, value):
           assert_is_renderable(parent)
           self.parent = parent
           assert (
               value is None or
               (isinstance(value, unicode) and not os.path.isabs(value)))
           self.value = value
           if self.value:
             # TODO(maruel): On POSIX, '\\' is a valid character so remove this
             # assert.
             assert '\\' not in self.value, value
             assert '\\' not in self.value, (repr(value), repr(self.value))

         def render(self):
           """Returns the current directory this instance is representing.

           This function is used to return the late-bound value.
           """
           assert self.parent is not None
           parent = render(self.parent)
           if self.value:
             return os.path.normpath(os.path.join(parent, self.value))
           return parent

       def __init__(self, root, pid):
         """Keeps enough information to be able to guess the original process
         root.

         strace doesn't store which process was the initial process. So more
         information needs to be kept so the graph can be reconstructed from the
         flat map.
         """
         logging.info('%s(%d)' % (self.__class__.__name__, pid))
         super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
         assert isinstance(root, ApiBase.Context)
         self._root = weakref.ref(root)
         # The dict key is the function name of the pending call, like 'open'
         # or 'execve'.
         self._pending_calls = {}
         self._line_number = 0
         # Current directory when the process started.
         if isinstance(self._root(), unicode):
           self.initial_cwd = self._root()
         else:
           self.initial_cwd = self.RelativePath(self._root(), None)
         self.parentid = None
         self._done = False

       def get_cwd(self):
         """Returns the best known value of cwd."""
         return self.cwd or self.initial_cwd

       def render(self):
         """Returns the string value of the RelativePath() object.

         Used by RelativePath. Returns the initial directory and not the
         current one since the current directory 'cwd' validity is time-limited.

         The validity is only guaranteed once all the logs are processed.
         """
         return self.initial_cwd.render()

       def on_line(self, line):
         assert isinstance(line, str)
         self._line_number += 1
         try:
           if self._done:
             raise TracingFailure(
                 'Found a trace for a terminated process or corrupted log',
                 None, None, None)

           if self.RE_SIGNAL.match(line):
             # Ignore signals.
             return None

           match = self.RE_KILLED.match(line)
           if match:
             # Converts a '+++ killed by Foo +++' trace into an exit_group().
             self.handle_exit_group(match.group(1), None)
             return None

           match = self.RE_PROCESS_EXITED.match(line)
           if match:
             # Converts a '+++ exited with 1 +++' trace into an exit_group()
             self.handle_exit_group(match.group(1), None)
             return None

           match = self.RE_UNFINISHED.match(line)
           if match:
             if match.group(1) in self._pending_calls:
               raise TracingFailure(
                   'Found two unfinished calls for the same function',
                   None, None, None,
                   self._pending_calls)
             self._pending_calls[match.group(1)] = (
                 match.group(1) + match.group(2))
             return None

           match = self.RE_UNAVAILABLE.match(line)
           if match:
             # This usually means a process was killed and a pending call was
             # canceled.
             # TODO(maruel): Look up the last exit_group() trace just above and
             # make sure any self._pending_calls[anything] is properly flushed.
             return None

           match = self.RE_PTRACE.match(line)
           if match:
             # Not sure what this means. Anyhow, the process died.
             # TODO(maruel): Add note that only RE_PROCESS_EXITED is valid
             # afterward.
             return None

           match = self.RE_RESUMED.match(line)
           if match:
             if match.group(1) not in self._pending_calls:
               raise TracingFailure(
                   'Found a resumed call that was not logged as unfinished',
                   None, None, None,
                   self._pending_calls)
             pending = self._pending_calls.pop(match.group(1))
             # Reconstruct the line.
             line = pending + match.group(2)

           match = self.RE_HEADER.match(line)
           if not match:
             # The line is corrupted. It happens occasionally when a process is
             # killed forcibly with activity going on. Assume the process died.
             # No other line can be processed afterward.
             logging.debug('%d is done: %s', self.pid, line)
             self._done = True
             return None

           if match.group(1) == self.UNNAMED_FUNCTION:
             return None

           # It's a valid line, handle it.
           handler = getattr(self, 'handle_%s' % match.group(1), None)
           if not handler:
             self._handle_unknown(match.group(1), match.group(2), match.group(3))
           return handler(match.group(2), match.group(3))
         except TracingFailure, e:
           # Hack in the values since the handler could be a static function.
           e.pid = self.pid
           e.line = line
           e.line_number = self._line_number
           # Re-raise the modified exception.
           raise
         except (KeyError, NotImplementedError, ValueError), e:
           raise TracingFailure(
               'Trace generated a %s exception: %s' % (
                   e.__class__.__name__, str(e)),
               self.pid,
               self._line_number,
               line,
               e)

       @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)
       def handle_access(self, args, _result):
         self._handle_file(args[0], Results.File.TOUCHED)

       @parse_args(r'^\"(.+?)\"$', True)
       def handle_chdir(self, args, _result):
         """Updates cwd."""
         self.cwd = self._mangle(args[0])
         logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))

       @parse_args(r'^\"(.+?)\", (\d+), (\d+)$', False)
       def handle_chown(self, args, _result):
         # TODO(maruel): Look at result?
         self._handle_file(args[0], Results.File.WRITE)

       def handle_clone(self, _args, result):
         self._handling_forking('clone', result)

       def handle_close(self, _args, _result):
         pass

       @parse_args(r'^\"(.+?)\", (\d+)$', False)
       def handle_chmod(self, args, _result):
         self._handle_file(args[0], Results.File.WRITE)

       @parse_args(r'^\"(.+?)\"$', False)
       def handle_chroot(self, _args, _result):
         # This is used by Chromium's sandbox. See
         # https://chromium.googlesource.com/chromium/src/+/master/sandbox/linux/suid/sandbox.c
         pass

       @parse_args(r'^\"(.+?)\", (\d+)$', False)
       def handle_creat(self, args, _result):
         self._handle_file(args[0], Results.File.WRITE)

       @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
       def handle_execve(self, args, _result):
         # Even if in practice execve() doesn't returns when it succeeds, strace
         # still prints '0' as the result.
         filepath = args[0]
         self._handle_file(filepath, Results.File.READ)
         self.executable = self._mangle(filepath)
         try:
           self.command = strace_process_quoted_arguments(args[1])
         except ValueError as e:
           raise TracingFailure(
               'Failed to process command line argument:\n%s' % e.args[0],
               None, None, None)

       def handle_exit_group(self, _args, _result):
         """Removes cwd."""
         self.cwd = None

       @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', True)
       def handle_faccessat(self, args, _results):
         if args[0] == 'AT_FDCWD':
           self._handle_file(args[1], Results.File.TOUCHED)
         else:
           raise Exception('Relative faccess not implemented.')

       def handle_fallocate(self, _args, result):
         pass

       def handle_fork(self, args, result):
         self._handle_unknown('fork', args, result)

       @parse_args(r'^(\d+), \"(.*?)\", ({.+}), (\d+)$', True)
       def handle_fstatat64(self, _args, _result):
         # TODO(maruel): Handle.
         pass

       def handle_futex(self, _args, _result):
         pass

       @parse_args(r'^(\".+?\"|0x[a-f0-9]+), (\d+)$', False)
       def handle_getcwd(self, args, _result):
         # TODO(maruel): Resolve file handle.
         if not args[0].startswith('0x'):
           filepath = args[0][1:-1]
           if os.path.isabs(filepath):
             logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
             if not isinstance(self.cwd, unicode):
               # Take the occasion to reset the path.
               self.cwd = self._mangle(filepath)
             else:
               # It should always match.
               assert self.cwd == Strace.load_filename(filepath), (
                   self.cwd, filepath)

       @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
       def handle_link(self, args, _result):
         self._handle_file(args[0], Results.File.READ)
         self._handle_file(args[1], Results.File.WRITE)

       @parse_args(r'^(\".+?\"|0x[a-f0-9]+), \{(?:.+?, )?\.\.\.\}$', True)
       def handle_lstat(self, args, _result):
         # TODO(maruel): Resolve file handle.
         if not args[0].startswith('0x'):
           self._handle_file(args[0][1:-1], Results.File.TOUCHED)

       def handle_mkdir(self, _args, _result):
         # We track content, not directories.
         pass

       @parse_args(r'^(\d+|AT_FDCWD), \".*?\", ({.+}), (\d+)$', True)
       def handle_newfstatat(self, _args, _result):
         # TODO(maruel): Handle
         pass

       @parse_args(r'^(\".*?\"|0x[a-f0-9]+), ([A-Z\_\|]+)(|, \d+)$', False)
       def handle_open(self, args, _result):
         if 'O_DIRECTORY' in args[1]:
           return
         # TODO(maruel): Resolve file handle.
         if not args[0].startswith('0x'):
           t = Results.File.READ if 'O_RDONLY' in args[1] else Results.File.WRITE
           self._handle_file(args[0][1:-1], t)

       @parse_args(
           r'^(\d+|AT_FDCWD), (\".*?\"|0x[a-f0-9]+), ([A-Z\_\|]+)(|, \d+)$',
           False)
       def handle_openat(self, args, _result):
         if 'O_DIRECTORY' in args[2]:
           return
         if args[0] == 'AT_FDCWD':
           # TODO(maruel): Resolve file handle.
           if not args[1].startswith('0x'):
             t = (
                 Results.File.READ if 'O_RDONLY' in args[2]
                 else Results.File.WRITE)
           self._handle_file(args[1][1:-1], t)
         else:
           # TODO(maruel): Implement relative open if necessary instead of the
           # AT_FDCWD flag, let's hope not since this means tracking all active
           # directory handles.
           raise NotImplementedError('Relative open via openat not implemented.')

       @parse_args(
           r'^(\".+?\"|0x[a-f0-9]+), (?:\".+?\"(?:\.\.\.)?|0x[a-f0-9]+), '
             r'\d+$',
           False)
       def handle_readlink(self, args, _result):
         # TODO(maruel): Resolve file handle.
         if not args[0].startswith('0x'):
           self._handle_file(args[0][1:-1], Results.File.READ)

       @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
       def handle_rename(self, args, _result):
         self._handle_file(args[0], Results.File.READ)
         self._handle_file(args[1], Results.File.WRITE)

       def handle_rmdir(self, _args, _result):
         # We track content, not directories.
         pass

       def handle_setxattr(self, _args, _result):
         # TODO(maruel): self._handle_file(args[0], Results.File.WRITE)
         pass

       @parse_args(r'^(\".+?\"|0x[a-f0-9]+), \{.+?\}$', True)
       def handle_stat(self, args, _result):
         # TODO(maruel): Resolve file handle.
         if not args[0].startswith('0x'):
           self._handle_file(args[0][1:-1], Results.File.TOUCHED)

       @parse_args(r'^(\".+?\"|0x[a-f0-9]+), \{.+?\}$', True)
       def handle_stat64(self, args, _result):
         if not args[0].startswith('0x'):
           self._handle_file(args[0][1:-1], Results.File.TOUCHED)

       @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
       def handle_symlink(self, args, _result):
         self._handle_file(args[0], Results.File.TOUCHED)
         self._handle_file(args[1], Results.File.WRITE)

       @parse_args(
           r'^([0-9xa-f]+), ([0-9xa-f]+), ([0-9xa-f]+), ([0-9xa-f]+), '
           r'([0-9xa-f]+), ([0-9xa-f]+)$',
           False)
       def handle_syscall_317(self, _args, _result):
         # move_pages()
         pass

       @parse_args(r'^\"(.+?)\", \d+', True)
       def handle_truncate(self, args, _result):
         self._handle_file(args[0], Results.File.WRITE)

       def handle_unlink(self, _args, _result):
         # In theory, the file had to be created anyway.
         pass

       def handle_unlinkat(self, _args, _result):
         # In theory, the file had to be created anyway.
         pass

       def handle_statfs(self, _args, _result):
         pass

       def handle_utimensat(self, _args, _result):
         pass

       def handle_vfork(self, _args, result):
         self._handling_forking('vfork', result)

       @staticmethod
       def _handle_unknown(function, args, result):
         raise TracingFailure(
             'Unexpected/unimplemented trace %s(%s)= %s' %
             (function, args, result),
             None, None, None)

       def _handling_forking(self, name, result):
         """Transfers cwd."""
         if result.startswith(('?', '-1')):
           # The call failed.
           return
         # Update the other process right away.
         childpid = int(result)
         if childpid < 20:
           # Ignore, it's probably the Chromium sandbox.
           return
         child = self._root().get_or_set_proc(childpid)
         if child.parentid is not None or childpid in self.children:
           raise TracingFailure(
               'Found internal inconsitency in process lifetime detection '
               'during a %s()=%s call:\n%s' % (
                 name,
                 result,
                 sorted(self._root()._process_lookup)),
               None, None, None)
         # Copy the cwd object.
         child.initial_cwd = self.get_cwd()
         child.parentid = self.pid
         # It is necessary because the logs are processed out of order.
         self.children.append(child)

       def _handle_file(self, filepath, mode):
         filepath = self._mangle(filepath)
         self.add_file(filepath, mode)

       def _mangle(self, filepath):
         """Decodes a filepath found in the log and convert it to a late-bound
         path if necessary.

         |filepath| is an strace 'encoded' string and the returned value is
         either an unicode string if the path was absolute or a late bound path
         otherwise.
         """
         filepath = Strace.load_filename(filepath)
         if os.path.isabs(filepath):
           return filepath
         else:
           if isinstance(self.get_cwd(), unicode):
             return os.path.normpath(os.path.join(self.get_cwd(), filepath))
           return self.RelativePath(self.get_cwd(), filepath)

     def __init__(self, blacklist, root_pid, initial_cwd):
       """|root_pid| may be None when the root process is not known.

       In that case, a search is done after reading all the logs to figure out
       the root process.
       """
       super(Strace.Context, self).__init__(blacklist)
       assert_is_renderable(initial_cwd)
       self.root_pid = root_pid
       self.initial_cwd = initial_cwd

     def render(self):
       """Returns the string value of the initial cwd of the root process.

       Used by RelativePath.
       """
       return self.initial_cwd

     def on_line(self, pid, line):
       """Transfers control into the Process.on_line() function."""
       self.get_or_set_proc(pid).on_line(line.strip())

     def to_results(self):
       """If necessary, finds back the root process and verify consistency."""
       if not self.root_pid:
         # The non-sudo case. The traced process was started by strace itself,
         # so the pid of the traced process is not known.
         root = [p for p in self._process_lookup.itervalues() if not p.parentid]
         if len(root) == 1:
           self.root_process = root[0]
           # Save it for later.
           self.root_pid = self.root_process.pid
         else:
           raise TracingFailure(
               'Found internal inconsitency in process lifetime detection '
               'while finding the root process',
               None,
               None,
               None,
               self.root_pid,
               sorted(self._process_lookup))
       else:
         # The sudo case. The traced process was started manually so its pid is
         # known.
         self.root_process = self._process_lookup.get(self.root_pid)
       if not self.root_process:
         raise TracingFailure(
             'Found internal inconsitency in process lifetime detection '
             'while finding the root process',
             None,
             None,
             None,
             self.root_pid,
             sorted(self._process_lookup))
       process = self.root_process.to_results_process()
       if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
         raise TracingFailure(
             'Found internal inconsitency in process lifetime detection '
             'while looking for len(tree) == len(list)',
             None,
             None,
             None,
             sorted(self._process_lookup),
             sorted(p.pid for p in process.all))
       return Results(process)

     def get_or_set_proc(self, pid):
       """Returns the Context.Process instance for this pid or creates a new one.
       """
       if not pid or not isinstance(pid, int):
         raise TracingFailure(
             'Unpexpected value for pid: %r' % pid,
             pid,
             None,
             None,
             pid)
       if pid not in self._process_lookup:
         self._process_lookup[pid] = self.Process(self, pid)
       return self._process_lookup[pid]

     @classmethod
     def traces(cls):
       """Returns the list of all handled traces to pass this as an argument to
       strace.
       """
       prefix = 'handle_'
       return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]

   class Tracer(ApiBase.Tracer):
     MAX_LEN = 256

     def __init__(self, logname, use_sudo):
       super(Strace.Tracer, self).__init__(logname)
       self.use_sudo = use_sudo
       if use_sudo:
         # TODO(maruel): Use the jump script systematically to make it easy to
         # figure out the root process, so RelativePath is not necessary anymore.
         self._child_script = create_exec_thunk()
         self._scripts_to_cleanup.append(self._child_script)

     def trace(self, cmd, cwd, tracename, output):
       """Runs strace on an executable.

       When use_sudo=True, it is a 3-phases process: start the thunk, start
       sudo strace with the pid of the thunk and then have the thunk os.execve()
       the process to trace.
       """
       logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
       assert os.path.isabs(cmd[0]), cmd[0]
       assert os.path.isabs(cwd), cwd
       assert os.path.normpath(cwd) == cwd, cwd
       with self._lock:
         if not self._initialized:
           raise TracingFailure(
               'Called Tracer.trace() on an unitialized object',
               None, None, None, tracename)
         assert tracename not in (i['trace'] for i in self._traces)
       stdout = stderr = None
       if output:
         stdout = subprocess.PIPE
         stderr = subprocess.STDOUT

       # Ensure all file related APIs are hooked.
       traces = ','.join(
           [i for i in Strace.Context.traces()
             if not i.startswith('syscall_')] +
           ['file'])
       flags = [
         # Each child process has its own trace file. It is necessary because
         # strace may generate corrupted log file if multiple processes are
         # heavily doing syscalls simultaneously.
         '-ff',
         # Reduce whitespace usage.
         '-a1',
         # hex encode non-ascii strings.
         # TODO(maruel): '-x',
         # TODO(maruel): '-ttt',
         # Signals are unnecessary noise here. Note the parser can cope with them
         # but reduce the unnecessary output.
         '-esignal=none',
         # Print as much data as wanted.
         '-s', '%d' % self.MAX_LEN,
         '-e', 'trace=%s' % traces,
         '-o', self._logname + '.' + tracename,
       ]
       logging.info('%s', flags)

       if self.use_sudo:
         pipe_r, pipe_w = os.pipe()
         # Start the child process paused.
         target_cmd = [sys.executable, self._child_script, str(pipe_r)] + cmd
         logging.debug(' '.join(target_cmd))
         child_proc = subprocess.Popen(
             target_cmd,
             stdin=subprocess.PIPE,
             stdout=stdout,
             stderr=stderr,
             cwd=cwd)

         # TODO(maruel): both processes must use the same UID for it to work
         # without sudo. Look why -p is failing at the moment without sudo.
         trace_cmd = [
           'sudo',
           'strace',
           '-p', str(child_proc.pid),
         ] + flags
         logging.debug(' '.join(trace_cmd))
         strace_proc = subprocess.Popen(
             trace_cmd,
             cwd=cwd,
             stdin=subprocess.PIPE,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE)

         line = strace_proc.stderr.readline()
         if not re.match(r'^Process \d+ attached \- interrupt to quit$', line):
           # TODO(maruel): Raise an exception.
           assert False, line

         # Now fire the child process.
         os.write(pipe_w, 'x')

         out = child_proc.communicate()[0]
         strace_out = strace_proc.communicate()[0]

         # TODO(maruel): if strace_proc.returncode: Add an exception.
         saved_out = strace_out if strace_proc.returncode else out
         root_pid = child_proc.pid
       else:
         # Non-sudo case.
         trace_cmd = [
           'strace',
         ] + flags + cmd
         logging.debug(' '.join(trace_cmd))
         child_proc = subprocess.Popen(
             trace_cmd,
             cwd=cwd,
             stdin=subprocess.PIPE,
             stdout=stdout,
             stderr=stderr)
         out = child_proc.communicate()[0]
         # TODO(maruel): Walk the logs and figure out the root process would
         # simplify parsing the logs a *lot*.
         saved_out = out
         # The trace reader will have to figure out.
         root_pid = None

       with self._lock:
         assert tracename not in (i['trace'] for i in self._traces)
         self._traces.append(
           {
             'cmd': cmd,
             'cwd': cwd,
             'output': saved_out,
             'pid': root_pid,
             'trace': tracename,
           })
       return child_proc.returncode, out

   def __init__(self, use_sudo=None):
     super(Strace, self).__init__()
     self.use_sudo = use_sudo

   def get_tracer(self, logname):
     return self.Tracer(logname, self.use_sudo)

   @staticmethod
   def clean_trace(logname):
     if fs.isfile(logname):
       fs.remove(logname)
     # Also delete any pid specific file from previous traces.
     for i in glob.iglob(logname + '.*'):
       if i.rsplit('.', 1)[1].isdigit():
         fs.remove(i)

   @classmethod
   def parse_log(cls, logname, blacklist, trace_name):
     logging.info('parse_log(%s, ..., %s)', logname, trace_name)
     assert os.path.isabs(logname)
     data = tools.read_json(logname)
     out = []
     for item in data['traces']:
       if trace_name and item['trace'] != trace_name:
         continue
       result = {
         'output': item['output'],
         'trace': item['trace'],
       }
       try:
         context = cls.Context(blacklist, item['pid'], item['cwd'])
         for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
           logging.debug('Reading %s', pidfile)
           pid = pidfile.rsplit('.', 1)[1]
           if pid.isdigit():
             pid = int(pid)
             found_line = False
             for line in open(pidfile, 'rb'):
               context.on_line(pid, line)
               found_line = True
             if not found_line:
               # Ensures that a completely empty trace still creates the
               # corresponding Process instance by logging a dummy line.
               context.on_line(pid, '')
           else:
             logging.warning('Found unexpected file %s', pidfile)
         result['results'] = context.to_results()
       except TracingFailure:
         result['exception'] = sys.exc_info()
       out.append(result)
     return out


 class Dtrace(ApiBase):
   """Uses DTrace framework through dtrace. Requires root access.

   Implies Mac OSX.

   dtruss can't be used because it has compatibility issues with python.

   Also, the pid->cwd handling needs to be done manually since OSX has no way to
   get the absolute path of the 'cwd' dtrace variable from the probe.

   Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
   https://discussions.apple.com/thread/1980539. So resort to handling execve()
   manually.

   errno is not printed in the log since this implementation currently only cares
   about files that were successfully opened.
   """
   class Context(ApiBase.Context):
     # Format: index pid function(args)
     RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$')

     # Arguments parsing.
     RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
     RE_CHDIR = re.compile(r'^\"(.+?)\"$')
     RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
     RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
     RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
     RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')

     # O_DIRECTORY is not defined on Windows and dtrace doesn't exist on Windows.
     O_DIRECTORY = os.O_DIRECTORY if hasattr(os, 'O_DIRECTORY') else None
     O_RDWR = os.O_RDWR
     O_WRONLY = os.O_WRONLY

     class Process(ApiBase.Context.Process):
       def __init__(self, *args):
         super(Dtrace.Context.Process, self).__init__(*args)
         self.cwd = self.initial_cwd

     def __init__(self, blacklist, thunk_pid, initial_cwd):
       logging.info(
           '%s(%d, %s)' % (self.__class__.__name__, thunk_pid, initial_cwd))
       super(Dtrace.Context, self).__init__(blacklist)
       assert isinstance(initial_cwd, unicode), initial_cwd
       # Process ID of the temporary script created by create_subprocess_thunk().
       self._thunk_pid = thunk_pid
       self._initial_cwd = initial_cwd
       self._line_number = 0

     def on_line(self, line):
       assert isinstance(line, unicode), line
       self._line_number += 1
       match = self.RE_HEADER.match(line)
       if not match:
         raise TracingFailure(
             'Found malformed line: %s' % line,
             None,
             self._line_number,
             line)
       fn = getattr(
           self,
           'handle_%s' % match.group(2).replace('-', '_'),
           self._handle_ignored)
       # It is guaranteed to succeed because of the regexp. Or at least I thought
       # it would.
       pid = int(match.group(1))
       try:
         return fn(pid, match.group(3))
       except TracingFailure, e:
         # Hack in the values since the handler could be a static function.
         e.pid = pid
         e.line = line
         e.line_number = self._line_number
         # Re-raise the modified exception.
         raise
       except (KeyError, NotImplementedError, ValueError), e:
         raise TracingFailure(
             'Trace generated a %s exception: %s' % (
                 e.__class__.__name__, str(e)),
             pid,
             self._line_number,
             line,
             e)

     def to_results(self):
       process = self.root_process.to_results_process()
       # Internal concistency check.
       if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
         raise TracingFailure(
             'Found internal inconsitency in process lifetime detection '
             'while looking for len(tree) == len(list)',
             None,
             None,
             None,
             sorted(self._process_lookup),
             sorted(p.pid for p in process.all))
       return Results(process)

     def handle_dtrace_BEGIN(self, _pid, args):
       if not self.RE_DTRACE_BEGIN.match(args):
         raise TracingFailure(
             'Found internal inconsitency in dtrace_BEGIN log line',
             None, None, None)

     def handle_proc_start(self, pid, args):
       """Transfers cwd.

       The dtrace script already takes care of only tracing the processes that
       are child of the traced processes so there is no need to verify the
       process hierarchy.
       """
       if pid in self._process_lookup:
         raise TracingFailure(
             'Found internal inconsitency in proc_start: %d started two times' %
                 pid,
             None, None, None)
       match = self.RE_PROC_START.match(args)
       if not match:
         raise TracingFailure(
             'Failed to parse arguments: %s' % args,
             None, None, None)
       ppid = int(match.group(1))
       if ppid == self._thunk_pid and not self.root_process:
         proc = self.root_process = self.Process(
             self.blacklist, pid, self._initial_cwd)
       elif ppid in self._process_lookup:
         proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
         self._process_lookup[ppid].children.append(proc)
       else:
         # Another process tree, ignore.
         return
       self._process_lookup[pid] = proc
       logging.debug(
           'New child: %s -> %d  cwd:%s' %
           (ppid, pid, render(proc.initial_cwd)))

     def handle_proc_exit(self, pid, _args):
       """Removes cwd."""
       if pid in self._process_lookup:
         # self._thunk_pid is not traced itself and other traces run neither.
         self._process_lookup[pid].cwd = None

     def handle_execve(self, pid, args):
       """Sets the process' executable.

       TODO(maruel): Read command line arguments.  See
       https://discussions.apple.com/thread/1980539 for an example.
       https://gist.github.com/1242279

       Will have to put the answer at http://stackoverflow.com/questions/7556249.
       :)
       """
       if not pid in self._process_lookup:
         # Another process tree, ignore.
         return
       match = self.RE_EXECVE.match(args)
       if not match:
         raise TracingFailure(
             'Failed to parse arguments: %r' % args,
             None, None, None)
       proc = self._process_lookup[pid]
       proc.executable = match.group(1)
       self._handle_file(pid, proc.executable, Results.File.READ)
       proc.command = self.process_escaped_arguments(match.group(3))
       if int(match.group(2)) != len(proc.command):
         raise TracingFailure(
             'Failed to parse execve() arguments: %s' % args,
             None, None, None)

     def handle_chdir(self, pid, args):
       """Updates cwd."""
       if pid not in self._process_lookup:
         # Another process tree, ignore.
         return
       cwd = self.RE_CHDIR.match(args).group(1)
       if not cwd.startswith('/'):
         cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
         logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
       else:
         logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
         cwd2 = cwd
       self._process_lookup[pid].cwd = cwd2

     def handle_open_nocancel(self, pid, args):
       """Redirects to handle_open()."""
       return self.handle_open(pid, args)

     def handle_open(self, pid, args):
       if pid not in self._process_lookup:
         # Another process tree, ignore.
         return
       match = self.RE_OPEN.match(args)
       if not match:
         raise TracingFailure(
             'Failed to parse arguments: %s' % args,
             None, None, None)
       flag = int(match.group(2), 16)
       if self.O_DIRECTORY & flag == self.O_DIRECTORY:
         # Ignore directories.
         return
       self._handle_file(
           pid,
           match.group(1),
           Results.File.READ if not ((self.O_RDWR | self.O_WRONLY) & flag)
             else Results.File.WRITE)

     def handle_rename(self, pid, args):
       if pid not in self._process_lookup:
         # Another process tree, ignore.
         return
       match = self.RE_RENAME.match(args)
       if not match:
         raise TracingFailure(
             'Failed to parse arguments: %s' % args,
             None, None, None)
       self._handle_file(pid, match.group(1), Results.File.READ)
       self._handle_file(pid, match.group(2), Results.File.WRITE)

     def _handle_file(self, pid, filepath, mode):
       if not filepath.startswith('/'):
         filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
       # We can get '..' in the path.
       filepath = os.path.normpath(filepath)
       # Sadly, still need to filter out directories here;
       # saw open_nocancel(".", 0, 0) = 0 lines.
       if fs.isdir(filepath):
         return
       self._process_lookup[pid].add_file(filepath, mode)

     def handle_ftruncate(self, pid, args):
       """Just used as a signal to kill dtrace, ignoring."""
       pass

     @staticmethod
     def _handle_ignored(pid, args):
       """Is called for all the event traces that are not handled."""
       raise NotImplementedError('Please implement me')

     @staticmethod
     def process_escaped_arguments(text):
       """Extracts escaped arguments on a string and return the arguments as a
       list.

       Implemented as an automaton.

       Example:
         With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
         function will return ['python2.7', '-c', 'print("hi")]
       """
       if not text.endswith('\\0'):
         raise ValueError('String is not null terminated: %r' % text, text)
       text = text[:-2]

       def unescape(x):
         """Replaces '\\\\' with '\\' and '\\?' (where ? is anything) with ?."""
         out = []
         escaped = False
         for i in x:
           if i == '\\' and not escaped:
             escaped = True
             continue
           escaped = False
           out.append(i)
         return ''.join(out)

       return [unescape(i) for i in text.split('\\001')]

   class Tracer(ApiBase.Tracer):
     # pylint: disable=C0301
     #
     # To understand the following code, you'll want to take a look at:
     # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
     # https://wikis.oracle.com/display/DTrace/Variables
     # http://docs.oracle.com/cd/E19205-01/820-4221/
     #
     # 0. Dump all the valid probes into a text file. It is important, you
     #    want to redirect into a file and you don't want to constantly 'sudo'.
     # $ sudo dtrace -l > probes.txt
     #
     # 1. Count the number of probes:
     # $ wc -l probes.txt
     # 81823  # On OSX 10.7, including 1 header line.
     #
     # 2. List providers, intentionally skipping all the 'syspolicy10925' and the
     #    likes and skipping the header with NR>1:
     # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'
     # dtrace
     # fbt
     # io
     # ip
     # lockstat
     # mach_trap
     # proc
     # profile
     # sched
     # syscall
     # tcp
     # vminfo
     #
     # 3. List of valid probes:
     # $ grep syscall probes.txt | less
     #    or use dtrace directly:
     # $ sudo dtrace -l -P syscall | less
     #
     # trackedpid is an associative array where its value can be 0, 1 or 2.
     # 0 is for untracked processes and is the default value for items not
     #   in the associative array.
     # 1 is for tracked processes.
     # 2 is for the script created by create_subprocess_thunk() only. It is not
     #   tracked itself but all its descendants are.
     #
     # The script will kill itself only once waiting_to_die == 1 and
     # current_processes == 0, so that both getlogin() was called and that
     # all traced processes exited.
     #
     # TODO(maruel): Use cacheable predicates. See
     # https://wikis.oracle.com/display/DTrace/Performance+Considerations
     D_CODE = """
       dtrace:::BEGIN {
         waiting_to_die = 0;
         current_processes = 0;
         logindex = 0;
         printf("%d %d %s_%s(\\"%s\\")\\n",
                logindex, PID, probeprov, probename, SCRIPT);
         logindex++;
       }

       proc:::start /trackedpid[ppid]/ {
         trackedpid[pid] = 1;
         current_processes += 1;
         printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
                logindex, pid, probeprov, probename,
                ppid,
                execname,
                current_processes);
         logindex++;
       }
       /* Should use SCRIPT but there is no access to this variable at that
        * point. */
       proc:::start /ppid == PID && execname == "Python"/ {
         trackedpid[pid] = 2;
         current_processes += 1;
         printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
                logindex, pid, probeprov, probename,
                ppid,
                execname,
                current_processes);
         logindex++;
       }
       proc:::exit /trackedpid[pid] &&
                   current_processes == 1 &&
                   waiting_to_die == 1/ {
         trackedpid[pid] = 0;
         current_processes -= 1;
         printf("%d %d %s_%s(%d)\\n",
                logindex, pid, probeprov, probename,
                current_processes);
         logindex++;
         exit(0);
       }
       proc:::exit /trackedpid[pid]/ {
         trackedpid[pid] = 0;
         current_processes -= 1;
         printf("%d %d %s_%s(%d)\\n",
                logindex, pid, probeprov, probename,
                current_processes);
         logindex++;
       }

       /* Use an arcane function to detect when we need to die */
       syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
         waiting_to_die = 1;
         printf("%d %d %s()\\n", logindex, pid, probefunc);
         logindex++;
       }
       syscall::ftruncate:entry /
           pid == PID && arg0 == FILE_ID && current_processes == 0/ {
         exit(0);
       }

       syscall::open*:entry /trackedpid[pid] == 1/ {
         self->open_arg0 = arg0;
         self->open_arg1 = arg1;
         self->open_arg2 = arg2;
       }
       syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
         this->open_arg0 = copyinstr(self->open_arg0);
         printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
                logindex, pid, probefunc,
                this->open_arg0,
                self->open_arg1,
                self->open_arg2);
         logindex++;
         this->open_arg0 = 0;
       }
       syscall::open*:return /trackedpid[pid] == 1/ {
         self->open_arg0 = 0;
         self->open_arg1 = 0;
         self->open_arg2 = 0;
       }

       syscall::rename:entry /trackedpid[pid] == 1/ {
         self->rename_arg0 = arg0;
         self->rename_arg1 = arg1;
       }
       syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
         this->rename_arg0 = copyinstr(self->rename_arg0);
         this->rename_arg1 = copyinstr(self->rename_arg1);
         printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
                logindex, pid, probefunc,
                this->rename_arg0,
                this->rename_arg1);
         logindex++;
         this->rename_arg0 = 0;
         this->rename_arg1 = 0;
       }
       syscall::rename:return /trackedpid[pid] == 1/ {
         self->rename_arg0 = 0;
         self->rename_arg1 = 0;
       }

       /* Track chdir, it's painful because it is only receiving relative path.
        */
       syscall::chdir:entry /trackedpid[pid] == 1/ {
         self->chdir_arg0 = arg0;
       }
       syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
         this->chdir_arg0 = copyinstr(self->chdir_arg0);
         printf("%d %d %s(\\"%s\\")\\n",
                logindex, pid, probefunc,
                this->chdir_arg0);
         logindex++;
         this->chdir_arg0 = 0;
       }
       syscall::chdir:return /trackedpid[pid] == 1/ {
         self->chdir_arg0 = 0;
       }
       """

     # execve-specific code, tends to throw a lot of exceptions.
     D_CODE_EXECVE = """
       /* Finally what we care about! */
       syscall::exec*:entry /trackedpid[pid]/ {
         self->exec_arg0 = copyinstr(arg0);
         /* Incrementally probe for a NULL in the argv parameter of execve() to
          * figure out argc. */
         /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
          * found. */
         self->exec_argc = 0;
         /* Probe for argc==1 */
         this->exec_argv = (user_addr_t*)copyin(
              arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
         self->exec_argc = this->exec_argv[self->exec_argc] ?
             (self->exec_argc + 1) : self->exec_argc;

         /* Probe for argc==2 */
         this->exec_argv = (user_addr_t*)copyin(
              arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
         self->exec_argc = this->exec_argv[self->exec_argc] ?
             (self->exec_argc + 1) : self->exec_argc;

         /* Probe for argc==3 */
         this->exec_argv = (user_addr_t*)copyin(
              arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
         self->exec_argc = this->exec_argv[self->exec_argc] ?
             (self->exec_argc + 1) : self->exec_argc;

         /* Probe for argc==4 */
         this->exec_argv = (user_addr_t*)copyin(
              arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
         self->exec_argc = this->exec_argv[self->exec_argc] ?
             (self->exec_argc + 1) : self->exec_argc;

         /* Copy the inputs strings since there is no guarantee they'll be
          * present after the call completed. */
         self->exec_argv0 = (self->exec_argc > 0) ?
             copyinstr(this->exec_argv[0]) : "";
         self->exec_argv1 = (self->exec_argc > 1) ?
             copyinstr(this->exec_argv[1]) : "";
         self->exec_argv2 = (self->exec_argc > 2) ?
             copyinstr(this->exec_argv[2]) : "";
         self->exec_argv3 = (self->exec_argc > 3) ?
             copyinstr(this->exec_argv[3]) : "";
         this->exec_argv = 0;
       }
       syscall::exec*:return /trackedpid[pid] && errno == 0/ {
         /* We need to join strings here, as using multiple printf() would
          * cause tearing when multiple threads/processes are traced.
          * Since it is impossible to escape a string and join it to another one,
          * like sprintf("%s%S", previous, more), use hackery.
          * Each of the elements are split with a \\1. \\0 cannot be used because
          * it is simply ignored. This will conflict with any program putting a
          * \\1 in their execve() string but this should be "rare enough" */
         this->args = "";
         /* Process exec_argv[0] */
         this->args = strjoin(
             this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");

         /* Process exec_argv[1] */
         this->args = strjoin(
             this->args, (self->exec_argc > 1) ? "\\1" : "");
         this->args = strjoin(
             this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");

         /* Process exec_argv[2] */
         this->args = strjoin(
             this->args, (self->exec_argc > 2) ? "\\1" : "");
         this->args = strjoin(
             this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");

         /* Process exec_argv[3] */
         this->args = strjoin(
             this->args, (self->exec_argc > 3) ? "\\1" : "");
         this->args = strjoin(
             this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");

         /* Prints self->exec_argc to permits verifying the internal
          * consistency since this code is quite fishy. */
         printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
                logindex, pid, probefunc,
                self->exec_arg0,
                self->exec_argc,
                this->args);
         logindex++;
         this->args = 0;
       }
       syscall::exec*:return /trackedpid[pid]/ {
         self->exec_arg0 = 0;
         self->exec_argc = 0;
         self->exec_argv0 = 0;
         self->exec_argv1 = 0;
         self->exec_argv2 = 0;
         self->exec_argv3 = 0;
       }
       """

     # Code currently not used.
     D_EXTRANEOUS = """
       /* This is a good learning experience, since it traces a lot of things
        * related to the process and child processes.
        * Warning: it generates a gigantic log. For example, tracing
        * "data/trace_inputs/child1.py --child" generates a 2mb log and takes
        * several minutes to execute.
        */
       /*
       mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
         printf("%d %d %s_%s() = %d\\n",
                logindex, pid, probeprov, probefunc, errno);
         logindex++;
       }
       proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
         printf("%d %d %s_%s() = %d\\n",
                logindex, pid, probeprov, probefunc, errno);
         logindex++;
       }
       sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
         printf("%d %d %s_%s() = %d\\n",
                logindex, pid, probeprov, probefunc, errno);
         logindex++;
       }
       syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
         printf("%d %d %s_%s() = %d\\n",
                logindex, pid, probeprov, probefunc, errno);
         logindex++;
       }
       vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
         printf("%d %d %s_%s() = %d\\n",
                logindex, pid, probeprov, probefunc, errno);
         logindex++;
       }
       */
       /* TODO(maruel): *stat* functions and friends
         syscall::access:return,
         syscall::chdir:return,
         syscall::chflags:return,
         syscall::chown:return,
         syscall::chroot:return,
         syscall::getattrlist:return,
         syscall::getxattr:return,
         syscall::lchown:return,
         syscall::lstat64:return,
         syscall::lstat:return,
         syscall::mkdir:return,
         syscall::pathconf:return,
         syscall::readlink:return,
         syscall::removexattr:return,
         syscall::setxattr:return,
         syscall::stat64:return,
         syscall::stat:return,
         syscall::truncate:return,
         syscall::unlink:return,
         syscall::utimes:return,
       */
       """

     def __init__(self, logname, use_sudo):
       """Starts the log collection with dtrace.

       Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
       this needs to wait for dtrace to be "warmed up".
       """
       super(Dtrace.Tracer, self).__init__(logname)
       # This script is used as a signal to figure out the root process.
       self._signal_script = create_subprocess_thunk()
       self._scripts_to_cleanup.append(self._signal_script)
       # This unique dummy temp file is used to signal the dtrace script that it
       # should stop as soon as all the child processes are done. A bit hackish
       # but works fine enough.
       self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
           prefix=u'trace_signal_file')

       dtrace_path = '/usr/sbin/dtrace'
       if not fs.isfile(dtrace_path):
         dtrace_path = 'dtrace'
       elif use_sudo is None and (fs.stat(dtrace_path).st_mode & stat.S_ISUID):
         # No need to sudo. For those following at home, don't do that.
         use_sudo = False

       # Note: do not use the -p flag. It's useless if the initial process quits
       # too fast, resulting in missing traces from the grand-children. The D
       # code manages the dtrace lifetime itself.
       trace_cmd = [
         dtrace_path,
         # Use a larger buffer if getting 'out of scratch space' errors.
         # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
         '-b', '10m',
         '-x', 'dynvarsize=10m',
         #'-x', 'dtrace_global_maxsize=1m',
         '-x', 'evaltime=exec',
         '-o', '/dev/stderr',
         '-q',
         '-n', self._get_dtrace_code(),
       ]
       if use_sudo is not False:
         trace_cmd.insert(0, 'sudo')

       with fs.open(self._logname + '.log', 'wb') as logfile:
         self._dtrace = subprocess.Popen(
             trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
       logging.debug('Started dtrace pid: %d' % self._dtrace.pid)

       # Reads until one line is printed, which signifies dtrace is up and ready.
       with fs.open(self._logname + '.log', 'rb') as logfile:
         while 'dtrace_BEGIN' not in logfile.readline():
           if self._dtrace.poll() is not None:
             # Do a busy wait. :/
             break
       logging.debug('dtrace started')

     def _get_dtrace_code(self):
       """Setups the D code to implement child process tracking.

       Injects the cookie in the script so it knows when to stop.

       The script will detect any instance of the script created with
       create_subprocess_thunk() and will start tracing it.
       """
       out = (
           'inline int PID = %d;\n'
           'inline string SCRIPT = "%s";\n'
           'inline int FILE_ID = %d;\n'
           '\n'
           '%s') % (
               os.getpid(),
               self._signal_script,
               self._dummy_file_id,
               self.D_CODE)
       if os.environ.get('TRACE_INPUTS_DTRACE_ENABLE_EXECVE') == '1':
         # Do not enable by default since it tends to spew dtrace: error lines
         # because the execve() parameters are not in valid memory at the time of
         # logging.
         # TODO(maruel): Find a way to make this reliable since it's useful but
         # only works in limited/trivial uses cases for now.
         out += self.D_CODE_EXECVE
       return out

     def trace(self, cmd, cwd, tracename, output):
       """Runs dtrace on an executable.

       This dtruss is broken when it starts the process itself or when tracing
       child processes, this code starts a wrapper process
       generated with create_subprocess_thunk() which starts the executable to
       trace.
       """
       logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
       assert os.path.isabs(cmd[0]), cmd[0]
       assert os.path.isabs(cwd), cwd
       assert os.path.normpath(cwd) == cwd, cwd
       with self._lock:
         if not self._initialized:
           raise TracingFailure(
               'Called Tracer.trace() on an unitialized object',
               None, None, None, tracename)
         assert tracename not in (i['trace'] for i in self._traces)

       # Starts the script wrapper to start the child process. This signals the
       # dtrace script that this process is to be traced.
       stdout = stderr = None
       if output:
         stdout = subprocess.PIPE
         stderr = subprocess.STDOUT
       child_cmd = [
         sys.executable,
         self._signal_script,
         tracename,
       ]
       # Call a dummy function so that dtrace knows I'm about to launch a process
       # that needs to be traced.
       # Yummy.
       child = subprocess.Popen(
           child_cmd + tools.fix_python_cmd(cmd),
           stdin=subprocess.PIPE,
           stdout=stdout,
           stderr=stderr,
           cwd=cwd)
       logging.debug('Started child pid: %d' % child.pid)

       out = child.communicate()[0]
       # This doesn't mean tracing is done, one of the grand-child process may
       # still be alive. It will be tracked with the dtrace script.

       with self._lock:
         assert tracename not in (i['trace'] for i in self._traces)
         self._traces.append(
           {
             'cmd': cmd,
             'cwd': cwd,
             'pid': child.pid,
             'output': out,
             'trace': tracename,
           })
       return child.returncode, out

     def close(self, timeout=None):
       """Terminates dtrace."""
       logging.debug('close(%s)' % timeout)
       try:
         try:
           super(Dtrace.Tracer, self).close(timeout)
           # Signal dtrace that it should stop now.
           # ftruncate doesn't exist on Windows.
           os.ftruncate(self._dummy_file_id, 0)  # pylint: disable=E1101
           if timeout:
             start = time.time()
             # Use polling. :/
             while (self._dtrace.poll() is None and
                    (time.time() - start) < timeout):
               time.sleep(0.1)
             self._dtrace.kill()
           self._dtrace.wait()
         finally:
           # Make sure to kill it in any case.
           if self._dtrace.poll() is None:
             try:
               self._dtrace.kill()
               self._dtrace.wait()
             except OSError:
               pass

         if self._dtrace.returncode != 0:
           # Warn about any dtrace failure but basically ignore it.
           print 'dtrace failure: %s' % self._dtrace.returncode
       finally:
         os.close(self._dummy_file_id)
         fs.remove(self._dummy_file_name)

     def post_process_log(self):
       """Sorts the log back in order when each call occurred.

       dtrace doesn't save the buffer in strict order since it keeps one buffer
       per CPU.
       """
       super(Dtrace.Tracer, self).post_process_log()
       logname = self._logname + '.log'
       with fs.open(logname, 'rb') as logfile:
         lines = [l for l in logfile if l.strip()]
       errors = [l for l in lines if l.startswith('dtrace:')]
       if errors:
         raise TracingFailure(
             'Found errors in the trace: %s' % '\n'.join(errors),
             None, None, None, logname)
       try:
         lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
       except ValueError:
         raise TracingFailure(
             'Found errors in the trace: %s' % '\n'.join(
                 l for l in lines if l.split(' ', 1)[0].isdigit()),
             None, None, None, logname)
       with fs.open(logname, 'wb') as logfile:
         logfile.write(''.join(lines))

   def __init__(self, use_sudo=None):
     super(Dtrace, self).__init__()
     self.use_sudo = use_sudo

   def get_tracer(self, logname):
     return self.Tracer(logname, self.use_sudo)

   @staticmethod
   def clean_trace(logname):
     for ext in ('', '.log'):
       if fs.isfile(logname + ext):
         fs.remove(logname + ext)

   @classmethod
   def parse_log(cls, logname, blacklist, trace_name):
     logging.info('parse_log(%s, ..., %s)', logname, trace_name)
     assert os.path.isabs(logname)

     def blacklist_more(filepath):
       # All the HFS metadata is in the form /.vol/...
       return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)

     data = tools.read_json(logname)
     out = []
     for item in data['traces']:
       if trace_name and item['trace'] != trace_name:
         continue
       result = {
         'output': item['output'],
         'trace': item['trace'],
       }
       try:
         context = cls.Context(blacklist_more, item['pid'], item['cwd'])
         # It's fine to assume the file as UTF-8: OSX enforces the file names to
         # be valid UTF-8 and we control the log output.
         for line in codecs.open(logname + '.log', 'rb', encoding='utf-8'):
           context.on_line(line)
         result['results'] = context.to_results()
       except TracingFailure:
         result['exception'] = sys.exc_info()
       out.append(result)
     return out


 class LogmanTrace(ApiBase):
   """Uses the native Windows ETW based tracing functionality to trace a child
   process.

   Caveat: this implementations doesn't track cwd or initial_cwd. It is because
   the Windows Kernel doesn't have a concept of 'current working directory' at
   all. A Win32 process has a map of current directories, one per drive letter
   and it is managed by the user mode kernel32.dll. In kernel, a file is always
   opened relative to another file_object or as an absolute path. All the current
   working directory logic is done in user mode.
   """
   class Context(ApiBase.Context):
     """Processes a ETW log line and keeps the list of existent and non
     existent files accessed.

     Ignores directories.
     """
     # These indexes are for the stripped version in json.
     EVENT_NAME = 0
     TYPE = 1
     PID = 2
     TID = 3
     PROCESSOR_ID = 4
     TIMESTAMP = 5
     USER_DATA = 6

     class Process(ApiBase.Context.Process):
       def __init__(self, *args):
         super(LogmanTrace.Context.Process, self).__init__(*args)
         # Handle file objects that succeeded.
         self.file_objects = {}

     def __init__(self, blacklist, thunk_pid, trace_name, thunk_cmd):
       logging.info(
           '%s(%d, %s, %s)', self.__class__.__name__, thunk_pid, trace_name,
           thunk_cmd)
       super(LogmanTrace.Context, self).__init__(blacklist)
       self._drive_map = file_path.DosDriveMap()
       # Threads mapping to the corresponding process id.
       self._threads_active = {}
       # Process ID of the tracer, e.g. the temporary script created by
       # create_subprocess_thunk(). This is tricky because the process id may
       # have been reused.
       self._thunk_pid = thunk_pid
       self._thunk_cmd = thunk_cmd
       self._trace_name = trace_name
       self._line_number = 0
       self._thunk_process = None

     def on_line(self, line):
       """Processes a json Event line."""
       self._line_number += 1
       try:
         # By Opcode
         handler = getattr(
             self,
             'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
             None)
         if not handler:
           raise TracingFailure(
               'Unexpected event %s_%s' % (
                   line[self.EVENT_NAME], line[self.TYPE]),
               None, None, None)
         handler(line)
       except TracingFailure, e:
         # Hack in the values since the handler could be a static function.
         e.pid = line[self.PID]
         e.line = line
         e.line_number = self._line_number
         # Re-raise the modified exception.
         raise
       except (KeyError, NotImplementedError, ValueError), e:
         raise TracingFailure(
             'Trace generated a %s exception: %s' % (
                 e.__class__.__name__, str(e)),
             line[self.PID],
             self._line_number,
             line,
             e)

     def to_results(self):
       if not self.root_process:
         raise TracingFailure(
             'Failed to detect the initial process %d' % self._thunk_pid,
             None, None, None)
       process = self.root_process.to_results_process()
       return Results(process)

     def _thread_to_process(self, tid):
       """Finds the process from the thread id."""
       tid = int(tid, 16)
       pid = self._threads_active.get(tid)
       if not pid or not self._process_lookup.get(pid):
         return None
       return self._process_lookup[pid]

     @classmethod
     def handle_EventTrace_Header(cls, line):
       """Verifies no event was dropped, e.g. no buffer overrun occurred."""
       BUFFER_SIZE = cls.USER_DATA
       #VERSION = cls.USER_DATA + 1
       #PROVIDER_VERSION = cls.USER_DATA + 2
       #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
       #END_TIME = cls.USER_DATA + 4
       #TIMER_RESOLUTION = cls.USER_DATA + 5
       #MAX_FILE_SIZE = cls.USER_DATA + 6
       #LOG_FILE_MODE = cls.USER_DATA + 7
       #BUFFERS_WRITTEN = cls.USER_DATA + 8
       #START_BUFFERS = cls.USER_DATA + 9
       #POINTER_SIZE = cls.USER_DATA + 10
       EVENTS_LOST = cls.USER_DATA + 11
       #CPU_SPEED = cls.USER_DATA + 12
       #LOGGER_NAME = cls.USER_DATA + 13
       #LOG_FILE_NAME = cls.USER_DATA + 14
       #BOOT_TIME = cls.USER_DATA + 15
       #PERF_FREQ = cls.USER_DATA + 16
       #START_TIME = cls.USER_DATA + 17
       #RESERVED_FLAGS = cls.USER_DATA + 18
       #BUFFERS_LOST = cls.USER_DATA + 19
       #SESSION_NAME_STRING = cls.USER_DATA + 20
       #LOG_FILE_NAME_STRING = cls.USER_DATA + 21
       if line[EVENTS_LOST] != '0':
         raise TracingFailure(
             ('%s events were lost during trace, please increase the buffer '
              'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
             None, None, None)

     def handle_FileIo_Cleanup(self, line):
       """General wisdom: if a file is closed, it's because it was opened.

       Note that FileIo_Close is not used since if a file was opened properly but
       not closed before the process exits, only Cleanup will be logged.
       """
       #IRP = self.USER_DATA
       TTID = self.USER_DATA + 1  # Thread ID, that's what we want.
       FILE_OBJECT = self.USER_DATA + 2
       #FILE_KEY = self.USER_DATA + 3
       proc = self._thread_to_process(line[TTID])
       if not proc:
         # Not a process we care about.
         return
       file_object = line[FILE_OBJECT]
       if file_object in proc.file_objects:
         filepath, access_type = proc.file_objects.pop(file_object)
         proc.add_file(filepath, access_type)

     def handle_FileIo_Create(self, line):
       """Handles a file open.

       All FileIo events are described at
       http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
       for some value of 'description'.

       " (..) process and thread id values of the IO events (..) are not valid "
       http://msdn.microsoft.com/magazine/ee358703.aspx

       The FileIo.Create event doesn't return if the CreateFile() call
       succeeded, so keep track of the file_object and check that it is
       eventually closed with FileIo_Cleanup.
       """
       #IRP = self.USER_DATA
       TTID = self.USER_DATA + 1  # Thread ID, that's what we want.
       FILE_OBJECT = self.USER_DATA + 2
       #CREATE_OPTIONS = self.USER_DATA + 3
       #FILE_ATTRIBUTES = self.USER_DATA + 4
       #SHARE_ACCESS = self.USER_DATA + 5
       OPEN_PATH = self.USER_DATA + 6

       proc = self._thread_to_process(line[TTID])
       if not proc:
         # Not a process we care about.
         return

       raw_path = line[OPEN_PATH]
       # Ignore directories and bare drive right away.
       if raw_path.endswith(os.path.sep):
         return
       filepath = self._drive_map.to_win32(raw_path)
       # Ignore bare drive right away. Some may still fall through with format
       # like '\\?\X:'
       if len(filepath) == 2:
         return
       file_object = line[FILE_OBJECT]
       if fs.isdir(filepath):
         # There is no O_DIRECTORY equivalent on Windows. The closed is
         # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
         # simply discard directories are they are found.
         return
       # Override any stale file object.
       # TODO(maruel): Figure out a way to detect if the file was opened for
       # reading or writing. Sadly CREATE_OPTIONS doesn't seem to be of any help
       # here. For now mark as None to make it clear we have no idea what it is
       # about.
       proc.file_objects[file_object] = (filepath, None)

     def handle_FileIo_Rename(self, line):
       # TODO(maruel): Handle?
       pass

     def handle_Process_End(self, line):
       pid = line[self.PID]
       if self._process_lookup.get(pid):
         logging.info('Terminated: %d' % pid)
         self._process_lookup[pid] = None
       else:
         logging.debug('Terminated: %d' % pid)
       if self._thunk_process and self._thunk_process.pid == pid:
         self._thunk_process = None

     def handle_Process_Start(self, line):
       """Handles a new child process started by PID."""
       #UNIQUE_PROCESS_KEY = self.USER_DATA
       PROCESS_ID = self.USER_DATA + 1
       #PARENT_PID = self.USER_DATA + 2
       #SESSION_ID = self.USER_DATA + 3
       #EXIT_STATUS = self.USER_DATA + 4
       #DIRECTORY_TABLE_BASE = self.USER_DATA + 5
       #USER_SID = self.USER_DATA + 6
       IMAGE_FILE_NAME = self.USER_DATA + 7
       COMMAND_LINE = self.USER_DATA + 8

       ppid = line[self.PID]
       pid = int(line[PROCESS_ID], 16)
       command_line = CommandLineToArgvW(line[COMMAND_LINE])
       logging.debug(
           'New process %d->%d (%s) %s' %
             (ppid, pid, line[IMAGE_FILE_NAME], command_line))

       if pid == self._thunk_pid:
         # Need to ignore processes we don't know about because the log is
         # system-wide. self._thunk_pid shall start only one process.
         # This is tricky though because Windows *loves* to reuse process id and
         # it happens often that the process ID of the thunk script created by
         # create_subprocess_thunk() is reused. So just detecting the pid here is
         # not sufficient, we must confirm the command line.
         if command_line[:len(self._thunk_cmd)] != self._thunk_cmd:
           logging.info(
               'Ignoring duplicate pid %d for %s: %s while searching for %s',
               pid, self._trace_name, command_line, self._thunk_cmd)
           return

         # TODO(maruel): The check is quite weak. Add the thunk path.
         if self._thunk_process:
           raise TracingFailure(
               ('Parent process is _thunk_pid(%d) but thunk_process(%d) is '
                'already set') % (self._thunk_pid, self._thunk_process.pid),
               None, None, None)
         proc = self.Process(self.blacklist, pid, None)
         self._thunk_process = proc
         return
       elif ppid == self._thunk_pid and self._thunk_process:
         proc = self.Process(self.blacklist, pid, None)
         self.root_process = proc
         ppid = None
       elif self._process_lookup.get(ppid):
         proc = self.Process(self.blacklist, pid, None)
         self._process_lookup[ppid].children.append(proc)
       else:
         # Ignore
         return
       self._process_lookup[pid] = proc

       proc.command = command_line
       proc.executable = line[IMAGE_FILE_NAME]
       # proc.command[0] may be the absolute path of 'executable' but it may be
       # anything else too. If it happens that command[0] ends with executable,
       # use it, otherwise defaults to the base name.
       cmd0 = proc.command[0].lower()
       if not cmd0.endswith('.exe'):
         # TODO(maruel): That's not strictly true either.
         cmd0 += '.exe'
       if cmd0.endswith(proc.executable) and fs.isfile(cmd0):
         # Fix the path.
         cmd0 = cmd0.replace('/', os.path.sep)
         cmd0 = os.path.normpath(cmd0)
         proc.executable = file_path.get_native_path_case(cmd0)
       logging.info(
           'New child: %s -> %d %s' % (ppid, pid, proc.executable))

     def handle_Thread_End(self, line):
       """Has the same parameters as Thread_Start."""
       tid = int(line[self.TID], 16)
       self._threads_active.pop(tid, None)

     def handle_Thread_Start(self, line):
       """Handles a new thread created.

       Do not use self.PID here since a process' initial thread is created by
       the parent process.
       """
       PROCESS_ID = self.USER_DATA
       TTHREAD_ID = self.USER_DATA + 1
       #STACK_BASE = self.USER_DATA + 2
       #STACK_LIMIT = self.USER_DATA + 3
       #USER_STACK_BASE = self.USER_DATA + 4
       #USER_STACK_LIMIT = self.USER_DATA + 5
       #AFFINITY = self.USER_DATA + 6
       #WIN32_START_ADDR = self.USER_DATA + 7
       #TEB_BASE = self.USER_DATA + 8
       #SUB_PROCESS_TAG = self.USER_DATA + 9
       #BASE_PRIORITY = self.USER_DATA + 10
       #PAGE_PRIORITY = self.USER_DATA + 11
       #IO_PRIORITY = self.USER_DATA + 12
       #THREAD_FLAGS = self.USER_DATA + 13
       # Do not use self.PID here since a process' initial thread is created by
       # the parent process.
       pid = int(line[PROCESS_ID], 16)
       tid = int(line[TTHREAD_ID], 16)
       logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
       self._threads_active[tid] = pid

     @classmethod
     def supported_events(cls):
       """Returns all the processed events."""
       out = []
       for member in dir(cls):
         match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
         if match:
           out.append(match.groups())
       return out

   class Tracer(ApiBase.Tracer):
     # The basic headers.
     EXPECTED_HEADER = [
       u'Event Name',
       u'Type',
       u'Event ID',
       u'Version',
       u'Channel',
       u'Level',  # 5
       u'Opcode',
       u'Task',
       u'Keyword',
       u'PID',
       u'TID',  # 10
       u'Processor Number',
       u'Instance ID',
       u'Parent Instance ID',
       u'Activity ID',
       u'Related Activity ID',  # 15
       u'Clock-Time',
       u'Kernel(ms)',  # Both have a resolution of ~15ms which makes them
       u'User(ms)',    # pretty much useless.
       u'User Data',   # Extra arguments that are event-specific.
     ]
     # Only the useful headers common to all entries are listed there. Any column
     # at 19 or higher is dependent on the specific event.
     EVENT_NAME = 0
     TYPE = 1
     PID = 9
     TID = 10
     PROCESSOR_ID = 11
     TIMESTAMP = 16
     NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
     USER_DATA = 19

     class CsvReader(object):
       """CSV reader that reads files generated by tracerpt.exe.

       csv.reader() fails to read them properly, it mangles file names quoted
       with "" with a comma in it.
       """
         # 0. Had a ',' or one of the following ' ' after a comma, next should
         # be ' ', '"' or string or ',' for an empty field.
       (HAD_DELIMITER,
        # 1. Processing an unquoted field up to ','.
        IN_STR,
        # 2. Processing a new field starting with '"'.
        STARTING_STR_QUOTED,
        # 3. Second quote in a row at the start of a field. It could be either
        # '""foo""' or '""'. Who the hell thought it was a great idea to use
        # the same character for delimiting and escaping?
        STARTING_SECOND_QUOTE,
        # 4. A quote inside a quoted string where the previous character was
        # not a quote, so the string is not empty. Can be either: end of a
        # quoted string (a delimiter) or a quote escape. The next char must be
        # either '"' or ','.
        HAD_QUOTE_IN_QUOTED,
        # 5. Second quote inside a quoted string.
        HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED,
        # 6. Processing a field that started with '"'.
        IN_STR_QUOTED) = range(7)

       def __init__(self, f):
         self.f = f

       def __iter__(self):
         return self

       def next(self):
         """Splits the line in fields."""
         line = self.f.readline()
         if not line:
           raise StopIteration()
         line = line.strip()
         fields = []
         state = self.HAD_DELIMITER
         for i, c in enumerate(line):
           if state == self.HAD_DELIMITER:
             if c == ',':
               # Empty field.
               fields.append('')
             elif c == ' ':
               # Ignore initial whitespaces
               pass
             elif c == '"':
               state = self.STARTING_STR_QUOTED
               fields.append('')
             else:
               # Start of a new field.
               state = self.IN_STR
               fields.append(c)

           elif state == self.IN_STR:
             # Do not accept quote inside unquoted field.
             assert c != '"', (i, c, line, fields)
             if c == ',':
               fields[-1] = fields[-1].strip()
               state = self.HAD_DELIMITER
             else:
               fields[-1] = fields[-1] + c

           elif state == self.STARTING_STR_QUOTED:
             if c == '"':
               # Do not store the character yet.
               state = self.STARTING_SECOND_QUOTE
             else:
               state = self.IN_STR_QUOTED
               fields[-1] = fields[-1] + c

           elif state == self.STARTING_SECOND_QUOTE:
             if c == ',':
               # It was an empty field. '""' == ''.
               state = self.HAD_DELIMITER
             else:
               fields[-1] = fields[-1] + '"' + c
               state = self.IN_STR_QUOTED

           elif state == self.HAD_QUOTE_IN_QUOTED:
             if c == ',':
               # End of the string.
               state = self.HAD_DELIMITER
             elif c == '"':
               state = self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED
             else:
               # The previous double-quote was just an unescaped quote.
               fields[-1] = fields[-1] + '"' + c
               state = self.IN_STR_QUOTED

           elif state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
             if c == ',':
               # End of the string.
               state = self.HAD_DELIMITER
               fields[-1] = fields[-1] + '"'
             else:
               # That's just how the logger rolls. Revert back to appending the
               # char and "guess" it was a quote in a double-quoted string.
               state = self.IN_STR_QUOTED
               fields[-1] = fields[-1] + '"' + c

           elif state == self.IN_STR_QUOTED:
             if c == '"':
               # Could be a delimiter or an escape.
               state = self.HAD_QUOTE_IN_QUOTED
             else:
               fields[-1] = fields[-1] + c

         if state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
           fields[-1] = fields[-1] + '"'
         else:
           assert state in (
               # Terminated with a normal field.
               self.IN_STR,
               # Terminated with an empty field.
               self.STARTING_SECOND_QUOTE,
               # Terminated with a normal quoted field.
               self.HAD_QUOTE_IN_QUOTED), (
               line, state, fields)
         return fields

     def __init__(self, logname):
       """Starts the log collection.

       Requires administrative access. logman.exe is synchronous so no need for a
       "warmup" call.  'Windows Kernel Trace' is *localized* so use its GUID
       instead.  The GUID constant name is SystemTraceControlGuid. Lovely.

       One can get the list of potentially interesting providers with:
       "logman query providers | findstr /i file"
       """
       super(LogmanTrace.Tracer, self).__init__(logname)
       self._signal_script = create_subprocess_thunk()
       self._scripts_to_cleanup.append(self._signal_script)
       cmd_start = [
         'logman.exe',
         'start',
         'NT Kernel Logger',
         '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
         # splitio,fileiocompletion,syscall,file,cswitch,img
         '(process,fileio,thread)',
         '-o', self._logname + '.etl',
         '-ets',  # Send directly to kernel
         # Values extracted out of thin air.
         # Event Trace Session buffer size in kb.
         '-bs', '10240',
         # Number of Event Trace Session buffers.
         '-nb', '16', '256',
       ]
       logging.debug('Running: %s' % cmd_start)
       try:
         subprocess.check_call(
             cmd_start,
             stdin=subprocess.PIPE,
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT)
       except subprocess.CalledProcessError, e:
         if e.returncode == -2147024891:
           print >> sys.stderr, 'Please restart with an elevated admin prompt'
         elif e.returncode == -2144337737:
           print >> sys.stderr, (
               'A kernel trace was already running, stop it and try again')
         raise

     def trace(self, cmd, cwd, tracename, output):
       logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
       assert os.path.isabs(cmd[0]), cmd[0]
       assert os.path.isabs(cwd), cwd
       assert os.path.normpath(cwd) == cwd, cwd
       with self._lock:
         if not self._initialized:
           raise TracingFailure(
               'Called Tracer.trace() on an unitialized object',
               None, None, None, tracename)
         assert tracename not in (i['trace'] for i in self._traces)

       # Use "logman -?" for help.

       stdout = stderr = None
       if output:
         stdout = subprocess.PIPE
         stderr = subprocess.STDOUT

       # Run the child process.
       logging.debug('Running: %s' % cmd)
       # Use the temporary script generated with create_subprocess_thunk() so we
       # have a clear pid owner. Since trace_inputs.py can be used as a library
       # and could trace multiple processes simultaneously, it makes it more
       # complex if the executable to be traced is executed directly here. It
       # also solves issues related to logman.exe that needs to be executed to
       # control the kernel trace.
       child_cmd = [
         sys.executable,
         self._signal_script,
         tracename,
       ]
       child = subprocess.Popen(
           child_cmd + tools.fix_python_cmd(cmd),
           cwd=cwd,
           stdin=subprocess.PIPE,
           stdout=stdout,
           stderr=stderr)
       logging.debug('Started child pid: %d' % child.pid)
       out = child.communicate()[0]
       # This doesn't mean all the grand-children are done. Sadly, we don't have
       # a good way to determine that.

       with self._lock:
         assert tracename not in (i['trace'] for i in self._traces)
         self._traces.append({
           'cmd': cmd,
           'cwd': cwd,
           'output': out,
           'pid': child.pid,
           # Used to figure out the real process when process ids are reused.
           'thunk_cmd': child_cmd,
           'trace': tracename,
         })

       return child.returncode, out

     def close(self, _timeout=None):
       """Stops the kernel log collection and converts the traces to text
       representation.
       """
       with self._lock:
         try:
           super(LogmanTrace.Tracer, self).close()
         finally:
           cmd_stop = [
             'logman.exe',
             'stop',
             'NT Kernel Logger',
             '-ets',  # Sends the command directly to the kernel.
           ]
           logging.debug('Running: %s' % cmd_stop)
           subprocess.check_call(
               cmd_stop,
               stdin=subprocess.PIPE,
               stdout=subprocess.PIPE,
               stderr=subprocess.STDOUT)

     def post_process_log(self):
       """Converts the .etl file into .csv then into .json."""
       super(LogmanTrace.Tracer, self).post_process_log()
       logformat = 'csv'
       self._convert_log(logformat)
       self._trim_log(logformat)

     def _gen_logdata(self):
       return  {
         'format': 'csv',
         'traces': self._traces,
       }

     def _trim_log(self, logformat):
       """Reduces the amount of data in original log by generating a 'reduced'
       log.
       """
       if logformat == 'csv_utf16':
         file_handle = codecs.open(
             self._logname + '.' + logformat, 'r', encoding='utf-16')

       elif logformat == 'csv':
         assert sys.getfilesystemencoding() == 'mbcs'
         file_handle = codecs.open(
               self._logname + '.' + logformat, 'r',
               encoding=get_current_encoding())

       supported_events = LogmanTrace.Context.supported_events()

       def trim(generator):
         """Loads items from the generator and returns the interesting data.

         It filters out any uninteresting line and reduce the amount of data in
         the trace.
         """
         for index, line in enumerate(generator):
           if not index:
             if line != self.EXPECTED_HEADER:
               raise TracingFailure(
                   'Found malformed header: %s' % line,
                   None, None, None)
             continue
           # As you can see, the CSV is full of useful non-redundant information:
           # pylint: disable=too-many-boolean-expressions
           if (line[2] != '0' or  # Event ID
               line[3] not in ('2', '3') or  # Version
               line[4] != '0' or  # Channel
               line[5] != '0' or  # Level
               line[7] != '0' or  # Task
               line[8] != '0x0000000000000000' or  # Keyword
               line[12] != '' or  # Instance ID
               line[13] != '' or  # Parent Instance ID
               line[14] != self.NULL_GUID or  # Activity ID
               line[15] != ''):  # Related Activity ID
             raise TracingFailure(
                 'Found unexpected values in line: %s' % ' '.join(line),
                   None, None, None)

           if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
             continue

           yield [
               line[self.EVENT_NAME],
               line[self.TYPE],
               line[self.PID],
               line[self.TID],
               line[self.PROCESSOR_ID],
               line[self.TIMESTAMP],
           ] + line[self.USER_DATA:]

       # must not convert the trim() call into a list, since it will use too much
       # memory for large trace. use a csv file as a workaround since the json
       # parser requires a complete in-memory file.
       path = os.path.join(
           unicode(os.getcwd()), u'%s.preprocessed' % self._logname)
       with fs.open(path, 'wb') as f:
         # $ and * can't be used in file name on windows, reducing the likelihood
         # of having to escape a string.
         out = csv.writer(
             f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
         for line in trim(self.CsvReader(file_handle)):
           out.writerow([s.encode('utf-8') for s in line])

     def _convert_log(self, logformat):
       """Converts the ETL trace to text representation.

       Normally, 'csv' is sufficient. If complex scripts are used (like eastern
       languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.

       Arguments:
         - logformat: Text format to be generated, csv, csv_utf16 or xml.

       Use "tracerpt -?" for help.
       """
       LOCALE_INVARIANT = 0x7F
       windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
       cmd_convert = [
         'tracerpt.exe',
         '-l', self._logname + '.etl',
         '-o', self._logname + '.' + logformat,
         '-gmt',  # Use UTC
         '-y',  # No prompt
         # Use -of XML to get the header of each items after column 19, e.g. all
         # the actual headers of 'User Data'.
       ]

       if logformat == 'csv':
         # tracerpt localizes the 'Type' column, for major brainfuck
         # entertainment. I can't imagine any sane reason to do that.
         cmd_convert.extend(['-of', 'CSV'])
       elif logformat == 'csv_utf16':
         # This causes it to use UTF-16, which doubles the log size but ensures
         # the log is readable for non-ASCII characters.
         cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
       elif logformat == 'xml':
         cmd_convert.extend(['-of', 'XML'])
       else:
         raise ValueError('Unexpected log format \'%s\'' % logformat)
       logging.debug('Running: %s' % cmd_convert)
       # This can takes tens of minutes for large logs.
       # Redirects all output to stderr.
       subprocess.check_call(
           cmd_convert,
           stdin=subprocess.PIPE,
           stdout=sys.stderr,
           stderr=sys.stderr)

   def __init__(self, use_sudo=False):  # pylint: disable=W0613
     super(LogmanTrace, self).__init__()
     # Ignore use_sudo. It's irrelevant on Windows but kept to simplify the API.

   @staticmethod
   def clean_trace(logname):
     for ext in ('', '.csv', '.etl', '.json', '.xml', '.preprocessed'):
       if fs.isfile(logname + ext):
         fs.remove(logname + ext)

   @classmethod
   def parse_log(cls, logname, blacklist, trace_name):
     logging.info('parse_log(%s, ..., %s)', logname, trace_name)
     assert os.path.isabs(logname)

     def blacklist_more(filepath):
       # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
       return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)

     # Create a list of (Context, result_dict) tuples. This is necessary because
     # the csv file may be larger than the amount of available memory.
     contexes = [
       (
         cls.Context(
             blacklist_more, item['pid'], item['trace'], item['thunk_cmd']),
         {
           'output': item['output'],
           'trace': item['trace'],
         },
       )
       for item in tools.read_json(logname)['traces']
       if not trace_name or item['trace'] == trace_name
     ]

     # The log may be too large to fit in memory and it is not efficient to read
     # it multiple times, so multiplex the contexes instead, which is slightly
     # more awkward.
     path = os.path.join(unicode(os.getcwd()), u'%s.preprocessed' % logname)
     with fs.open(path, 'rb') as f:
       lines = csv.reader(
           f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
       for encoded in lines:
         line = [s.decode('utf-8') for s in encoded]
         # Convert the PID in-place from hex.
         line[cls.Context.PID] = int(line[cls.Context.PID], 16)
         for context in contexes:
           if 'exception' in context[1]:
             continue
           try:
             context[0].on_line(line)
           except TracingFailure:
             context[1]['exception'] = sys.exc_info()

     for context in contexes:
       if 'exception' in context[1]:
         continue
       context[1]['results'] = context[0].to_results()

     return [context[1] for context in contexes]


 def get_api(**kwargs):
   """Returns the correct implementation for the current OS."""
   if sys.platform == 'cygwin':
     raise NotImplementedError(
         'Not implemented for cygwin, start the script from Win32 python')
   flavors = {
     'win32': LogmanTrace,
     'darwin': Dtrace,
     'sunos5': Dtrace,
     'freebsd7': Dtrace,
     'freebsd8': Dtrace,
   }
   # Defaults to strace.
   return flavors.get(sys.platform, Strace)(**kwargs)


 def extract_directories(root_dir, files, blacklist):
   """Detects if all the files in a directory are in |files| and if so, replace
   the individual files by a Results.Directory instance.

   Takes a list of Results.File instances and returns a shorter list of
   Results.File and Results.Directory instances.

   Arguments:
     - root_dir: Optional base directory that shouldn't be search further.
     - files: list of Results.File instances.
     - blacklist: lambda to reject unneeded files, for example '.+\\.pyc'.
   """
   logging.info(
       'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
   assert not (root_dir or '').endswith(os.path.sep), root_dir
   # It is important for root_dir to not be a symlinked path, make sure to call
   # os.path.realpath() as needed.
   assert not root_dir or (
       os.path.realpath(file_path.get_native_path_case(root_dir)) == root_dir)
   assert not any(isinstance(f, Results.Directory) for f in files)
   # Remove non existent files.
   files = [f for f in files if f.existent]
   if not files:
     return files
   # All files must share the same root, which can be None.
   assert len(set(f.root for f in files)) == 1, set(f.root for f in files)

   # Creates a {directory: {filename: File}} mapping, up to root.
   buckets = {}
   if root_dir:
     buckets[root_dir] = {}
   for fileobj in files:
     path = fileobj.full_path
     directory = os.path.dirname(path)
     assert directory
     # Do not use os.path.basename() so trailing os.path.sep is kept.
     basename = path[len(directory)+1:]
     files_in_directory = buckets.setdefault(directory, {})
     files_in_directory[basename] = fileobj
     # Add all the directories recursively up to root.
     while True:
       old_d = directory
       directory = os.path.dirname(directory)
       if directory + os.path.sep == root_dir or directory == old_d:
         break
       buckets.setdefault(directory, {})

   root_prefix = len(root_dir) + 1 if root_dir else 0
   for directory in sorted(buckets, reverse=True):
     if not fs.isdir(directory):
       logging.debug(
           '%s was a directory but doesn\'t exist anymore; ignoring', directory)
       continue
     actual = set(f for f in fs.listdir(directory) if not blacklist(f))
     expected = set(buckets[directory])
     if not (actual - expected):
       parent = os.path.dirname(directory)
       buckets[parent][os.path.basename(directory)] = Results.Directory(
         root_dir,
         directory[root_prefix:],
         False,
         sum(f.size for f in buckets[directory].itervalues()),
         sum(f.nb_files for f in buckets[directory].itervalues()))
       # Remove the whole bucket.
       del buckets[directory]

   # Reverse the mapping with what remains. The original instances are returned,
   # so the cached meta data is kept.
   files = sum((x.values() for x in buckets.itervalues()), [])
   return sorted(files, key=lambda x: x.path)


 def trace(logfile, cmd, cwd, api, output):
   """Traces an executable. Returns (returncode, output) from api.

   Arguments:
   - logfile: file to write to.
   - cmd: command to run.
   - cwd: current directory to start the process in.
   - api: a tracing api instance.
   - output: if True, returns output, otherwise prints it at the console.
   """
   cmd = tools.fix_python_cmd(cmd)
   api.clean_trace(logfile)
   with api.get_tracer(logfile) as tracer:
     return tracer.trace(cmd, cwd, 'default', output)


 def CMDclean(parser, args):
   """Cleans up traces."""
   options, args = parser.parse_args(args)
   api = get_api()
   api.clean_trace(options.log)
   return 0


 def CMDtrace(parser, args):
   """Traces an executable."""
   parser.allow_interspersed_args = False
   parser.add_option(
       '-q', '--quiet', action='store_true',
       help='Redirects traced executable output to /dev/null')
   parser.add_option(
       '-s', '--sudo', action='store_true',
       help='Use sudo when shelling out the tracer tool (ignored on Windows)')
   parser.add_option(
       '-n', '--no-sudo', action='store_false',
       help='Don\'t use sudo')
   options, args = parser.parse_args(args)

   if not args:
     parser.error('Please provide a command to run')

   if not can_trace():
     parser.error('Please rerun this program with admin privileges')

   if not os.path.isabs(args[0]) and fs.access(args[0], os.X_OK):
     args[0] = os.path.abspath(args[0])

   # options.sudo default value is None, which is to do whatever tracer defaults
   # do.
   api = get_api(use_sudo=options.sudo)
   return trace(options.log, args, os.getcwd(), api, options.quiet)[0]


 def CMDread(parser, args):
   """Reads the logs and prints the result."""
   parser.add_option(
       '-V', '--variable',
       nargs=2,
       action='append',
       dest='variables',
       metavar='VAR_NAME directory',
       default=[],
       help=('Variables to replace relative directories against. Example: '
             '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
             'home dir with $HOME') % getpass.getuser())
   parser.add_option(
       '--root-dir',
       help='Root directory to base everything off it. Anything outside of this '
            'this directory will not be reported')
   parser.add_option(
       '--trace-name',
       help='Only reads one of the trace. Defaults to reading all traces')
   parser.add_option(
       '-j', '--json', action='store_true',
       help='Outputs raw result data as json')
   parser.add_option(
       '--trace-blacklist', action='append', default=[],
       help='List of regexp to use as blacklist filter')
   options, args = parser.parse_args(args)

   if options.root_dir:
     options.root_dir = file_path.get_native_path_case(
         unicode(os.path.abspath(options.root_dir)))

   variables = dict(options.variables)
   api = get_api()
   blacklist = tools.gen_blacklist(options.trace_blacklist)
   data = api.parse_log(options.log, blacklist, options.trace_name)
   # Process each trace.
   output_as_json = []
   try:
     for item in data:
       if 'exception' in item:
         # Do not abort the other traces.
         print >> sys.stderr, (
             'Trace %s: Got an exception: %s' % (
               item['trace'], item['exception'][1]))
         continue
       results = item['results']
       if options.root_dir:
         results = results.strip_root(options.root_dir)

       if options.json:
         output_as_json.append(results.flatten())
       else:
         simplified = extract_directories(
             options.root_dir, results.files, blacklist)
         simplified = [f.replace_variables(variables) for f in simplified]
         if len(data) > 1:
           print('Trace: %s' % item['trace'])
         print('Total: %d' % len(results.files))
         print('Non existent: %d' % len(results.non_existent))
         for f in results.non_existent:
           print('  %s' % f.path)
         print(
             'Interesting: %d reduced to %d' % (
                 len(results.existent), len(simplified)))
         for f in simplified:
           print('  %s' % f.path)

     if options.json:
       tools.write_json(sys.stdout, output_as_json, False)
   except KeyboardInterrupt:
     return 1
   except IOError as e:
     if e.errno == errno.EPIPE:
       # Do not print a stack trace when the output is piped to less and the user
       # quits before the whole output was written.
       return 1
     raise
   return 0


 class OptionParserTraceInputs(logging_utils.OptionParserWithLogging):
   """Adds automatic --log handling."""

   # Disable --log-file options since both --log and --log-file options are
   # confusing.
   # TODO(vadimsh): Rename --log-file or --log to something else.
   enable_log_file = False

   def __init__(self, **kwargs):
     logging_utils.OptionParserWithLogging.__init__(self, **kwargs)
     self.add_option(
         '-l', '--log', help='Log file to generate or read, required')

   def parse_args(self, *args, **kwargs):
     """Makes sure the paths make sense.

     On Windows, / and \\ are often mixed together in a path.
     """
     options, args = logging_utils.OptionParserWithLogging.parse_args(
         self, *args, **kwargs)
     if not options.log:
       self.error('Must supply a log file with -l')
     options.log = unicode(os.path.abspath(options.log))
     return options, args


 def main(argv):
   dispatcher = subcommand.CommandDispatcher(__name__)
   try:
     return dispatcher.execute(
         OptionParserTraceInputs(version=__version__), argv)
   except TracingFailure, e:
     sys.stderr.write('\nError: ')
     sys.stderr.write(str(e))
     sys.stderr.write('\n')
     return 1


 if __name__ == '__main__':
   subprocess42.inhibit_os_error_reporting()
   fix_encoding.fix_encoding()
   tools.disable_buffering()
   colorama.init()
   sys.exit(main(sys.argv[1:]))