verification/try_server.py - chromium/tools/commit-queue - Git at Google

 # coding=utf8
 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """Sends patches to the Try server and reads back results.

 - TryJobs contains TryJob, one per try job on a builder.
 - TryRunnerBase contains the common logic to send try jobs and responds to the
   try job results.
 - TryRunnerSvn uses svn plus /json polling on the try server for status updates.
 """

 import logging
 import os
 import re
 import time
 import urllib2

 import find_depot_tools  # pylint: disable=W0611
 import presubmit_support
 import trychange

 import buildbot_json
 import model
 from verification import base


 # We don't want to have trychange use gcl so block it.
 trychange.gcl = None
 # Hack out trychange logging.info()
 trychange.logging = logging.getLogger('trychange')
 trychange.logging.setLevel(logging.WARNING)


 def or_3_way(a, b):
   """Returns highest value, where True > False > None."""
   return None if (a == b == None) else (a or b)


 def parse_gclient_rev(rev):
   """Returns the absolute number of a gclient revision.

   It strips off the solution.
   """
   if rev is None:
     return rev
   return str(rev).split('@')[-1]


 def unambiguous_revision(checkout, revision):
   """Returns if a revision is unambiguous for the checkout.

   HEAD, date or branch name are ambiguous.
   """
   revision = parse_gclient_rev(revision)
   if not revision:
     return False
   name = checkout.__class__.__name__
   if revision.isdigit() and 'Svn' in name:
     # GitSvn should accept revision numbers?
     return True
   if re.match(r'^[a-f0-9]{5,20}$', revision) and 'Git' in name:
     return True
   return False


 class TryJob(model.PersistentMixIn):
   """Represents a try job for a pending commit.

   This data can be regenerated by parsing all the try job names but it is a bit
   hard on the try server.

   TODO(maruel): Should use __getstate__(), __setstate__() and __reduce__().
   """
   builder = unicode
   build = (None, int)
   revision = (None, int)
   result = (None, int)
   sent = float
   failed_steps = list
   clobber = bool
   name = (None, unicode)
   # Number of retries for this configuration.
   tries = int
   tests = list

   def __init__(self, **kwargs):
     kwargs.setdefault('sent', time.time())
     super(TryJob, self).__init__(**kwargs)

   def get_state(self):
     if self.result in (
         buildbot_json.SUCCESS, buildbot_json.WARNINGS, buildbot_json.SKIPPED):
       return base.SUCCEEDED
     elif self.result in (
         buildbot_json.FAILURE, buildbot_json.EXCEPTION, buildbot_json.RETRY):
       return base.FAILED
     else:
       assert self.result == None
       return base.PROCESSING


 class TryJobs(base.IVerifierStatus):
   """A set of try jobs that were sent for a specific patch."""
   # An array of TryJob objects.
   try_jobs = list
   # When NOTRY=true is specified.
   skipped = bool

   def get_state(self):
     if self.skipped:
       return base.SUCCEEDED
     if not self.try_jobs:
       return base.PROCESSING
     states = set(i.get_state() for i in self.try_jobs)
     assert states.issubset(base.VALID_STATES)
     return max(states)

   def why_not(self):
     pass


 def steps_quality(steps):
   if not steps:
     return None
   return all(v in (True, None) for v in steps)


 class StepDb(object):
   """Calculate statistics about all steps for each revisions."""
   max_cache = 200

   def __init__(self, builders, buildbot, checkout):
     self._need_full = True
     self.builders = builders
     self.buildbot = buildbot
     self.checkout = checkout

   def need_full(self):
     result = self._need_full
     self._need_full = False
     return result

   def revision_quality_builder_steps(self, builder, revision):
     """Calculates the occurence of a successful step execution, for a specific
     builder, for builds on a specific revision.

     The return value is a tuple of two elements:
       1. array of True/False/None, one value per step in a build. True means the
          step passed at least once. False means the step always failed. None
          means the step never ran for this revision on this builder.
       2. Number of builds that ran on this builder at this revision.
     """
     revision = str(revision)
     steps = None
     nb_builds = 0
     for build in self.buildbot.builders[builder].builds.cached_children:
       if parse_gclient_rev(build.revision) != revision:
         continue
       nb_builds += 1
       if not (not steps or len(steps) == len(build.steps)):
         logging.warn('Invalid build %s' % build)
         continue
       if not steps or len(steps) != len(build.steps):
         # If the number of steps changed after a master restart, we need to
         # ditch the previous steps.
         # One workaround is to key by name but it's not worth the effort here,
         # the worst case is that previous builds that could be considered good
         # will be ignored, making it impossible to determine a lkgr.
         steps = [None] * len(build.steps)
       for step in build.steps:
         steps[step.number] = or_3_way(
             steps[step.number], step.simplified_result)
     return steps, nb_builds

   def last_good_revision_builder(self, builder):
     """Returns LKGR for this builder or None if no revision was found.

     For a single revision, for each step, make sure step either passed at least
     once or never ran.

     For example, if build 1 has [True, True, False, None] and build 2 has [True,
     False, True, None], the revision is known to be good since each step run
     either succeeded one time or never ran.
     """
     state = {}
     for build in self.buildbot.builders[builder].builds.cached_children:
       if not unambiguous_revision(self.checkout, build.revision):
         # Ignore all builds that doesn't use revision numbers. It could be
         # instead svn date format {2011-01-30}, 'HEAD', 'BASE', etc.
         continue
       build_rev = parse_gclient_rev(build.revision)
       state.setdefault(build_rev, [None] * len(build.steps))
       for step in build.steps:
         if len(state[build_rev]) <= step.number:
           continue
         state[build_rev][step.number] = or_3_way(
             state[build_rev][step.number],
             step.simplified_result)

     # Insert a None item. It will be returned if no revision was found.
     revisions = sorted(
         int(revision) for revision in state
         if (all(v in (True, None) for v in state[revision])))
     if not revisions:
       return None
     return revisions[-1]


 class TryRunnerBase(base.VerifierCheckout):
   """Stateless communication with a try server.

   Sends try jobs and reads try job status.

   Analysis goes as following:
   - compile step is not flaky. compile.py already takes care of most flakiness
     and clobber build is done by default. If compile step fails, try again with
     clobber=True
   - test steps are flaky and can be retried as necessary.
   """
   name = 'try server'

   # A try job sent this long ago and that hasn't started yet is deemed to be
   # lost.
   lost_try_job_delay = 15*60

   # Only updates a job status once every 60 seconds.
   update_latency = 60

   def __init__(
       self, context_obj, try_server_url, commit_user,
       builders_and_tests, ignored_steps, solution):
     super(TryRunnerBase, self).__init__(context_obj)
     self.commit_user = commit_user
     self.try_server_url = try_server_url
     self.builders_and_tests = builders_and_tests
     self.ignored_steps = set(ignored_steps)
     self.last_update = time.time() - self.update_latency
     self.solution = solution

   def verify(self, pending):
     """Sends a try job to the try server and returns a TryJob list."""
     jobs = pending.verifications.setdefault(self.name, TryJobs())
     if jobs.try_jobs:
       logging.warning(
           'Already tried jobs. Let it go. At worst, it\'ll time out soon.')
       return

     jobs.try_jobs = jobs.try_jobs or []
     if self._is_skip_try_job(pending):
       # Do not run try job for it.
       jobs.skipped = True
       return

     new_jobs = [
       TryJob(
         builder=builder,
         tests=self.builders_and_tests[builder],
         revision=pending.revision,
         clobber=False)
       for builder in sorted(self.builders_and_tests)
     ]
     jobs.try_jobs.extend(new_jobs)
     self._send_jobs(
         pending,
         new_jobs,
         False,
         self.builders_and_tests,
         unicode(pending.pending_name()))
     # Slightly postpone next check.
     self.last_update = min(
         time.time(), self.last_update + (self.update_latency / 4))

   def update_status(self, queue):
     """Grabs the current status of all try jobs and update self.queue.

     Note: it would be more efficient to be event based.
     """
     if not queue:
       logging.debug('The list is empty, nothing to do')
       return

     if time.time() - self.last_update < self.update_latency:
       logging.debug('TS: Throttling updates')
       return
     self.last_update = time.time()

     self._update_statuses(queue)

   def _send_jobs(
       self, pending, jobs, need_prepare, builders_and_tests, job_name):
     """Prepares the TryJobs instance |jobs| to send try jobs to the try server.

     Sending try jobs is deferred to self._send_job().

     Arguments:
     - pending: pending_manager.Pending instance.
     - jobs: List of TryJob instances to be executed.
     - need_prepare: The checkout needs to have the patch applied, e.g. this
       function is called from within update_status().
     - builders_and_tests: dict('builder': ['test1', 'test2']) for try jobs to
       run. Can be self.builders_and_tests or a smaller subset when retrying
       jobs.
     - job_name: Job name to use, may have suffix like "retry".
     """
     for job in jobs:
       job.tries = job.tries or 0
       job.tries += 1
       if job.tries > 4:
         raise base.DiscardPending(
             pending,
             ('The commit queue went berserk retrying too often for a\n'
              'seemingly flaky test. Builder is %s, revision is %s, job name\n'
              'was %s.') % (job.builder, job.revision, job_name))

     builders = sorted(job.builder for job in jobs)
     assert len(set(builders)) == len(builders)

     revision = set(job.revision for job in jobs)
     assert len(revision) == 1
     revision = revision.pop()

     clobber = set(job.clobber for job in jobs)
     assert len(clobber) == 1
     clobber = clobber.pop()

     for job in jobs:
       job.result = None
       job.build = None
       job.name = job_name
       job.tests = builders_and_tests[job.builder]

     if need_prepare:
       self._prepare(pending, revision)
     self._send_job(pending, revision, clobber, builders_and_tests, job_name)
     for builder in builders:
       # Signal a new try job was sent.
       info = {
           'builder': builder,
           'clobber': job.clobber,
           'job_name': job_name,
           'revision': revision,
       }
       self.send_status(pending, info)
     for job in jobs:
       job.sent = time.time()

   def _build_status_url(self, job):
     """Html url for this try job."""
     assert job.build is not None, str(job)
     return '%s/buildstatus?builder=%s&number=%s' % (
         self.try_server_url.rstrip('/'), job.builder, job.build)

   def _error_msg(self, name, job, failed_steps):
     """Constructs the error message."""
     def steps_to_str(steps):
       if len(steps) > 1:
         return 'steps "%s"' % ', '.join(steps)
       elif steps:
         return 'step "%s"' % steps[0]
       else:
         return ''

     msg = u'Try job failure for %s on %s for %s' % (
         name, job.builder, steps_to_str(failed_steps))
     if job.clobber:
       msg += ' (clobber build)'
     msg += '.'
     if job.failed_steps:
       msg += u'\nIt\'s a second try, previously, %s failed.' % (
           steps_to_str(job.failed_steps))
     msg += '\n%s' % self._build_status_url(job)
     logging.info(msg)
     return msg

   def _handle_try_job(self, pending, jobs, job, build):
     """Determines if the try job is a good signal to commit the patch."""
     if build.simplified_result is None:
       # The build hasn't completed yet.
       return
     assert job.result is None
     assert job.build is not None
     job.result = build.result
     # Warning: This code assumes that steps do not abort build on failure.
     failed_steps = list(set(
         step.name for step in build.steps if step.simplified_result is False
         ) - self.ignored_steps)
     # If the failed steps are only ignored steps like update_scripts or
     # cleanup_temp, still consider the job as a success. As such, do not use
     # build.result.
     if (not failed_steps and
         all(build.steps[s].simplified_result for s in job.tests
             if s in build.steps.keys)):
       job.result = buildbot_json.SUCCESS

     # Signal to the dashboard a try job completed.
     info = {
         'build': build.number,
         'builder': job.builder,
         'duration': build.duration,
         'job_name': job.name,
         'result': job.result,
         'revision': job.revision,
         'url': self._build_status_url(job),
     }
     self.send_status(pending, info)

     if job.get_state() != base.FAILED:
       assert not failed_steps
       logging.info(u'Try job status for %s on %s: %s\n%s' % (
           job.name,
           job.builder,
           job.result,
           self._build_status_url(job)))
       return

     msg = self._error_msg(job.name, job, failed_steps)
     quality = self._get_quality(job.builder, int(job.revision))

     def retry(msg2, tests=None):
       """Retry a try job. Will use LKGR if quality is bad."""
       if not quality:
         lkgr = self.get_lkgr(job.builder)
         if lkgr is None:
           logging.error('lkgr should never be None.')
           fail('Couldn\'t find a good revision, aborting.')
           return
         job.revision = lkgr
       logging.info(
           'Retrying %s on %s, %s; rev=%s; %s' %
               (job.name, job.builder, str(tests), job.revision, msg2))
       job.failed_steps = failed_steps
       tests = tests or job.tests
       self._send_jobs(
           pending, [job], True, {job.builder: tests}, u'%s (retry)' % job.name)

     def fail(msg2):
       jobs.error_message = '%s\n%s' % (msg, msg2)
       logging.info(jobs.error_message)
       job.failed_steps = failed_steps

     if 'update' in failed_steps:
       # Look at update quality specifically since it's a special step.
       return fail(
           '\nStep "update" is always a major failure.\n'
           'Look at the try server FAQ for more details.')

     if 'compile' in failed_steps:
       if not job.clobber:
         # Note: this resets previous test failure if there has been on the
         # second previous try. This is fine since a slave could be broken.
         job.clobber = True
         return retry('retry compile with clobber')

       return fail('')

     if quality:
       if job.failed_steps:
         # The job had already failed.
         return fail('')

       return retry('Quality but first try', failed_steps)

     # TODO(maruel): It would make sense to do a clobber build to see if the
     # revision is indeed broken, since this algorithm assumes that the try
     # server is continuously used for recent revisions!
     # The revision looks like it's broken, retry with lkgr instead.
     return retry('No quality, no idea', failed_steps)

   @staticmethod
   def _is_skip_try_job(pending):
     """Returns True if a description contains NOTRY=true."""
     match = re.search(r'^NOTRY=(.*)$', pending.description, re.MULTILINE)
     return match and match.group(1).lower() == 'true'

   def _prepare(self, pending, revision):
     """Prepares the checkout by applying the patch."""
     raise NotImplementedError()

   def _get_quality(self, builder, revision):
     """Gets quality about a revision job."""
     raise NotImplementedError()

   def get_lkgr(self, builder):
     """Gets the last known good revision."""
     raise NotImplementedError()

   def _send_job(self, pending, revision, clobber, builders_and_tests, job_name):
     """Sends a try job."""
     raise NotImplementedError()

   def _update_statuses(self, queue):
     """Updates TryJob status for all the Pending instances in the queue.

     Calls to this function are throttled.
     """
     raise NotImplementedError()


 class TryRunnerSvn(TryRunnerBase):
   """Uses SVN to send the try job.

   Keeps a database of steps for each revision for each builder that ever passed,
   to know if it is possible for a step to pass. When unsure, it sends an empty
   build for the said revsion to determine if the revision is simply broken.

   TODO(maruel): Ask the main server for details? Still doesn't cover well flaky
   tests.
   """
   def __init__(
       self, context_obj, try_server_url, commit_user,
       builders_and_tests, ignored_steps, solution,
       extra_flags, lkgr):
     super(TryRunnerSvn, self).__init__(
         context_obj, try_server_url, commit_user,
         builders_and_tests, ignored_steps, solution)
     self.status = buildbot_json.Buildbot(self.try_server_url)
     self.step_db = StepDb(
         self.builders_and_tests.keys(), self.status, self.context.checkout)
     self.extra_flags = extra_flags or []
     self.lkgr = lkgr

   def _prepare(self, pending, revision):
     """Running from inside update_status(), the patch wasn't applied. Do it now.
     """
     pending.revision = revision
     pending.apply_patch(self.context, True)

   def _get_quality(self, builder, revision):
     steps, _ = self.step_db.revision_quality_builder_steps(builder, revision)
     return steps_quality(steps)

   def get_lkgr(self, builder):
     return max(self.step_db.last_good_revision_builder(builder), self.lkgr())

   def _send_job(self, pending, revision, clobber, builders_and_tests, job_name):
     """Sends a try job."""
     assert revision
     cmd = [
         '--no_search',
         '--revision', '%s@%s' % (self.solution, revision),
         '--name', job_name,
         '--user', self.commit_user.split('@', 1)[0],
         '--email', self.commit_user,
         '--rietveld_url', self._patch_url(pending),
         '--issue', str(pending.issue),
         '--patchset', str(pending.patchset)
     ]
     cmd.extend(self.extra_flags)
     for builder in sorted(builders_and_tests):
       cmd.append('--bot')
       tests = builders_and_tests[builder]
       if tests:
         cmd.append('%s:%s' % (builder, ','.join(tests)))
       else:
         cmd.append(builder)
     if clobber:
       cmd.append('--clobber')
     # TODO(maruel): use GitChange when relevant.
     change = presubmit_support.SvnChange(
         job_name,
         pending.description,
         self.context.checkout.project_path,
         [('M', f) for f in pending.files],
         pending.issue,
         pending.patchset,
         pending.owner)
     prev_dir = os.getcwd()
     try:
       os.chdir(self.context.checkout.project_path)
       trychange.TryChange(
           cmd,
           change,
           swallow_exception=True)
     except SystemExit as e:
       logging.error(
           '_send_job(%s, %s, %s, %s, %s) failed!' % (
             pending.pending_name(), revision, clobber, builders_and_tests,
             job_name))
       raise base.DiscardPending(
           pending,
           'Failed to send try job %s: %s' % (job_name, e))
     finally:
       os.chdir(prev_dir)

   def _reset_cache(self, queue):
     """Resets the cache of self.status and self.step_db so the next requests
     are more efficient.
     """
     self.status.discard()

     jobs_to_update = []
     for _, jobs in self.loop(queue, TryJobs, True):
       jobs_to_update.extend(
           job for job in jobs.try_jobs if job.get_state() == base.PROCESSING)

     # First determine what data is needed.
     builds_to_cache = {}
     if self.step_db.need_full():
       logging.info('Fetching all try jobs status to fetch good revisions')
       builders_to_cache = self.builders_and_tests.keys()
     else:
       builders_to_cache = set()
       for job in jobs_to_update:
         if job.build is None:
           builders_to_cache.add(job.builder)
         else:
           if job.get_state() == base.PROCESSING:
             builds_to_cache.setdefault(job.builder, []).append(job.build)

     # Simplify testing.
     builders_to_cache = sorted(builders_to_cache)

     # Reduce the number of requests by caching all the needed builders in one
     # shot when some jobs weren't started yet.
     if builders_to_cache:
       self.status.builders.cache_partial(builders_to_cache)

     for builder in builders_to_cache:
       self.status.builders[builder].builds.cache()
       # Filter out jobs that were retrieved.
       if builder in builds_to_cache:
         del builds_to_cache[builder]

     # Cache remaining builds. Sort to make testing simpler.
     for builder, builds in sorted(
         builds_to_cache.iteritems(), key=lambda x: x[0]):
       self.status.builders[builder].builds.cache_partial(builds)

   def _update_statuses(self, queue):
     self._reset_cache(queue)
     for pending, jobs in self.loop(queue, TryJobs, True):
       for job in jobs.try_jobs:
         if job.get_state() != base.PROCESSING:
           continue
         self._update_status(pending, jobs, job)

   def _update_status(self, pending, jobs, job):
     """There's one TryJob per builder."""
     # TODO(maruel): There should be differentiation when there's multiple
     # jobs for a single builder.
     build = None
     try:
       if job.build is None:
         build = self._find_job(job)
         if build:
           # Signal a try job was found.
           info = {
               'build': build.number,
               'builder': job.builder,
               'job_name': job.name,
               'revision': job.revision,
               'url': self._build_status_url(job),
           }
           self.send_status(pending, info)
       else:
         try:
           build = self.status.builders[job.builder].builds[job.build]
         except KeyError:
           # May happen when there is a huge backlog and the build is not
           # cached anymore.
           build = None
     except urllib2.HTTPError as e:
       logging.error(str(e))
       return

     if build is not None:
       self._handle_try_job(pending, jobs, job, build)
     else:
       # A job needs to be sent again if it has been sent more than
       # self.lost_try_job_delay ago.
       builder = self.status.builders[job.builder]
       pending_builds = builder.data.get('pendingBuilds', 0)
       if (time.time() - job.sent) > self.lost_try_job_delay:
         if pending_builds:
           job_names = [
               data.get('reason', '') for data in builder.pending_builds.data
           ]
           if job.name in job_names:
             # It's pending, move on.
             return

         # The job went to /dev/null. For example, the master may have
         # restarted, the svn server may have a fluke, network may have had a
         # short downtime, etc. Delete the previous job.
         # Resend exactly the same job.
         tests = job.tests
         if not tests:
           if not job.builder in self.builders_and_tests:
             # This means the builder was removed. Skip it.
             logging.warn(
                 ( 'Wanted to retry %s but it\'s not a requirement anymore. '
                   'Ignoring it!') % job.builder)
             job.result = buildbot_json.SKIPPED
             return

           tests = self.builders_and_tests[job.builder]
         self._send_jobs(
             pending,
             [job],
             True,
             {job.builder:tests},
             u'%s (previous was lost)' % job.name)

   def _find_job(self, job):
     """Searches on the try server if the try job for |job| has started."""
     revision = '%s@%s' % (self.solution, job.revision)
     # TODO(maruel): Strip this off.
     job_name = job.name.split(':', 1)[-1]
     logging.debug('Searching for job.reason = %s @ %s' % (job_name, revision))
     for build in self.status.builders[job.builder].builds:
       blame = build.data.get('blame', [])
       logging.debug(
           'Build.reason = %s @ %s; blame: %s' % (
             build.reason, build.revision, ','.join(blame)))
       if (build.reason == job_name and
           str(build.revision) == revision and
           len(blame) == 1 and
           blame[0] == self.commit_user):
         # Note the build number to remember it started.
         logging.info('Found build %d for job %s' % (build.number, job_name))
         job.build = build.number
         return build
     return None

   def _patch_url(self, pending):
     return ('%s/download/issue%d_%d.diff' %
         (self.context.rietveld.url, pending.issue, pending.patchset))