| # coding=utf8 |
| # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| """Sends patches to the Try server and reads back results. |
| |
| - TryJobs contains TryJob, one per try job on a builder. |
| - TryRunnerBase contains the common logic to send try jobs and responds to the |
| try job results. |
| - TryRunnerSvn uses svn plus /json polling on the try server for status updates. |
| """ |
| |
| import logging |
| import os |
| import re |
| import time |
| import urllib2 |
| |
| import find_depot_tools # pylint: disable=W0611 |
| import presubmit_support |
| import trychange |
| |
| import buildbot_json |
| import model |
| from verification import base |
| |
| |
| # We don't want to have trychange use gcl so block it. |
| trychange.gcl = None |
| # Hack out trychange logging.info() |
| trychange.logging = logging.getLogger('trychange') |
| trychange.logging.setLevel(logging.WARNING) |
| |
| |
| def or_3_way(a, b): |
| """Returns highest value, where True > False > None.""" |
| return None if (a == b == None) else (a or b) |
| |
| |
| def parse_gclient_rev(rev): |
| """Returns the absolute number of a gclient revision. |
| |
| It strips off the solution. |
| """ |
| if rev is None: |
| return rev |
| return str(rev).split('@')[-1] |
| |
| |
| def unambiguous_revision(checkout, revision): |
| """Returns if a revision is unambiguous for the checkout. |
| |
| HEAD, date or branch name are ambiguous. |
| """ |
| revision = parse_gclient_rev(revision) |
| if not revision: |
| return False |
| name = checkout.__class__.__name__ |
| if revision.isdigit() and 'Svn' in name: |
| # GitSvn should accept revision numbers? |
| return True |
| if re.match(r'^[a-f0-9]{5,20}$', revision) and 'Git' in name: |
| return True |
| return False |
| |
| |
| class TryJob(model.PersistentMixIn): |
| """Represents a try job for a pending commit. |
| |
| This data can be regenerated by parsing all the try job names but it is a bit |
| hard on the try server. |
| |
| TODO(maruel): Should use __getstate__(), __setstate__() and __reduce__(). |
| """ |
| builder = unicode |
| build = (None, int) |
| revision = (None, int) |
| result = (None, int) |
| sent = float |
| failed_steps = list |
| clobber = bool |
| name = (None, unicode) |
| # Number of retries for this configuration. |
| tries = int |
| tests = list |
| |
| def __init__(self, **kwargs): |
| kwargs.setdefault('sent', time.time()) |
| super(TryJob, self).__init__(**kwargs) |
| |
| def get_state(self): |
| if self.result in ( |
| buildbot_json.SUCCESS, buildbot_json.WARNINGS, buildbot_json.SKIPPED): |
| return base.SUCCEEDED |
| elif self.result in ( |
| buildbot_json.FAILURE, buildbot_json.EXCEPTION, buildbot_json.RETRY): |
| return base.FAILED |
| else: |
| assert self.result == None |
| return base.PROCESSING |
| |
| |
| class TryJobs(base.IVerifierStatus): |
| """A set of try jobs that were sent for a specific patch.""" |
| # An array of TryJob objects. |
| try_jobs = list |
| # When NOTRY=true is specified. |
| skipped = bool |
| |
| def get_state(self): |
| if self.skipped: |
| return base.SUCCEEDED |
| if not self.try_jobs: |
| return base.PROCESSING |
| states = set(i.get_state() for i in self.try_jobs) |
| assert states.issubset(base.VALID_STATES) |
| return max(states) |
| |
| def why_not(self): |
| pass |
| |
| |
| def steps_quality(steps): |
| if not steps: |
| return None |
| return all(v in (True, None) for v in steps) |
| |
| |
| class StepDb(object): |
| """Calculate statistics about all steps for each revisions.""" |
| max_cache = 200 |
| |
| def __init__(self, builders, buildbot, checkout): |
| self._need_full = True |
| self.builders = builders |
| self.buildbot = buildbot |
| self.checkout = checkout |
| |
| def need_full(self): |
| result = self._need_full |
| self._need_full = False |
| return result |
| |
| def revision_quality_builder_steps(self, builder, revision): |
| """Calculates the occurence of a successful step execution, for a specific |
| builder, for builds on a specific revision. |
| |
| The return value is a tuple of two elements: |
| 1. array of True/False/None, one value per step in a build. True means the |
| step passed at least once. False means the step always failed. None |
| means the step never ran for this revision on this builder. |
| 2. Number of builds that ran on this builder at this revision. |
| """ |
| revision = str(revision) |
| steps = None |
| nb_builds = 0 |
| for build in self.buildbot.builders[builder].builds.cached_children: |
| if parse_gclient_rev(build.revision) != revision: |
| continue |
| nb_builds += 1 |
| if not (not steps or len(steps) == len(build.steps)): |
| logging.warn('Invalid build %s' % build) |
| continue |
| if not steps or len(steps) != len(build.steps): |
| # If the number of steps changed after a master restart, we need to |
| # ditch the previous steps. |
| # One workaround is to key by name but it's not worth the effort here, |
| # the worst case is that previous builds that could be considered good |
| # will be ignored, making it impossible to determine a lkgr. |
| steps = [None] * len(build.steps) |
| for step in build.steps: |
| steps[step.number] = or_3_way( |
| steps[step.number], step.simplified_result) |
| return steps, nb_builds |
| |
| def last_good_revision_builder(self, builder): |
| """Returns LKGR for this builder or None if no revision was found. |
| |
| For a single revision, for each step, make sure step either passed at least |
| once or never ran. |
| |
| For example, if build 1 has [True, True, False, None] and build 2 has [True, |
| False, True, None], the revision is known to be good since each step run |
| either succeeded one time or never ran. |
| """ |
| state = {} |
| for build in self.buildbot.builders[builder].builds.cached_children: |
| if not unambiguous_revision(self.checkout, build.revision): |
| # Ignore all builds that doesn't use revision numbers. It could be |
| # instead svn date format {2011-01-30}, 'HEAD', 'BASE', etc. |
| continue |
| build_rev = parse_gclient_rev(build.revision) |
| state.setdefault(build_rev, [None] * len(build.steps)) |
| for step in build.steps: |
| if len(state[build_rev]) <= step.number: |
| continue |
| state[build_rev][step.number] = or_3_way( |
| state[build_rev][step.number], |
| step.simplified_result) |
| |
| # Insert a None item. It will be returned if no revision was found. |
| revisions = sorted( |
| int(revision) for revision in state |
| if (all(v in (True, None) for v in state[revision]))) |
| if not revisions: |
| return None |
| return revisions[-1] |
| |
| |
| class TryRunnerBase(base.VerifierCheckout): |
| """Stateless communication with a try server. |
| |
| Sends try jobs and reads try job status. |
| |
| Analysis goes as following: |
| - compile step is not flaky. compile.py already takes care of most flakiness |
| and clobber build is done by default. If compile step fails, try again with |
| clobber=True |
| - test steps are flaky and can be retried as necessary. |
| """ |
| name = 'try server' |
| |
| # A try job sent this long ago and that hasn't started yet is deemed to be |
| # lost. |
| lost_try_job_delay = 15*60 |
| |
| # Only updates a job status once every 60 seconds. |
| update_latency = 60 |
| |
| def __init__( |
| self, context_obj, try_server_url, commit_user, |
| builders_and_tests, ignored_steps, solution): |
| super(TryRunnerBase, self).__init__(context_obj) |
| self.commit_user = commit_user |
| self.try_server_url = try_server_url |
| self.builders_and_tests = builders_and_tests |
| self.ignored_steps = set(ignored_steps) |
| self.last_update = time.time() - self.update_latency |
| self.solution = solution |
| |
| def verify(self, pending): |
| """Sends a try job to the try server and returns a TryJob list.""" |
| jobs = pending.verifications.setdefault(self.name, TryJobs()) |
| if jobs.try_jobs: |
| logging.warning( |
| 'Already tried jobs. Let it go. At worst, it\'ll time out soon.') |
| return |
| |
| jobs.try_jobs = jobs.try_jobs or [] |
| if self._is_skip_try_job(pending): |
| # Do not run try job for it. |
| jobs.skipped = True |
| return |
| |
| new_jobs = [ |
| TryJob( |
| builder=builder, |
| tests=self.builders_and_tests[builder], |
| revision=pending.revision, |
| clobber=False) |
| for builder in sorted(self.builders_and_tests) |
| ] |
| jobs.try_jobs.extend(new_jobs) |
| self._send_jobs( |
| pending, |
| new_jobs, |
| False, |
| self.builders_and_tests, |
| unicode(pending.pending_name())) |
| # Slightly postpone next check. |
| self.last_update = min( |
| time.time(), self.last_update + (self.update_latency / 4)) |
| |
| def update_status(self, queue): |
| """Grabs the current status of all try jobs and update self.queue. |
| |
| Note: it would be more efficient to be event based. |
| """ |
| if not queue: |
| logging.debug('The list is empty, nothing to do') |
| return |
| |
| if time.time() - self.last_update < self.update_latency: |
| logging.debug('TS: Throttling updates') |
| return |
| self.last_update = time.time() |
| |
| self._update_statuses(queue) |
| |
| def _send_jobs( |
| self, pending, jobs, need_prepare, builders_and_tests, job_name): |
| """Prepares the TryJobs instance |jobs| to send try jobs to the try server. |
| |
| Sending try jobs is deferred to self._send_job(). |
| |
| Arguments: |
| - pending: pending_manager.Pending instance. |
| - jobs: List of TryJob instances to be executed. |
| - need_prepare: The checkout needs to have the patch applied, e.g. this |
| function is called from within update_status(). |
| - builders_and_tests: dict('builder': ['test1', 'test2']) for try jobs to |
| run. Can be self.builders_and_tests or a smaller subset when retrying |
| jobs. |
| - job_name: Job name to use, may have suffix like "retry". |
| """ |
| for job in jobs: |
| job.tries = job.tries or 0 |
| job.tries += 1 |
| if job.tries > 4: |
| raise base.DiscardPending( |
| pending, |
| ('The commit queue went berserk retrying too often for a\n' |
| 'seemingly flaky test. Builder is %s, revision is %s, job name\n' |
| 'was %s.') % (job.builder, job.revision, job_name)) |
| |
| builders = sorted(job.builder for job in jobs) |
| assert len(set(builders)) == len(builders) |
| |
| revision = set(job.revision for job in jobs) |
| assert len(revision) == 1 |
| revision = revision.pop() |
| |
| clobber = set(job.clobber for job in jobs) |
| assert len(clobber) == 1 |
| clobber = clobber.pop() |
| |
| for job in jobs: |
| job.result = None |
| job.build = None |
| job.name = job_name |
| job.tests = builders_and_tests[job.builder] |
| |
| if need_prepare: |
| self._prepare(pending, revision) |
| self._send_job(pending, revision, clobber, builders_and_tests, job_name) |
| for builder in builders: |
| # Signal a new try job was sent. |
| info = { |
| 'builder': builder, |
| 'clobber': job.clobber, |
| 'job_name': job_name, |
| 'revision': revision, |
| } |
| self.send_status(pending, info) |
| for job in jobs: |
| job.sent = time.time() |
| |
| def _build_status_url(self, job): |
| """Html url for this try job.""" |
| assert job.build is not None, str(job) |
| return '%s/buildstatus?builder=%s&number=%s' % ( |
| self.try_server_url.rstrip('/'), job.builder, job.build) |
| |
| def _error_msg(self, name, job, failed_steps): |
| """Constructs the error message.""" |
| def steps_to_str(steps): |
| if len(steps) > 1: |
| return 'steps "%s"' % ', '.join(steps) |
| elif steps: |
| return 'step "%s"' % steps[0] |
| else: |
| return '' |
| |
| msg = u'Try job failure for %s on %s for %s' % ( |
| name, job.builder, steps_to_str(failed_steps)) |
| if job.clobber: |
| msg += ' (clobber build)' |
| msg += '.' |
| if job.failed_steps: |
| msg += u'\nIt\'s a second try, previously, %s failed.' % ( |
| steps_to_str(job.failed_steps)) |
| msg += '\n%s' % self._build_status_url(job) |
| logging.info(msg) |
| return msg |
| |
| def _handle_try_job(self, pending, jobs, job, build): |
| """Determines if the try job is a good signal to commit the patch.""" |
| if build.simplified_result is None: |
| # The build hasn't completed yet. |
| return |
| assert job.result is None |
| assert job.build is not None |
| job.result = build.result |
| # Warning: This code assumes that steps do not abort build on failure. |
| failed_steps = list(set( |
| step.name for step in build.steps if step.simplified_result is False |
| ) - self.ignored_steps) |
| # If the failed steps are only ignored steps like update_scripts or |
| # cleanup_temp, still consider the job as a success. As such, do not use |
| # build.result. |
| if (not failed_steps and |
| all(build.steps[s].simplified_result for s in job.tests |
| if s in build.steps.keys)): |
| job.result = buildbot_json.SUCCESS |
| |
| # Signal to the dashboard a try job completed. |
| info = { |
| 'build': build.number, |
| 'builder': job.builder, |
| 'duration': build.duration, |
| 'job_name': job.name, |
| 'result': job.result, |
| 'revision': job.revision, |
| 'url': self._build_status_url(job), |
| } |
| self.send_status(pending, info) |
| |
| if job.get_state() != base.FAILED: |
| assert not failed_steps |
| logging.info(u'Try job status for %s on %s: %s\n%s' % ( |
| job.name, |
| job.builder, |
| job.result, |
| self._build_status_url(job))) |
| return |
| |
| msg = self._error_msg(job.name, job, failed_steps) |
| quality = self._get_quality(job.builder, int(job.revision)) |
| |
| def retry(msg2, tests=None): |
| """Retry a try job. Will use LKGR if quality is bad.""" |
| if not quality: |
| lkgr = self.get_lkgr(job.builder) |
| if lkgr is None: |
| logging.error('lkgr should never be None.') |
| fail('Couldn\'t find a good revision, aborting.') |
| return |
| job.revision = lkgr |
| logging.info( |
| 'Retrying %s on %s, %s; rev=%s; %s' % |
| (job.name, job.builder, str(tests), job.revision, msg2)) |
| job.failed_steps = failed_steps |
| tests = tests or job.tests |
| self._send_jobs( |
| pending, [job], True, {job.builder: tests}, u'%s (retry)' % job.name) |
| |
| def fail(msg2): |
| jobs.error_message = '%s\n%s' % (msg, msg2) |
| logging.info(jobs.error_message) |
| job.failed_steps = failed_steps |
| |
| if 'update' in failed_steps: |
| # Look at update quality specifically since it's a special step. |
| return fail( |
| '\nStep "update" is always a major failure.\n' |
| 'Look at the try server FAQ for more details.') |
| |
| if 'compile' in failed_steps: |
| if not job.clobber: |
| # Note: this resets previous test failure if there has been on the |
| # second previous try. This is fine since a slave could be broken. |
| job.clobber = True |
| return retry('retry compile with clobber') |
| |
| return fail('') |
| |
| if quality: |
| if job.failed_steps: |
| # The job had already failed. |
| return fail('') |
| |
| return retry('Quality but first try', failed_steps) |
| |
| # TODO(maruel): It would make sense to do a clobber build to see if the |
| # revision is indeed broken, since this algorithm assumes that the try |
| # server is continuously used for recent revisions! |
| # The revision looks like it's broken, retry with lkgr instead. |
| return retry('No quality, no idea', failed_steps) |
| |
| @staticmethod |
| def _is_skip_try_job(pending): |
| """Returns True if a description contains NOTRY=true.""" |
| match = re.search(r'^NOTRY=(.*)$', pending.description, re.MULTILINE) |
| return match and match.group(1).lower() == 'true' |
| |
| def _prepare(self, pending, revision): |
| """Prepares the checkout by applying the patch.""" |
| raise NotImplementedError() |
| |
| def _get_quality(self, builder, revision): |
| """Gets quality about a revision job.""" |
| raise NotImplementedError() |
| |
| def get_lkgr(self, builder): |
| """Gets the last known good revision.""" |
| raise NotImplementedError() |
| |
| def _send_job(self, pending, revision, clobber, builders_and_tests, job_name): |
| """Sends a try job.""" |
| raise NotImplementedError() |
| |
| def _update_statuses(self, queue): |
| """Updates TryJob status for all the Pending instances in the queue. |
| |
| Calls to this function are throttled. |
| """ |
| raise NotImplementedError() |
| |
| |
| class TryRunnerSvn(TryRunnerBase): |
| """Uses SVN to send the try job. |
| |
| Keeps a database of steps for each revision for each builder that ever passed, |
| to know if it is possible for a step to pass. When unsure, it sends an empty |
| build for the said revsion to determine if the revision is simply broken. |
| |
| TODO(maruel): Ask the main server for details? Still doesn't cover well flaky |
| tests. |
| """ |
| def __init__( |
| self, context_obj, try_server_url, commit_user, |
| builders_and_tests, ignored_steps, solution, |
| extra_flags, lkgr): |
| super(TryRunnerSvn, self).__init__( |
| context_obj, try_server_url, commit_user, |
| builders_and_tests, ignored_steps, solution) |
| self.status = buildbot_json.Buildbot(self.try_server_url) |
| self.step_db = StepDb( |
| self.builders_and_tests.keys(), self.status, self.context.checkout) |
| self.extra_flags = extra_flags or [] |
| self.lkgr = lkgr |
| |
| def _prepare(self, pending, revision): |
| """Running from inside update_status(), the patch wasn't applied. Do it now. |
| """ |
| pending.revision = revision |
| pending.apply_patch(self.context, True) |
| |
| def _get_quality(self, builder, revision): |
| steps, _ = self.step_db.revision_quality_builder_steps(builder, revision) |
| return steps_quality(steps) |
| |
| def get_lkgr(self, builder): |
| return max(self.step_db.last_good_revision_builder(builder), self.lkgr()) |
| |
| def _send_job(self, pending, revision, clobber, builders_and_tests, job_name): |
| """Sends a try job.""" |
| assert revision |
| cmd = [ |
| '--no_search', |
| '--revision', '%s@%s' % (self.solution, revision), |
| '--name', job_name, |
| '--user', self.commit_user.split('@', 1)[0], |
| '--email', self.commit_user, |
| '--rietveld_url', self._patch_url(pending), |
| '--issue', str(pending.issue), |
| '--patchset', str(pending.patchset) |
| ] |
| cmd.extend(self.extra_flags) |
| for builder in sorted(builders_and_tests): |
| cmd.append('--bot') |
| tests = builders_and_tests[builder] |
| if tests: |
| cmd.append('%s:%s' % (builder, ','.join(tests))) |
| else: |
| cmd.append(builder) |
| if clobber: |
| cmd.append('--clobber') |
| # TODO(maruel): use GitChange when relevant. |
| change = presubmit_support.SvnChange( |
| job_name, |
| pending.description, |
| self.context.checkout.project_path, |
| [('M', f) for f in pending.files], |
| pending.issue, |
| pending.patchset, |
| pending.owner) |
| prev_dir = os.getcwd() |
| try: |
| os.chdir(self.context.checkout.project_path) |
| trychange.TryChange( |
| cmd, |
| change, |
| swallow_exception=True) |
| except SystemExit as e: |
| logging.error( |
| '_send_job(%s, %s, %s, %s, %s) failed!' % ( |
| pending.pending_name(), revision, clobber, builders_and_tests, |
| job_name)) |
| raise base.DiscardPending( |
| pending, |
| 'Failed to send try job %s: %s' % (job_name, e)) |
| finally: |
| os.chdir(prev_dir) |
| |
| def _reset_cache(self, queue): |
| """Resets the cache of self.status and self.step_db so the next requests |
| are more efficient. |
| """ |
| self.status.discard() |
| |
| jobs_to_update = [] |
| for _, jobs in self.loop(queue, TryJobs, True): |
| jobs_to_update.extend( |
| job for job in jobs.try_jobs if job.get_state() == base.PROCESSING) |
| |
| # First determine what data is needed. |
| builds_to_cache = {} |
| if self.step_db.need_full(): |
| logging.info('Fetching all try jobs status to fetch good revisions') |
| builders_to_cache = self.builders_and_tests.keys() |
| else: |
| builders_to_cache = set() |
| for job in jobs_to_update: |
| if job.build is None: |
| builders_to_cache.add(job.builder) |
| else: |
| if job.get_state() == base.PROCESSING: |
| builds_to_cache.setdefault(job.builder, []).append(job.build) |
| |
| # Simplify testing. |
| builders_to_cache = sorted(builders_to_cache) |
| |
| # Reduce the number of requests by caching all the needed builders in one |
| # shot when some jobs weren't started yet. |
| if builders_to_cache: |
| self.status.builders.cache_partial(builders_to_cache) |
| |
| for builder in builders_to_cache: |
| self.status.builders[builder].builds.cache() |
| # Filter out jobs that were retrieved. |
| if builder in builds_to_cache: |
| del builds_to_cache[builder] |
| |
| # Cache remaining builds. Sort to make testing simpler. |
| for builder, builds in sorted( |
| builds_to_cache.iteritems(), key=lambda x: x[0]): |
| self.status.builders[builder].builds.cache_partial(builds) |
| |
| def _update_statuses(self, queue): |
| self._reset_cache(queue) |
| for pending, jobs in self.loop(queue, TryJobs, True): |
| for job in jobs.try_jobs: |
| if job.get_state() != base.PROCESSING: |
| continue |
| self._update_status(pending, jobs, job) |
| |
| def _update_status(self, pending, jobs, job): |
| """There's one TryJob per builder.""" |
| # TODO(maruel): There should be differentiation when there's multiple |
| # jobs for a single builder. |
| build = None |
| try: |
| if job.build is None: |
| build = self._find_job(job) |
| if build: |
| # Signal a try job was found. |
| info = { |
| 'build': build.number, |
| 'builder': job.builder, |
| 'job_name': job.name, |
| 'revision': job.revision, |
| 'url': self._build_status_url(job), |
| } |
| self.send_status(pending, info) |
| else: |
| try: |
| build = self.status.builders[job.builder].builds[job.build] |
| except KeyError: |
| # May happen when there is a huge backlog and the build is not |
| # cached anymore. |
| build = None |
| except urllib2.HTTPError as e: |
| logging.error(str(e)) |
| return |
| |
| if build is not None: |
| self._handle_try_job(pending, jobs, job, build) |
| else: |
| # A job needs to be sent again if it has been sent more than |
| # self.lost_try_job_delay ago. |
| builder = self.status.builders[job.builder] |
| pending_builds = builder.data.get('pendingBuilds', 0) |
| if (time.time() - job.sent) > self.lost_try_job_delay: |
| if pending_builds: |
| job_names = [ |
| data.get('reason', '') for data in builder.pending_builds.data |
| ] |
| if job.name in job_names: |
| # It's pending, move on. |
| return |
| |
| # The job went to /dev/null. For example, the master may have |
| # restarted, the svn server may have a fluke, network may have had a |
| # short downtime, etc. Delete the previous job. |
| # Resend exactly the same job. |
| tests = job.tests |
| if not tests: |
| if not job.builder in self.builders_and_tests: |
| # This means the builder was removed. Skip it. |
| logging.warn( |
| ( 'Wanted to retry %s but it\'s not a requirement anymore. ' |
| 'Ignoring it!') % job.builder) |
| job.result = buildbot_json.SKIPPED |
| return |
| |
| tests = self.builders_and_tests[job.builder] |
| self._send_jobs( |
| pending, |
| [job], |
| True, |
| {job.builder:tests}, |
| u'%s (previous was lost)' % job.name) |
| |
| def _find_job(self, job): |
| """Searches on the try server if the try job for |job| has started.""" |
| revision = '%s@%s' % (self.solution, job.revision) |
| # TODO(maruel): Strip this off. |
| job_name = job.name.split(':', 1)[-1] |
| logging.debug('Searching for job.reason = %s @ %s' % (job_name, revision)) |
| for build in self.status.builders[job.builder].builds: |
| blame = build.data.get('blame', []) |
| logging.debug( |
| 'Build.reason = %s @ %s; blame: %s' % ( |
| build.reason, build.revision, ','.join(blame))) |
| if (build.reason == job_name and |
| str(build.revision) == revision and |
| len(blame) == 1 and |
| blame[0] == self.commit_user): |
| # Note the build number to remember it started. |
| logging.info('Found build %d for job %s' % (build.number, job_name)) |
| job.build = build.number |
| return build |
| return None |
| |
| def _patch_url(self, pending): |
| return ('%s/download/issue%d_%d.diff' % |
| (self.context.rietveld.url, pending.issue, pending.patchset)) |