blob: ef07d27ca94ed2c19b58a3e099b6abf3e0f56bde [file] [log] [blame]
# coding=utf8
# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Sends patches to the Try server and reads back results.
- TryJobs contains TryJob, one per try job on a builder.
- TryRunnerBase contains the common logic to send try jobs and responds to the
try job results.
- TryRunnerSvn uses svn plus /json polling on the try server for status updates.
"""
import logging
import os
import re
import time
import urllib2
import find_depot_tools # pylint: disable=W0611
import presubmit_support
import trychange
import buildbot_json
import model
from verification import base
# We don't want to have trychange use gcl so block it.
trychange.gcl = None
# Hack out trychange logging.info()
trychange.logging = logging.getLogger('trychange')
trychange.logging.setLevel(logging.WARNING)
def or_3_way(a, b):
"""Returns highest value, where True > False > None."""
return None if (a == b == None) else (a or b)
def parse_gclient_rev(rev):
"""Returns the absolute number of a gclient revision.
It strips off the solution.
"""
if rev is None:
return rev
return str(rev).split('@')[-1]
def unambiguous_revision(checkout, revision):
"""Returns if a revision is unambiguous for the checkout.
HEAD, date or branch name are ambiguous.
"""
revision = parse_gclient_rev(revision)
if not revision:
return False
name = checkout.__class__.__name__
if revision.isdigit() and 'Svn' in name:
# GitSvn should accept revision numbers?
return True
if re.match(r'^[a-f0-9]{5,20}$', revision) and 'Git' in name:
return True
return False
class TryJob(model.PersistentMixIn):
"""Represents a try job for a pending commit.
This data can be regenerated by parsing all the try job names but it is a bit
hard on the try server.
TODO(maruel): Should use __getstate__(), __setstate__() and __reduce__().
"""
builder = unicode
build = (None, int)
revision = (None, int)
result = (None, int)
sent = float
failed_steps = list
clobber = bool
name = (None, unicode)
# Number of retries for this configuration.
tries = int
tests = list
def __init__(self, **kwargs):
kwargs.setdefault('sent', time.time())
super(TryJob, self).__init__(**kwargs)
def get_state(self):
if self.result in (
buildbot_json.SUCCESS, buildbot_json.WARNINGS, buildbot_json.SKIPPED):
return base.SUCCEEDED
elif self.result in (
buildbot_json.FAILURE, buildbot_json.EXCEPTION, buildbot_json.RETRY):
return base.FAILED
else:
assert self.result == None
return base.PROCESSING
class TryJobs(base.IVerifierStatus):
"""A set of try jobs that were sent for a specific patch."""
# An array of TryJob objects.
try_jobs = list
# When NOTRY=true is specified.
skipped = bool
def get_state(self):
if self.skipped:
return base.SUCCEEDED
if not self.try_jobs:
return base.PROCESSING
states = set(i.get_state() for i in self.try_jobs)
assert states.issubset(base.VALID_STATES)
return max(states)
def why_not(self):
pass
def steps_quality(steps):
if not steps:
return None
return all(v in (True, None) for v in steps)
class StepDb(object):
"""Calculate statistics about all steps for each revisions."""
max_cache = 200
def __init__(self, builders, buildbot, checkout):
self._need_full = True
self.builders = builders
self.buildbot = buildbot
self.checkout = checkout
def need_full(self):
result = self._need_full
self._need_full = False
return result
def revision_quality_builder_steps(self, builder, revision):
"""Calculates the occurence of a successful step execution, for a specific
builder, for builds on a specific revision.
The return value is a tuple of two elements:
1. array of True/False/None, one value per step in a build. True means the
step passed at least once. False means the step always failed. None
means the step never ran for this revision on this builder.
2. Number of builds that ran on this builder at this revision.
"""
revision = str(revision)
steps = None
nb_builds = 0
for build in self.buildbot.builders[builder].builds.cached_children:
if parse_gclient_rev(build.revision) != revision:
continue
nb_builds += 1
if not (not steps or len(steps) == len(build.steps)):
logging.warn('Invalid build %s' % build)
continue
if not steps or len(steps) != len(build.steps):
# If the number of steps changed after a master restart, we need to
# ditch the previous steps.
# One workaround is to key by name but it's not worth the effort here,
# the worst case is that previous builds that could be considered good
# will be ignored, making it impossible to determine a lkgr.
steps = [None] * len(build.steps)
for step in build.steps:
steps[step.number] = or_3_way(
steps[step.number], step.simplified_result)
return steps, nb_builds
def last_good_revision_builder(self, builder):
"""Returns LKGR for this builder or None if no revision was found.
For a single revision, for each step, make sure step either passed at least
once or never ran.
For example, if build 1 has [True, True, False, None] and build 2 has [True,
False, True, None], the revision is known to be good since each step run
either succeeded one time or never ran.
"""
state = {}
for build in self.buildbot.builders[builder].builds.cached_children:
if not unambiguous_revision(self.checkout, build.revision):
# Ignore all builds that doesn't use revision numbers. It could be
# instead svn date format {2011-01-30}, 'HEAD', 'BASE', etc.
continue
build_rev = parse_gclient_rev(build.revision)
state.setdefault(build_rev, [None] * len(build.steps))
for step in build.steps:
if len(state[build_rev]) <= step.number:
continue
state[build_rev][step.number] = or_3_way(
state[build_rev][step.number],
step.simplified_result)
# Insert a None item. It will be returned if no revision was found.
revisions = sorted(
int(revision) for revision in state
if (all(v in (True, None) for v in state[revision])))
if not revisions:
return None
return revisions[-1]
class TryRunnerBase(base.VerifierCheckout):
"""Stateless communication with a try server.
Sends try jobs and reads try job status.
Analysis goes as following:
- compile step is not flaky. compile.py already takes care of most flakiness
and clobber build is done by default. If compile step fails, try again with
clobber=True
- test steps are flaky and can be retried as necessary.
"""
name = 'try server'
# A try job sent this long ago and that hasn't started yet is deemed to be
# lost.
lost_try_job_delay = 15*60
# Only updates a job status once every 60 seconds.
update_latency = 60
def __init__(
self, context_obj, try_server_url, commit_user,
builders_and_tests, ignored_steps, solution):
super(TryRunnerBase, self).__init__(context_obj)
self.commit_user = commit_user
self.try_server_url = try_server_url
self.builders_and_tests = builders_and_tests
self.ignored_steps = set(ignored_steps)
self.last_update = time.time() - self.update_latency
self.solution = solution
def verify(self, pending):
"""Sends a try job to the try server and returns a TryJob list."""
jobs = pending.verifications.setdefault(self.name, TryJobs())
if jobs.try_jobs:
logging.warning(
'Already tried jobs. Let it go. At worst, it\'ll time out soon.')
return
jobs.try_jobs = jobs.try_jobs or []
if self._is_skip_try_job(pending):
# Do not run try job for it.
jobs.skipped = True
return
new_jobs = [
TryJob(
builder=builder,
tests=self.builders_and_tests[builder],
revision=pending.revision,
clobber=False)
for builder in sorted(self.builders_and_tests)
]
jobs.try_jobs.extend(new_jobs)
self._send_jobs(
pending,
new_jobs,
False,
self.builders_and_tests,
unicode(pending.pending_name()))
# Slightly postpone next check.
self.last_update = min(
time.time(), self.last_update + (self.update_latency / 4))
def update_status(self, queue):
"""Grabs the current status of all try jobs and update self.queue.
Note: it would be more efficient to be event based.
"""
if not queue:
logging.debug('The list is empty, nothing to do')
return
if time.time() - self.last_update < self.update_latency:
logging.debug('TS: Throttling updates')
return
self.last_update = time.time()
self._update_statuses(queue)
def _send_jobs(
self, pending, jobs, need_prepare, builders_and_tests, job_name):
"""Prepares the TryJobs instance |jobs| to send try jobs to the try server.
Sending try jobs is deferred to self._send_job().
Arguments:
- pending: pending_manager.Pending instance.
- jobs: List of TryJob instances to be executed.
- need_prepare: The checkout needs to have the patch applied, e.g. this
function is called from within update_status().
- builders_and_tests: dict('builder': ['test1', 'test2']) for try jobs to
run. Can be self.builders_and_tests or a smaller subset when retrying
jobs.
- job_name: Job name to use, may have suffix like "retry".
"""
for job in jobs:
job.tries = job.tries or 0
job.tries += 1
if job.tries > 4:
raise base.DiscardPending(
pending,
('The commit queue went berserk retrying too often for a\n'
'seemingly flaky test. Builder is %s, revision is %s, job name\n'
'was %s.') % (job.builder, job.revision, job_name))
builders = sorted(job.builder for job in jobs)
assert len(set(builders)) == len(builders)
revision = set(job.revision for job in jobs)
assert len(revision) == 1
revision = revision.pop()
clobber = set(job.clobber for job in jobs)
assert len(clobber) == 1
clobber = clobber.pop()
for job in jobs:
job.result = None
job.build = None
job.name = job_name
job.tests = builders_and_tests[job.builder]
if need_prepare:
self._prepare(pending, revision)
self._send_job(pending, revision, clobber, builders_and_tests, job_name)
for builder in builders:
# Signal a new try job was sent.
info = {
'builder': builder,
'clobber': job.clobber,
'job_name': job_name,
'revision': revision,
}
self.send_status(pending, info)
for job in jobs:
job.sent = time.time()
def _build_status_url(self, job):
"""Html url for this try job."""
assert job.build is not None, str(job)
return '%s/buildstatus?builder=%s&number=%s' % (
self.try_server_url.rstrip('/'), job.builder, job.build)
def _error_msg(self, name, job, failed_steps):
"""Constructs the error message."""
def steps_to_str(steps):
if len(steps) > 1:
return 'steps "%s"' % ', '.join(steps)
elif steps:
return 'step "%s"' % steps[0]
else:
return ''
msg = u'Try job failure for %s on %s for %s' % (
name, job.builder, steps_to_str(failed_steps))
if job.clobber:
msg += ' (clobber build)'
msg += '.'
if job.failed_steps:
msg += u'\nIt\'s a second try, previously, %s failed.' % (
steps_to_str(job.failed_steps))
msg += '\n%s' % self._build_status_url(job)
logging.info(msg)
return msg
def _handle_try_job(self, pending, jobs, job, build):
"""Determines if the try job is a good signal to commit the patch."""
if build.simplified_result is None:
# The build hasn't completed yet.
return
assert job.result is None
assert job.build is not None
job.result = build.result
# Warning: This code assumes that steps do not abort build on failure.
failed_steps = list(set(
step.name for step in build.steps if step.simplified_result is False
) - self.ignored_steps)
# If the failed steps are only ignored steps like update_scripts or
# cleanup_temp, still consider the job as a success. As such, do not use
# build.result.
if (not failed_steps and
all(build.steps[s].simplified_result for s in job.tests
if s in build.steps.keys)):
job.result = buildbot_json.SUCCESS
# Signal to the dashboard a try job completed.
info = {
'build': build.number,
'builder': job.builder,
'duration': build.duration,
'job_name': job.name,
'result': job.result,
'revision': job.revision,
'url': self._build_status_url(job),
}
self.send_status(pending, info)
if job.get_state() != base.FAILED:
assert not failed_steps
logging.info(u'Try job status for %s on %s: %s\n%s' % (
job.name,
job.builder,
job.result,
self._build_status_url(job)))
return
msg = self._error_msg(job.name, job, failed_steps)
quality = self._get_quality(job.builder, int(job.revision))
def retry(msg2, tests=None):
"""Retry a try job. Will use LKGR if quality is bad."""
if not quality:
lkgr = self.get_lkgr(job.builder)
if lkgr is None:
logging.error('lkgr should never be None.')
fail('Couldn\'t find a good revision, aborting.')
return
job.revision = lkgr
logging.info(
'Retrying %s on %s, %s; rev=%s; %s' %
(job.name, job.builder, str(tests), job.revision, msg2))
job.failed_steps = failed_steps
tests = tests or job.tests
self._send_jobs(
pending, [job], True, {job.builder: tests}, u'%s (retry)' % job.name)
def fail(msg2):
jobs.error_message = '%s\n%s' % (msg, msg2)
logging.info(jobs.error_message)
job.failed_steps = failed_steps
if 'update' in failed_steps:
# Look at update quality specifically since it's a special step.
return fail(
'\nStep "update" is always a major failure.\n'
'Look at the try server FAQ for more details.')
if 'compile' in failed_steps:
if not job.clobber:
# Note: this resets previous test failure if there has been on the
# second previous try. This is fine since a slave could be broken.
job.clobber = True
return retry('retry compile with clobber')
return fail('')
if quality:
if job.failed_steps:
# The job had already failed.
return fail('')
return retry('Quality but first try', failed_steps)
# TODO(maruel): It would make sense to do a clobber build to see if the
# revision is indeed broken, since this algorithm assumes that the try
# server is continuously used for recent revisions!
# The revision looks like it's broken, retry with lkgr instead.
return retry('No quality, no idea', failed_steps)
@staticmethod
def _is_skip_try_job(pending):
"""Returns True if a description contains NOTRY=true."""
match = re.search(r'^NOTRY=(.*)$', pending.description, re.MULTILINE)
return match and match.group(1).lower() == 'true'
def _prepare(self, pending, revision):
"""Prepares the checkout by applying the patch."""
raise NotImplementedError()
def _get_quality(self, builder, revision):
"""Gets quality about a revision job."""
raise NotImplementedError()
def get_lkgr(self, builder):
"""Gets the last known good revision."""
raise NotImplementedError()
def _send_job(self, pending, revision, clobber, builders_and_tests, job_name):
"""Sends a try job."""
raise NotImplementedError()
def _update_statuses(self, queue):
"""Updates TryJob status for all the Pending instances in the queue.
Calls to this function are throttled.
"""
raise NotImplementedError()
class TryRunnerSvn(TryRunnerBase):
"""Uses SVN to send the try job.
Keeps a database of steps for each revision for each builder that ever passed,
to know if it is possible for a step to pass. When unsure, it sends an empty
build for the said revsion to determine if the revision is simply broken.
TODO(maruel): Ask the main server for details? Still doesn't cover well flaky
tests.
"""
def __init__(
self, context_obj, try_server_url, commit_user,
builders_and_tests, ignored_steps, solution,
extra_flags, lkgr):
super(TryRunnerSvn, self).__init__(
context_obj, try_server_url, commit_user,
builders_and_tests, ignored_steps, solution)
self.status = buildbot_json.Buildbot(self.try_server_url)
self.step_db = StepDb(
self.builders_and_tests.keys(), self.status, self.context.checkout)
self.extra_flags = extra_flags or []
self.lkgr = lkgr
def _prepare(self, pending, revision):
"""Running from inside update_status(), the patch wasn't applied. Do it now.
"""
pending.revision = revision
pending.apply_patch(self.context, True)
def _get_quality(self, builder, revision):
steps, _ = self.step_db.revision_quality_builder_steps(builder, revision)
return steps_quality(steps)
def get_lkgr(self, builder):
return max(self.step_db.last_good_revision_builder(builder), self.lkgr())
def _send_job(self, pending, revision, clobber, builders_and_tests, job_name):
"""Sends a try job."""
assert revision
cmd = [
'--no_search',
'--revision', '%s@%s' % (self.solution, revision),
'--name', job_name,
'--user', self.commit_user.split('@', 1)[0],
'--email', self.commit_user,
'--rietveld_url', self._patch_url(pending),
'--issue', str(pending.issue),
'--patchset', str(pending.patchset)
]
cmd.extend(self.extra_flags)
for builder in sorted(builders_and_tests):
cmd.append('--bot')
tests = builders_and_tests[builder]
if tests:
cmd.append('%s:%s' % (builder, ','.join(tests)))
else:
cmd.append(builder)
if clobber:
cmd.append('--clobber')
# TODO(maruel): use GitChange when relevant.
change = presubmit_support.SvnChange(
job_name,
pending.description,
self.context.checkout.project_path,
[('M', f) for f in pending.files],
pending.issue,
pending.patchset,
pending.owner)
prev_dir = os.getcwd()
try:
os.chdir(self.context.checkout.project_path)
trychange.TryChange(
cmd,
change,
swallow_exception=True)
except SystemExit as e:
logging.error(
'_send_job(%s, %s, %s, %s, %s) failed!' % (
pending.pending_name(), revision, clobber, builders_and_tests,
job_name))
raise base.DiscardPending(
pending,
'Failed to send try job %s: %s' % (job_name, e))
finally:
os.chdir(prev_dir)
def _reset_cache(self, queue):
"""Resets the cache of self.status and self.step_db so the next requests
are more efficient.
"""
self.status.discard()
jobs_to_update = []
for _, jobs in self.loop(queue, TryJobs, True):
jobs_to_update.extend(
job for job in jobs.try_jobs if job.get_state() == base.PROCESSING)
# First determine what data is needed.
builds_to_cache = {}
if self.step_db.need_full():
logging.info('Fetching all try jobs status to fetch good revisions')
builders_to_cache = self.builders_and_tests.keys()
else:
builders_to_cache = set()
for job in jobs_to_update:
if job.build is None:
builders_to_cache.add(job.builder)
else:
if job.get_state() == base.PROCESSING:
builds_to_cache.setdefault(job.builder, []).append(job.build)
# Simplify testing.
builders_to_cache = sorted(builders_to_cache)
# Reduce the number of requests by caching all the needed builders in one
# shot when some jobs weren't started yet.
if builders_to_cache:
self.status.builders.cache_partial(builders_to_cache)
for builder in builders_to_cache:
self.status.builders[builder].builds.cache()
# Filter out jobs that were retrieved.
if builder in builds_to_cache:
del builds_to_cache[builder]
# Cache remaining builds. Sort to make testing simpler.
for builder, builds in sorted(
builds_to_cache.iteritems(), key=lambda x: x[0]):
self.status.builders[builder].builds.cache_partial(builds)
def _update_statuses(self, queue):
self._reset_cache(queue)
for pending, jobs in self.loop(queue, TryJobs, True):
for job in jobs.try_jobs:
if job.get_state() != base.PROCESSING:
continue
self._update_status(pending, jobs, job)
def _update_status(self, pending, jobs, job):
"""There's one TryJob per builder."""
# TODO(maruel): There should be differentiation when there's multiple
# jobs for a single builder.
build = None
try:
if job.build is None:
build = self._find_job(job)
if build:
# Signal a try job was found.
info = {
'build': build.number,
'builder': job.builder,
'job_name': job.name,
'revision': job.revision,
'url': self._build_status_url(job),
}
self.send_status(pending, info)
else:
try:
build = self.status.builders[job.builder].builds[job.build]
except KeyError:
# May happen when there is a huge backlog and the build is not
# cached anymore.
build = None
except urllib2.HTTPError as e:
logging.error(str(e))
return
if build is not None:
self._handle_try_job(pending, jobs, job, build)
else:
# A job needs to be sent again if it has been sent more than
# self.lost_try_job_delay ago.
builder = self.status.builders[job.builder]
pending_builds = builder.data.get('pendingBuilds', 0)
if (time.time() - job.sent) > self.lost_try_job_delay:
if pending_builds:
job_names = [
data.get('reason', '') for data in builder.pending_builds.data
]
if job.name in job_names:
# It's pending, move on.
return
# The job went to /dev/null. For example, the master may have
# restarted, the svn server may have a fluke, network may have had a
# short downtime, etc. Delete the previous job.
# Resend exactly the same job.
tests = job.tests
if not tests:
if not job.builder in self.builders_and_tests:
# This means the builder was removed. Skip it.
logging.warn(
( 'Wanted to retry %s but it\'s not a requirement anymore. '
'Ignoring it!') % job.builder)
job.result = buildbot_json.SKIPPED
return
tests = self.builders_and_tests[job.builder]
self._send_jobs(
pending,
[job],
True,
{job.builder:tests},
u'%s (previous was lost)' % job.name)
def _find_job(self, job):
"""Searches on the try server if the try job for |job| has started."""
revision = '%s@%s' % (self.solution, job.revision)
# TODO(maruel): Strip this off.
job_name = job.name.split(':', 1)[-1]
logging.debug('Searching for job.reason = %s @ %s' % (job_name, revision))
for build in self.status.builders[job.builder].builds:
blame = build.data.get('blame', [])
logging.debug(
'Build.reason = %s @ %s; blame: %s' % (
build.reason, build.revision, ','.join(blame)))
if (build.reason == job_name and
str(build.revision) == revision and
len(blame) == 1 and
blame[0] == self.commit_user):
# Note the build number to remember it started.
logging.info('Found build %d for job %s' % (build.number, job_name))
job.build = build.number
return build
return None
def _patch_url(self, pending):
return ('%s/download/issue%d_%d.diff' %
(self.context.rietveld.url, pending.issue, pending.patchset))