blob: 359acb6ae8bee76aff244310929d025c3c5d9576 [file] [log] [blame]
# coding=utf8
# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Sends patches to the Try server and reads back results.
- RietveldTryJobs contains RietveldTryJob, one per try job on a builder.
- TryRunnerRietveld uses Rietveld to signal and poll job results.
"""
import collections
import errno
import logging
import re
import socket
import time
import urllib2
import buildbot_json
import model
from verification import base
from verification import try_job_steps
# A build running for longer than this is considered to be timed out.
TIMED_OUT = 12 * 60 * 60
def is_job_expired(now, revision, timestamp, checkout):
"""Returns False if the job result is still somewhat valid.
A job that occured more than 4 days ago or more than 200 commits behind
is 'expired'.
"""
if timestamp < (now - 4*24*60*60):
return True
if checkout.revisions(revision, None) >= 200:
return True
return False
TryJobProperties = collections.namedtuple(
'TryJobProperties',
['key', 'parent_key', 'builder', 'build', 'buildnumber', 'properties'])
def filter_jobs(try_job_results, watched_builders, current_irrelevant_keys,
status):
"""For each try jobs results, query the Try Server for updated status and
returns details about each job in a TryJobProperties.
Returns a list of namedtuple describing the updated results and and the new
list of irrelevant keys.
It adds the build to the ignored list if the build doesn't exist on the Try
Server anymore (usually it's too old) or if the try job was not triggered by
the Commit Queue itself.
"""
irrelevant = set(current_irrelevant_keys)
try_jobs_with_props = []
for result in try_job_results:
key = result['key']
assert key
if key in current_irrelevant_keys:
continue
builder = result['builder']
try:
buildnumber = int(result['buildnumber'])
except (TypeError, ValueError):
continue
if buildnumber < 0:
logging.debug('Ignoring %s/%d; invalid', builder, buildnumber)
irrelevant.add(key)
continue
if builder not in watched_builders:
logging.debug('Ignoring %s/%d; no step verifier is examining it', builder,
buildnumber)
irrelevant.add(key)
continue
# Constructing the object itself doesn't throw an exception, it's reading
# its properties that throws.
build = status.builders[builder].builds[buildnumber]
try:
props = build.properties_as_dict
except IOError:
logging.info(
'Build %s/%s is not on the try server anymore',
builder, buildnumber)
irrelevant.add(key)
continue
parent_key = props.get('parent_try_job_key')
if parent_key:
# Triggered build
key = '%s/%d_triggered_%s' % (builder, buildnumber, parent_key)
elif props.get('try_job_key') != key:
# not triggered, not valid
logging.debug(
'Ignoring %s/%d; not from rietveld', builder, buildnumber)
irrelevant.add(key)
continue
try_jobs_with_props.append(
TryJobProperties(key, parent_key, builder, build, buildnumber, props))
# Sort the non-triggered builds first so triggered jobs
# can expect their parent to be added to self.try_jobs
try_jobs_with_props.sort(key=lambda tup: tup.parent_key)
return try_jobs_with_props, list(irrelevant)
def _is_skip_try_job(pending):
"""Returns True if a description contains NOTRY=true."""
match = re.search(r'^NOTRY=(.*)$', pending.description, re.MULTILINE)
return match and match.group(1).lower() == 'true'
class RietveldTryJobPending(model.PersistentMixIn):
"""Represents a pending try job for a pending commit that we care about.
It is immutable.
"""
builder = unicode
revision = (None, unicode, int)
requested_steps = list
clobber = bool
# Number of retries for this configuration. Initial try is 1.
tries = int
init_time = float
def __init__(self, **kwargs):
required = set(self._persistent_members())
actual = set(kwargs)
assert required == actual, (required - actual, required, actual)
super(RietveldTryJobPending, self).__init__(**kwargs)
# Then mark it read-only.
self._read_only = True
class RietveldTryJob(model.PersistentMixIn):
"""Represents a try job for a pending commit that we care about.
This data can be regenerated by parsing all the try job names but it is a bit
hard on the try server.
It is immutable.
"""
builder = unicode
build = int
revision = (None, unicode, int)
requested_steps = list
# The timestamp when the build started. buildbot_json returns int.
started = int
steps_passed = list
steps_failed = list
clobber = bool
completed = bool
# Number of retries for this configuration. Initial try is 1.
tries = int
parent_key = (None, unicode)
init_time = float
def __init__(self, **kwargs):
required = set(self._persistent_members())
actual = set(kwargs)
assert required == actual, (required - actual, required, actual)
super(RietveldTryJob, self).__init__(**kwargs)
# Then mark it read-only.
self._read_only = True
@property
@model.immutable
def result(self):
if self.steps_failed:
return buildbot_json.FAILURE
if self.completed:
return buildbot_json.SUCCESS
return None
class RietveldTryJobs(base.IVerifierStatus):
"""A set of try jobs that were sent for a specific patch.
Multiple concurrent try jobs can be sent on a single builder. For example, a
previous valid try job could have been triggered by the user but was not
completed so another was sent with the missing tests.
Also, a try job is sent as soon as a test failure is detected.
"""
# An dict of RietveldTryJob objects per key.
try_jobs = dict
# The try job keys we ignore because they can't be used to give a good
# signal: either they are too old (old revision) or they were not triggerd
# by Rietveld, so we don't know if the diff is 100% good.
irrelevant = list
# When NOTRY=true is specified.
skipped = bool
# List of test verifiers. All the logic to decide when they are
# and what bots they trigger is hidden inside.
step_verifiers = list
# Jobs that have been sent but are not found yet. Likely a builder is fully
# utilized or the try server hasn't polled Rietveld yet. list of
# RietveldTryJobPending() instances.
pendings = list
@model.immutable
def get_state(self):
"""Returns the state of this verified.
Failure can be from:
- For each entry in self.step_verifiers:
- A Try Job in self.try_jobs has been retried too often.
In particular, there is no need to wait for every Try Job to complete.
"""
if self.error_message:
return base.FAILED
if not self.tests_waiting_for_result():
return base.SUCCEEDED
return base.PROCESSING
@model.immutable
def tests_need_to_be_run(self, now):
"""Returns which tests need to be run.
These are the tests that are not pending on any try job, either running or
in the pending list.
"""
# Skipped or failed, nothing to do.
if self.skipped or self.error_message:
return {}
# What originally needed to be run.
# All_tests is {builder_name: set(test_name*)}
all_tests = {}
for verifier in self.step_verifiers:
(builder, tests) = verifier.need_to_trigger(self.try_jobs, now)
if tests:
all_tests.setdefault(builder, set()).update(tests)
# Removes what is queued to be run but hasn't started yet.
for try_job in self.pendings:
if try_job.builder in all_tests:
all_tests[try_job.builder] -= set(try_job.requested_steps)
return dict(
(builder, sorted(tests)) for builder, tests in all_tests.iteritems()
if tests)
@model.immutable
def tests_waiting_for_result(self):
"""Returns the tests that we are waiting for results on pending or running
builds.
"""
# Skipped or failed, nothing to do.
if self.skipped or self.error_message:
return {}
# What originally needed to be run.
all_tests = {}
for verification in self.step_verifiers:
(builder, tests) = verification.waiting_for(self.try_jobs)
if tests:
all_tests.setdefault(builder, set()).update(tests)
# Removes what was run.
for try_job in self.try_jobs.itervalues():
if try_job.builder in all_tests:
all_tests[try_job.builder] -= set(try_job.steps_passed)
return dict(
(builder, list(tests)) for builder, tests in all_tests.iteritems()
if tests)
@model.immutable
def watched_builders(self):
"""Marks all the jobs that the step_verifiers don't examine as
irrelevant.
"""
# Generate the list of builders to keep.
watched_builders = set()
for step_verifier in self.step_verifiers:
watched_builders.add(step_verifier.builder_name)
if isinstance(step_verifier, try_job_steps.TryJobTriggeredSteps):
watched_builders.add(step_verifier.trigger_name)
return watched_builders
def update_jobs_from_rietveld(
self, data, status, checkout, now):
"""Retrieves the jobs statuses from rietveld and updates its state.
Args:
owner: Owner of the CL.
data: Patchset properties as returned from Rietveld.
status: A buildbot_json.Buildbot instance.
checkout: A depot_tools' Checkout instance.
now: epoch time of what should be considered to be 'now'.
Returns:
Keys which were updated.
"""
updated = []
try_job_results = data.get('try_job_results', [])
logging.debug('Found %d entries', len(try_job_results))
try_jobs_with_props, self.irrelevant = filter_jobs(
try_job_results, self.watched_builders() , self.irrelevant, status)
# Ensure that all irrelevant jobs have been removed from the set of valid
# try jobs.
for irrelevant_key in self.irrelevant:
if irrelevant_key in self.try_jobs:
del self.try_jobs[irrelevant_key]
if irrelevant_key + '_old' in self.try_jobs:
del self.try_jobs[irrelevant_key + '_old']
for i in try_jobs_with_props:
if self._update_try_job_status(checkout, i, now):
updated.append(i.key)
return updated
def _update_try_job_status(self, checkout, try_job_properties, now):
"""Updates status of a specific RietveldTryJob.
try_job_property is an instance of TryJobProperties.
Returns True if it was updated.
"""
key = try_job_properties.key
builder = try_job_properties.builder
buildnumber = try_job_properties.buildnumber
if key in self.irrelevant:
logging.debug('Ignoring %s/%d; irrelevant', builder, buildnumber)
return False
if (try_job_properties.parent_key and
try_job_properties.parent_key not in self.try_jobs):
logging.debug('Ignoring %s, parent unknown', key)
return False
requested_steps = []
# Set it to 0 as the default value since when the job is new and previous
# try jobs are found, we don't want to count them as tries.
tries = 0
job = self.try_jobs.get(key)
build = try_job_properties.build
if job:
if job.completed:
logging.debug('Ignoring %s/%d; completed', builder, buildnumber)
return False
else:
if now - job.started > TIMED_OUT:
# Flush it and start over.
self.irrelevant.append(key)
del self.try_jobs[key]
return False
requested_steps = job.requested_steps
tries = job.tries
init_time = job.init_time
else:
# This try job is new. See if we triggered it previously by
# looking in self.pendings.
for index, pending_job in enumerate(self.pendings):
if pending_job.builder == builder:
# Reuse its item.
requested_steps = pending_job.requested_steps
tries = pending_job.tries
self.pendings.pop(index)
break
else:
# Is this a good build? It must not be too old and triggered by
# rietveld.
if is_job_expired(now, build.revision, build.start_time, checkout):
logging.debug('Ignoring %s/%d; expired', builder, buildnumber)
self.irrelevant.append(key)
return False
init_time = now
passed = [s.name for s in build.steps if s.simplified_result]
failed = [s.name for s in build.steps if s.simplified_result is False]
# The steps in neither passed or failed were skipped.
new_job = RietveldTryJob(
init_time=init_time,
builder=builder,
build=buildnumber,
revision=build.revision,
requested_steps=requested_steps,
started=build.start_time,
steps_passed=passed,
steps_failed=failed,
clobber=bool(try_job_properties.properties.get('clobber')),
completed=build.completed,
tries=tries,
parent_key=try_job_properties.parent_key)
if job and job.build and new_job.build and job.build != new_job.build:
# It's tricky because 'key' is the same for both. The trick is to create
# a fake key for the old build and mark it as completed. Note that
# Rietveld is confused by it too.
logging.warning(
'Try Server was restarted and restarted builds with the same keys. '
'I\'m confused. %s: %d != %d', job.builder, job.build, new_job.build)
# Resave the old try job and mark it as completed.
self.try_jobs[key + '_old'] = RietveldTryJob(
init_time=job.init_time,
builder=job.builder,
build=job.build,
revision=job.revision,
requested_steps=job.requested_steps,
started=build.start_time,
steps_passed=job.steps_passed,
steps_failed=job.steps_failed,
clobber=job.clobber,
completed=True,
tries=job.tries,
parent_key=job.parent_key)
if not job or not model.is_equivalent(new_job, job):
logging.info(
'Job update: %s: %s/%d',
try_job_properties.properties.get('issue'),
builder,
buildnumber)
self.try_jobs[key] = new_job
return key
def signal_as_failed_if_needed(self, job, url, now):
"""Detects if the RietveldTryJob instance is in a state where it is
impossible to make progress.
If so, mark ourself as failed by setting self.error_message and return True.
"""
if self.skipped or self.error_message:
return False
# Figure out steps that should be retried for this builder.
missing_tests = self.tests_need_to_be_run(now).get(job.builder, [])
if not missing_tests:
return False
if job.tries > 2:
self.error_message = (
'Retried try job too often on %s for step(s) %s\n%s' %
(job.builder, ', '.join(missing_tests), url))
logging.info(self.error_message)
return True
return False
@model.immutable
def why_not(self):
# Skipped or failed, nothing to do.
if self.skipped or self.error_message:
return None
waiting = self.tests_waiting_for_result()
if waiting:
out = 'Waiting for the following jobs:\n'
for builder in sorted(waiting):
out += ' %s: %s\n' % (builder, ','.join(waiting[builder]))
return out
class TryRunnerRietveld(base.VerifierCheckout):
"""Stateless communication with a try server.
Uses Rietveld to trigger the try job and reads try job status with the json
API.
Analysis goes as following:
- compile step itself is not flaky. compile.py already takes care of most
flakiness and clobber build is done by default. If compile step fails, try
again with clobber=True
- test steps are flaky and can be retried as necessary.
1. For each existing try jobs from rietveld.
1. Fetch result from try server.
2. If try job was generated from rietveld;
1. If not is_job_expired();
1. Skip any scheduled test that succeeded on this builder.
2. For each builder with tests scheduled;
1. If no step waiting to be triggered, skip this builder completely.
2. For each non succeeded job;
1. Send try jobs to rietveld.
Note: It needs rietveld, hence it uses VerifierCheckout, but it doesn't need a
checkout.
"""
name = 'try job rietveld'
# Only updates a job status once every 60 seconds.
update_latency = 60
def __init__(
self, context_obj, try_server_url, commit_user, step_verifiers,
ignored_steps, solution):
super(TryRunnerRietveld, self).__init__(context_obj)
self.try_server_url = try_server_url.rstrip('/')
self.commit_user = commit_user
# TODO(maruel): Have it be overridden by presubmit_support.DoGetTrySlaves.
self.step_verifiers = step_verifiers
self.ignored_steps = set(ignored_steps)
# Time to poll the Try Server, and not Rietveld.
self.last_update = time.time() - self.update_latency
self.solution = solution
def verify(self, pending):
"""Sends a try job to the try server and returns a RietveldTryJob list.
This function is called synchronously.
"""
jobs = pending.verifications.setdefault(self.name, RietveldTryJobs())
if _is_skip_try_job(pending):
# Do not run try job for it.
jobs.skipped = True
return
# Overridde any previous list from the last restart.
jobs.step_verifiers = []
for step in self.step_verifiers:
if isinstance(step, try_job_steps.TryJobTriggeredOrNormalSteps):
# Since the steps are immutable, create a new step so that swarm
# can be enabled.
jobs.step_verifiers.append(try_job_steps.TryJobTriggeredOrNormalSteps(
builder_name=step.builder_name,
trigger_name=step.trigger_name,
steps=step.steps,
trigger_bot_steps=step.trigger_bot_steps,
use_triggered_bot=True))
else:
jobs.step_verifiers.append(step)
# First, update the status of the current try jobs on Rietveld.
now = time.time()
self._update_jobs_from_rietveld(pending, jobs, False, now)
# Add anything that is missing.
self._send_jobs(pending, jobs, now)
# Slightly postpone next check.
self.last_update = min(now, self.last_update + (self.update_latency / 4))
def update_status(self, queue):
"""Grabs the current status of all try jobs and update self.queue.
Note: it would be more efficient to be event based.
"""
if not queue:
logging.debug('The list is empty, nothing to do')
return
# Hard code 'now' to the value before querying and sending them. This will
# cause some issues when querying state or sending the jobs takes a
# non-trivial amount of time but in general it will be fine.
now = time.time()
if now - self.last_update < self.update_latency:
logging.debug('TS: Throttling updates')
return
self.last_update = now
# Update the status of the current pending CLs on Rietveld.
for pending, jobs in self.loop(queue, RietveldTryJobs, True):
# Update 'now' since querying the try jobs may take a significant amount
# of time.
now = time.time()
if self._update_jobs_from_rietveld(pending, jobs, True, now):
# Send any necessary job. Noop if not needed.
self._send_jobs(pending, jobs, now)
def _add_pending_job_and_send_if_needed(self, builder, steps, jobs,
send_job, pending, now):
# Find if there was a previous try.
previous_jobs = [
job for job in jobs.try_jobs.itervalues() if job.builder == builder
]
if previous_jobs:
tries = max(job.tries for job in previous_jobs)
clobber = max(
(job.clobber or 'compile' in job.steps_failed)
for job in previous_jobs)
else:
tries = 0
clobber = False
if tries > 4:
# Fail safe.
jobs.error_message = (
( 'The commit queue went berserk retrying too often for a\n'
'seemingly flaky test on builder %s:\n%s') %
( builder,
'\n'.join(self._build_status_url(j) for j in previous_jobs)))
return False
# Don't always send the job (triggered bots don't need to send there own
# request).
if send_job:
logging.debug(
'Sending job %s for %s: %s', pending.issue, builder, ','.join(steps))
try:
self.context.rietveld.trigger_try_jobs(
pending.issue, pending.patchset, 'CQ', clobber, 'HEAD',
{builder: steps})
except urllib2.HTTPError as e:
if e.code == 400:
# This probably mean a new patchset was uploaded since the last poll,
# so it's better to drop the CL.
jobs.error_message = 'Failed to trigger a try job on %s\n%s' % (
builder, e)
return False
else:
raise
# Set the status of this pending job here and on the CQ page.
jobs.pendings.append(
RietveldTryJobPending(
init_time=now,
builder=builder,
revision=None,
requested_steps=steps,
clobber=clobber,
tries=tries + 1))
# Update the status on the AppEngine status to signal a new try job was
# sent.
info = {
'builder': builder,
'clobber': clobber,
'job_name': 'CQ',
'revision': None, #revision,
}
self.send_status(pending, info)
return True
def _get_triggered_bots(self, builder, steps):
"""Returns a dict of all the (builder, steps) pairs of bots that will get
triggered by the given builder steps combination."""
triggered_bots = {}
for verifier in self.step_verifiers:
builder, steps = verifier.get_triggered_steps(builder, steps)
if steps:
triggered_bots[builder] = steps
return triggered_bots
def _send_jobs(self, pending, jobs, now):
"""Prepares the RietveldTryJobs instance |jobs| to send try jobs to the try
server.
"""
if jobs.error_message:
# Too late.
return
remaining = jobs.tests_need_to_be_run(now)
if not remaining:
return
# Send them in order to simplify testing.
for builder in sorted(remaining):
tests = remaining[builder]
if not self._add_pending_job_and_send_if_needed(builder, tests, jobs,
True, pending, now):
# If the main job wasn't sent, we can't skip the triggered jobs since
# they won't get triggered.
continue
# Add any pending bots that will be triggered from this build.
triggered_bots = self._get_triggered_bots(builder, tests)
for builder, steps in triggered_bots.iteritems():
self._add_pending_job_and_send_if_needed(builder, steps, jobs, False,
pending, now)
@model.immutable
def _build_status_url(self, job):
"""Html url for this try job."""
assert job.build is not None, str(job)
return '%s/buildstatus?builder=%s&number=%s' % (
self.try_server_url, job.builder, job.build)
@model.immutable
def _update_dashboard(self, pending, job):
"""Updates the CQ dashboard with the current Try Job state as known to the
CQ.
"""
logging.debug('_update_dashboard(%s/%s)', job.builder, job.build)
info = {
'build': job.build,
'builder': job.builder,
'job_name': 'CQ',
'result': job.result,
'revision': job.revision,
'url': self._build_status_url(job),
}
self.send_status(pending, info)
def _update_jobs_from_rietveld(self, pending, jobs, handle, now):
"""Grabs data from Rietveld and pass it to
RietveldTryJobs.update_jobs_from_rietveld().
Returns True on success.
"""
status = buildbot_json.Buildbot(self.try_server_url)
try:
try:
data = self.context.rietveld.get_patchset_properties(
pending.issue, pending.patchset)
except urllib2.HTTPError as e:
if e.code == 404:
# TODO(phajdan.jr): Maybe generate a random id to correlate the user's
# error message and exception in the logs.
# Don't put exception traceback in the user-visible message to avoid
# leaking sensitive CQ data (passwords etc).
jobs.error_message = ('Failed to get patchset properties (patchset '
'not found?)')
logging.error(str(e))
return False
else:
raise
# Update the RietvedTryJobs object.
keys = jobs.update_jobs_from_rietveld(
data,
status,
self.context.checkout,
now)
except urllib2.HTTPError as e:
if e.code in (500, 502, 503):
# Temporary AppEngine hiccup. Just log it and return failure.
logging.warning('%s while accessing %s. Ignoring error.' % (
str(e), e.url))
return False
else:
raise
except urllib2.URLError as e:
if 'timed out' in e.reason:
# Handle timeouts gracefully.
logging.warning('%s while updating tryserver status for '
'rietveld issue %s', e, pending.issue)
return False
else:
raise
except socket.error as e:
# Temporary AppEngine hiccup. Just log it and return failure.
if e.errno == errno.ECONNRESET:
logging.warning(
'%s while updating tryserver status for rietveld issue %s.' % (
str(e), str(pending.issue)))
return False
else:
raise
except IOError as e:
# Temporary AppEngine hiccup. Just log it and return failure.
if e.errno == 'socket error':
logging.warning(
'%s while updating tryserver status for rietveld issue %s.' % (
str(e), str(pending.issue)))
return False
raise
if handle:
for updated_key in keys:
job = jobs.try_jobs[updated_key]
self._update_dashboard(pending, job)
jobs.signal_as_failed_if_needed(job, self._build_status_url(job), now)
return True