exonerator/innocent_cls_precq.py - chromiumos/infra/cl_exonerator - Git at Google

 # Copyright 2018 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Returns a sequence of new innocent CLs which failed pre-cq.

 Queries CIDB for sanity Pre-CQ build failures.
 """

 from __future__ import print_function
 from __future__ import absolute_import
 from __future__ import division

 import datetime
 import itertools

 from chromite.lib import constants
 from chromite.lib import cidb
 from chromite.lib import clactions
 from chromite.lib import parallel
 from chromite.lib import cros_logging as logging
 from google.cloud import datastore  # pylint: disable=E0611,import-error
 from infra_libs import ts_mon

 from exonerator import checkpointlib
 from exonerator import innocent_cls_cq
 from exonerator import innocent_cls_precq_flake


 PRECQ_PROCESSED_KEY = 'PreCQProcessed'
 _SANITY_BUILD_REASON = 'sanity-pre-cq'

 # Forgive failed builds up to 3 days before a sanity failure, and 1 day
 # afterward.
 _DAYS_LOWER_LIMIT = 3
 _DAYS_UPPER_LIMIT = 1

 _PRE_CQ_CL_LIMITS = {
   # Only kick off one build per tick during peak hours
   True: 1,
   # Kick off pre-CQ builds faster during non-peak hours
   False: 5,
 }


 _SANITY_FAILURES_EXAMINED = ts_mon.GaugeMetric(
     'chromeos/exonerator/sanity/failures_examined',
     description=('The current number of failed sanity runs being examined by '
                  'CL Exonerator.'),
     field_spec=None)


 def NewInnocentCLs(conn, on_peak, checkpoint=True):
   """Finds new innocent CLs since the last run, with checkpointing.

   Looping over the results with a for loop will result in a checkpoint for
   each CL as it is processed. Do not convert the results to a list unless you
   don't care that this will checkpoint immediately during the list construction.

   Args:
     conn: The CIDBConnection to use.
     on_peak: Whether it's business hours (peak load) or off-peak.
     checkpoint: Whether to save progress after every buildMessage processed.

   Yields:
     lists of ChangeWithBuild objects, representing possibly innocent CLs that
     have Pre-CQ faliures which were associtaed with a recent sanity build
     failure.
   """
   limit = _PRE_CQ_CL_LIMITS[on_peak]
   innocents = _InnocentCLsFromSanityBuildFailuresCheckpointed(
       conn, on_peak, limit, checkpoint)

   # TODO(crbug.com/820379) add some logic here to find all the CLs which
   # participated in the build. Put them together in a batch to yield.
   # Also change docstring of limit to be "number of BUILDS to process."

   # Verify that this build was the latest run. We don't want to exonerate a
   # stale pre-cq run when there is a newer run.
   filtered = [
       innocent_cls_cq.ChangeWithBuild(change, build_id)
       for change, build_id in innocents
       if _ShouldExonerate(conn, change, build_id)]
   return [filtered] if filtered else []


 def _InnocentCLsFromSanityBuildFailuresCheckpointed(conn, on_peak, limit,
                                                     checkpoint):
   """Finds Pre-CQ failures to forgive.

   Checkpoints processed builds with a PreCQForgiven row. Specifically,

     x = next(fetch)
     # x is not checkpointed yet
     y = next(fetch)  # This checkpoints that x is done

   Args:
     conn: The CIDBConnection to use.
     on_peak: Whether it's business hours (peak load) or off-peak.
     limit: The maximum number of innocent CLs to yield.
     checkpoint: Whether to save progress after every build yielded.

   Yields:
     tuples of (change, pre-cq build id) to forgive.
   """
   # The first priority is to forgive pre-CQ builds associated with a sanity
   # failure. Afterwards, we can consider pre-CQ failures due to flakey pre-CQ
   # configs.
   seq = itertools.chain(
       _InnocentCLsFromSanityBuildFailures(conn),
       innocent_cls_precq_flake.InnocentPreCQsFromFlake(conn, on_peak))

   if not checkpoint:
     return itertools.islice(seq, limit)

   ds = datastore.Client()

   def _AlreadyProcessed(item):
     change, build_id = item
     return _AlreadyForgiven(ds, change, build_id)

   def _Save(item):
     change, build_id = item
     entity = ds.get(key=ds.key(PRECQ_PROCESSED_KEY, build_id))
     if not entity:
       logging.info('Inserting Pre-CQ checkpoint row %s.', build_id)
       entity = datastore.Entity(key=ds.key(PRECQ_PROCESSED_KEY, build_id))
     else:
       logging.info('Updating Pre-CQ checkpoint row %s.', build_id)
     # A build may include more than one CL, so add a property to the entity for
     # each CL of the build that was processed.
     entity.update({str(change.gerrit_number): True})
     ds.put(entity)

   return checkpointlib.CheckpointSequence(
       _AlreadyProcessed, _Save, seq, limit=limit)


 def _InnocentCLsFromSanityBuildFailures(conn):
   """Finds Pre-CQ failures to forgive.

   Args:
     conn: The CIDBConnection to use.

   Yields:
     Tuples of (change, pre-cq build) to forgive.
   """
   sanity_runs = conn.GetLatestBuildRequestsForReason(_SANITY_BUILD_REASON)
   for sanity_run in _FailedSanityRuns(conn, sanity_runs):
     for change, build_id in _InnocentPreCQsFromSanity(conn, sanity_run):
       yield change, build_id


 def _FailedSanityRuns(conn, sanity_runs):
   """Filters the latest sanity runs to only the failed sanity runs.

   Args:
     conn: The CIDBConnection to use.
     sanity_runs: An iterable of sanity builds (has a .build_id).

   Returns:
     The sanity runs that failed.
   """
   # TODO(phobbs) add a JOIN buildTable in GetLatestBuildRequestsForReason
   # to avoid creating / destroying a bunch of cidb connections.
   statuses = parallel.RunTasksInProcessPool(
       conn.GetBuildStatus,
       [[run.build_id] for run in sanity_runs],
       processes=len(sanity_runs))

   faileds = [status['status'] == constants.BUILDER_STATUS_FAILED
              for status in statuses]

   _SANITY_FAILURES_EXAMINED.set(sum(map(int, faileds)))

   for sanity_run, failed in itertools.izip(sanity_runs, faileds):
     if failed:
       # TODO(crbug.com/820379) yield build ids from here.
       yield sanity_run


 def _InnocentPreCQsFromSanity(conn, sanity_failure):
   """Finds forgiveable Pre-CQ failures from a sanity failure

   Args:
     conn: A CIDBConnection
     sanity_failure: A BuildRequest corresponding to a sanity build failure.

   Yields:
     (GerritChangeTuple, build_id) corresponding to pre-cq failures to forgive
   """
   lower_limit = (
       sanity_failure.timestamp - datetime.timedelta(days=_DAYS_LOWER_LIMIT))
   upper_limit = (
       sanity_failure.timestamp + datetime.timedelta(days=_DAYS_UPPER_LIMIT))
   builds = conn.GetBuildHistory(
       sanity_failure.request_build_config,
       cidb.CIDBConnection.NUM_RESULTS_NO_LIMIT,
       start_date=lower_limit.date(),
       end_date=upper_limit.date(),)

   for build in builds:
     actions = conn.GetActionsForBuild(build['id'])
     if not actions:
       continue
     if any(action.status == constants.BUILDER_STATUS_FAILED
            for action in actions):
       action = actions[0]
       change = clactions.GerritPatchTuple(
           action.change_number, action.patch_number,
           action.change_source == 'internal')
       # TODO(crbug.com/820379) yield build ids from here.
       yield change, action.build_id


 def _AlreadyForgiven(ds, change, build_id):
   """Whether the given Pre-CQ build was forgiven already.

   Args:
     ds: The cloud Datastore client.
     change: A GerritPatchTuple instance.
     build_id: The CIDB build id for the pre-cq run.
   """
   entity = ds.get(key=ds.key(PRECQ_PROCESSED_KEY, build_id))
   # Each entity stores the CLs that were processed by exonerator for that
   # build. For simplicity, assume gerrit_number:change is a one-to-one mapping.
   # Technically, this is false, but it's extremely unlikely to affect the
   # behavior.
   return (entity is not None
           and entity.get(str(change.gerrit_number), False))


 def _ShouldExonerate(conn, change, build_id):
   """Whether we should exonerate the patch.

   Args:
     conn: The cidb.CIDBConnection
     change: A GerritPatchTuple to possibly exonerate
     build_id: The threshold build_id.
   """
   actions = conn.GetActionsForChanges([change])
   return (
       not _ExistsNewerPreCQForCL(actions, build_id)
       and not _PatchAlreadyExonerated(actions, change))


 def _ExistsNewerPreCQForCL(actions, build_id):
   """Whether there is a newer action for a given CL of a certain type.

   Args:
     actions: The CLActions for the change
     build_id: The threshold build_id.

   Returns:
     A boolean indicating whether there are any new CLActions of the given
     |action_type|.
   """
   actions_for_build = [a for a in actions if a.build_id == build_id]
   try:
     build_config = actions_for_build[0].build_config
   except IndexError:
     return False

   newer_pickup_actions_for_build_config = [
       a.build_id > build_id
       for a in actions
       if (a.build_config == build_config
           and a.action == constants.CL_ACTION_PICKED_UP)]
   return any(newer_pickup_actions_for_build_config)


 def _PatchAlreadyExonerated(actions, change):
   """Whether the patch was already exonerated once.

   This prevents infinitely retrying pre-cq on a single patch.
   TODO: consider retrying more times (2 or 3?) if there is low pre-cq load.

   Args:
     actions: The CLActions for the change
     change: A GerritPatchTuple to find actions for.
   """
   exonerations_for_patch = [
       a for a in actions
       if (a.action == constants.CL_ACTION_EXONERATED
           and a.patch_number == change.patch_number)]
   return bool(exonerations_for_patch)
	# Copyright 2018 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Returns a sequence of new innocent CLs which failed pre-cq.

	Queries CIDB for sanity Pre-CQ build failures.
	"""

	from __future__ import print_function
	from __future__ import absolute_import
	from __future__ import division

	import datetime
	import itertools

	from chromite.lib import constants
	from chromite.lib import cidb
	from chromite.lib import clactions
	from chromite.lib import parallel
	from chromite.lib import cros_logging as logging
	from google.cloud import datastore # pylint: disable=E0611,import-error
	from infra_libs import ts_mon

	from exonerator import checkpointlib
	from exonerator import innocent_cls_cq
	from exonerator import innocent_cls_precq_flake


	PRECQ_PROCESSED_KEY = 'PreCQProcessed'
	_SANITY_BUILD_REASON = 'sanity-pre-cq'

	# Forgive failed builds up to 3 days before a sanity failure, and 1 day
	# afterward.
	_DAYS_LOWER_LIMIT = 3
	_DAYS_UPPER_LIMIT = 1

	_PRE_CQ_CL_LIMITS = {
	# Only kick off one build per tick during peak hours
	True: 1,
	# Kick off pre-CQ builds faster during non-peak hours
	False: 5,
	}


	_SANITY_FAILURES_EXAMINED = ts_mon.GaugeMetric(
	'chromeos/exonerator/sanity/failures_examined',
	description=('The current number of failed sanity runs being examined by '
	'CL Exonerator.'),
	field_spec=None)


	def NewInnocentCLs(conn, on_peak, checkpoint=True):
	"""Finds new innocent CLs since the last run, with checkpointing.

	Looping over the results with a for loop will result in a checkpoint for
	each CL as it is processed. Do not convert the results to a list unless you
	don't care that this will checkpoint immediately during the list construction.

	Args:
	conn: The CIDBConnection to use.
	on_peak: Whether it's business hours (peak load) or off-peak.
	checkpoint: Whether to save progress after every buildMessage processed.

	Yields:
	lists of ChangeWithBuild objects, representing possibly innocent CLs that
	have Pre-CQ faliures which were associtaed with a recent sanity build
	failure.
	"""
	limit = _PRE_CQ_CL_LIMITS[on_peak]
	innocents = _InnocentCLsFromSanityBuildFailuresCheckpointed(
	conn, on_peak, limit, checkpoint)

	# TODO(crbug.com/820379) add some logic here to find all the CLs which
	# participated in the build. Put them together in a batch to yield.
	# Also change docstring of limit to be "number of BUILDS to process."

	# Verify that this build was the latest run. We don't want to exonerate a
	# stale pre-cq run when there is a newer run.
	filtered = [
	innocent_cls_cq.ChangeWithBuild(change, build_id)
	for change, build_id in innocents
	if _ShouldExonerate(conn, change, build_id)]
	return [filtered] if filtered else []


	def _InnocentCLsFromSanityBuildFailuresCheckpointed(conn, on_peak, limit,
	checkpoint):
	"""Finds Pre-CQ failures to forgive.

	Checkpoints processed builds with a PreCQForgiven row. Specifically,

	x = next(fetch)
	# x is not checkpointed yet
	y = next(fetch) # This checkpoints that x is done

	Args:
	conn: The CIDBConnection to use.
	on_peak: Whether it's business hours (peak load) or off-peak.
	limit: The maximum number of innocent CLs to yield.
	checkpoint: Whether to save progress after every build yielded.

	Yields:
	tuples of (change, pre-cq build id) to forgive.
	"""
	# The first priority is to forgive pre-CQ builds associated with a sanity
	# failure. Afterwards, we can consider pre-CQ failures due to flakey pre-CQ
	# configs.
	seq = itertools.chain(
	_InnocentCLsFromSanityBuildFailures(conn),
	innocent_cls_precq_flake.InnocentPreCQsFromFlake(conn, on_peak))

	if not checkpoint:
	return itertools.islice(seq, limit)

	ds = datastore.Client()

	def _AlreadyProcessed(item):
	change, build_id = item
	return _AlreadyForgiven(ds, change, build_id)

	def _Save(item):
	change, build_id = item
	entity = ds.get(key=ds.key(PRECQ_PROCESSED_KEY, build_id))
	if not entity:
	logging.info('Inserting Pre-CQ checkpoint row %s.', build_id)
	entity = datastore.Entity(key=ds.key(PRECQ_PROCESSED_KEY, build_id))
	else:
	logging.info('Updating Pre-CQ checkpoint row %s.', build_id)
	# A build may include more than one CL, so add a property to the entity for
	# each CL of the build that was processed.
	entity.update({str(change.gerrit_number): True})
	ds.put(entity)

	return checkpointlib.CheckpointSequence(
	_AlreadyProcessed, _Save, seq, limit=limit)


	def _InnocentCLsFromSanityBuildFailures(conn):
	"""Finds Pre-CQ failures to forgive.

	Args:
	conn: The CIDBConnection to use.

	Yields:
	Tuples of (change, pre-cq build) to forgive.
	"""
	sanity_runs = conn.GetLatestBuildRequestsForReason(_SANITY_BUILD_REASON)
	for sanity_run in _FailedSanityRuns(conn, sanity_runs):
	for change, build_id in _InnocentPreCQsFromSanity(conn, sanity_run):
	yield change, build_id


	def _FailedSanityRuns(conn, sanity_runs):
	"""Filters the latest sanity runs to only the failed sanity runs.

	Args:
	conn: The CIDBConnection to use.
	sanity_runs: An iterable of sanity builds (has a .build_id).

	Returns:
	The sanity runs that failed.
	"""
	# TODO(phobbs) add a JOIN buildTable in GetLatestBuildRequestsForReason
	# to avoid creating / destroying a bunch of cidb connections.
	statuses = parallel.RunTasksInProcessPool(
	conn.GetBuildStatus,
	[[run.build_id] for run in sanity_runs],
	processes=len(sanity_runs))

	faileds = [status['status'] == constants.BUILDER_STATUS_FAILED
	for status in statuses]

	_SANITY_FAILURES_EXAMINED.set(sum(map(int, faileds)))

	for sanity_run, failed in itertools.izip(sanity_runs, faileds):
	if failed:
	# TODO(crbug.com/820379) yield build ids from here.
	yield sanity_run


	def _InnocentPreCQsFromSanity(conn, sanity_failure):
	"""Finds forgiveable Pre-CQ failures from a sanity failure

	Args:
	conn: A CIDBConnection
	sanity_failure: A BuildRequest corresponding to a sanity build failure.

	Yields:
	(GerritChangeTuple, build_id) corresponding to pre-cq failures to forgive
	"""
	lower_limit = (
	sanity_failure.timestamp - datetime.timedelta(days=_DAYS_LOWER_LIMIT))
	upper_limit = (
	sanity_failure.timestamp + datetime.timedelta(days=_DAYS_UPPER_LIMIT))
	builds = conn.GetBuildHistory(
	sanity_failure.request_build_config,
	cidb.CIDBConnection.NUM_RESULTS_NO_LIMIT,
	start_date=lower_limit.date(),
	end_date=upper_limit.date(),)

	for build in builds:
	actions = conn.GetActionsForBuild(build['id'])
	if not actions:
	continue
	if any(action.status == constants.BUILDER_STATUS_FAILED
	for action in actions):
	action = actions[0]
	change = clactions.GerritPatchTuple(
	action.change_number, action.patch_number,
	action.change_source == 'internal')
	# TODO(crbug.com/820379) yield build ids from here.
	yield change, action.build_id


	def _AlreadyForgiven(ds, change, build_id):
	"""Whether the given Pre-CQ build was forgiven already.

	Args:
	ds: The cloud Datastore client.
	change: A GerritPatchTuple instance.
	build_id: The CIDB build id for the pre-cq run.
	"""
	entity = ds.get(key=ds.key(PRECQ_PROCESSED_KEY, build_id))
	# Each entity stores the CLs that were processed by exonerator for that
	# build. For simplicity, assume gerrit_number:change is a one-to-one mapping.
	# Technically, this is false, but it's extremely unlikely to affect the
	# behavior.
	return (entity is not None
	and entity.get(str(change.gerrit_number), False))


	def _ShouldExonerate(conn, change, build_id):
	"""Whether we should exonerate the patch.

	Args:
	conn: The cidb.CIDBConnection
	change: A GerritPatchTuple to possibly exonerate
	build_id: The threshold build_id.
	"""
	actions = conn.GetActionsForChanges([change])
	return (
	not _ExistsNewerPreCQForCL(actions, build_id)
	and not _PatchAlreadyExonerated(actions, change))


	def _ExistsNewerPreCQForCL(actions, build_id):
	"""Whether there is a newer action for a given CL of a certain type.

	Args:
	actions: The CLActions for the change
	build_id: The threshold build_id.

	Returns:
	A boolean indicating whether there are any new CLActions of the given
	\|action_type\|.
	"""
	actions_for_build = [a for a in actions if a.build_id == build_id]
	try:
	build_config = actions_for_build[0].build_config
	except IndexError:
	return False

	newer_pickup_actions_for_build_config = [
	a.build_id > build_id
	for a in actions
	if (a.build_config == build_config
	and a.action == constants.CL_ACTION_PICKED_UP)]
	return any(newer_pickup_actions_for_build_config)


	def _PatchAlreadyExonerated(actions, change):
	"""Whether the patch was already exonerated once.

	This prevents infinitely retrying pre-cq on a single patch.
	TODO: consider retrying more times (2 or 3?) if there is low pre-cq load.

	Args:
	actions: The CLActions for the change
	change: A GerritPatchTuple to find actions for.
	"""
	exonerations_for_patch = [
	a for a in actions
	if (a.action == constants.CL_ACTION_EXONERATED
	and a.patch_number == change.patch_number)]
	return bool(exonerations_for_patch)