recipe_engine/autoroll_impl/candidate_algorithm.py - external/github.com/luci/recipes-py - Git at Google

 # Copyright 2017 The LUCI Authors. All rights reserved.
 # Use of this source code is governed under the Apache License, Version 2.0
 # that can be found in the LICENSE file.

 import sys
 import time

 from recipe_engine import package

 from .commit_list import CommitList
 from .roll_candidate import RollCandidate


 def get_commitlists(deps):
   """Returns {'project_id': CommitList} for every repo this recipe repo depends
   on.

   Args:
     context (PackageContext) - The local context for where the repos live, etc.
     deps (dict(project_id, RepoSpec)) - The dependencies to grab.

   Returns dict(project_id, CommitList) where each project_id is one of the repos
     mentioned in the deps.
   """
   return {
     project_id: CommitList.from_repo_spec(repo_spec)
     for project_id, repo_spec in deps.iteritems()
   }


 def find_best_rev(repos):
   """Returns the project_id of the best repo to roll.

   "Best" is determined by "is an interesting commit and moves the least amount
   of commits, globally".

   "Interesting" means "the commit modifies one or more recipe related files",
   and is defined by CommitMetadata.roll_candidate.

   There are two ways that rolling a repo can move commits:

     1) As dependencies. Rolling repo A that depends on (B, C) will take
        a penalty for each commit that B and C need to move in order to be
        compatible with the new A revision.

     2) As dependees. Rolling repo A which is depended on by (B, C) will take
        a penalty for each commit that B and C need to move in order to be
        compatible with the new A revision.

   Each repo is analyzed with these two rules and a score is computed. The score
   may be negative if one of the other repos (e.g. B or C) needs A to catch up
   (say that B already rolled forward, and now it requires that its dependency
   A needs to roll forward too).

   As a tiebreaker, the commit with the lowest commit timestamp will be picked.
   So if two repos cause the same amount of global commit movement, the "older"
   of the two commits will roll first. Clearly this is best-effort as there's no
   meaningful enforcement of timestamps between different repos, but it's usually
   close enough to reality to be helpful. This is done to reflect a realistic
   commit ordering; it doesn't make sense for two independent dependencies to
   move such that a very large time gap develops between them (assuming the
   timestamps are sensible). All that said, the precise timestamp value is not
   necessary for the correctness of the algorithm, since it's only involvement is
   a tiebreaker between otherwise valid choices.

   The repo with the lowest score wins and is returned. Note that rolling this
   repo does NOT guarantee a consistent dependency graph. This is OK, as it means
   that the outer loop will just call this function multiple times to roll each
   best repo until there's a consistent graph.

   Args:
     repos (dict(project_id, CommitList)) - The repos to analyze. This function
       will only read from the dict and CommitLists (it will not modify them).

   Returns (str) - The project_id of the repo to advance next.
   """
   # The project_ids of all the repos that can move
   repo_set = set(repos)

   best_project_id = None
   best_score = ()  # (# commits moved, timestamp)

   for project_id, clist in repos.iteritems():
     assert isinstance(clist, CommitList)
     candidate, movement_score = clist.next_roll_candidate
     if not candidate:
       continue

     unaccounted_repos = set(repo_set)

     # first, determine if rolling this repo will force other repos to move.
     for d_pid, dep in candidate.spec.deps.iteritems():
       unaccounted_repos.discard(d_pid)
       movement_score += repos[d_pid].dist_to(dep.revision)

     # Next, see if any unaccounted_repos depend on this repo.
     for pid in unaccounted_repos:
       movement_score += repos[pid].dist_compatible_with(pid, candidate.revision)

     score = (movement_score, candidate.commit_timestamp)
     if not best_score or score < best_score:
       best_score = score
       best_project_id = project_id

   return best_project_id


 def is_consistent(spec_pb, repos):
   for pid in spec_pb.deps:
     for d_pid, dep in repos[pid].current.spec.deps.iteritems():
       if dep.revision != spec_pb.deps[d_pid].revision:
         return False
   return True


 def _get_roll_candidates_impl(context, package_spec, repos):
   current_pb = package_spec.spec_pb

   ret_good = []
   ret_bad = []

   while True:
     best_project_id = find_best_rev(repos)
     if best_project_id is None:
       # end when there's no best rev to roll
       return ret_good, ret_bad

     rev = repos[best_project_id].advance()
     if not rev:
       return ret_good, ret_bad

     for d_pid, dep in sorted(rev.spec.deps.items()):
       if not repos[d_pid].advance_to(dep.revision):
         return ret_good, ret_bad

     # TODO(iannucci): rationalize what happens if there's a conflict in e.g.
     # branch/url.

     # First, copy all revisions from clists to current_pb. Note that this will
     # accumulate new repos! Because we don't have a quick way to evaluate
     # implicit vs. explicit repo dependencies, we can't algorithmically remove
     # them currently.
     #
     # However, we expect this to be a pretty uncommon case, and removing them by
     # hand should be sufficient for now.
     #
     # If someone really wants to make it automatic, they'll need to make a quick
     # way to scan all DEPS within the current repo to see which packages are
     # actually used. If you have this information, then removing a repo here
     # would be easy (if we don't directly depend on it and none of our
     # immediate dependencies list it, then we can remove it).
     for pid, clist in repos.iteritems():
       current_pb.deps[pid].revision = clist.current.revision

     # Next, copy ONE instance of any new repos
     for pid in current_pb.deps:
       for d_pid, dep in repos[pid].current.spec.deps.iteritems():
         if d_pid not in current_pb.deps:
           current_pb.deps[d_pid].CopyFrom(dep)

     if not is_consistent(current_pb, repos):
       ret_bad.append(RollCandidate(current_pb))
     else:
       ret_good.append(RollCandidate(current_pb))

       # See if this roll introduced any new dependencies we need to worry about
       # going forward.
       new_repos = set(current_pb.deps) - set(repos)
       if new_repos:
         # Going forward we have to account for the new repos. current_pb has all
         # the old and new repos already.
         package_spec = package.PackageSpec.from_package_pb(context, current_pb)
         package_spec.ensure_up_to_date(context)

         # Add any newly discovered repos to our repos set. We don't replace
         # repos because we want to keep the metadata for all already-rolled
         # revisions.
         repos.update(get_commitlists({
           pid: repo_spec
           for pid, repo_spec in package_spec.deps.iteritems()
           if pid in new_repos
         }))


 def get_roll_candidates(context, package_spec):
   """Returns a list of RollCandidate objects.

   Args:
     context (PackageContext)
     package_spec (PackageSpec)

   Returns:
     good_candidates (list(RollCandidate)) - The list of valid roll candidates in
       least-changed to most-changed order.
     bad_candidates (list(RollCandidate)) - The list of invalid (but considered)
       roll candidates.
     repos (dict(project_id, CommitList)) - A repos dictionary suitable for
       invoking RollCandidate.changelist().
   """
   start = time.time()

   # not on py3 so we can't use print(..., flush=True) :(
   sys.stdout.write('finding roll candidates... ')
   sys.stdout.flush()

   repos = get_commitlists(package_spec.deps)
   ret_good, ret_bad = _get_roll_candidates_impl(context, package_spec, repos)

   print('found %d/%d good/bad candidates in %0.2f seconds' % (
     len(ret_good), len(ret_bad), time.time()-start))
   sys.stdout.flush()
   return ret_good, ret_bad, repos
	# Copyright 2017 The LUCI Authors. All rights reserved.
	# Use of this source code is governed under the Apache License, Version 2.0
	# that can be found in the LICENSE file.

	import sys
	import time

	from recipe_engine import package

	from .commit_list import CommitList
	from .roll_candidate import RollCandidate


	def get_commitlists(deps):
	"""Returns {'project_id': CommitList} for every repo this recipe repo depends
	on.

	Args:
	context (PackageContext) - The local context for where the repos live, etc.
	deps (dict(project_id, RepoSpec)) - The dependencies to grab.

	Returns dict(project_id, CommitList) where each project_id is one of the repos
	mentioned in the deps.
	"""
	return {
	project_id: CommitList.from_repo_spec(repo_spec)
	for project_id, repo_spec in deps.iteritems()
	}


	def find_best_rev(repos):
	"""Returns the project_id of the best repo to roll.

	"Best" is determined by "is an interesting commit and moves the least amount
	of commits, globally".

	"Interesting" means "the commit modifies one or more recipe related files",
	and is defined by CommitMetadata.roll_candidate.

	There are two ways that rolling a repo can move commits:

	1) As dependencies. Rolling repo A that depends on (B, C) will take
	a penalty for each commit that B and C need to move in order to be
	compatible with the new A revision.

	2) As dependees. Rolling repo A which is depended on by (B, C) will take
	a penalty for each commit that B and C need to move in order to be
	compatible with the new A revision.

	Each repo is analyzed with these two rules and a score is computed. The score
	may be negative if one of the other repos (e.g. B or C) needs A to catch up
	(say that B already rolled forward, and now it requires that its dependency
	A needs to roll forward too).

	As a tiebreaker, the commit with the lowest commit timestamp will be picked.
	So if two repos cause the same amount of global commit movement, the "older"
	of the two commits will roll first. Clearly this is best-effort as there's no
	meaningful enforcement of timestamps between different repos, but it's usually
	close enough to reality to be helpful. This is done to reflect a realistic
	commit ordering; it doesn't make sense for two independent dependencies to
	move such that a very large time gap develops between them (assuming the
	timestamps are sensible). All that said, the precise timestamp value is not
	necessary for the correctness of the algorithm, since it's only involvement is
	a tiebreaker between otherwise valid choices.

	The repo with the lowest score wins and is returned. Note that rolling this
	repo does NOT guarantee a consistent dependency graph. This is OK, as it means
	that the outer loop will just call this function multiple times to roll each
	best repo until there's a consistent graph.

	Args:
	repos (dict(project_id, CommitList)) - The repos to analyze. This function
	will only read from the dict and CommitLists (it will not modify them).

	Returns (str) - The project_id of the repo to advance next.
	"""
	# The project_ids of all the repos that can move
	repo_set = set(repos)

	best_project_id = None
	best_score = () # (# commits moved, timestamp)

	for project_id, clist in repos.iteritems():
	assert isinstance(clist, CommitList)
	candidate, movement_score = clist.next_roll_candidate
	if not candidate:
	continue

	unaccounted_repos = set(repo_set)

	# first, determine if rolling this repo will force other repos to move.
	for d_pid, dep in candidate.spec.deps.iteritems():
	unaccounted_repos.discard(d_pid)
	movement_score += repos[d_pid].dist_to(dep.revision)

	# Next, see if any unaccounted_repos depend on this repo.
	for pid in unaccounted_repos:
	movement_score += repos[pid].dist_compatible_with(pid, candidate.revision)

	score = (movement_score, candidate.commit_timestamp)
	if not best_score or score < best_score:
	best_score = score
	best_project_id = project_id

	return best_project_id


	def is_consistent(spec_pb, repos):
	for pid in spec_pb.deps:
	for d_pid, dep in repos[pid].current.spec.deps.iteritems():
	if dep.revision != spec_pb.deps[d_pid].revision:
	return False
	return True


	def _get_roll_candidates_impl(context, package_spec, repos):
	current_pb = package_spec.spec_pb

	ret_good = []
	ret_bad = []

	while True:
	best_project_id = find_best_rev(repos)
	if best_project_id is None:
	# end when there's no best rev to roll
	return ret_good, ret_bad

	rev = repos[best_project_id].advance()
	if not rev:
	return ret_good, ret_bad

	for d_pid, dep in sorted(rev.spec.deps.items()):
	if not repos[d_pid].advance_to(dep.revision):
	return ret_good, ret_bad

	# TODO(iannucci): rationalize what happens if there's a conflict in e.g.
	# branch/url.

	# First, copy all revisions from clists to current_pb. Note that this will
	# accumulate new repos! Because we don't have a quick way to evaluate
	# implicit vs. explicit repo dependencies, we can't algorithmically remove
	# them currently.
	#
	# However, we expect this to be a pretty uncommon case, and removing them by
	# hand should be sufficient for now.
	#
	# If someone really wants to make it automatic, they'll need to make a quick
	# way to scan all DEPS within the current repo to see which packages are
	# actually used. If you have this information, then removing a repo here
	# would be easy (if we don't directly depend on it and none of our
	# immediate dependencies list it, then we can remove it).
	for pid, clist in repos.iteritems():
	current_pb.deps[pid].revision = clist.current.revision

	# Next, copy ONE instance of any new repos
	for pid in current_pb.deps:
	for d_pid, dep in repos[pid].current.spec.deps.iteritems():
	if d_pid not in current_pb.deps:
	current_pb.deps[d_pid].CopyFrom(dep)

	if not is_consistent(current_pb, repos):
	ret_bad.append(RollCandidate(current_pb))
	else:
	ret_good.append(RollCandidate(current_pb))

	# See if this roll introduced any new dependencies we need to worry about
	# going forward.
	new_repos = set(current_pb.deps) - set(repos)
	if new_repos:
	# Going forward we have to account for the new repos. current_pb has all
	# the old and new repos already.
	package_spec = package.PackageSpec.from_package_pb(context, current_pb)
	package_spec.ensure_up_to_date(context)

	# Add any newly discovered repos to our repos set. We don't replace
	# repos because we want to keep the metadata for all already-rolled
	# revisions.
	repos.update(get_commitlists({
	pid: repo_spec
	for pid, repo_spec in package_spec.deps.iteritems()
	if pid in new_repos
	}))


	def get_roll_candidates(context, package_spec):
	"""Returns a list of RollCandidate objects.

	Args:
	context (PackageContext)
	package_spec (PackageSpec)

	Returns:
	good_candidates (list(RollCandidate)) - The list of valid roll candidates in
	least-changed to most-changed order.
	bad_candidates (list(RollCandidate)) - The list of invalid (but considered)
	roll candidates.
	repos (dict(project_id, CommitList)) - A repos dictionary suitable for
	invoking RollCandidate.changelist().
	"""
	start = time.time()

	# not on py3 so we can't use print(..., flush=True) :(
	sys.stdout.write('finding roll candidates... ')
	sys.stdout.flush()

	repos = get_commitlists(package_spec.deps)
	ret_good, ret_bad = _get_roll_candidates_impl(context, package_spec, repos)

	print('found %d/%d good/bad candidates in %0.2f seconds' % (
	len(ret_good), len(ret_bad), time.time()-start))
	sys.stdout.flush()
	return ret_good, ret_bad, repos