recipes/recipes/git_cache_updater.py - infra/infra.git - Git at Google

 # Copyright 2014 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Updates the Git Cache zip files."""

 import re

 from recipe_engine import post_process
 from PB.recipe_engine import result as result_pb
 from PB.go.chromium.org.luci.buildbucket.proto import common as bb_common_pb

 from PB.recipes.infra import git_cache_updater as git_cache_updater_pb


 PYTHON_VERSION_COMPATIBILITY = "PY2+3"

 DEPS = [
   'recipe_engine/buildbucket',
   'recipe_engine/context',
   'recipe_engine/file',
   'recipe_engine/raw_io',
   'recipe_engine/futures',
   'recipe_engine/path',
   'recipe_engine/properties',
   'recipe_engine/runtime',
   'recipe_engine/step',
   'recipe_engine/url',

   'depot_tools/depot_tools',
   'depot_tools/git',
 ]

 PROPERTIES = git_cache_updater_pb.Inputs

 OK, EMPTY = range(2)

 CONCURRENT_STEPS = 1


 def _list_host_repos(api, host_url):
   host_url = host_url.rstrip('/')
   with api.depot_tools.on_path():
     output = api.url.get_text('%s/?format=TEXT' % host_url,
                               default_test_data=TEST_REPOS).output
     return output.strip().splitlines()


 def _repos_to_urls(host_url, repos):
   host_url = host_url.rstrip('/')
   return ['%s/%s' % (host_url, repo) for repo in repos]


 class _InvalidInput(Exception):
   pass


 def _get_repo_urls(api, inputs):
   if inputs.git_host.host:
     assert not inputs.repo_urls, 'only 1 of (git_host, repo_urls) allowed'
     repos = _list_host_repos(api, 'https://' + inputs.git_host.host)
     if inputs.git_host.exclude_repos:
       exclude_regexps = []
       for i, r in enumerate(inputs.git_host.exclude_repos):
         try:
           exclude_regexps.append(re.compile('^' + r + '$', re.IGNORECASE))
         except Exception as e:
           raise _InvalidInput(
               'invalid regular expression[%d] %r: %s' % (i, r, e))
       repos = [repo for repo in repos
                if all(not r.match(repo) for r in exclude_regexps)]
     return _repos_to_urls('https://' + inputs.git_host.host, repos)

   if inputs.repo_urls:
     return list(inputs.repo_urls)

   raise _InvalidInput('repo_urls or git_host.host must be provided')


 def _do_update_bootstrap(api, url, work_dir, gc_aggressive):
   opts = [
     '--cache-dir', work_dir,
     '--verbose',
     url,
   ]

   with api.step.nest(url) as summary:
     api.step(
         name='populate',
         cmd=[
             'git_cache.py',
             'populate',
             '--reset-fetch-config',

             # By default, "refs/heads/*" are checked out by
             # git_cache. However, for heavy branching repos,
             # 'refs/branch-heads/*' is also very useful (crbug/942169).
             # This is a noop for repos without refs/branch-heads.
             '--ref',
             'refs/branch-heads/*',

             # By default, any tags that point to objects we fetch
             # from remote are also fetched. This ensures ALL tags are
             # downloaded from remote.
             # This is needed by chromeos builders.
             '--ref',
             'refs/tags/*',
             '--break-locks',
         ] + opts,
         cost=api.step.ResourceCost(disk=20))

     repo_path = api.path.abs_to_path(
         api.step(
             name='lookup repo_path',
             cmd=['git_cache.py', 'exists'] + opts,
             stdout=api.raw_io.output(),
             step_test_data=lambda: api.raw_io.test_api.stream_output(
                 api.path.join(work_dir, url.strip('https://')) + '\n',),
         ).stdout.decode('utf-8').strip())

     with api.context(cwd=repo_path):
       stats = api.git.count_objects(
           raise_on_failure=True,
           # TODO(iannucci): ugh, the test mock for this is horrendous.
           #   1) it should default to something automatically
           #   2) test_api.count_objects_output should return a TestData, not
           #      a string.
           step_test_data=lambda: api.raw_io.test_api.stream_output(
               api.git.test_api.count_objects_output(10)))

       # Scale the memory cost of this update by size-pack squared. This is
       # an arbitrary scaling factor, but it allows multiple small repos to run
       # in parallel but allows large repos (e.g. chromium) to exclusively use
       # all the memory on the system.
       mem_cost = 4 * int((stats['size'] + stats['size-pack']) ** 2)
       if mem_cost == 0:
         # some repos can be empty (e.g. they're an "ACL-only" repo), and
         # update-bootstrap doesn't like that, so skip them.
         api.step('repo is empty; skipping update', cmd=None)
         summary.step_text = "[empty]"
         summary.status = api.step.FAILURE  # TODO(iannucci): warning
         return EMPTY

     gc_aggressive_opt = []
     if gc_aggressive:
       gc_aggressive_opt = ['--gc-aggressive']

     api.step(
         name='update bootstrap',
         cmd=[
           'git_cache.py', 'update-bootstrap',
           '--skip-populate', '--prune',
         ] + opts + gc_aggressive_opt,
         cost=api.step.ResourceCost(
             cpu=api.step.CPU_CORE*2,
             memory=mem_cost,
             net=10,
         ))

     summary.step_text = "[ok]"
     return OK


 def RunSteps(api, inputs):
   try:
     repo_urls = _get_repo_urls(api, inputs)
   except _InvalidInput as e:
     return result_pb.RawResult(
         status=bb_common_pb.FAILURE, summary_markdown=str(e))

   work_dir = api.path.cache_dir.joinpath('builder', 'w')
   api.file.ensure_directory('ensure work_dir', work_dir)

   env = {
     # Turn off the low speed limit, since checkout will be long.
     'GIT_HTTP_LOW_SPEED_LIMIT': '0',
     'GIT_HTTP_LOW_SPEED_TIME': '0',
     # Ensure git-number tool can be used.
     'CHROME_HEADLESS': '1',
   }
   if api.runtime.is_experimental:
     assert inputs.override_bucket, 'override_bucket required for experiments'
   if inputs.override_bucket:
     env['OVERRIDE_BOOTSTRAP_BUCKET'] = inputs.override_bucket

   work = []
   sem = api.futures.make_bounded_semaphore(CONCURRENT_STEPS)

   def fn(sem, api, url, work_dir, gc_aggressive):
     with sem:
       return _do_update_bootstrap(api, url, work_dir, gc_aggressive)

   with api.context(env=env), api.depot_tools.on_path():
     for url in sorted(repo_urls):
       work.append(
           api.futures.spawn_immediate(
               fn, sem, api, url, work_dir, inputs.gc_aggressive, __name=url))

   total = len(work)
   success = warning = 0
   failed_repos = []
   empties = 0
   for future in api.futures.iwait(work):
     try:
       status = future.result()
     except Exception:  # pylint: disable=broad-except
       failed_repos.append(future.name)
       continue

     if status == OK:
       success += 1
     elif status == EMPTY:
       empties += 1
       warning += 1
     else:
       assert False, 'unknown status %r' % (status,)  # pragma: no cover

   status = bb_common_pb.FAILURE if failed_repos else bb_common_pb.SUCCESS
   summary = 'Updated cache for %d/%d repos.' % (success, total)
   if warning:
     summary += '\n\nEncountered warnings for %d repos:\n' % (warning,)
     if empties:
       summary += '\n  * empty (repo has no objects): %d' % (empties,)
   if failed_repos:
     summary += '\n\nEncountered failures for %d repos:\n' % (len(failed_repos),)
     for repo_name in failed_repos:
       summary += '\n  * ' + repo_name

   return result_pb.RawResult(status=status, summary_markdown=summary)


 TEST_REPOS = """
 All-Projects
 All-Users
 apps
 chromium/src
 foo/bar
 """


 def GenTests(api):
   yield (
       api.test('needs input')
       + api.expect_status('FAILURE')
       + api.post_process(post_process.StatusFailure)
       + api.post_process(post_process.DropExpectation)
   )

   yield (api.test('one-repo-experiment-aggressive') +
          api.runtime(is_experimental=True) + api.properties(
              git_cache_updater_pb.Inputs(
                  override_bucket='experimental-gs-bucket',
                  repo_urls=['https://chromium.googlesource.com/v8/v8'],
                  gc_aggressive=True,
              )))

   yield (api.test('one-repo-empty') + api.runtime(is_experimental=True) +
          api.properties(
              git_cache_updater_pb.Inputs(
                  override_bucket='experimental-gs-bucket',
                  repo_urls=['https://chromium.googlesource.com/empty'],
                  gc_aggressive=True,
              )) + api.override_step_data(
                  'https://chromium.googlesource.com/empty.git count-objects',
                  api.raw_io.stream_output(api.git.count_objects_output(0)),
              ))

   yield (api.test('one-repo-fail') + api.runtime(is_experimental=True) +
          api.properties(
              git_cache_updater_pb.Inputs(
                  override_bucket='experimental-gs-bucket',
                  repo_urls=['https://chromium.googlesource.com/fail'],
                  gc_aggressive=True,
              )) + api.override_step_data(
                  'https://chromium.googlesource.com/fail.populate',
                  retcode=1,
              )) + api.expect_status('FAILURE')


   yield (
       api.test('host-with-exclusions')
       + api.properties(git_cache_updater_pb.Inputs(
           git_host=git_cache_updater_pb.Inputs.GitHost(
               host='chromium.googlesource.com',
               exclude_repos=[
                 'foo/.+',
                 'all-projects',
                 'all-users',
               ],
           ),
       ))
   )

   yield (
       api.test('host-with-incorrect-regexp-exclude')
       + api.properties(git_cache_updater_pb.Inputs(
           git_host=git_cache_updater_pb.Inputs.GitHost(
               host='chromium.googlesource.com',
               exclude_repos=[
                 '?.\\',
               ],
           ),
       ))
       + api.expect_status('FAILURE')
       + api.post_process(post_process.StatusFailure)
       + api.post_process(post_process.DropExpectation)
   )
	# Copyright 2014 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Updates the Git Cache zip files."""

	import re

	from recipe_engine import post_process
	from PB.recipe_engine import result as result_pb
	from PB.go.chromium.org.luci.buildbucket.proto import common as bb_common_pb

	from PB.recipes.infra import git_cache_updater as git_cache_updater_pb


	PYTHON_VERSION_COMPATIBILITY = "PY2+3"

	DEPS = [
	'recipe_engine/buildbucket',
	'recipe_engine/context',
	'recipe_engine/file',
	'recipe_engine/raw_io',
	'recipe_engine/futures',
	'recipe_engine/path',
	'recipe_engine/properties',
	'recipe_engine/runtime',
	'recipe_engine/step',
	'recipe_engine/url',

	'depot_tools/depot_tools',
	'depot_tools/git',
	]

	PROPERTIES = git_cache_updater_pb.Inputs

	OK, EMPTY = range(2)

	CONCURRENT_STEPS = 1


	def _list_host_repos(api, host_url):
	host_url = host_url.rstrip('/')
	with api.depot_tools.on_path():
	output = api.url.get_text('%s/?format=TEXT' % host_url,
	default_test_data=TEST_REPOS).output
	return output.strip().splitlines()


	def _repos_to_urls(host_url, repos):
	host_url = host_url.rstrip('/')
	return ['%s/%s' % (host_url, repo) for repo in repos]


	class _InvalidInput(Exception):
	pass


	def _get_repo_urls(api, inputs):
	if inputs.git_host.host:
	assert not inputs.repo_urls, 'only 1 of (git_host, repo_urls) allowed'
	repos = _list_host_repos(api, 'https://' + inputs.git_host.host)
	if inputs.git_host.exclude_repos:
	exclude_regexps = []
	for i, r in enumerate(inputs.git_host.exclude_repos):
	try:
	exclude_regexps.append(re.compile('^' + r + '$', re.IGNORECASE))
	except Exception as e:
	raise _InvalidInput(
	'invalid regular expression[%d] %r: %s' % (i, r, e))
	repos = [repo for repo in repos
	if all(not r.match(repo) for r in exclude_regexps)]
	return _repos_to_urls('https://' + inputs.git_host.host, repos)

	if inputs.repo_urls:
	return list(inputs.repo_urls)

	raise _InvalidInput('repo_urls or git_host.host must be provided')


	def _do_update_bootstrap(api, url, work_dir, gc_aggressive):
	opts = [
	'--cache-dir', work_dir,
	'--verbose',
	url,
	]

	with api.step.nest(url) as summary:
	api.step(
	name='populate',
	cmd=[
	'git_cache.py',
	'populate',
	'--reset-fetch-config',

	# By default, "refs/heads/*" are checked out by
	# git_cache. However, for heavy branching repos,
	# 'refs/branch-heads/*' is also very useful (crbug/942169).
	# This is a noop for repos without refs/branch-heads.
	'--ref',
	'refs/branch-heads/*',

	# By default, any tags that point to objects we fetch
	# from remote are also fetched. This ensures ALL tags are
	# downloaded from remote.
	# This is needed by chromeos builders.
	'--ref',
	'refs/tags/*',
	'--break-locks',
	] + opts,
	cost=api.step.ResourceCost(disk=20))

	repo_path = api.path.abs_to_path(
	api.step(
	name='lookup repo_path',
	cmd=['git_cache.py', 'exists'] + opts,
	stdout=api.raw_io.output(),
	step_test_data=lambda: api.raw_io.test_api.stream_output(
	api.path.join(work_dir, url.strip('https://')) + '\n',),
	).stdout.decode('utf-8').strip())

	with api.context(cwd=repo_path):
	stats = api.git.count_objects(
	raise_on_failure=True,
	# TODO(iannucci): ugh, the test mock for this is horrendous.
	# 1) it should default to something automatically
	# 2) test_api.count_objects_output should return a TestData, not
	# a string.
	step_test_data=lambda: api.raw_io.test_api.stream_output(
	api.git.test_api.count_objects_output(10)))

	# Scale the memory cost of this update by size-pack squared. This is
	# an arbitrary scaling factor, but it allows multiple small repos to run
	# in parallel but allows large repos (e.g. chromium) to exclusively use
	# all the memory on the system.
	mem_cost = 4 * int((stats['size'] + stats['size-pack']) ** 2)
	if mem_cost == 0:
	# some repos can be empty (e.g. they're an "ACL-only" repo), and
	# update-bootstrap doesn't like that, so skip them.
	api.step('repo is empty; skipping update', cmd=None)
	summary.step_text = "[empty]"
	summary.status = api.step.FAILURE # TODO(iannucci): warning
	return EMPTY

	gc_aggressive_opt = []
	if gc_aggressive:
	gc_aggressive_opt = ['--gc-aggressive']

	api.step(
	name='update bootstrap',
	cmd=[
	'git_cache.py', 'update-bootstrap',
	'--skip-populate', '--prune',
	] + opts + gc_aggressive_opt,
	cost=api.step.ResourceCost(
	cpu=api.step.CPU_CORE*2,
	memory=mem_cost,
	net=10,
	))

	summary.step_text = "[ok]"
	return OK


	def RunSteps(api, inputs):
	try:
	repo_urls = _get_repo_urls(api, inputs)
	except _InvalidInput as e:
	return result_pb.RawResult(
	status=bb_common_pb.FAILURE, summary_markdown=str(e))

	work_dir = api.path.cache_dir.joinpath('builder', 'w')
	api.file.ensure_directory('ensure work_dir', work_dir)

	env = {
	# Turn off the low speed limit, since checkout will be long.
	'GIT_HTTP_LOW_SPEED_LIMIT': '0',
	'GIT_HTTP_LOW_SPEED_TIME': '0',
	# Ensure git-number tool can be used.
	'CHROME_HEADLESS': '1',
	}
	if api.runtime.is_experimental:
	assert inputs.override_bucket, 'override_bucket required for experiments'
	if inputs.override_bucket:
	env['OVERRIDE_BOOTSTRAP_BUCKET'] = inputs.override_bucket

	work = []
	sem = api.futures.make_bounded_semaphore(CONCURRENT_STEPS)

	def fn(sem, api, url, work_dir, gc_aggressive):
	with sem:
	return _do_update_bootstrap(api, url, work_dir, gc_aggressive)

	with api.context(env=env), api.depot_tools.on_path():
	for url in sorted(repo_urls):
	work.append(
	api.futures.spawn_immediate(
	fn, sem, api, url, work_dir, inputs.gc_aggressive, __name=url))

	total = len(work)
	success = warning = 0
	failed_repos = []
	empties = 0
	for future in api.futures.iwait(work):
	try:
	status = future.result()
	except Exception: # pylint: disable=broad-except
	failed_repos.append(future.name)
	continue

	if status == OK:
	success += 1
	elif status == EMPTY:
	empties += 1
	warning += 1
	else:
	assert False, 'unknown status %r' % (status,) # pragma: no cover

	status = bb_common_pb.FAILURE if failed_repos else bb_common_pb.SUCCESS
	summary = 'Updated cache for %d/%d repos.' % (success, total)
	if warning:
	summary += '\n\nEncountered warnings for %d repos:\n' % (warning,)
	if empties:
	summary += '\n * empty (repo has no objects): %d' % (empties,)
	if failed_repos:
	summary += '\n\nEncountered failures for %d repos:\n' % (len(failed_repos),)
	for repo_name in failed_repos:
	summary += '\n * ' + repo_name

	return result_pb.RawResult(status=status, summary_markdown=summary)


	TEST_REPOS = """
	All-Projects
	All-Users
	apps
	chromium/src
	foo/bar
	"""


	def GenTests(api):
	yield (
	api.test('needs input')
	+ api.expect_status('FAILURE')
	+ api.post_process(post_process.StatusFailure)
	+ api.post_process(post_process.DropExpectation)
	)

	yield (api.test('one-repo-experiment-aggressive') +
	api.runtime(is_experimental=True) + api.properties(
	git_cache_updater_pb.Inputs(
	override_bucket='experimental-gs-bucket',
	repo_urls=['https://chromium.googlesource.com/v8/v8'],
	gc_aggressive=True,
	)))

	yield (api.test('one-repo-empty') + api.runtime(is_experimental=True) +
	api.properties(
	git_cache_updater_pb.Inputs(
	override_bucket='experimental-gs-bucket',
	repo_urls=['https://chromium.googlesource.com/empty'],
	gc_aggressive=True,
	)) + api.override_step_data(
	'https://chromium.googlesource.com/empty.git count-objects',
	api.raw_io.stream_output(api.git.count_objects_output(0)),
	))

	yield (api.test('one-repo-fail') + api.runtime(is_experimental=True) +
	api.properties(
	git_cache_updater_pb.Inputs(
	override_bucket='experimental-gs-bucket',
	repo_urls=['https://chromium.googlesource.com/fail'],
	gc_aggressive=True,
	)) + api.override_step_data(
	'https://chromium.googlesource.com/fail.populate',
	retcode=1,
	)) + api.expect_status('FAILURE')


	yield (
	api.test('host-with-exclusions')
	+ api.properties(git_cache_updater_pb.Inputs(
	git_host=git_cache_updater_pb.Inputs.GitHost(
	host='chromium.googlesource.com',
	exclude_repos=[
	'foo/.+',
	'all-projects',
	'all-users',
	],
	),
	))
	)

	yield (
	api.test('host-with-incorrect-regexp-exclude')
	+ api.properties(git_cache_updater_pb.Inputs(
	git_host=git_cache_updater_pb.Inputs.GitHost(
	host='chromium.googlesource.com',
	exclude_repos=[
	'?.\\',
	],
	),
	))
	+ api.expect_status('FAILURE')
	+ api.post_process(post_process.StatusFailure)
	+ api.post_process(post_process.DropExpectation)
	)