Reland "Generate and upload separate coverage data for each target in a build."

Changed the name of the upload directory for per-target metadata so that
it doesn't conflict with where the merged metadata is uploaded.

Previous commit description:

Generate and upload separate coverage data for each target in a build.

This requires us to run llvm-cov on the profdata for each test target.
The code coverage service will aggregate the per-target data to make
reports.

Temporarily skipping this for fuzzers so that the builds don't hit the
20 hr time limit on swarming

Bug: 948885
Change-Id: I17794b6a9aac537b54a2c0c723545d73c37b5f68
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/build/+/1610557
Reviewed-by: Yuke Liao <liaoyuke@chromium.org>
Commit-Queue: Sajjad Mirza <sajjadm@chromium.org>
diff --git a/scripts/slave/recipe_modules/clang_coverage/api.py b/scripts/slave/recipe_modules/clang_coverage/api.py
index d4efc23..e7d13fe 100644
--- a/scripts/slave/recipe_modules/clang_coverage/api.py
+++ b/scripts/slave/recipe_modules/clang_coverage/api.py
@@ -50,6 +50,8 @@
     # The location of the scripts used to merge code coverage data (as opposed
     # to test results).
     self._merge_scripts_location = None
+    # File to store the git revisions of each source file
+    self._revision_cache = None
 
   @staticmethod
   def _dir_name_for_step(step_name):
@@ -115,6 +117,13 @@
       self._metadata_dir = self.m.path.mkdtemp()
     return self._metadata_dir
 
+  @property
+  def generate_individual_metadata(self):
+    """True if we should generate per-target metadata for this build."""
+    return (not self._is_per_cl_coverage and
+            self.m.buildbucket.build.builder.builder in (
+                'linux-code-coverage', 'linux-code-coverage-dev'))
+
   def _get_source_exclusion_pattern(self):
     if 'exclude_sources' in self.m.properties:
       return _EXCLUDE_SOURCES.get(self.m.properties['exclude_sources'])
@@ -269,6 +278,12 @@
       return
 
     try:
+      if self.generate_individual_metadata:
+        with self.m.step.nest('Generate per-target metadata'):
+          if not self._revision_cache:
+            self._revision_cache = self.m.path.mkstemp()
+          self._generate_and_upload_individual_metadata(tests)
+
       merged_profdata = self._merge_profdata()
       self._surface_merging_errors()
       binaries = self._get_binaries(tests)
@@ -279,14 +294,46 @@
         self.m.python.succeeding_step(
             'skip processing coverage data because no data is found', '')
         return
-
-      self._generate_and_upload_metadata(binaries, merged_profdata)
+      suffix = ' for %d targets' % len(self._profdata_dirs)
+      self._generate_and_upload_metadata(
+          binaries, merged_profdata, step_name_suffix=suffix)
       self._generate_and_upload_html_report_on_trybot(binaries, merged_profdata)
     except self.m.step.StepFailure:
       self.m.step.active_result.presentation.properties[
           'process_coverage_data_failure'] = True
       raise
 
+  def _generate_and_upload_individual_metadata(self, tests):
+    step = self.m.step.active_result
+    for test in tests:
+      # Use the name of the test to look up the profdata file for that step.
+      # This only works for full-codebase coverage. In per-CL coverage the test
+      # name isn't the same as the step name, so the directory won't match.
+      name = test.name
+      # Skip fuzzer targets because they make the build take too long.
+      # TODO(crbug.com/948885) Remove this check when we speed up the build.
+      if name.endswith('fuzzer'):
+        continue
+      if name not in self._profdata_dirs:  # pragma: no cover.
+        step.presentation.step_text += (
+            'No profdata dir for test name: %s' % name)
+        continue
+      profdata = self._profdata_dirs[name].join("default.profdata")
+      binaries = self._get_binaries([test])
+      if not binaries:
+        step.presentation.step_text += "No binaries for test name: %s" % name
+        continue
+      gs_path = self._compose_gs_path_for_coverage_data(
+          'target_metadata/%s' % name)
+      out_dir = self.m.path.mkdtemp()
+      self.m.file.ensure_directory('ensure metadata dir for %s' % name, out_dir)
+      self._generate_and_upload_metadata(
+          binaries,
+          profdata,
+          out_dir,
+          gs_path,
+          step_name_suffix=' for %s' % name)
+
   def _merge_profdata(self):
     """Merges the profdata generated by each step to a single profdata."""
     merged_profdata = self.profdata_dir().join('merged.profdata')
@@ -422,8 +469,9 @@
           additional_merge['script'],
       ])
       new_merge['args'].extend([
-        '--additional-merge-script-args', self.m.json.dumps(
-          additional_merge['args'])])
+          '--additional-merge-script-args',
+          self.m.json.dumps(additional_merge['args'])
+      ])
 
     return new_merge
 
@@ -467,13 +515,20 @@
         stdout=self.m.raw_io.output_text(add_output_log=True))
     return component_mapping
 
-  def _generate_and_upload_metadata(self, binaries, profdata_path):
+  def _generate_and_upload_metadata(self,
+                                    binaries,
+                                    profdata_path,
+                                    output_dir=None,
+                                    gs_path=None,
+                                    step_name_suffix=''):
     """Generates the coverage info in metadata format."""
+    output_dir = output_dir or self.metadata_dir
+    gs_path = gs_path or self._compose_gs_path_for_coverage_data('metadata')
     args = [
         '--src-path',
         self.m.path['checkout'],
         '--output-dir',
-        self.metadata_dir,
+        output_dir,
         '--profdata-path',
         profdata_path,
         '--llvm-cov',
@@ -481,6 +536,8 @@
         '--binaries',
     ]
     args.extend(binaries)
+    if self._revision_cache:
+      args.extend(['--revision-cache-path', self._revision_cache])
     if self._is_per_cl_coverage:
       args.append('--sources')
       args.extend([
@@ -495,7 +552,7 @@
           self._affected_source_files)
       args.extend([
           '--diff-mapping-path',
-          self.metadata_dir.join(_BOT_TO_GERRIT_LINE_NUM_MAPPING_FILE_NAME)
+          output_dir.join(_BOT_TO_GERRIT_LINE_NUM_MAPPING_FILE_NAME)
       ])
     else:
       pattern = self._get_source_exclusion_pattern()
@@ -508,20 +565,19 @@
 
     try:
       self.m.python(
-          'generate metadata for %d targets' % len(self._profdata_dirs),
+          'generate metadata' + step_name_suffix,
           self.resource('generate_coverage_metadata.py'),
           args=args,
           venv=True)
     finally:
-      gs_path = self._compose_gs_path_for_coverage_data('metadata')
       upload_step = self.m.gsutil.upload(
-          self.metadata_dir,
+          output_dir,
           _BUCKET_NAME,
           gs_path,
           link_name=None,
           args=['-r'],
           multithreaded=True,
-          name='upload metadata')
+          name='upload metadata' + step_name_suffix)
       upload_step.presentation.links['metadata report'] = (
           'https://storage.cloud.google.com/%s/%s/index.html' % (_BUCKET_NAME,
                                                                  gs_path))
diff --git a/scripts/slave/recipe_modules/clang_coverage/resources/generate_coverage_metadata.py b/scripts/slave/recipe_modules/clang_coverage/resources/generate_coverage_metadata.py
index 796e011..53a7468 100644
--- a/scripts/slave/recipe_modules/clang_coverage/resources/generate_coverage_metadata.py
+++ b/scripts/slave/recipe_modules/clang_coverage/resources/generate_coverage_metadata.py
@@ -632,9 +632,62 @@
   return compressed_data
 
 
-def _generate_metadata(src_path, output_dir, profdata_path, llvm_cov_path,
-                       binaries, component_mapping, sources, diff_mapping,
-                       exclusions):
+class RevisionCache(dict):
+  """Dict-like class for storing git metadata for a file.
+
+  Can be used in a with statement to automatically load and save contents to
+  the backing file.
+
+  Since this script is invoked once for each target in a coverage build,
+  computing the git revision for a file that is present in multiple targets adds
+  significant overhead. The purpose of this class is to store that information
+  across multiple invocations of this script, which reduces the time required to
+  do the build.
+
+  Args:
+    cache_file: Path to a file to save revision data in. If cache_file is false-
+                like, RevisionCache won't try to load or save data.
+  """
+
+  def __init__(self, cache_file):
+    super(RevisionCache, self).__init__()
+    self._cache_file = cache_file
+
+  def __enter__(self):
+    self.Load()
+    return self
+
+  def __exit__(self, exc_type, exc_value, traceback):
+    self.Save()
+
+  def Load(self):
+    if self._cache_file:
+      try:
+        with open(self._cache_file) as f:
+          self.update(json.load(f))
+      except (IOError, ValueError):
+        logging.warning('Revision cache invalid, starting with empty cache')
+
+  def Save(self):
+    if self._cache_file:
+      try:
+        with open(self._cache_file, 'w') as f:
+          json.dump(self, f)
+      except IOError:
+        logging.warning('Unable to write to revision cache')
+        raise
+
+
+def _generate_metadata(src_path,
+                       output_dir,
+                       profdata_path,
+                       llvm_cov_path,
+                       binaries,
+                       component_mapping,
+                       sources,
+                       diff_mapping,
+                       exclusions,
+                       file_git_metadata=None):
   """Generates code coverage metadata.
 
   Args:
@@ -651,6 +704,9 @@
     diff_mapping: A json object that stores the diff mapping. Only meaningful to
                   per-cl coverage.
     exclusions: A regex string to exclude matches from aggregation.
+    file_git_metadata: A dict-like object that maps file path to a tuple of
+                       (revision, timestamp). Only meaningful to full-repo
+                       coverage.
 
   Returns:
     None. This method doesn't return anything, instead, it writes the produced
@@ -665,23 +721,33 @@
       'Generating & loading coverage metadata with "llvm-cov export" '
       'took %.0f minutes', minutes)
 
-  file_git_metadata = {}
   if not diff_mapping:
     logging.info('Retrieving file git metadata...')
     start_time = time.time()
-    all_files = []
+    cache_hit_count = 0
+    if file_git_metadata is None:
+      file_git_metadata = {}
+    else:
+      logging.info('Starting with %d entries in cache', len(file_git_metadata))
+    files_to_check = []
     for datum in data['data']:
       for file_data in datum['files']:
         filename = file_data['filename']
         src_file = os.path.relpath(filename, src_path)
         if not src_file.startswith('//'):
           src_file = '//' + src_file  # Prefix the file path with '//'.
-        all_files.append(src_file)
-    file_git_metadata = repository_util.GetFileRevisions(
-        src_path, 'DEPS', all_files)
+        if src_file in file_git_metadata:
+          cache_hit_count += 1
+        else:
+          files_to_check.append(src_file)
+    new_git_metadata = repository_util.GetFileRevisions(src_path, 'DEPS',
+                                                        files_to_check)
+    file_git_metadata.update(new_git_metadata)
     minutes = (time.time() - start_time) / 60
     logging.info('Retrieving git metadata for %d files took %.0f minutes',
-                 len(all_files), minutes)
+                 len(files_to_check), minutes)
+    if cache_hit_count:
+      logging.info('Found %d cached file revisions', cache_hit_count)
 
   logging.info('Processing coverage data ...')
   start_time = time.time()
@@ -700,7 +766,9 @@
         file_path = '//' + file_path  # Prefix the file path with '//'.
         record['path'] = file_path
 
-      git_metadata = file_git_metadata.get(file_path)
+      git_metadata = None
+      if file_git_metadata:
+        git_metadata = file_git_metadata.get(file_path)
       if git_metadata:
         record['revision'] = git_metadata[0]
         record['timestamp'] = git_metadata[1]
@@ -787,6 +855,10 @@
       '--exclusion-pattern',
       type=str,
       help='regex pattern for sources to exclude from aggregation')
+  parser.add_argument(
+      '--revision-cache-path',
+      type=str,
+      help='file containing git revisions for each source file')
   return parser.parse_args(args=args)
 
 
@@ -832,10 +904,12 @@
       'Either component_mapping (for full-repo coverage) or diff_mapping '
       '(for per-cl coverage) must be specified.')
 
-  compressed_data = _generate_metadata(
-      params.src_path, params.output_dir, params.profdata_path, params.llvm_cov,
-      params.binaries, component_mapping, abs_sources, diff_mapping,
-      params.exclusion_pattern)
+  # Revision cache is only used in full-codebase coverage
+  with RevisionCache(params.revision_cache_path) as rev_cache:
+    compressed_data = _generate_metadata(
+        params.src_path, params.output_dir, params.profdata_path,
+        params.llvm_cov, params.binaries, component_mapping, abs_sources,
+        diff_mapping, params.exclusion_pattern, rev_cache)
 
   with open(os.path.join(params.output_dir, 'all.json.gz'), 'w') as f:
     f.write(zlib.compress(json.dumps(compressed_data)))
diff --git a/scripts/slave/recipe_modules/clang_coverage/tests/full.py b/scripts/slave/recipe_modules/clang_coverage/tests/full.py
index 5399e09..50d8b96 100644
--- a/scripts/slave/recipe_modules/clang_coverage/tests/full.py
+++ b/scripts/slave/recipe_modules/clang_coverage/tests/full.py
@@ -4,7 +4,6 @@
 
 from recipe_engine import post_process
 
-
 DEPS = [
     'chromium_tests',
     'clang_coverage',
@@ -16,7 +15,6 @@
     'recipe_engine/step',
 ]
 
-
 # Number of tests. Needed by the tests.
 _NUM_TARGETS = 7
 
@@ -41,12 +39,14 @@
       api.chromium_tests.steps.SwarmingGTestTest('gl_unittests_ozone'),
       api.chromium_tests.steps.SwarmingIsolatedScriptTest('abc_fuzzer'),
       api.chromium_tests.steps.SwarmingIsolatedScriptTest(
-          'blink_web_tests', merge={
-              'script': api.path['start_dir'].join(
-                  'coverage', 'tests', 'merge_blink_web_tests.py'),
+          'blink_web_tests',
+          merge={
+              'script':
+                  api.path['start_dir'].join('coverage', 'tests',
+                                             'merge_blink_web_tests.py'),
               'args': ['random', 'args'],
           })
-    ]
+  ]
   assert _NUM_TARGETS == len(tests)
 
   for test in tests:
@@ -54,8 +54,8 @@
     api.clang_coverage.profdata_dir(step)
     # Protected access ok here, as this is normally done by the test object
     # itself.
-    api.clang_coverage.shard_merge(step, additional_merge=getattr(
-        test, '_merge', None))
+    api.clang_coverage.shard_merge(
+        step, additional_merge=getattr(test, '_merge', None))
 
   api.clang_coverage.process_coverage_data(tests)
 
@@ -64,6 +64,7 @@
   _ = api.clang_coverage.raw_profile_merge_script
 
 
+# yapf: disable
 def GenTests(api):
   yield (
       api.test('basic')
@@ -89,7 +90,8 @@
           post_process.MustRun,
           'generate metadata for %s targets' % _NUM_TARGETS)
       + api.post_process(
-          post_process.MustRun, 'gsutil upload metadata')
+          post_process.MustRun,
+          'gsutil upload metadata for %s targets' % _NUM_TARGETS)
       + api.post_process(
           post_process.DoesNotRun,
           'generate html report for %s targets' % _NUM_TARGETS)
@@ -130,7 +132,8 @@
           post_process.MustRun,
           'generate metadata for %s targets' % _NUM_TARGETS)
       + api.post_process(
-          post_process.MustRun, 'gsutil upload metadata')
+          post_process.MustRun,
+          'gsutil upload metadata for %s targets' % _NUM_TARGETS)
       + api.post_process(
           post_process.DoesNotRun,
           'generate html report for %s targets' % _NUM_TARGETS)
@@ -187,7 +190,8 @@
           post_process.MustRun,
           'generate metadata for %s targets' % _NUM_TARGETS)
       + api.post_process(
-          post_process.MustRun, 'gsutil upload metadata')
+          post_process.MustRun,
+          'gsutil upload metadata for %s targets' % _NUM_TARGETS)
       + api.post_process(post_process.StatusSuccess)
       + api.post_process(post_process.DropExpectation)
   )
@@ -270,7 +274,8 @@
           post_process.MustRun,
           'generate metadata for %s targets' % _NUM_TARGETS)
       + api.post_process(
-          post_process.MustRun, 'gsutil upload metadata')
+          post_process.MustRun,
+          'gsutil upload metadata for %s targets' % _NUM_TARGETS)
       + api.post_process(
           post_process.DoesNotRun,
           'generate html report for %s targets' % _NUM_TARGETS)
@@ -283,7 +288,7 @@
           'generate metadata for %s targets' % _NUM_TARGETS, retcode=1)
       + api.post_check(
           lambda check, steps:
-          check(steps['gsutil upload metadata']
+          check(steps['gsutil upload metadata for %s targets' % _NUM_TARGETS]
                 .output_properties['process_coverage_data_failure'] == 'true'))
       + api.post_process(post_process.StatusFailure)
       + api.post_process(post_process.DropExpectation)