[code coverage] Moving merge scripts src-side.

This is the first part of moving the code coverage merge scripts to src.
A recipe-side change will follow once this has landed.

R=martiniss,liaoyuke,sajjadm
BUG=928577

Change-Id: I213f13651a4a6440ef3fbd9e813ff7371de32ec9
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1521465
Commit-Queue: Roberto Carrillo <robertocn@chromium.org>
Reviewed-by: Stephen Martinis <martiniss@chromium.org>
Reviewed-by: Yuke Liao <liaoyuke@chromium.org>
Cr-Commit-Position: refs/heads/master@{#640613}
diff --git a/testing/merge_scripts/code_coverage/OWNERS b/testing/merge_scripts/code_coverage/OWNERS
new file mode 100644
index 0000000..c3d0dbb
--- /dev/null
+++ b/testing/merge_scripts/code_coverage/OWNERS
@@ -0,0 +1,3 @@
+liaoyuke@chromium.org
+robertocn@chromium.org
+sajjadm@chromium.org
diff --git a/testing/merge_scripts/code_coverage/merge_lib.py b/testing/merge_scripts/code_coverage/merge_lib.py
new file mode 100644
index 0000000..8e40f44
--- /dev/null
+++ b/testing/merge_scripts/code_coverage/merge_lib.py
@@ -0,0 +1,199 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+"""Functions for interacting with llvm-profdata"""
+
+import logging
+import multiprocessing
+import os
+import subprocess
+
+logging.basicConfig(
+    format='[%(asctime)s %(levelname)s] %(message)s', level=logging.DEBUG)
+
+
+def _call_profdata_tool(profile_input_file_paths,
+                        profile_output_file_path,
+                        profdata_tool_path,
+                        retries=3):
+  """Calls the llvm-profdata tool.
+
+  Args:
+    profile_input_file_paths: A list of relative paths to the files that
+        are to be merged.
+    profile_output_file_path: The path to the merged file to write.
+    profdata_tool_path: The path to the llvm-profdata executable.
+
+  Returns:
+    A list of paths to profiles that had to be excluded to get the merge to
+    succeed, suspected of being corrupted or malformed.
+
+  Raises:
+    CalledProcessError: An error occurred merging profiles.
+  """
+  logging.info('Merging profiles.')
+
+  try:
+    subprocess_cmd = [
+        profdata_tool_path, 'merge', '-o', profile_output_file_path,
+        '-sparse=true'
+    ]
+    subprocess_cmd.extend(profile_input_file_paths)
+
+    # Redirecting stderr is required because when error happens, llvm-profdata
+    # writes the error output to stderr and our error handling logic relies on
+    # that output.
+    output = subprocess.check_output(subprocess_cmd, stderr=subprocess.STDOUT)
+    logging.info('Merge succeeded with output: %r', output)
+  except subprocess.CalledProcessError as error:
+    if len(profile_input_file_paths) > 1 and retries >= 0:
+      logging.warning('Merge failed with error output: %r', error.output)
+
+      # The output of the llvm-profdata command will include the path of
+      # malformed files, such as
+      # `error: /.../default.profraw: Malformed instrumentation profile data`
+      invalid_profiles = [
+          f for f in profile_input_file_paths if f in error.output
+      ]
+
+      if not invalid_profiles:
+        logging.info(
+            'Merge failed, but wasn\'t able to figure out the culprit invalid '
+            'profiles from the output, so skip retry and bail out.')
+        raise error
+
+      valid_profiles = list(
+          set(profile_input_file_paths) - set(invalid_profiles))
+      if valid_profiles:
+        logging.warning(
+            'Following invalid profiles are removed as they were mentioned in '
+            'the merge error output: %r', invalid_profiles)
+        logging.info('Retry merging with the remaining profiles: %r',
+                     valid_profiles)
+        return invalid_profiles + _call_profdata_tool(
+            valid_profiles, profile_output_file_path, profdata_tool_path,
+            retries - 1)
+
+    logging.error('Failed to merge profiles, return code (%d), output: %r' %
+                  (error.returncode, error.output))
+    raise error
+
+  logging.info('Profile data is created as: "%r".', profile_output_file_path)
+  return []
+
+
+def _get_profile_paths(input_dir, input_extension):
+  """Finds all the profiles in the given directory (recursively)."""
+  paths = []
+  for dir_path, _sub_dirs, file_names in os.walk(input_dir):
+    paths.extend([
+        os.path.join(dir_path, fn)
+        for fn in file_names
+        if fn.endswith(input_extension)
+    ])
+  return paths
+
+
+def _validate_and_convert_profraws(profraw_files, profdata_tool_path):
+  """Validates and converts profraws to profdatas.
+
+  For each given .profraw file in the input, this method first validates it by
+  trying to convert it to an indexed .profdata file, and if the validation and
+  conversion succeeds, the generated .profdata file will be included in the
+  output, otherwise, won't.
+
+  This method is mainly used to filter out invalid profraw files.
+
+  Args:
+    profraw_files: A list of .profraw paths.
+    profdata_tool_path: The path to the llvm-profdata executable.
+
+  Returns:
+    A tulple:
+      A list of converted .profdata files of *valid* profraw files.
+      A list of *invalid* profraw files.
+  """
+  logging.info('Validating and converting .profraw files.')
+
+  for profraw_file in profraw_files:
+    if not profraw_file.endswith('.profraw'):
+      raise RuntimeError('%r is expected to be a .profraw file.' % profraw_file)
+
+  cpu_count = multiprocessing.cpu_count()
+  counts = max(10, cpu_count - 5)  # Use 10+ processes, but leave 5 cpu cores.
+  pool = multiprocessing.Pool(counts)
+  output_profdata_files = multiprocessing.Manager().list()
+  invalid_profraw_files = multiprocessing.Manager().list()
+
+  for profraw_file in profraw_files:
+    pool.apply_async(_validate_and_convert_profraw,
+                     (profraw_file, output_profdata_files,
+                      invalid_profraw_files, profdata_tool_path))
+
+  pool.close()
+  pool.join()
+
+  # Remove inputs, as they won't be needed and they can be pretty large.
+  for input_file in profraw_files:
+    os.remove(input_file)
+
+  return list(output_profdata_files), list(invalid_profraw_files)
+
+
+def _validate_and_convert_profraw(profraw_file, output_profdata_files,
+                                  invalid_profraw_files, profdata_tool_path):
+  output_profdata_file = profraw_file.replace('.profraw', '.profdata')
+  subprocess_cmd = [
+      profdata_tool_path, 'merge', '-o', output_profdata_file, '-sparse=true',
+      profraw_file
+  ]
+
+  try:
+    # Redirecting stderr is required because when error happens, llvm-profdata
+    # writes the error output to stderr and our error handling logic relies on
+    # that output.
+    output = subprocess.check_output(subprocess_cmd, stderr=subprocess.STDOUT)
+    logging.info('Validating and converting %r to %r succeeded with output: %r',
+                 profraw_file, output_profdata_file, output)
+    output_profdata_files.append(output_profdata_file)
+  except subprocess.CalledProcessError as error:
+    logging.warning('Validating and converting %r to %r failed with output: %r',
+                    profraw_file, output_profdata_file, error.output)
+    invalid_profraw_files.append(profraw_file)
+
+
+def merge_profiles(input_dir, output_file, input_extension, profdata_tool_path):
+  """Merges the profiles produced by the shards using llvm-profdata.
+
+  Args:
+    input_dir (str): The path to traverse to find input profiles.
+    output_file (str): Where to write the merged profile.
+    input_extension (str): File extension to look for in the input_dir.
+        e.g. '.profdata' or '.profraw'
+    profdata_tool_path: The path to the llvm-profdata executable.
+  Returns:
+    The list of profiles that had to be excluded to get the merge to
+    succeed.
+  """
+  profile_input_file_paths = _get_profile_paths(input_dir, input_extension)
+  invalid_profraw_files = []
+  if input_extension == '.profraw':
+    profile_input_file_paths, invalid_profraw_files = (
+        _validate_and_convert_profraws(profile_input_file_paths,
+                                       profdata_tool_path))
+    logging.info('List of converted .profdata files: %r',
+                 profile_input_file_paths)
+    logging.info((
+        'List of invalid .profraw files that failed to validate and convert: %r'
+    ), invalid_profraw_files)
+
+  invalid_profdata_files = _call_profdata_tool(
+      profile_input_file_paths=profile_input_file_paths,
+      profile_output_file_path=output_file,
+      profdata_tool_path=profdata_tool_path)
+
+  # Remove inputs, as they won't be needed and they can be pretty large.
+  for input_file in profile_input_file_paths:
+    os.remove(input_file)
+
+  return invalid_profraw_files + invalid_profdata_files
diff --git a/testing/merge_scripts/code_coverage/merge_profiles.py b/testing/merge_scripts/code_coverage/merge_profiles.py
new file mode 100755
index 0000000..354080a2
--- /dev/null
+++ b/testing/merge_scripts/code_coverage/merge_profiles.py
@@ -0,0 +1,67 @@
+#!/usr/bin/python
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+"""This script merges code coverage profiles from multiple shards.
+
+It is functionally identical to merge_steps.py but it accepts the parameters
+passed by swarming api.
+"""
+
+import argparse
+import json
+import logging
+import os
+import sys
+
+import merge_lib as merger
+
+
+def _MergeAPIArgumentParser(*args, **kwargs):
+  """Parameters passed to this merge script, as per:
+  https://chromium.googlesource.com/chromium/tools/build/+/master/scripts/slave/recipe_modules/swarming/resources/merge_api.py
+  """
+  parser = argparse.ArgumentParser(*args, **kwargs)
+  parser.add_argument('--build-properties', help=argparse.SUPPRESS)
+  parser.add_argument('--summary-json', help=argparse.SUPPRESS)
+  parser.add_argument('--task-output-dir', help=argparse.SUPPRESS)
+  parser.add_argument(
+      '-o', '--output-json', required=True, help=argparse.SUPPRESS)
+  parser.add_argument(
+      '--profdata-dir', required=True, help='where to store the merged data')
+  parser.add_argument(
+      '--llvm-profdata', required=True, help='path to llvm-profdata executable')
+  parser.add_argument('jsons_to_merge', nargs='*', help=argparse.SUPPRESS)
+  return parser
+
+
+def main():
+  desc = "Merge profraw files in <--task-output-dir> into a single profdata."
+  parser = _MergeAPIArgumentParser(description=desc)
+  params = parser.parse_args()
+  invalid_profiles = merger.merge_profiles(
+      params.task_output_dir,
+      os.path.join(params.profdata_dir, 'default.profdata'), '.profraw',
+      params.llvm_profdata)
+  if invalid_profiles:
+    with open(os.path.join(params.profdata_dir, 'invalid_profiles.json'),
+              'w') as f:
+      json.dump(invalid_profiles, f)
+
+  # TODO(crbug.com/921300) This script doesn't know how to merge test results,
+  # and the correct solution should be taking other merge script as inputs to
+  # perform the merge.
+  # However, to work around the issue that fuzzer test steps are red, following
+  # logic directly copy paste the output json if there is only one shard, and
+  # this strategy should work for test targets that only have one shard, such
+  # as fuzzer targets and simple gtests targets.
+  if len(params.jsons_to_merge) == 1:
+    with open(params.jsons_to_merge[0]) as f_read:
+      with open(params.output_json, 'w') as f_write:
+        f_write.write(f_read.read())
+
+
+if __name__ == '__main__':
+  logging.basicConfig(
+      format='[%(asctime)s %(levelname)s] %(message)s', level=logging.INFO)
+  sys.exit(main())
diff --git a/testing/merge_scripts/code_coverage/merge_profiles_test.py b/testing/merge_scripts/code_coverage/merge_profiles_test.py
new file mode 100755
index 0000000..5cbe5c2
--- /dev/null
+++ b/testing/merge_scripts/code_coverage/merge_profiles_test.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env vpython
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import json
+import os
+import subprocess
+import sys
+import unittest
+
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(
+    0, os.path.abspath(os.path.join(THIS_DIR, os.pardir, os.pardir, os.pardir,
+                                    'third_party', 'pymock')))
+
+import mock
+
+import merge_profiles
+import merge_steps
+import merge_lib as merger
+
+
+class MergeProfilesTest(unittest.TestCase):
+
+  def __init__(self, *args, **kwargs):
+    super(MergeProfilesTest, self).__init__(*args, **kwargs)
+    self.maxDiff = None
+
+  def test_merge_script_api_parameters(self):
+    """Test the step-level merge front-end."""
+    build_properties = json.dumps({
+        'some': {
+            'complicated': ['nested', {
+                'json': None,
+                'object': 'thing',
+            }]
+        }
+    })
+    task_output_dir = 'some/task/output/dir'
+    profdata_dir = '/some/different/path/to/profdata/default.profdata'
+    profdata_file = os.path.join(profdata_dir, 'default.profdata')
+    args = [
+        'script_name', '--output-json', 'output.json', '--build-properties',
+        build_properties, '--summary-json', 'summary.json', '--task-output-dir',
+        task_output_dir, '--profdata-dir', profdata_dir, '--llvm-profdata',
+        'llvm-profdata', 'a.json', 'b.json', 'c.json'
+    ]
+    with mock.patch.object(merger, 'merge_profiles') as mock_merge:
+      mock_merge.return_value = None
+      with mock.patch.object(sys, 'argv', args):
+        merge_profiles.main()
+        self.assertEqual(
+            mock_merge.call_args,
+            mock.call(task_output_dir, profdata_file, '.profraw',
+                      'llvm-profdata'))
+
+  def test_merge_steps_parameters(self):
+    """Test the build-level merge front-end."""
+    input_dir = 'some/task/output/dir'
+    output_file = '/some/different/path/to/profdata/merged.profdata'
+    args = [
+        'script_name',
+        '--input-dir',
+        input_dir,
+        '--output-file',
+        output_file,
+        '--llvm-profdata',
+        'llvm-profdata',
+    ]
+    with mock.patch.object(merger, 'merge_profiles') as mock_merge:
+      mock_merge.return_value = None
+      with mock.patch.object(sys, 'argv', args):
+        merge_steps.main()
+        self.assertEqual(
+            mock_merge.call_args,
+            mock.call(input_dir, output_file, '.profdata', 'llvm-profdata'))
+
+  @mock.patch.object(merger, '_validate_and_convert_profraws')
+  def test_merge_profraw(self, mock_validate_and_convert_profraws):
+    mock_input_dir_walk = [
+        ('/b/some/path', ['0', '1', '2', '3'], ['summary.json']),
+        ('/b/some/path/0', [],
+         ['output.json', 'default-1.profraw', 'default-2.profraw']),
+        ('/b/some/path/1', [],
+         ['output.json', 'default-1.profraw', 'default-2.profraw']),
+    ]
+
+    mock_validate_and_convert_profraws.return_value = [
+        '/b/some/path/0/default-1.profdata',
+        '/b/some/path/1/default-2.profdata',
+    ], [
+        '/b/some/path/0/default-2.profraw',
+        '/b/some/path/1/default-1.profraw',
+    ]
+
+    with mock.patch.object(os, 'walk') as mock_walk:
+      with mock.patch.object(os, 'remove'):
+        mock_walk.return_value = mock_input_dir_walk
+        with mock.patch.object(subprocess, 'check_output') as mock_exec_cmd:
+          merger.merge_profiles('/b/some/path', 'output/dir/default.profdata',
+                                '.profraw', 'llvm-profdata')
+          self.assertEqual(
+              mock.call(
+                  [
+                      'llvm-profdata',
+                      'merge',
+                      '-o',
+                      'output/dir/default.profdata',
+                      '-sparse=true',
+                      '/b/some/path/0/default-1.profdata',
+                      '/b/some/path/1/default-2.profdata',
+                  ],
+                  stderr=-2,
+              ), mock_exec_cmd.call_args)
+
+    self.assertTrue(mock_validate_and_convert_profraws.called)
+
+  @mock.patch.object(merger, '_validate_and_convert_profraws')
+  def test_merge_profdata(self, mock_validate_and_convert_profraws):
+    mock_input_dir_walk = [
+        ('/b/some/path', ['base_unittests', 'url_unittests'], ['summary.json']),
+        ('/b/some/path/base_unittests', [], ['output.json',
+                                             'default.profdata']),
+        ('/b/some/path/url_unittests', [], ['output.json', 'default.profdata']),
+    ]
+    with mock.patch.object(os, 'walk') as mock_walk:
+      with mock.patch.object(os, 'remove'):
+        mock_walk.return_value = mock_input_dir_walk
+        with mock.patch.object(subprocess, 'check_output') as mock_exec_cmd:
+          merger.merge_profiles('/b/some/path', 'output/dir/default.profdata',
+                                '.profdata', 'llvm-profdata')
+          self.assertEqual(
+              mock.call(
+                  [
+                      'llvm-profdata',
+                      'merge',
+                      '-o',
+                      'output/dir/default.profdata',
+                      '-sparse=true',
+                      '/b/some/path/base_unittests/default.profdata',
+                      '/b/some/path/url_unittests/default.profdata',
+                  ],
+                  stderr=-2,
+              ), mock_exec_cmd.call_args)
+
+    # The mock method should only apply when merging .profraw files.
+    self.assertFalse(mock_validate_and_convert_profraws.called)
+
+  def test_retry_profdata_merge_failures(self):
+    mock_input_dir_walk = [
+        ('/b/some/path', ['0', '1'], ['summary.json']),
+        ('/b/some/path/0', [],
+         ['output.json', 'default-1.profdata', 'default-2.profdata']),
+        ('/b/some/path/1', [],
+         ['output.json', 'default-1.profdata', 'default-2.profdata']),
+    ]
+    with mock.patch.object(os, 'walk') as mock_walk:
+      with mock.patch.object(os, 'remove'):
+        mock_walk.return_value = mock_input_dir_walk
+        with mock.patch.object(subprocess, 'check_output') as mock_exec_cmd:
+          invalid_profiles_msg = (
+              'error: /b/some/path/0/default-1.profdata: Malformed '
+              'instrumentation profile data.')
+
+          # Failed on the first merge, but succeed on the second attempt.
+          mock_exec_cmd.side_effect = [
+              subprocess.CalledProcessError(
+                  returncode=1, cmd='dummy cmd', output=invalid_profiles_msg),
+              None
+          ]
+
+          merger.merge_profiles('/b/some/path', 'output/dir/default.profdata',
+                                '.profdata', 'llvm-profdata')
+
+          self.assertEqual(2, mock_exec_cmd.call_count)
+
+          # Note that in the second call, /b/some/path/0/default-1.profdata is
+          # excluded!
+          self.assertEqual(
+              mock.call(
+                  [
+                      'llvm-profdata',
+                      'merge',
+                      '-o',
+                      'output/dir/default.profdata',
+                      '-sparse=true',
+                      '/b/some/path/0/default-2.profdata',
+                      '/b/some/path/1/default-1.profdata',
+                      '/b/some/path/1/default-2.profdata',
+                  ],
+                  stderr=-2,
+              ), mock_exec_cmd.call_args)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/testing/merge_scripts/code_coverage/merge_steps.py b/testing/merge_scripts/code_coverage/merge_steps.py
new file mode 100755
index 0000000..c0d5d4e
--- /dev/null
+++ b/testing/merge_scripts/code_coverage/merge_steps.py
@@ -0,0 +1,33 @@
+#!/usr/bin/python
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+"""This script merges code coverage profiles from multiple steps."""
+
+import argparse
+import os
+import sys
+
+import merge_lib as merger
+
+
+def _merge_steps_argument_parser(*args, **kwargs):
+  parser = argparse.ArgumentParser(*args, **kwargs)
+  parser.add_argument('--input-dir', required=True, help=argparse.SUPPRESS)
+  parser.add_argument(
+      '--output-file', required=True, help='where to store the merged data')
+  parser.add_argument(
+      '--llvm-profdata', required=True, help='path to llvm-profdata executable')
+  return parser
+
+
+def main():
+  desc = "Merge profdata files in <--input-dir> into a single profdata."
+  parser = _merge_steps_argument_parser(description=desc)
+  params = parser.parse_args()
+  merger.merge_profiles(params.input_dir, params.output_file, '.profdata',
+                        params.llvm_profdata)
+
+
+if __name__ == '__main__':
+  sys.exit(main())