split_cl.py - chromium/tools/depot_tools.git - Git at Google

 #!/usr/bin/env python3
 # Copyright 2017 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """Splits a branch into smaller branches and uploads CLs."""

 import collections
 import dataclasses
 import hashlib
 import math
 import os
 import re
 import tempfile
 from typing import List, Set, Tuple, Dict, Any

 import gclient_utils
 import git_footers
 import scm

 import git_common as git

 # If a call to `git cl split` will generate more than this number of CLs, the
 # command will prompt the user to make sure they know what they're doing. Large
 # numbers of CLs generated by `git cl split` have caused infrastructure issues
 # in the past.
 CL_SPLIT_FORCE_LIMIT = 10

 # The maximum number of top reviewers to list. `git cl split` may send many CLs
 # to a single reviewer, so the top reviewers with the most CLs sent to them
 # will be listed.
 CL_SPLIT_TOP_REVIEWERS = 5


 def Emit(*msg: str):
     """Wrapper for easier mocking during tests"""
     print(*msg)


 def EmitWarning(*msg: str):
     print("Warning: ", *msg)


 def HashList(lst: List[Any]) -> str:
     """
     Hash a list, returning a positive integer. Lists with identical elements
     should have the same hash, regardless of order.
     """
     # We need a bytes-like object for hashlib algorithms
     byts = bytes().join(
         (action + file).encode() for action, file in sorted(lst))
     # No security implication: we just need a deterministic output
     hashed = hashlib.sha1(byts)
     return hashed.hexdigest()[:10]

 FilesAndOwnersDirectory = collections.namedtuple("FilesAndOwnersDirectory",
                                                  "files owners_directories")


 @dataclasses.dataclass
 class CLInfo:
     """
     Data structure representing a single CL. The script will split the large CL
     into a list of these.

     Fields:
     - reviewers: the reviewers the CL will be sent to.
     - files: a list of <action>, <file> pairs in the CL.
              Has the same format as `git status`.
     - description: a string describing the CL. Typically the list of affected
                    directories. Only used for replacing $description in
                    the user-provided CL description.
     """
     # Have to use default_factory because lists are mutable
     reviewers: Set[str] = dataclasses.field(default_factory=set)
     files: List[Tuple[str, str]] = dataclasses.field(default_factory=list)

     # This is only used for formatting in the CL description, so it just
     # has to be convertible to string.
     description: Any = ""

     def FormatForPrinting(self) -> str:
         """
         Format the CLInfo for printing to a file in a human-readable format.
         """
         # Don't quote the reviewer emails in the output
         reviewers_str = ", ".join(self.reviewers)
         lines = [
             f"Reviewers: [{reviewers_str}]", f"Description: {self.description}"
         ] + [f"{action}, {file}" for (action, file) in self.files]
         return "\n".join(lines)


 def CLInfoFromFilesAndOwnersDirectoriesDict(
         d: Dict[Tuple[str], FilesAndOwnersDirectory]) -> List[CLInfo]:
     """
     Transform a dictionary mapping reviewer tuples to FilesAndOwnersDirectories
     into a list of CLInfo
     """
     cl_infos = []
     for (reviewers, fod) in d.items():
         cl_infos.append(
             CLInfo(set(reviewers), fod.files,
                    FormatDirectoriesForPrinting(fod.owners_directories)))
     return cl_infos


 def EnsureInGitRepository():
     """Throws an exception if the current directory is not a git repository."""
     git.run('rev-parse')


 def GetGitInfo(repository_root, cl) -> Tuple[List[Tuple[str, str]], str, str]:
     """
     Get various information by running git commands.

     Specifically, determine which branch we're on, which upstream we're
     targeting, and the list of changed files (and the associated git actions)
     that make up the CL we're splitting.
     """
     upstream = cl.GetCommonAncestorWithUpstream()
     files = [(action.strip(), f)
              for action, f in scm.GIT.CaptureStatus(repository_root, upstream)]

     refactor_branch = git.current_branch()
     assert refactor_branch, "Can't run from detached branch."
     refactor_branch_upstream = git.upstream(refactor_branch)
     assert refactor_branch_upstream, \
         "Branch %s must have an upstream." % refactor_branch

     return files, refactor_branch, refactor_branch_upstream


 def CreateBranchName(prefix: str, files: List[Tuple[str, str]]) -> str:
     """
     Given a sub-CL as a list of (action, file) pairs, create a unique and
     deterministic branch name for it.
     The name has the format <prefix>_<dirname>_<hash(files)>_split.
     """
     file_names = [file for _, file in files]
     if len(file_names) == 1:
         # Only one file, just use its directory as the common path
         common_path = os.path.dirname(file_names[0])
     else:
         common_path = os.path.commonpath(file_names)
     if not common_path:
         # Files have nothing in common at all. Unlikely but possible.
         common_path = "None"
     # Replace path delimiter with underscore in common_path.
     common_path = common_path.replace(os.path.sep, '_')
     return f"{prefix}_{HashList(files)}_{common_path}_split"


 def CreateBranchForOneCL(prefix: str, files: List[Tuple[str, str]],
                          upstream: str) -> bool:
     """Creates a branch named |prefix| + "_" + |hash(files)| + "_split".

     Return false if the branch already exists. |upstream| is used as upstream
     for the created branch.
     """
     branches_on_disk = set(git.branches(use_limit=False))
     branch_name = CreateBranchName(prefix, files)
     if branch_name in branches_on_disk:
         return False
     git.run('checkout', '-t', upstream, '-b', branch_name)
     return True


 def ValidateExistingBranches(prefix: str, cl_infos: List[CLInfo]) -> bool:
     """
     Check if there are splitting branches left over from a previous run.
     We only allow branches to exist if we're resuming a previous upload,
     in which case we require that the existing branches are a subset of
     the branches we're going to generate.
     """
     branches_on_disk = set(
         branch for branch in git.branches(use_limit=False)
         if branch.startswith(prefix + "_") and branch.endswith("_split"))

     branches_to_be_made = set(
         CreateBranchName(prefix, info.files) for info in cl_infos)

     if not branches_on_disk.issubset(branches_to_be_made):
         Emit("It seems like you've already run `git cl split` on this branch.\n"
              "If you're resuming a previous upload, you must pass in the "
              "same splitting as before, using the --from-file option.\n"
              "If you're starting a new upload, please clean up existing split "
              f"branches (starting with '{prefix}_' and ending with '_split'), "
              "and re-run the tool.")
         Emit("The following branches need to be cleaned up:\n")
         for branch in branches_on_disk - branches_to_be_made:
             Emit(branch)
         return False
     return True


 def FormatDirectoriesForPrinting(directories: List[str],
                                  prefix: str = None) -> str:
     """Formats directory list for printing

     Uses dedicated format for single-item list."""

     prefixed = directories
     if prefix:
         prefixed = [(prefix + d) for d in directories]

     return str(prefixed[0]) if len(prefixed) == 1 else str(prefixed)


 def FormatDescriptionOrComment(txt, desc):
     """Replaces $description with |desc| in |txt|."""
     # TODO(389069356): Remove support for $directory entirely once it's been
     # deprecated for a while.
     replaced_txt = txt.replace('$directory', desc)
     if txt != replaced_txt:
         EmitWarning('Usage of $directory is deprecated and will be removed '
                     'in a future update. Please use $description instead, '
                     'which has the same behavior by default.\n\n')
     replaced_txt = replaced_txt.replace('$description', desc)
     return replaced_txt


 def AddUploadedByGitClSplitToDescription(description, is_experimental=False):
     """Adds a 'This CL was uploaded by git cl split.' line to |description|.

     The line is added before footers, or at the end of |description| if it has
     no footers.
     """
     if is_experimental:
         new_lines = [
             'This CL was uploaded by an experimental version of git cl split',
             '(https://crbug.com/389069356).'
         ]
     else:
         new_lines = ['This CL was uploaded by git cl split.']
     split_footers = git_footers.split_footers(description)
     lines = split_footers[0]
     if lines[-1] and not lines[-1].isspace():
         lines = lines + ['']
     lines = lines + new_lines
     if split_footers[1]:
         lines += [''] + split_footers[1]
     return '\n'.join(lines)


 def UploadCl(refactor_branch, refactor_branch_upstream, cl_description, files,
              user_description, saved_splitting_file, comment, reviewers,
              changelist, cmd_upload, cq_dry_run, enable_auto_submit, topic,
              repository_root):
     """Uploads a CL with all changes to |files| in |refactor_branch|.

     Args:
         refactor_branch: Name of the branch that contains the changes to upload.
         refactor_branch_upstream: Name of the upstream of |refactor_branch|.
         cl_description: Description of this specific CL, e.g. the list of
           affected directories.
         files: List of AffectedFile instances to include in the uploaded CL.
         user_description: Description provided by user.
         comment: Comment to post on the uploaded CL.
         reviewers: A set of reviewers for the CL.
         changelist: The Changelist class.
         cmd_upload: The function associated with the git cl upload command.
         cq_dry_run: If CL uploads should also do a cq dry run.
         enable_auto_submit: If CL uploads should also enable auto submit.
         topic: Topic to associate with uploaded CLs.
     """
     # Create a branch.
     if not CreateBranchForOneCL(refactor_branch, files,
                                 refactor_branch_upstream):
         Emit(
             f'Skipping existing branch for CL with description: {cl_description}'
         )
         return

     # Checkout all changes to files in |files|.
     deleted_files = []
     modified_files = []
     for action, f in files:
         abspath = os.path.abspath(os.path.join(repository_root, f))
         if action == 'D':
             deleted_files.append(abspath)
         else:
             modified_files.append(abspath)

     if deleted_files:
         git.run(*['rm'] + deleted_files)
     if modified_files:
         git.run(*['checkout', refactor_branch, '--'] + modified_files)

     # Commit changes. The temporary file is created with delete=False so that it
     # can be deleted manually after git has read it rather than automatically
     # when it is closed.
     with gclient_utils.temporary_file() as tmp_file:
         gclient_utils.FileWrite(
             tmp_file,
             FormatDescriptionOrComment(user_description, cl_description))
         git.run('commit', '-F', tmp_file)

     # Upload a CL.
     upload_args = ['-f']
     if reviewers:
         upload_args.extend(['-r', ','.join(sorted(reviewers))])
     if cq_dry_run:
         upload_args.append('--cq-dry-run')
     if not comment:
         upload_args.append('--send-mail')
     if enable_auto_submit:
         upload_args.append('--enable-auto-submit')
     if topic:
         upload_args.append('--topic={}'.format(topic))
     Emit(f'Uploading CL with description: {cl_description} ...')

     ret = cmd_upload(upload_args)
     if ret != 0:
         Emit('Uploading failed.')
         Emit('Note: git cl split has built-in resume capabilities.')
         Emit(f'Delete {git.current_branch()} then run\n'
              f'git cl split --from-file={saved_splitting_file}\n'
              'to resume uploading.')

     if comment:
         changelist().AddComment(FormatDescriptionOrComment(
             comment, cl_description),
                                 publish=True)


 def GetFilesSplitByOwners(files, max_depth, repository_root):
     """Returns a map of files split by OWNERS file.

     Args:
         files: List of the file paths to be grouped by the OWNERS.
             Note that each path is relative to the repostiory root.
         max_depth: Max depth to traverse from the repository path.
         repository_root: Absolute path to the repository root.

     Returns:
         A map where keys are paths to directories containing an OWNERS file and
         values are lists of files sharing an OWNERS file.
     """
     files_split_by_owners = {}
     for action, path in files:
         # normpath() is important to normalize separators here, in prepration
         # for str.split() before. It would be nicer to use something like
         # pathlib here but alas...
         dir_with_owners = os.path.normpath(os.path.dirname(path))
         if max_depth >= 1:
             dir_with_owners = os.path.join(
                 *dir_with_owners.split(os.path.sep)[:max_depth])

         # Find the closest parent directory with an OWNERS file.
         dir_with_owners = os.path.join(repository_root, dir_with_owners)
         while dir_with_owners != repository_root:
             if dir_with_owners in files_split_by_owners:
                 break
             owners_path = os.path.join(dir_with_owners, 'OWNERS')
             if os.path.isfile(owners_path):
                 break
             if os.path.lexists(owners_path):
                 raise ClSplitParseError(
                     f'{owners_path} exists, but is not a file')

             dir_with_owners = os.path.dirname(dir_with_owners)

         files_split_by_owners.setdefault(
             os.path.relpath(dir_with_owners, start=repository_root), []).append(
                 (action, path))
     return files_split_by_owners


 def PrintClInfo(cl_index, num_cls, cl_description, file_paths, user_description,
                 reviewers, cq_dry_run, enable_auto_submit, topic):
     """Prints info about a CL.

     Args:
         cl_index: The index of this CL in the list of CLs to upload.
         num_cls: The total number of CLs that will be uploaded.
         cl_description: Description of this specific CL, e.g. the list of
           affected directories.
         file_paths: A list of files in this CL.
         user_description: Description provided by user.
         reviewers: A set of reviewers for this CL.
         cq_dry_run: If the CL should also be sent to CQ dry run.
         enable_auto_submit: If the CL should also have auto submit enabled.
         topic: Topic to set for this CL.
     """
     description_lines = FormatDescriptionOrComment(user_description,
                                                    cl_description).splitlines()
     indented_description = '\n'.join(['    ' + l for l in description_lines])

     Emit('CL {}/{}'.format(cl_index, num_cls))
     Emit('Paths: {}'.format(cl_description))
     Emit('Reviewers: {}'.format(', '.join(reviewers)))
     Emit('Auto-Submit: {}'.format(enable_auto_submit))
     Emit('CQ Dry Run: {}'.format(cq_dry_run))
     Emit('Topic: {}'.format(topic))
     Emit('\n' + indented_description + '\n')
     Emit('\n'.join(file_paths))


 def LoadDescription(description_file, dry_run):
     if not description_file:
         if not dry_run:
             # Parser checks this as well, so should be impossible
             raise ValueError(
                 "Must provide a description file except during dry runs")
         return ('Dummy description for dry run.\n'
                 'description = $description')

     return gclient_utils.FileRead(description_file)


 def ProcessDescription(description_file: str, dry_run: bool,
                        target_range: bool) -> str:
     """
     Load the provided description, append the note about git cl split, and
     (on a real run), validate that it contains a bug link.

     Returns the loaded description, or None if the user aborted due to a
     missing bug link.
     """
     description = LoadDescription(description_file, dry_run)

     description = AddUploadedByGitClSplitToDescription(
         description, is_experimental=target_range)

     if not dry_run and not CheckDescriptionBugLink(description):
         return None

     return description

 def PrintSummary(cl_infos, refactor_branch):
     """Print a brief summary of the splitting so the user
        can review it before uploading.

     Args:
        files_split_by_reviewers: A dictionary mapping reviewer tuples
            to the files and directories assigned to them.
     """
     for info in cl_infos:
         Emit(f'Reviewers: {info.reviewers}, files: {len(info.files)}, '
              f'description: {info.description}')

     num_cls = len(cl_infos)
     Emit(f'\nWill split branch {refactor_branch} into {num_cls} CLs. '
          'Please quickly review them before proceeding.\n')

     if (num_cls > CL_SPLIT_FORCE_LIMIT):
         EmitWarning(
             'Uploading this many CLs may potentially '
             'reach the limit of concurrent runs, imposed on you by the '
             'build infrastructure. Your runs may be throttled as a '
             'result.\n\nPlease email infra-dev@chromium.org if you '
             'have any questions. '
             'The infra team reserves the right to cancel '
             'your jobs if they are overloading the CQ.\n\n'
             '(Alternatively, you can reduce the number of CLs created by '
             'using the --max-depth option, or altering the arguments to '
             '--target-range, as appropriate. Pass --dry-run to examine the '
             'CLs which will be created until you are happy with the '
             'results.)')


 def SummarizeAndValidate(dry_run: bool, summarize: bool,
                          files: List[Tuple[str, str]], refactor_branch: str,
                          cl_infos: List[CLInfo]) -> Tuple[List[CLInfo], str]:
     """
     Print a summary of the generated splitting for the user. If we're doing a
     real run, prompt the user to confirm the splitting is acceptable, and
     allow them to edit it if they wish.

     If we're doing a real run, also save the splitting to a file so the user
     can safely resume an aborted upload with the same splitting.

     Arguments:
     dry_run: Whether or not we're doing a dry run
     summarize: If we're doing a dry run, should we print a concise summary first
     files: The list of (action, file) pairs that make up the CL we're splitting
     refactor_branch: Name of the branch we're splitting

     Returns:
     A pair of the edited cl_infos and the name of the file to which we saved
     the splitting. If the user aborts, the edited cl_infos will be falsy.
     """
     if not dry_run or summarize:
         PrintSummary(cl_infos, refactor_branch)

     if dry_run:
         return cl_infos, ""

     answer = gclient_utils.AskForData(
         'Proceed? (y/N, or i to edit interactively): ')

     if answer.lower() == 'i':
         cl_infos, saved_splitting_file = EditSplittingInteractively(
             cl_infos, files_on_disk=files)
     else:
         # Save so the user can use the splitting later if they want to
         saved_splitting_file = SaveSplittingToTempFile(cl_infos)
         if answer.lower() != 'y':
             return None, saved_splitting_file

     # Make sure there isn't any clutter left over from a previous run
     if not ValidateExistingBranches(refactor_branch, cl_infos):
         return None, saved_splitting_file

     return cl_infos, saved_splitting_file


 def ComputeSplitting(
     from_file: str,
     files: List[Tuple[str, str]],
     target_range: Tuple[int, int],
     max_depth: int,
     reviewers_override: List[str],
     expect_owners_override: bool,
     cl,
     repository_root: str,
 ) -> List[CLInfo]:
     """
     Split the current CL into sub-CLs by partitioning the files and assigning
     reviewers. The method used depends on the command-line arguments.

     Arguments are the same as SplitCl, excecpt for the following:
     cl: Changelist class instance, for calling owners methods
     """
     author = git.run('config', 'user.email').strip() or None

     if from_file:
         # Load a precomputed splitting
         cl_infos = LoadSplittingFromFile(from_file, files_on_disk=files)
     elif target_range:
         # Use the directory-based clustering algorithm
         min_files, max_files = target_range
         cl_infos = GroupFilesByDirectory(cl, author, expect_owners_override,
                                          files, min_files, max_files)
     else:
         # Use the default algorithm
         files_split_by_reviewers = SelectReviewersForFiles(
             cl, author, files, max_depth, repository_root)

         cl_infos = CLInfoFromFilesAndOwnersDirectoriesDict(
             files_split_by_reviewers)

     # Note that we do this override even if the list is empty (indicating that
     # the user requested CLs not be assigned to any reviewers).
     if reviewers_override != None:
         for info in cl_infos:
             info.reviewers = set(reviewers_override)

     return cl_infos


 def SplitCl(description_file, comment_file, changelist, cmd_upload, dry_run,
             summarize, reviewers_override, cq_dry_run, enable_auto_submit,
             max_depth, topic, target_range, expect_owners_override, from_file,
             repository_root):
     """"Splits a branch into smaller branches and uploads CLs.

     Args:
         description_file: File containing the description of uploaded CLs.
         comment_file: File containing the comment of uploaded CLs.
         changelist: The Changelist class.
         cmd_upload: The function associated with the git cl upload command.
         dry_run: Whether this is a dry run (no branches or CLs created).
         reviewers_override: Either None or a (possibly empty) list of reviewers
             all CLs should be sent to.
         cq_dry_run: If CL uploads should also do a cq dry run.
         enable_auto_submit: If CL uploads should also enable auto submit.
         max_depth: The maximum directory depth to search for OWNERS files. A
             value less than 1 means no limit.
         topic: Topic to associate with split CLs.
         repository_root: Absolute path of the repository root.

     Returns:
         0 in case of success. 1 in case of error.
     """

     EnsureInGitRepository()
     cl = changelist()
     # Get the list of changed files, as well as the branch we're on and its
     # upstream.
     files, refactor_branch, refactor_branch_upstream = GetGitInfo(
         repository_root, cl)

     if not files:
         Emit('Cannot split an empty CL.')
         return 1

     # Load and validate the description and comment files now, so we can error
     # early if there's a problem with them.
     comment = gclient_utils.FileRead(comment_file) if comment_file else None
     description = ProcessDescription(description_file, dry_run, target_range)
     if not description:
         return 0

     cl_infos = ComputeSplitting(from_file, files, target_range, max_depth,
                                 reviewers_override, expect_owners_override, cl,
                                 repository_root)

     cl_infos, saved_splitting_file = SummarizeAndValidate(
         dry_run, summarize, files, refactor_branch, cl_infos)
     # If the user aborted, we're done
     if not cl_infos:
         return 0

     cls_per_reviewer = collections.defaultdict(int)
     for cl_index, cl_info in enumerate(cl_infos, 1):
         if dry_run and summarize:
             pass
         elif dry_run:
             file_paths = [f for _, f in cl_info.files]
             PrintClInfo(cl_index, len(cl_infos), cl_info.description,
                         file_paths, description, cl_info.reviewers, cq_dry_run,
                         enable_auto_submit, topic)
         else:
             UploadCl(refactor_branch, refactor_branch_upstream,
                      cl_info.description, cl_info.files, description,
                      saved_splitting_file, comment, cl_info.reviewers,
                      changelist, cmd_upload, cq_dry_run, enable_auto_submit,
                      topic, repository_root)

         for reviewer in cl_info.reviewers:
             cls_per_reviewer[reviewer] += 1

     # List the top reviewers that will be sent the most CLs as a result of
     # the split.
     reviewer_rankings = sorted(cls_per_reviewer.items(),
                                key=lambda item: item[1],
                                reverse=True)
     Emit('The top reviewers are:')
     for reviewer, count in reviewer_rankings[:CL_SPLIT_TOP_REVIEWERS]:
         Emit(f'    {reviewer}: {count} CLs')

     if dry_run:
         # Wait until now to save the splitting so the file name doesn't get
         # washed away by the flood of dry-run printing.
         SaveSplittingToTempFile(cl_infos)

     # Go back to the original branch.
     git.run('checkout', refactor_branch)
     return 0


 def CheckDescriptionBugLink(description):
     """Verifies that the description contains a bug link.

     Examples:
         Bug: 123
         Bug: chromium:456

     Prompts user if the description does not contain a bug link.
     """
     bug_pattern = re.compile(r"^Bug:\s*(?:[a-zA-Z]+:)?[0-9]+", re.MULTILINE)
     matches = re.findall(bug_pattern, description)
     answer = 'y'
     if not matches:
         answer = gclient_utils.AskForData(
             'Description does not include a bug link. Proceed? (y/N):')
     return answer.lower() == 'y'


 def SelectReviewersForFiles(cl, author, files, max_depth, repository_root):
     """Selects reviewers for passed-in files

     Args:
         cl: Changelist class instance
         author: Email of person running 'git cl split'
         files: List of files
         max_depth: The maximum directory depth to search for OWNERS files.
             A value less than 1 means no limit.
         repository_root: Absolute path of the repository root
     """
     info_split_by_owners = GetFilesSplitByOwners(files, max_depth,
                                                  repository_root)

     info_split_by_reviewers = {}

     for (directory, split_files) in info_split_by_owners.items():
         # Use '/' as a path separator in the branch name and the CL description
         # and comment.
         directory = directory.replace(os.path.sep, '/')
         file_paths = [f for _, f in split_files]
         # Convert reviewers list to tuple in order to use reviewers as key to
         # dictionary.
         reviewers = tuple(
             cl.owners_client.SuggestOwners(
                 file_paths, exclude=[author, cl.owners_client.EVERYONE]))

         if not reviewers in info_split_by_reviewers:
             info_split_by_reviewers[reviewers] = FilesAndOwnersDirectory([], [])
         info_split_by_reviewers[reviewers].files.extend(split_files)
         info_split_by_reviewers[reviewers].owners_directories.append(directory)

     return info_split_by_reviewers


 ################################################################################
 # Code for saving, editing, and loading splittings.
 ################################################################################

 def SaveSplittingToFile(cl_infos: List[CLInfo], filename: str, silent=False):
     """
     Writes the listed CLs to the designated file, in a human-readable and
     editable format. Include an explanation of the file format at the top,
     as well as instructions for how to use it.
     """
     preamble = (
         "# CLs in this file must have the following format:\n"
         "# A 'Reviewers: [...]' line, where '...' is a (possibly empty) list "
         "of reviewer emails.\n"
         "# A 'Description: ...' line, where '...' is any string (by default, "
         "the list of directories the files have been pulled from).\n"
         "# One or more file lines, consisting of an <action>, <file> pair, in "
         "the format output by `git status`.\n\n"
         "# Each 'Reviewers' line begins a new CL.\n"
         "# To use the splitting in this file, use the --from-file option.\n\n")

     cl_string = "\n\n".join([info.FormatForPrinting() for info in cl_infos])
     gclient_utils.FileWrite(filename, preamble + cl_string)
     if not silent:
         Emit(f"Saved splitting to {filename}")


 def SaveSplittingToTempFile(cl_infos: List[CLInfo], silent=False):
     """
     Create a file in the user's temp directory, and save the splitting there.
     """
     # We can't use gclient_utils.temporary_file because it will be removed
     temp_file, temp_name = tempfile.mkstemp(prefix="split_cl_")
     os.close(temp_file)  # Necessary for windows
     SaveSplittingToFile(cl_infos, temp_name, silent)
     return temp_name


 class ClSplitParseError(Exception):
     pass


 # Matches 'Reviewers: [...]', extracts the ...
 reviewers_re = re.compile(r'Reviewers:\s*\[([^\]]*)\]')
 # Matches 'Description: ...', extracts the ...
 description_re = re.compile(r'Description:\s*(.+)')
 # Matches '<action>, <file>', and extracts both
 # <action> must be a valid code (either 1 or 2 letters)
 file_re = re.compile(r'([MTADRC]{1,2}),\s*(.+)')

 # We use regex parsing instead of e.g. json because it lets us use a much more
 # human-readable format, similar to the summary printed in dry runs
 def ParseSplittings(lines: List[str]) -> List[CLInfo]:
     """
     Parse a splitting file. We expect to get a series of lines in the format
     of CLInfo.FormatForPrinting. In the following order, we expect to see
     - A 'Reviewers: ' line containing a list,
     - A 'Description: ' line containing anything, and
     - A list of <action>, <path> pairs, each on its own line

     Note that this function only transforms the file into a list of CLInfo
     (if possible). It does not validate the information; for that, see
     ValidateSplitting.
     """

     cl_infos = []
     current_cl_info = None
     for line in lines:
         line = line.strip()

         # Skip empty or commented lines
         if not line or line.startswith('#'):
             continue

         # Start a new CL whenever we see a new Reviewers: line
         m = re.fullmatch(reviewers_re, line)
         if m:
             reviewers_str = m.group(1)
             reviewers = [r.strip() for r in reviewers_str.split(",")]
             # Account for empty list or trailing comma
             if not reviewers[-1]:
                 reviewers = reviewers[:-1]

             if current_cl_info:
                 cl_infos.append(current_cl_info)

             current_cl_info = CLInfo(reviewers=reviewers)
             continue

         if not current_cl_info:
             # Make sure no nonempty lines appear before the first CL
             raise ClSplitParseError(
                 f"Error: Line appears before the first 'Reviewers: ' line:\n{line}"
             )

         # Description is just used as a description, so any string is fine
         m = re.fullmatch(description_re, line)
         if m:
             if current_cl_info.description:
                 raise ClSplitParseError(
                     f"Error parsing line: CL already has a description entry\n{line}"
                 )
             current_cl_info.description = m.group(1).strip()
             continue

         # Any other line is presumed to be an '<action>, <file>' pair
         m = re.fullmatch(file_re, line)
         if m:
             action, path = m.groups()
             current_cl_info.files.append((action, path))
             continue

         raise ClSplitParseError("Error parsing line: Does not look like\n"
                                 "'Reviewers: [...]',\n"
                                 "'Description: ...', or\n"
                                 f"a pair of '<action>, <file>':\n{line}")

     if (current_cl_info):
         cl_infos.append(current_cl_info)

     return cl_infos


 def ValidateSplitting(cl_infos: List[CLInfo], filename: str,
                       files_on_disk: List[Tuple[str, str]]):
     """
     Ensure that the provided list of CLs is a valid splitting.

     Specifically, check that:
     - Each file is in at most one CL
     - Each file and action appear in the list of changed files reported by git
     - Warn if some files don't appear in any CL
     - Warn if a reviewer string looks wrong, or if a CL is empty
     """
     # Validate the parsed information
     if not cl_infos:
         EmitWarning("No CLs listed in file. No action will be taken.")
         return []

     files_in_loaded_cls = set()
     # Collect all files, ensuring no duplicates
     # Warn on empty CLs or invalid reviewer strings
     for info in cl_infos:
         if not info.files:
             EmitWarning("CL has no files, and will be skipped:\n",
                         info.FormatForPrinting())
         for file_info in info.files:
             if file_info in files_in_loaded_cls:
                 raise ClSplitParseError(
                     f"File appears in multiple CLs in {filename}:\n{file_info}")

             files_in_loaded_cls.add(file_info)
         for reviewer in info.reviewers:
             if not (re.fullmatch(r"[^@]+@[^.]+\..+", reviewer)):
                 EmitWarning("reviewer does not look like an email address: ",
                             reviewer)

     # Strip empty CLs
     cl_infos = [info for info in cl_infos if info.files]

     # Ensure the files in the user-provided CL splitting match the files
     # that git reports.
     # Warn if not all the files git reports appear.
     # Fail if the user mentions a file that isn't reported by git
     files_on_disk = set(files_on_disk)
     if not files_in_loaded_cls.issubset(files_on_disk):
         extra_files = files_in_loaded_cls.difference(files_on_disk)
         extra_files_str = "\n".join(f"{action}, {file}"
                                     for (action, file) in extra_files)
         raise ClSplitParseError(
             f"Some files are listed in {filename} but do not match any files "
             f"listed by git:\n{extra_files_str}")

     unmentioned_files = files_on_disk.difference(files_in_loaded_cls)
     if (unmentioned_files):
         EmitWarning(
             "the following files are not included in any CL in {filename}. "
             "They will not be uploaded:")
         for file in unmentioned_files:
             Emit(file)


 def LoadSplittingFromFile(filename: str,
                           files_on_disk: List[Tuple[str, str]]) -> List[CLInfo]:
     """
     Given a file and the list of <action>, <file> pairs reported by git,
     read the file and return the list of CLInfos it contains.
     """
     lines = gclient_utils.FileRead(filename).splitlines()

     cl_infos = ParseSplittings(lines)
     ValidateSplitting(cl_infos, filename, files_on_disk)

     return cl_infos


 def EditSplittingInteractively(
         cl_infos: List[CLInfo],
         files_on_disk: List[Tuple[str, str]]) -> Tuple[List[CLInfo], str]:
     """
     Allow the user to edit the generated splitting using their default editor.
     Make sure the edited splitting is saved so they can retrieve it if needed.
     """

     tmp_file = SaveSplittingToTempFile(cl_infos, silent=True)
     splitting = gclient_utils.RunEditor(gclient_utils.FileRead(tmp_file), False)
     cl_infos = ParseSplittings(splitting.splitlines())

     # Save the edited splitting before validation, so the user can go back
     # and edit it if there are any typos
     SaveSplittingToFile(cl_infos, tmp_file)
     ValidateSplitting(cl_infos, "the provided splitting", files_on_disk)
     return cl_infos, tmp_file


 ################################################################################
 # Code for the clustering-based splitting algorithm.
 ################################################################################


 def GroupFilesByDirectory(cl, author: str, expect_owners_override: bool,
                           all_files: Tuple[str, str], min_files: int,
                           max_files: int) -> List[CLInfo]:
     """
     Group the contents of |all_files| into clusters of size between |min_files|
     and |max_files|, inclusive, based on their directory structure. Assign one
     reviewer to each group to create a CL. If |expect_owners_override| is true,
     consider only the directory structure of the files, ignoring ownership.

     May rarely create groups with fewer than |min_files| files, or assign
     multiple reviewers to a single CL.

     Args:
         cl: Changelist class instance, for calling owners methods
         author: Email of person running the script; never assigned as a reviewer
     """

     # Record the actions associated with each file because the clustering
     # algorithm just takes filenames
     actions_by_file = {}
     file_paths = []
     for (action, file) in all_files:
         actions_by_file[file] = action
         file_paths.append(file)

     reviewers_so_far = []
     cls = []
     # Go through the clusters by path length so that we're likely to choose
     # top-level owners earlier
     for (directories, files) in sorted(
             ClusterFiles(expect_owners_override, file_paths, min_files,
                          max_files)):
         # Use '/' as a path separator in the branch name and the CL description
         # and comment.
         directories = [
             directory.replace(os.path.sep, '/') for directory in directories
         ]
         files_with_actions = [(actions_by_file[file], file) for file in files]

         # Try to find a reviewer. If some of the files have noparent set,
         # we'll likely get multiple reviewers. Don't consider reviewers we've
         # already assigned something to.
         # FIXME: Rather than excluding existing reviewers, it would be better
         # to just penalize them, but still choose them over reviewers who have
         # a worse score. At the moment, owners_client doesn't support anything
         # to do with the score.
         reviewers = cl.owners_client.SuggestMinimalOwners(
             files,
             exclude=[author, cl.owners_client.EVERYONE] + reviewers_so_far)

         # Retry without excluding existing reviewers if we couldn't find any.
         # This is very unlikely since there are many fallback owners.
         if not reviewers:
             reviewers = cl.owners_client.SuggestMinimalOwners(
                 directories, exclude=[author, cl.owners_client.EVERYONE])

         reviewers_so_far.extend(reviewers)
         cls.append(
             CLInfo(set(reviewers), files_with_actions,
                    FormatDirectoriesForPrinting(directories)))

     return cls


 ### Trie Code


 def FolderHasParent(path: str) -> bool:
     """
     Check if a folder inherits owners from a higher-level directory:
     i.e. it's not at top level, and doesn't have an OWNERS file that contains
     `set noparent`
     """
     # Treat each top-leve directory as having no parent, as well as the root
     # directory.
     if len(path.split(os.path.sep)) <= 1:
         # Top level
         return False

     owners_file = os.path.join(path, 'OWNERS')
     if (os.path.isfile(owners_file)):
         with (open(owners_file)) as f:
             for line in f.readlines():

                 # Strip whitespace and comments
                 line = line.split('#')[0].strip()

                 if (line == 'set noparent'):
                     return False

     return True


 class DirectoryTrie():
     """
     Trie structure: Nested dictionaries representing file paths.
     Each level represents one folder, and contains:
     - The path to that folder (its prefix)
     - A list of files that reside in that folder
     - A boolean for whether that folder inherits owners from a parent folder
     - One Trie representing each of that folder's subdirectories

     Files are stored with their entire path, so we don't need to reconstruct
     it every time we read them.
     """

     def __init__(self, expect_owners_override, prefix: str = ""):
         """ Create an empty DirectoryTrie with the specified prefix """
         has_parent = expect_owners_override or FolderHasParent(prefix)
         # yapf: disable
         self.subdirectories : Dict[str, DirectoryTrie] = {}
         self.files          : List[str]                = []
         self.prefix         : str                      = prefix
         self.has_parent     : bool                     = has_parent
         self.expect_owners_override : bool             = expect_owners_override
         # yapf: enable

     def AddFile(self, path: List[str]):
         """
         Add a file to the Trie, adding new subdirectories if necessary.
         The file should be represented as a list of directories, with the final
         entry being the filename.
         """
         if len(path) == 1:
             self.files.append(os.path.join(self.prefix, path[0]))
         else:
             directory = path[0]
             if directory not in self.subdirectories:
                 prefix = os.path.join(self.prefix, directory)
                 self.subdirectories[directory] = DirectoryTrie(
                     self.expect_owners_override, prefix)
             self.subdirectories[directory].AddFile(path[1:])

     def AddFiles(self, paths: List[List[str]]):
         """ Convenience function to add many files at once. """
         for path in paths:
             self.AddFile(path)

     def ToList(self) -> List[str]:
         """ Return a list of all files in the trie. """
         files = []
         files += self.files
         for subdir in self.subdirectories.values():
             files += subdir.ToList()
         return files


 ### Clustering code

 # Convenience type: a "bin" represents a collection of files:
 # it tracks their prefix(es) and the list of files themselves.
 # Both elements are string lists.
 Bin = collections.namedtuple("Bin", "prefixes files")


 def PackFiles(max_size: int, files_to_pack: List[Bin]) -> List[Bin]:
     """
     Simple bin packing algorithm: given a list of small bins, consolidate them
     into as few larger bins as possible, where each bin can hold at most
     |max_size| files.
     """
     bins = []
     # Guess how many bins we'll need ahead of time so we can spread things
     # between them. We'll add more bins later if necessary
     expected_bins_needed = math.ceil(
         sum(len(bin.files) for bin in files_to_pack) / max_size)
     expected_avg_bin_size = math.ceil(
         sum(len(bin.files) for bin in files_to_pack) / expected_bins_needed)
     for _ in range(expected_bins_needed):
         bins.append(Bin([], []))

     # Sort by number of files, decreasing
     sorted_by_num_files = sorted(files_to_pack, key=lambda bin: -len(bin.files))

     # Invariant: the least-filled bin is always the first element of |bins|
     # This ensures we spread things between bins as much as possible.
     for (prefixes, files) in sorted_by_num_files:
         b = bins[0]
         if len(b.files) + len(files) <= max_size:
             b[0].extend(prefixes)
             b[1].extend(files)
         else:
             # Since the first bin is the emptiest, if we failed to fit in
             # that we don't need to try any others.

             # If these files alone are too large, split them up into
             # groups of size |expected_avg_bin_size|
             if len(files) > max_size:
                 bins.extend([
                     Bin(prefixes, files[i:i + expected_avg_bin_size])
                     for i in range(0, len(files), expected_avg_bin_size)
                 ])
             else:
                 bins.append(Bin(prefixes, files))

         # Maintain invariant
         bins.sort(key=lambda bin: len(bin.files))
     return [bin for bin in bins if len(bin.files) > 0]


 def ClusterFiles(expect_owners_override: bool, files: List[str], min_files: int,
                  max_files: int) -> List[Bin]:
     """
     Group the entries of |files| into clusters of size between |min_files| and
     |max_files|, inclusive. Guarantees that the size does not exceed
     |max_files|, but the size may rarely be less than |min_files|. If
     |expect_owners_override| is true, don't consider ownership when clustering,
     only directory structure.

     Clustering strategy for a given directory:
     1. Try to group each subdirectory independently
     2. Group any remaining files as follows:
         2a. If there are less than |min_files| files and the folder has a parent,
             give up and let the parent folder handle it.
         2c. Otherwise, if there are at most |max_files| files, create one
             cluster.
         2c. Finally, if there are more than |max_files| files, create several
             clusters of size less than |max_files|.
     """
     trie = DirectoryTrie(expect_owners_override)
     trie.AddFiles([file.split(os.path.sep) for file in files])
     clusters: List[Bin] = []

     def ClusterDirectory(current_dir: DirectoryTrie) -> List[str]:
         """
         Attempt to cluster the files for a directory, by grouping them into
         Bins and appending the bins to |clusters|.
         Returns a list of files that weren't able to be clustered (because
         there weren't at least |min_files| files).
         """
         # Track all the files we need to handle in this directory
         unclustered_files: List[Bin] = []

         # Record any files that live in this directory directly
         if len(current_dir.files) > 0:
             unclustered_files.append(
                 Bin([current_dir.prefix], current_dir.files))

         # Step 1: Try to cluster each subdirectory independently
         for subdir in current_dir.subdirectories.values():
             unclustered_files_in_subdir = ClusterDirectory(subdir)
             # If not all files were submitted, record them
             if len(unclustered_files_in_subdir) > 0:
                 unclustered_files.append(
                     Bin([subdir.prefix], unclustered_files_in_subdir))

         # A flattened list containing just the names of all unclustered files
         unclustered_files_names_only = [
             file for bin in unclustered_files for file in bin.files
         ]

         if len(unclustered_files_names_only) == 0:
             return []

         # Step 2a: If we don't have enough files for a cluster and it's possible
         # to recurse upward, do so
         if (len(unclustered_files_names_only) < min_files
                 and current_dir.has_parent):
             return unclustered_files_names_only

         # Step 2b, 2c: Create one or more clusters from the unclustered files
         # by appending to the |clusters| variable in the outer scope
         nonlocal clusters
         if len(unclustered_files_names_only) <= max_files:
             clusters.append(
                 Bin([current_dir.prefix], unclustered_files_names_only))
         else:
             clusters += PackFiles(max_files, unclustered_files)

         return []

     unclustered_paths = ClusterDirectory(trie)
     if (len(unclustered_paths) > 0):
         EmitWarning(
             'Not all files were assigned to a CL!\n'
             'This should be impossible, file a bug.\n'
             f'{len(unclustered_paths)} Unassigned files: {unclustered_paths}')

     return clusters