blob: 417c331cb6000a0ff6edb235be0cd88ccb409404 [file] [log] [blame]
#!/usr/bin/env python3
# Copyright 2018 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from __future__ import print_function
import argparse
import subprocess
import pickle
import re
import os
from pathlib import PurePath
from os import path
from datetime import date, timedelta
from collections import namedtuple, defaultdict
Commit = namedtuple('Commit', ['hash', 'author', 'commit_date', 'dirs'])
# dict mapping each subdirectory and author to the number of their commits and
# modifications in that directory
DIRECTORY_AUTHORS = defaultdict(dict)
# cache for directory owners for memoisation purposes
OWNERS_CACHE = {}
# filename for pickle cache
CACHE_FILENAME = 'suggest_owners.cache'
def _RunGitCommand(options, cmd_args, pipe_output=False):
repo_path = path.join(options.repo_path, '.git')
cmd = ['git', '--git-dir', repo_path] + cmd_args
print('>', ' '.join(cmd))
if not pipe_output:
return subprocess.check_output(cmd, encoding='utf-8')
else:
return subprocess.Popen(cmd, encoding='utf-8',
stdout=subprocess.PIPE).stdout
def _ValidAuthor(author):
return author.endswith(
('@chromium.org', '@google.com')) and 'roller' not in author
# Returns additions/deletions by a commit to a directory (and its descendants).
def getEditsForDirectory(commit, directory):
additions = deletions = 0
for commit_directory, (directory_additions, directory_deletions) \
in commit.dirs.items():
# check if commit_directory is same as or a descendant of directory
if isSubDirectory(directory, commit_directory):
additions += directory_additions
deletions += directory_deletions
return additions, deletions
# This propagates a commit touching a directory to also be touching all
# ancesstor directories.
def _PropagateCommit(options, commit):
touched_dirs = set()
# first get all the touched dirs and their ancestors
for directory in commit.dirs.keys():
# PurePath.parent returns '.' for non absolute paths in the limit.
while str(directory) != '.':
touched_dirs.add(str(directory))
# get the parent directory
directory = PurePath(directory).parent
# loop over them and calculate the edits per directory
for directory in touched_dirs:
author_commits, author_additions, author_deletions = \
DIRECTORY_AUTHORS[directory].get(commit.author, (0,0,0))
directory_additions, directory_deletions = \
getEditsForDirectory(commit, directory)
DIRECTORY_AUTHORS[directory][commit.author] = \
(author_commits + 1, author_additions + directory_additions,
author_deletions + directory_deletions)
# Checks if child_directory is same as or below parent_directory. For some
# reason the os.path module does not have this functionality.
def isSubDirectory(parent_directory, child_directory):
parent_directory = PurePath(parent_directory)
child_directory = PurePath(child_directory)
return child_directory.is_relative_to(parent_directory)
def _GetGitLogCmd(options):
# TODO(mheikal): git-log with --numstat vs --name-only takes 10x the time to
# complete. It takes >15 mins for git log --numstat to return the 1 year git
# history of the full repo. Should probably add a script flag to switch off
# keeping track of number of modifications per commit.
date_limit = date.today() - timedelta(days=options.days_ago)
format_string = "%h,%ae,%cI"
cmd_args = [
'log',
'--since', date_limit.isoformat(),
'--numstat',
'--pretty=format:%s'%format_string,
]
# has to be last arg
if options.subdirectory:
cmd_args += ['--', options.subdirectory]
return cmd_args
def _ParseCommitLine(line):
commit_hash, author, commit_date = line.split(",")
return Commit(hash=commit_hash, author=author, commit_date=commit_date,
dirs={})
def _ParseFileStatsLine(current_commit, line):
try:
additions, deletions, filepath = line.split('\t')
except ValueError:
return False
if additions == '-':
additions = 0
else:
additions = int(additions)
if deletions == '-':
deletions = 0
else:
deletions = int(deletions)
if additions == 0 and deletions == 0:
return True
dir_path = path.dirname(filepath)
# For git renames, we count the destination directory
if '=>' in dir_path:
dir_path = re.sub(r'\{[^=]* => ([^\}]*)\}', r'\1', dir_path)
# remove possibly empty path parts.
dir_path = dir_path.replace('//', '/')
commit_additions, commit_deletions = \
current_commit.dirs.get(dir_path, (0,0))
current_commit.dirs[dir_path] = (
additions + commit_additions, deletions + commit_deletions)
return True
def processAllCommits(options):
if not options.subdirectory and options.days_ago > 100:
print('git log for your query might take > 5 minutes, limit by a '
'subdirectory or reduce the number of days of history to low double '
'digits to make this faster. There is no progress indicator, it is '
'all waiting for single git log to finish.')
output_pipe = _RunGitCommand(options,
_GetGitLogCmd(options),
pipe_output=True)
current_commit = None
for line in iter(output_pipe.readline, ''):
line = line.rstrip('\n')
if current_commit is None:
current_commit = _ParseCommitLine(line)
else:
if line == '': # all commit details read
if _ValidAuthor(current_commit.author):
_PropagateCommit(options, current_commit)
current_commit = None
else:
# Merge commits weird out git-log. If we fail to parse the line, then
# the last commit was a merge and this line is actually another commit
# description line.
if not _ParseFileStatsLine(current_commit, line):
current_commit = _ParseCommitLine(line)
# process the final commit
if _ValidAuthor(current_commit.author):
_PropagateCommit(options, current_commit)
print('Done parsing commit log.')
def _CountCommits(directory):
return sum(
[count for (count, _a, _d) in DIRECTORY_AUTHORS[directory].values()])
def _GetOwnerLevel(options, author, directory):
sorted_owners = sorted(_GetOwners(options, directory), key=lambda e: e[1])
for owner, level in sorted_owners:
if author == owner:
return level
else:
return -1
# Returns the owners for a repo subdirectory. This does not understand per-file
# directives.
# TODO(mheikal): use depot_tools owners.py for parsing owners files.
def _GetOwners(options, directory_path):
if directory_path in OWNERS_CACHE:
return OWNERS_CACHE[directory_path]
owners_path = path.join(options.repo_path, directory_path, 'OWNERS')
owners = set()
parent_dir = directory_path
owner_level = 0
while parent_dir != '':
if path.isfile(owners_path):
parsed_owners, noparent = _ParseOwnersFile(options, owners_path)
owners.update([(owner, owner_level) for owner in parsed_owners])
owner_level += 1
if noparent:
break
parent_dir = path.dirname(parent_dir)
owners_path = path.join(parent_dir, 'OWNERS')
OWNERS_CACHE[directory_path] = set(owners)
return owners
# Parse an OWNERS file, returns set of owners and if the file sets noparent
def _ParseOwnersFile(options, filepath):
owners = set()
noparent = False
with open(filepath) as f:
for line in f.readlines():
line = line.strip()
# The script deals with directories so per-files are ignored.
if line == '' or line[0] == '#' or line.startswith('per-file'):
continue
if line.startswith('file://'):
relpath = line[7:]
abspath = path.join(options.repo_path, relpath)
parsed_owners, _ = _ParseOwnersFile(options, abspath)
owners.update(parsed_owners)
if line == 'set noparent':
noparent = True
index = line.find('@chromium.org')
if index > -1:
owners.add(line[:index + len('@chromium.org')])
return owners, noparent
# Trivial directories are ones that just contain a single child subdir and
# nothing else.
def _IsTrivialDirectory(options, repo_subdir):
try:
return len(os.listdir(path.join(options.repo_path, repo_subdir))) == 1
except OSError:
# directory no longer exists
return False
def computeSuggestions(options):
directory_suggestions = []
for directory, authors in sorted(DIRECTORY_AUTHORS.items()):
if _IsTrivialDirectory(options, directory):
continue
if _CountCommits(directory) < options.dir_commit_limit:
continue
# skip suggestions for directories outside the passed in directory
if (options.subdirectory
and not isSubDirectory(options.subdirectory, directory)):
continue
# sort authors by descending number of commits
sorted_authors = sorted(authors.items(), key=lambda entry: -entry[1][0])
# keep only authors above the limit
suggestions = [(a,c) for a,c in sorted_authors if \
a not in options.ignore_authors \
and c[0] >= options.author_cl_limit]
directory_suggestions.append((directory, suggestions))
return directory_suggestions
def _PrintSettings(options):
print('Showing directories with at least ({}) commits in the last ({}) '
'days.'.format(options.dir_commit_limit, options.days_ago))
print('Showing top ({}) committers who have commited at least ({}) commits '
'to the directory in the last ({}) days.'.format(
options.max_suggestions, options.author_cl_limit,
options.days_ago))
print('(owners+N) represents distance through OWNERS files for said owner\n')
def printSuggestions(options, directory_suggestions):
print('\nCommit stats:')
_PrintSettings(options)
for directory, suggestions in directory_suggestions:
print('{}: {} commits in the last {} days'.format(
directory, _CountCommits(directory), options.days_ago))
non_owner_suggestions = 0
for author, (commit_count, additions, deletions) in suggestions:
owner_level = _GetOwnerLevel(options, author, directory)
if owner_level > -1:
owner_string = ' (owner+{})'.format(owner_level)
else:
non_owner_suggestions +=1
owner_string = ''
print('{}{}, commits: {}, additions:{}, deletions: {}'.format(
author, owner_string, commit_count, additions, deletions))
if non_owner_suggestions >= options.max_suggestions:
break
print()
def _GetHeadCommitHash(options):
return _RunGitCommand(options, ['rev-parse', 'HEAD']).strip()
def _GetCacheMetadata(options):
return _GetHeadCommitHash(options), options.days_ago, options.subdirectory
def _IsCacheValid(options, metadata):
head_hash, days_ago, cached_subdirectory = metadata
if head_hash != _GetHeadCommitHash(options):
return False
if days_ago != options.days_ago:
return False
if (cached_subdirectory is not None
and not isSubDirectory(cached_subdirectory, options.subdirectory)):
return False
return True
def cacheProcessedCommits(options):
metadata = _GetCacheMetadata(options)
with open(CACHE_FILENAME, 'wb') as f:
pickle.dump((metadata, DIRECTORY_AUTHORS), f)
def maybeRestoreProcessedCommits(options):
global DIRECTORY_AUTHORS
if not path.exists(CACHE_FILENAME):
return False
with open(CACHE_FILENAME, 'rb') as f:
stored_metadata, cached_directory_authors = pickle.load(f)
if _IsCacheValid(options, stored_metadata):
print('Loading from cache')
DIRECTORY_AUTHORS = cached_directory_authors
return True
else:
print('Cache is stale or invalid, must rerun `git log`')
return False
def do(options):
if options.skip_cache or not maybeRestoreProcessedCommits(options):
processAllCommits(options)
cacheProcessedCommits(options)
directory_suggestions = computeSuggestions(options)
printSuggestions(options, directory_suggestions)
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('repo_path')
parser.add_argument('--days-ago', type=int,
help='Number of days of history to search through.',
default=365, metavar='DAYS_AGO')
parser.add_argument('--subdirectory',
help='Limit suggestions to this subdirectory', default='')
parser.add_argument('--ignore-authors',
help='Ignore this comma separated list of authors')
parser.add_argument('--max-suggestions', type=int, help='Maximum number of '
'suggested authors per directory.', default=5)
parser.add_argument('--author-cl-limit', type=int, help='Do not suggest '
'authors who have commited less than this to the '
'directory in the last DAYS_AGO days.', default=10)
parser.add_argument('--dir-commit-limit', type=int, help='Skip directories '
'with less than this number of commits in the last '
'DAYS_AGO days.', default=100)
parser.add_argument('--skip-cache', action='store_true',
help='Do not read from cache.', default=False)
options = parser.parse_args()
if options.ignore_authors:
options.ignore_authors = set(
map(str.strip, options.ignore_authors.split(',')))
else:
options.ignore_authors = set()
do(options)
if __name__ == '__main__':
main()