blob: 89771ad142d05307f6b4d8152d25e1d81691bdd5 [file] [log] [blame]
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Utilities for scanning source files to determine code authorship.
"""
import itertools
def ForwardSlashesToOsPathSeps(input_api, path):
"""Converts forward slashes ('/') in the input path to OS-specific
path separators. Used when the paths come from outside and are using
UNIX path separators. Only works for relative paths!
Args:
input_api: InputAPI, as in presubmit scripts.
path: The path to convert.
Returns:
Converted path.
"""
return input_api.os_path.join(*path.split('/'))
def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
"""Similar to UNIX utility find(1), searches for files in the directories.
Automatically leaves out only source code files and excludes third_party
directories.
Args:
input_api: InputAPI, as in presubmit scripts.
root_dir: The root directory, to which all other paths are relative.
start_paths_list: The list of paths to start search from. Each path can
be a file or a directory.
excluded_dirs_list: The list of directories to skip.
Returns:
The list of source code files found, relative to |root_dir|.
"""
excluded_dirs_list = [d for d in excluded_dirs_list if not 'third_party' in d]
# Using a common pattern for third-partyies makes the ignore regexp shorter
excluded_dirs_list.append('third_party')
path_join = input_api.os_path.join
EXTRA_EXCLUDED_DIRS = [
# VCS dirs
path_join('.git'),
path_join('.svn'),
# Build output
path_join('out', 'Debug'),
path_join('out', 'Release'),
# 'Copyright' appears in license agreements
path_join('chrome', 'app', 'resources'),
# Quickoffice js files from internal src used on buildbots.
# crbug.com/350472.
path_join('chrome', 'browser', 'resources', 'chromeos', 'quickoffice'),
# blink style copy right headers.
path_join('content', 'shell', 'renderer', 'test_runner'),
# blink style copy right headers.
path_join('content', 'shell', 'tools', 'plugin'),
# This is tests directory, doesn't exist in the snapshot
path_join('content', 'test', 'data'),
# This is a tests directory that doesn't exist in the shipped product.
path_join('gin', 'test'),
# This is a test output directory
path_join('data', 'dom_perf'),
# This is a tests directory that doesn't exist in the shipped product.
path_join('tools', 'perf', 'page_sets'),
path_join('tools', 'perf', 'page_sets', 'tough_animation_cases'),
# Histogram tools, doesn't exist in the snapshot
path_join('tools', 'histograms'),
# Swarming tools, doesn't exist in the snapshot
path_join('tools', 'swarming_client'),
# Don't check downloaded goma client binaries.
path_join('build', 'goma', 'client'),
# Ignore sysroots.
path_join('build', 'linux', 'debian_jessie_arm64-sysroot'),
path_join('build', 'linux', 'debian_wheezy_amd64-sysroot'),
path_join('build', 'linux', 'debian_wheezy_arm-sysroot'),
path_join('build', 'linux', 'debian_wheezy_mips-sysroot'),
path_join('build', 'linux', 'debian_wheezy_i386-sysroot'),
# Old location (TODO(sbc): Remove this once it no longer exists on any bots)
path_join('chrome', 'installer', 'linux', 'debian_wheezy_arm-sysroot'),
# Data is not part of open source chromium, but are included on some bots.
path_join('data'),
# This is not part of open source chromium, but are included on some bots.
path_join('skia', 'tools', 'clusterfuzz-data'),
# Not shipped, only relates to Chrome for Android, but not to WebView
path_join('clank'),
# Internal-only repository.
path_join('remoting', 'android', 'internal'),
]
excluded_dirs_list.extend(EXTRA_EXCLUDED_DIRS)
# Surround the directory names with OS path separators.
dirs_blacklist = [path_join('.', d, '')[1:] for d in excluded_dirs_list if d]
def IsBlacklistedDir(d):
for item in dirs_blacklist:
if item in d:
return True
return False
files_whitelist_re = input_api.re.compile(
r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
'|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
'|tex|mli?)$')
files = []
base_path_len = len(root_dir)
for path in start_paths_list:
full_path = path_join(root_dir, path)
if input_api.os_path.isfile(full_path):
if files_whitelist_re.search(path) and \
not IsBlacklistedDir(full_path[base_path_len:]): # Keep '/' prefix.
files.append(path)
else:
for dirpath, dirnames, filenames in input_api.os_walk(full_path):
# Remove excluded subdirs for faster scanning.
for item in dirnames[:]:
if IsBlacklistedDir(
path_join(dirpath, item)[base_path_len + 1:]):
dirnames.remove(item)
for filename in filenames:
filepath = \
path_join(dirpath, filename)[base_path_len + 1:]
if files_whitelist_re.search(filepath) and \
not IsBlacklistedDir(filepath):
files.append(filepath)
return files
class _GeneratedFilesDetector(object):
GENERATED_FILE = 'GENERATED FILE'
NO_COPYRIGHT = '*No copyright*'
def __init__(self, input_api):
self.python_multiline_string_double_re = \
input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
self.python_multiline_string_single_re = \
input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
self.automatically_generated_re = input_api.re.compile(
r'(All changes made in this file will be lost'
'|DO NOT (EDIT|delete this file)'
'|Generated (at|automatically|data)'
'|Automatically generated'
'|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)
def IsGeneratedFile(self, header):
header = header.upper()
if '"""' in header:
header = self.python_multiline_string_double_re.sub('', header)
if "'''" in header:
header = self.python_multiline_string_single_re.sub('', header)
# First do simple strings lookup to save time.
if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
return True
if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
'GENERATED' in header:
return self.automatically_generated_re.search(header)
return False
class _CopyrightsScanner(object):
@staticmethod
def StaticInit(input_api):
_CopyrightsScanner._c_comment_re = \
input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
_CopyrightsScanner._copyright_indicator = \
r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
_CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
_CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)
def __init__(self, input_api):
self.max_line_numbers_proximity = 3
self.last_a_item_line_number = -200
self.last_b_item_line_number = -100
self.re = input_api.re
def _CloseLineNumbers(self, a, b):
return 0 <= a - b <= self.max_line_numbers_proximity
def MatchLine(self, line_number, line):
if '"' in line:
line = _CopyrightsScanner._c_comment_re.sub('', line)
upcase_line = line.upper()
# Record '(a)' and '(b)' last occurences in C++ comments.
# This is to filter out '(c)' used as a list item inside C++ comments.
# E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
cpp_comment_idx = upcase_line.find('//')
if cpp_comment_idx != -1:
if upcase_line.find('(A)') > cpp_comment_idx:
self.last_a_item_line_number = line_number
if upcase_line.find('(B)') > cpp_comment_idx:
self.last_b_item_line_number = line_number
# Fast bailout, uses the same patterns as _copyright_indicator regexp.
if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
and not '\xc2\xa9' in upcase_line:
c_item_index = upcase_line.find('(C)')
if c_item_index == -1:
return None
if c_item_index > cpp_comment_idx and \
self._CloseLineNumbers(line_number,
self.last_b_item_line_number) and \
self._CloseLineNumbers(self.last_b_item_line_number,
self.last_a_item_line_number):
return None
copyr = None
m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
if m and \
not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
copyr = m.group(0)
# Prettify the authorship string.
copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
copyr = self.re.sub(
_CopyrightsScanner._copyright_indicator, '', copyr, \
flags=self.re.IGNORECASE)
copyr = self.re.sub(r'^\s+', '', copyr)
copyr = self.re.sub(r'\s{2,}', ' ', copyr)
copyr = self.re.sub(r'\\@', '@', copyr)
return copyr
def FindCopyrights(input_api, root_dir, files_to_scan):
"""Determines code autorship, and finds generated files.
Args:
input_api: InputAPI, as in presubmit scripts.
root_dir: The root directory, to which all other paths are relative.
files_to_scan: The list of file names to scan.
Returns:
The list of copyrights associated with each of the files given.
If the certain file is generated, the corresponding list consists a single
entry -- 'GENERATED_FILE' string. If the file has no copyright info,
the corresponding list contains 'NO_COPYRIGHT' string.
"""
generated_files_detector = _GeneratedFilesDetector(input_api)
_CopyrightsScanner.StaticInit(input_api)
copyrights = []
for file_name in files_to_scan:
linenum = 0
header = []
file_copyrights = []
scanner = _CopyrightsScanner(input_api)
contents = input_api.ReadFile(
input_api.os_path.join(root_dir, file_name), 'r')
for l in contents.split('\n'):
linenum += 1
if linenum <= 25:
header.append(l)
c = scanner.MatchLine(linenum, l)
if c:
file_copyrights.append(c)
if generated_files_detector.IsGeneratedFile('\n'.join(header)):
copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
elif file_copyrights:
copyrights.append(file_copyrights)
else:
copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
return copyrights
def FindCopyrightViolations(input_api, root_dir, files_to_scan):
"""Looks for files that are not belong exlusively to the Chromium Authors.
Args:
input_api: InputAPI, as in presubmit scripts.
root_dir: The root directory, to which all other paths are relative.
files_to_scan: The list of file names to scan.
Returns:
The list of file names that contain non-Chromium copyrights.
"""
copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
offending_files = []
allowed_copyrights_re = input_api.re.compile(
r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
'All rights reserved.*)$')
for f, cs in itertools.izip(files_to_scan, copyrights):
if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
continue
for c in cs:
if not allowed_copyrights_re.match(c):
offending_files.append(input_api.os_path.normpath(f))
break
return offending_files
def _GetWhitelistFileName(input_api):
return input_api.os_path.join(
'tools', 'copyright_scanner', 'third_party_files_whitelist.txt')
def _ProcessWhitelistedFilesList(input_api, lines):
whitelisted_files = []
for line in lines:
match = input_api.re.match(r'([^#\s]+)', line)
if match:
whitelisted_files.append(
ForwardSlashesToOsPathSeps(input_api, match.group(1)))
return whitelisted_files
def LoadWhitelistedFilesList(input_api):
"""Loads and parses the 3rd party code whitelist file.
input_api: InputAPI of presubmit scripts.
Returns:
The list of files.
"""
full_file_name = input_api.os_path.join(
input_api.change.RepositoryRoot(), _GetWhitelistFileName(input_api))
file_data = input_api.ReadFile(full_file_name, 'rb')
return _ProcessWhitelistedFilesList(input_api, file_data.splitlines())
def AnalyzeScanResults(input_api, whitelisted_files, offending_files):
"""Compares whitelist contents with the results of file scanning.
input_api: InputAPI of presubmit scripts.
whitelisted_files: Whitelisted files list.
offending_files: Files that contain 3rd party code.
Returns:
A triplet of "unknown", "missing", and "stale" file lists.
"Unknown" are files that contain 3rd party code but not whitelisted.
"Missing" are files that are whitelisted but doesn't really exist.
"Stale" are files that are whitelisted unnecessarily.
"""
unknown = set(offending_files) - set(whitelisted_files)
missing = [f for f in whitelisted_files if not input_api.os_path.isfile(
input_api.os_path.join(input_api.change.RepositoryRoot(), f))]
stale = set(whitelisted_files) - set(offending_files) - set(missing)
return (list(unknown), missing, list(stale))
def _GetDeletedContents(affected_file):
"""Returns a list of all deleted lines.
AffectedFile class from presubmit_support is lacking this functionality.
"""
deleted_lines = []
for line in affected_file.GenerateScmDiff().splitlines():
if line.startswith('-') and not line.startswith('--'):
deleted_lines.append(line[1:])
return deleted_lines
def _DoScanAtPresubmit(input_api, whitelisted_files, files_to_check):
# We pass empty 'known third-party' dirs list here. Since this is a patch
# for the Chromium's src tree, it must contain properly licensed Chromium
# code. Any third-party code must be put into a directory named 'third_party',
# and such dirs are automatically excluded by FindFiles.
files_to_scan = FindFiles(
input_api, input_api.change.RepositoryRoot(), files_to_check, [])
offending_files = FindCopyrightViolations(
input_api, input_api.change.RepositoryRoot(), files_to_scan)
return AnalyzeScanResults(
input_api, whitelisted_files, offending_files)
def ScanAtPresubmit(input_api, output_api):
"""Invoked at change presubmit time. Verifies that updated non third-party
code doesn't contain external copyrighted code.
input_api: InputAPI of presubmit scripts.
output_api: OutputAPI of presubmit scripts.
"""
files_to_check = set([])
deleted_files = set([])
whitelist_contents_changed = False
for f in input_api.AffectedFiles():
if f.LocalPath() == _GetWhitelistFileName(input_api):
whitelist_contents_changed = True
deleted_files |= set(_ProcessWhitelistedFilesList(
input_api, _GetDeletedContents(f)))
continue
if f.Action() != 'D':
files_to_check.add(f.LocalPath())
else:
deleted_files.add(f.LocalPath())
whitelisted_files = set(LoadWhitelistedFilesList(input_api))
if not whitelist_contents_changed:
whitelisted_files &= files_to_check | deleted_files
else:
# Need to re-check the entire contents of the whitelist file.
# Also add files removed from the whitelist. If the file has indeed been
# deleted, the scanner will not complain.
files_to_check |= whitelisted_files | deleted_files
(unknown_files, missing_files, stale_files) = _DoScanAtPresubmit(
input_api, list(whitelisted_files), list(files_to_check))
results = []
if unknown_files:
results.append(output_api.PresubmitError(
'The following files contain a third-party license but are not in ' \
'a listed third-party directory and are not whitelisted. You must ' \
'add the following files to the whitelist file %s\n' \
'(Note that if the code you are adding does not actually contain ' \
'any third-party code, it may contain the word "copyright", which ' \
'should be masked out, e.g. by writing it as "copy-right"):' \
'' % _GetWhitelistFileName(input_api),
sorted(unknown_files)))
if missing_files:
results.append(output_api.PresubmitPromptWarning(
'The following files are whitelisted in %s, ' \
'but do not exist or not files:' % _GetWhitelistFileName(input_api),
sorted(missing_files)))
if stale_files:
results.append(output_api.PresubmitPromptWarning(
'The following files are whitelisted unnecessarily. You must ' \
'remove the following files from the whitelist file ' \
'%s:' % _GetWhitelistFileName(input_api),
sorted(stale_files)))
return results