| # Copyright 2014 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Utilities for scanning source files to determine code authorship. |
| """ |
| |
| import itertools |
| |
| def ForwardSlashesToOsPathSeps(input_api, path): |
| """Converts forward slashes ('/') in the input path to OS-specific |
| path separators. Used when the paths come from outside and are using |
| UNIX path separators. Only works for relative paths! |
| Args: |
| input_api: InputAPI, as in presubmit scripts. |
| path: The path to convert. |
| Returns: |
| Converted path. |
| """ |
| return input_api.os_path.join(*path.split('/')) |
| |
| def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list): |
| """Similar to UNIX utility find(1), searches for files in the directories. |
| Automatically leaves out only source code files and excludes third_party |
| directories. |
| Args: |
| input_api: InputAPI, as in presubmit scripts. |
| root_dir: The root directory, to which all other paths are relative. |
| start_paths_list: The list of paths to start search from. Each path can |
| be a file or a directory. |
| excluded_dirs_list: The list of directories to skip. |
| Returns: |
| The list of source code files found, relative to |root_dir|. |
| """ |
| excluded_dirs_list = [d for d in excluded_dirs_list if not 'third_party' in d] |
| # Using a common pattern for third-partyies makes the ignore regexp shorter |
| excluded_dirs_list.append('third_party') |
| |
| path_join = input_api.os_path.join |
| EXTRA_EXCLUDED_DIRS = [ |
| # VCS dirs |
| path_join('.git'), |
| path_join('.svn'), |
| # Build output |
| path_join('out', 'Debug'), |
| path_join('out', 'Release'), |
| # 'Copyright' appears in license agreements |
| path_join('chrome', 'app', 'resources'), |
| # Quickoffice js files from internal src used on buildbots. |
| # crbug.com/350472. |
| path_join('chrome', 'browser', 'resources', 'chromeos', 'quickoffice'), |
| # This is a test output directory |
| path_join('chrome', 'tools', 'test', 'reference_build'), |
| # blink style copy right headers. |
| path_join('content', 'shell', 'renderer', 'test_runner'), |
| # blink style copy right headers. |
| path_join('content', 'shell', 'tools', 'plugin'), |
| # This is tests directory, doesn't exist in the snapshot |
| path_join('content', 'test', 'data'), |
| # This is a tests directory that doesn't exist in the shipped product. |
| path_join('gin', 'test'), |
| # This is a test output directory |
| path_join('data', 'dom_perf'), |
| # This is a tests directory that doesn't exist in the shipped product. |
| path_join('tools', 'perf', 'page_sets'), |
| path_join('tools', 'perf', 'page_sets', 'tough_animation_cases'), |
| # Histogram tools, doesn't exist in the snapshot |
| path_join('tools', 'histograms'), |
| # Swarming tools, doesn't exist in the snapshot |
| path_join('tools', 'swarming_client'), |
| # ARM sysroot, doesn't exist in the snapshot |
| path_join('chrome', 'installer', 'linux', 'debian_wheezy_arm-sysroot'), |
| # Old location (TODO(sbc): Remove this once it no longer exists on any bots) |
| path_join('arm-sysroot'), |
| # Data is not part of open source chromium, but are included on some bots. |
| path_join('data'), |
| # This is not part of open source chromium, but are included on some bots. |
| path_join('skia', 'tools', 'clusterfuzz-data'), |
| # Not shipped, only relates to Chrome for Android, but not to WebView |
| path_join('clank'), |
| ] |
| excluded_dirs_list.extend(EXTRA_EXCLUDED_DIRS) |
| |
| # Surround the directory names with OS path separators. |
| dirs_blacklist = [path_join('.', d, '')[1:] for d in excluded_dirs_list if d] |
| def IsBlacklistedDir(d): |
| for item in dirs_blacklist: |
| if item in d: |
| return True |
| return False |
| |
| files_whitelist_re = input_api.re.compile( |
| r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' |
| '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' |
| '|tex|mli?)$') |
| files = [] |
| |
| base_path_len = len(root_dir) |
| for path in start_paths_list: |
| full_path = path_join(root_dir, path) |
| if input_api.os_path.isfile(full_path): |
| if files_whitelist_re.search(path) and \ |
| not IsBlacklistedDir(full_path[base_path_len:]): # Keep '/' prefix. |
| files.append(path) |
| else: |
| for dirpath, dirnames, filenames in input_api.os_walk(full_path): |
| # Remove excluded subdirs for faster scanning. |
| for item in dirnames[:]: |
| if IsBlacklistedDir( |
| path_join(dirpath, item)[base_path_len + 1:]): |
| dirnames.remove(item) |
| for filename in filenames: |
| filepath = \ |
| path_join(dirpath, filename)[base_path_len + 1:] |
| if files_whitelist_re.search(filepath) and \ |
| not IsBlacklistedDir(filepath): |
| files.append(filepath) |
| return files |
| |
| |
| class _GeneratedFilesDetector(object): |
| GENERATED_FILE = 'GENERATED FILE' |
| NO_COPYRIGHT = '*No copyright*' |
| |
| def __init__(self, input_api): |
| self.python_multiline_string_double_re = \ |
| input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE) |
| self.python_multiline_string_single_re = \ |
| input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE) |
| self.automatically_generated_re = input_api.re.compile( |
| r'(All changes made in this file will be lost' |
| '|DO NOT (EDIT|delete this file)' |
| '|Generated (at|automatically|data)' |
| '|Automatically generated' |
| '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE) |
| |
| def IsGeneratedFile(self, header): |
| header = header.upper() |
| if '"""' in header: |
| header = self.python_multiline_string_double_re.sub('', header) |
| if "'''" in header: |
| header = self.python_multiline_string_single_re.sub('', header) |
| # First do simple strings lookup to save time. |
| if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: |
| return True |
| if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ |
| 'GENERATED' in header: |
| return self.automatically_generated_re.search(header) |
| return False |
| |
| |
| class _CopyrightsScanner(object): |
| @staticmethod |
| def StaticInit(input_api): |
| _CopyrightsScanner._c_comment_re = \ |
| input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''') |
| _CopyrightsScanner._copyright_indicator = \ |
| r'(?:copyright|copr\.|\xc2\xa9|\(c\))' |
| _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile( |
| r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \ |
| r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE) |
| _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile( |
| r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE) |
| |
| def __init__(self, input_api): |
| self.max_line_numbers_proximity = 3 |
| self.last_a_item_line_number = -200 |
| self.last_b_item_line_number = -100 |
| self.re = input_api.re |
| |
| def _CloseLineNumbers(self, a, b): |
| return 0 <= a - b <= self.max_line_numbers_proximity |
| |
| def MatchLine(self, line_number, line): |
| if '"' in line: |
| line = _CopyrightsScanner._c_comment_re.sub('', line) |
| upcase_line = line.upper() |
| # Record '(a)' and '(b)' last occurences in C++ comments. |
| # This is to filter out '(c)' used as a list item inside C++ comments. |
| # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah" |
| cpp_comment_idx = upcase_line.find('//') |
| if cpp_comment_idx != -1: |
| if upcase_line.find('(A)') > cpp_comment_idx: |
| self.last_a_item_line_number = line_number |
| if upcase_line.find('(B)') > cpp_comment_idx: |
| self.last_b_item_line_number = line_number |
| # Fast bailout, uses the same patterns as _copyright_indicator regexp. |
| if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \ |
| and not '\xc2\xa9' in upcase_line: |
| c_item_index = upcase_line.find('(C)') |
| if c_item_index == -1: |
| return None |
| if c_item_index > cpp_comment_idx and \ |
| self._CloseLineNumbers(line_number, |
| self.last_b_item_line_number) and \ |
| self._CloseLineNumbers(self.last_b_item_line_number, |
| self.last_a_item_line_number): |
| return None |
| copyr = None |
| m = _CopyrightsScanner._full_copyright_indicator_re.search(line) |
| if m and \ |
| not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): |
| copyr = m.group(0) |
| # Prettify the authorship string. |
| copyr = self.re.sub(r'([,.])?\s*$/', '', copyr) |
| copyr = self.re.sub( |
| _CopyrightsScanner._copyright_indicator, '', copyr, \ |
| flags=self.re.IGNORECASE) |
| copyr = self.re.sub(r'^\s+', '', copyr) |
| copyr = self.re.sub(r'\s{2,}', ' ', copyr) |
| copyr = self.re.sub(r'\\@', '@', copyr) |
| return copyr |
| |
| |
| def FindCopyrights(input_api, root_dir, files_to_scan): |
| """Determines code autorship, and finds generated files. |
| Args: |
| input_api: InputAPI, as in presubmit scripts. |
| root_dir: The root directory, to which all other paths are relative. |
| files_to_scan: The list of file names to scan. |
| Returns: |
| The list of copyrights associated with each of the files given. |
| If the certain file is generated, the corresponding list consists a single |
| entry -- 'GENERATED_FILE' string. If the file has no copyright info, |
| the corresponding list contains 'NO_COPYRIGHT' string. |
| """ |
| generated_files_detector = _GeneratedFilesDetector(input_api) |
| _CopyrightsScanner.StaticInit(input_api) |
| copyrights = [] |
| for file_name in files_to_scan: |
| linenum = 0 |
| header = [] |
| file_copyrights = [] |
| scanner = _CopyrightsScanner(input_api) |
| contents = input_api.ReadFile( |
| input_api.os_path.join(root_dir, file_name), 'r') |
| for l in contents.split('\n'): |
| linenum += 1 |
| if linenum <= 25: |
| header.append(l) |
| c = scanner.MatchLine(linenum, l) |
| if c: |
| file_copyrights.append(c) |
| if generated_files_detector.IsGeneratedFile('\n'.join(header)): |
| copyrights.append([_GeneratedFilesDetector.GENERATED_FILE]) |
| elif file_copyrights: |
| copyrights.append(file_copyrights) |
| else: |
| copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT]) |
| return copyrights |
| |
| |
| def FindCopyrightViolations(input_api, root_dir, files_to_scan): |
| """Looks for files that are not belong exlusively to the Chromium Authors. |
| Args: |
| input_api: InputAPI, as in presubmit scripts. |
| root_dir: The root directory, to which all other paths are relative. |
| files_to_scan: The list of file names to scan. |
| Returns: |
| The list of file names that contain non-Chromium copyrights. |
| """ |
| copyrights = FindCopyrights(input_api, root_dir, files_to_scan) |
| offending_files = [] |
| allowed_copyrights_re = input_api.re.compile( |
| r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' |
| 'All rights reserved.*)$') |
| for f, cs in itertools.izip(files_to_scan, copyrights): |
| if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \ |
| cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT: |
| continue |
| for c in cs: |
| if not allowed_copyrights_re.match(c): |
| offending_files.append(input_api.os_path.normpath(f)) |
| break |
| return offending_files |
| |
| |
| def _GetWhitelistFileName(input_api): |
| return input_api.os_path.join( |
| 'android_webview', 'tools', 'third_party_files_whitelist.txt') |
| |
| def _ProcessWhitelistedFilesList(input_api, lines): |
| whitelisted_files = [] |
| for line in lines: |
| match = input_api.re.match(r'([^#\s]+)', line) |
| if match: |
| whitelisted_files.append( |
| ForwardSlashesToOsPathSeps(input_api, match.group(1))) |
| return whitelisted_files |
| |
| |
| def LoadWhitelistedFilesList(input_api): |
| """Loads and parses the 3rd party code whitelist file. |
| input_api: InputAPI of presubmit scripts. |
| Returns: |
| The list of files. |
| """ |
| full_file_name = input_api.os_path.join( |
| input_api.change.RepositoryRoot(), _GetWhitelistFileName(input_api)) |
| file_data = input_api.ReadFile(full_file_name, 'rb') |
| return _ProcessWhitelistedFilesList(input_api, file_data.splitlines()) |
| |
| |
| def AnalyzeScanResults(input_api, whitelisted_files, offending_files): |
| """Compares whitelist contents with the results of file scanning. |
| input_api: InputAPI of presubmit scripts. |
| whitelisted_files: Whitelisted files list. |
| offending_files: Files that contain 3rd party code. |
| Returns: |
| A triplet of "unknown", "missing", and "stale" file lists. |
| "Unknown" are files that contain 3rd party code but not whitelisted. |
| "Missing" are files that are whitelisted but doesn't really exist. |
| "Stale" are files that are whitelisted unnecessarily. |
| """ |
| unknown = set(offending_files) - set(whitelisted_files) |
| missing = [f for f in whitelisted_files if not input_api.os_path.isfile( |
| input_api.os_path.join(input_api.change.RepositoryRoot(), f))] |
| stale = set(whitelisted_files) - set(offending_files) - set(missing) |
| return (list(unknown), missing, list(stale)) |
| |
| |
| def _GetDeletedContents(affected_file): |
| """Returns a list of all deleted lines. |
| AffectedFile class from presubmit_support is lacking this functionality. |
| """ |
| deleted_lines = [] |
| for line in affected_file.GenerateScmDiff().splitlines(): |
| if line.startswith('-') and not line.startswith('--'): |
| deleted_lines.append(line[1:]) |
| return deleted_lines |
| |
| def _DoScanAtPresubmit(input_api, whitelisted_files, files_to_check): |
| # We pass empty 'known third-party' dirs list here. Since this is a patch |
| # for the Chromium's src tree, it must contain properly licensed Chromium |
| # code. Any third-party code must be put into a directory named 'third_party', |
| # and such dirs are automatically excluded by FindFiles. |
| files_to_scan = FindFiles( |
| input_api, input_api.change.RepositoryRoot(), files_to_check, []) |
| offending_files = FindCopyrightViolations( |
| input_api, input_api.change.RepositoryRoot(), files_to_scan) |
| return AnalyzeScanResults( |
| input_api, whitelisted_files, offending_files) |
| |
| def ScanAtPresubmit(input_api, output_api): |
| """Invoked at change presubmit time. Verifies that updated non third-party |
| code doesn't contain external copyrighted code. |
| input_api: InputAPI of presubmit scripts. |
| output_api: OutputAPI of presubmit scripts. |
| """ |
| files_to_check = set([]) |
| deleted_files = set([]) |
| whitelist_contents_changed = False |
| for f in input_api.AffectedFiles(): |
| if f.LocalPath() == _GetWhitelistFileName(input_api): |
| whitelist_contents_changed = True |
| deleted_files |= set(_ProcessWhitelistedFilesList( |
| input_api, _GetDeletedContents(f))) |
| continue |
| if f.Action() != 'D': |
| files_to_check.add(f.LocalPath()) |
| else: |
| deleted_files.add(f.LocalPath()) |
| whitelisted_files = set(LoadWhitelistedFilesList(input_api)) |
| if not whitelist_contents_changed: |
| whitelisted_files &= files_to_check | deleted_files |
| else: |
| # Need to re-check the entire contents of the whitelist file. |
| # Also add files removed from the whitelist. If the file has indeed been |
| # deleted, the scanner will not complain. |
| files_to_check |= whitelisted_files | deleted_files |
| |
| (unknown_files, missing_files, stale_files) = _DoScanAtPresubmit( |
| input_api, list(whitelisted_files), list(files_to_check)) |
| results = [] |
| if unknown_files: |
| results.append(output_api.PresubmitError( |
| 'The following files contain a third-party license but are not in ' \ |
| 'a listed third-party directory and are not whitelisted. You must ' \ |
| 'add the following files to the whitelist file ' \ |
| '%s:' % _GetWhitelistFileName(input_api), |
| sorted(unknown_files))) |
| if missing_files: |
| results.append(output_api.PresubmitPromptWarning( |
| 'The following files are whitelisted in %s, ' \ |
| 'but do not exist or not files:' % _GetWhitelistFileName(input_api), |
| sorted(missing_files))) |
| if stale_files: |
| results.append(output_api.PresubmitPromptWarning( |
| 'The following files are whitelisted unnecessarily. You must ' \ |
| 'remove the following files from the whitelist file ' \ |
| '%s:' % _GetWhitelistFileName(input_api), |
| sorted(stale_files))) |
| return results |