blob: f747cb8a36f8559aac39b46695e28bb48c9f20e8 [file] [log] [blame]
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Checks third-party licenses for the purposes of the Android WebView build.
The Android tree includes a snapshot of Chromium in order to power the system
WebView. This tool checks that all code uses open-source licenses compatible
with Android, and that we meet the requirements of those licenses. It can also
be used to generate an Android NOTICE file for the third-party code.
It makes use of src/tools/ and the README.chromium files on which
it depends. It also makes use of a data file, third_party_files_whitelist.txt,
which whitelists individual files which contain third-party code but which
aren't in a third-party directory with a README.chromium file.
import imp
import json
import multiprocessing
import optparse
import os
import re
import sys
import textwrap
REPOSITORY_ROOT = os.path.abspath(os.path.join(
os.path.dirname(__file__), '..', '..'))
# Import third_party/ via imp to avoid importing a random
# from $PATH, also make sure we don't generate a .pyc file.
sys.dont_write_bytecode = True
third_party = \
imp.load_source('PRESUBMIT', \
os.path.join(REPOSITORY_ROOT, 'third_party', ''))
sys.path.append(os.path.join(REPOSITORY_ROOT, 'third_party'))
import jinja2
sys.path.append(os.path.join(REPOSITORY_ROOT, 'tools'))
from copyright_scanner import copyright_scanner
import licenses
class InputApi(object):
def __init__(self):
self.os_path = os.path
self.os_walk = os.walk = re
self.ReadFile = _ReadFile
self.change = InputApiChange()
class InputApiChange(object):
def __init__(self):
self.RepositoryRoot = lambda: REPOSITORY_ROOT
class ScanResult(object):
Ok, Warnings, Errors = range(3)
# Needs to be a top-level function for multiprocessing
def _FindCopyrightViolations(files_to_scan_as_string):
return copyright_scanner.FindCopyrightViolations(
InputApi(), REPOSITORY_ROOT, files_to_scan_as_string)
def _ShardList(l, shard_len):
return [l[i:i + shard_len] for i in range(0, len(l), shard_len)]
def _CheckLicenseHeaders(excluded_dirs_list, whitelisted_files):
"""Checks that all files which are not in a listed third-party directory,
and which do not use the standard Chromium license, are whitelisted.
excluded_dirs_list: The list of directories to exclude from scanning.
whitelisted_files: The whitelist of files.
ScanResult.Ok if all files with non-standard license headers are whitelisted
and the whitelist contains no stale entries;
ScanResult.Warnings if there are stale entries;
ScanResult.Errors if new non-whitelisted entries found.
input_api = InputApi()
files_to_scan = copyright_scanner.FindFiles(
input_api, REPOSITORY_ROOT, ['.'], excluded_dirs_list)
sharded_files_to_scan = _ShardList(files_to_scan, 2000)
pool = multiprocessing.Pool()
offending_files_chunks = pool.map_async(
_FindCopyrightViolations, sharded_files_to_scan).get(999999)
# Flatten out the result
offending_files = \
[item for sublist in offending_files_chunks for item in sublist]
(unknown, missing, stale) = copyright_scanner.AnalyzeScanResults(
input_api, whitelisted_files, offending_files)
if unknown:
print 'The following files contain a third-party license but are not in ' \
'a listed third-party directory and are not whitelisted. You must ' \
'add the following files to the whitelist.\n' \
'(Note that if the code you are adding does not actually contain ' \
'any third-party code, it may contain the word "copyright", which ' \
'should be masked out, e.g. by writing it as "copy-right")\n%s' % \
if missing:
print 'The following files are whitelisted, but do not exist.\n%s' % \
if stale:
print 'The following files are whitelisted unnecessarily. You must ' \
'remove the following files from the whitelist.\n%s' % \
if unknown:
code = ScanResult.Errors
elif stale or missing:
code = ScanResult.Warnings
code = ScanResult.Ok
problem_paths = sorted(set(unknown + missing + stale))
return (code, problem_paths)
def _ReadFile(full_path, mode='rU'):
"""Reads a file from disk. This emulates presubmit InputApi.ReadFile func.
full_path: The path of the file to read.
The contents of the file as a string.
with open(full_path, mode) as f:
def _Scan():
"""Checks that license meta-data is present for all third-party code and
that all non third-party code doesn't contain external copyrighted code.
ScanResult.Ok if everything is in order;
ScanResult.Warnings if there are non-fatal problems (e.g. stale whitelist
ScanResult.Errors otherwise.
third_party_dirs = licenses.FindThirdPartyDirsWithFiles(REPOSITORY_ROOT)
problem_paths = []
# First, check designated third-party directories using src/tools/
all_licenses_valid = True
for path in sorted(third_party_dirs):
licenses.ParseDir(path, REPOSITORY_ROOT)
except licenses.LicenseError, e:
print 'Got LicenseError "%s" while scanning %s' % (e, path)
all_licenses_valid = False
# Second, check for non-standard license text.
whitelisted_files = copyright_scanner.LoadWhitelistedFilesList(InputApi())
licenses_check, more_problem_paths = _CheckLicenseHeaders(
third_party_dirs, whitelisted_files)
return (licenses_check if all_licenses_valid else ScanResult.Errors,
class TemplateEntryGenerator(object):
def __init__(self):
self._generate_licenses_file_list_only = False
self._toc_index = 0
def SetGenerateLicensesFileListOnly(self, generate_licenses_file_list_only):
self._generate_licenses_file_list_only = generate_licenses_file_list_only
def _ReadFileGuessEncoding(self, name):
if self._generate_licenses_file_list_only:
return ''
contents = ''
with open(name, 'rb') as input_file:
contents =
return contents.decode('utf8')
except UnicodeDecodeError:
# If it's not UTF-8, it must be CP-1252. Fail otherwise.
return contents.decode('cp1252')
def MetadataToTemplateEntry(self, metadata):
self._toc_index += 1
return {
'name': metadata['Name'],
'url': metadata['URL'],
'license_file': metadata['License File'],
'license': self._ReadFileGuessEncoding(metadata['License File']),
'toc_href': 'entry' + str(self._toc_index),
def GenerateNoticeFile(generate_licenses_file_list_only=False):
"""Generates the contents of an Android NOTICE file for the third-party code.
This is used by the snapshot tool.
The contents of the NOTICE file.
generator = TemplateEntryGenerator()
# Start from Chromium's LICENSE file
entries = [generator.MetadataToTemplateEntry({
'Name': 'The Chromium Project',
'URL': '',
'License File': os.path.join(REPOSITORY_ROOT, 'LICENSE') })
third_party_dirs = licenses.FindThirdPartyDirsWithFiles(REPOSITORY_ROOT)
# We provide attribution for all third-party directories.
# TODO(mnaganov): Limit this to only code used by the WebView binary.
for directory in sorted(third_party_dirs):
metadata = licenses.ParseDir(directory, REPOSITORY_ROOT,
except licenses.LicenseError:
# Since this code is called during project files generation,
# we don't want to break the it. But we assume that release
# WebView apks are built using checkouts that pass
# ' scan' check, thus they don't contain
# projects with non-compatible licenses.
license_file = metadata['License File']
if license_file and license_file != licenses.NOT_SHIPPED:
if generate_licenses_file_list_only:
return [entry['license_file'] for entry in entries]
env = jinja2.Environment(
template = env.get_template('licenses_notice.tmpl')
return template.render({ 'entries': entries }).encode('utf8')
def main():
class FormatterWithNewLines(optparse.IndentedHelpFormatter):
def format_description(self, description):
paras = description.split('\n')
formatted_paras = [textwrap.fill(para, self.width) for para in paras]
return '\n'.join(formatted_paras) + '\n'
parser = optparse.OptionParser(formatter=FormatterWithNewLines(),
usage='%prog [options]')
parser.add_option('--json', help='Path to JSON output file')
parser.description = (__doc__ +
' scan Check licenses.\n'
' notice_deps Generate the list of dependencies for '
'Android NOTICE file.\n'
' notice [file] Generate Android NOTICE file on '
'stdout or into |file|.\n'
' display_copyrights Display autorship on the files'
' using names provided via stdin.\n')
(options, args) = parser.parse_args()
if len(args) < 1:
return ScanResult.Errors
if args[0] == 'scan':
scan_result, problem_paths = _Scan()
if scan_result == ScanResult.Ok:
print 'OK!'
if options.json:
with open(options.json, 'w') as f:
json.dump(problem_paths, f)
return scan_result
elif args[0] == 'notice_deps':
# 'set' is used to eliminate duplicate references to the same license file.
print ' '.join(
return ScanResult.Ok
elif args[0] == 'gn_notice_deps':
# generate list for gn.
# 'set' is used to eliminate duplicate references to the same license file.
gn_file_list = ['"' + f + '"' for f in
print '[%s] ' % ','.join(gn_file_list)
return ScanResult.Ok
elif args[0] == 'notice':
notice_file_contents = GenerateNoticeFile()
if len(args) == 1:
print notice_file_contents
with open(args[1], 'w') as output_file:
return ScanResult.Ok
elif args[0] == 'display_copyrights':
files =
for f, c in \
zip(files, copyright_scanner.FindCopyrights(InputApi(), '.', files)):
print f, '\t', ' / '.join(sorted(c))
return ScanResult.Ok
return ScanResult.Errors
if __name__ == '__main__':