blob: 6d037a80eb6cfa0a7181c81ba70d8b44153bedfb [file] [log] [blame]
#!/usr/bin/env python
#
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""This script creates a "jumbo" file which merges all incoming files
for compiling.
"""
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import hashlib
import io
import os
def cut_ranges(boundaries):
# Given an increasing sequence of boundary indices, generate a sequence of
# non-overlapping ranges. The total range is inclusive of the first index
# and exclusive of the last index from the given sequence.
for start, stop in zip(boundaries, boundaries[1:]):
yield range(start, stop)
def generate_chunk_stops(inputs, output_count, smart_merge=True):
# Note: In the comments below, unique numeric labels are assigned to files.
# Consider them as the sorted rank of the hash of each file path.
# Simple jumbo chunking generates uniformly sized chunks with the ceiling of:
# (output_index + 1) * input_count / output_count
input_count = len(inputs)
stops = [((i + 1) * input_count + output_count - 1) // output_count
for i in range(output_count)]
# This is disruptive at times because file insertions and removals can
# invalidate many chunks as all files are offset by one.
# For example, say we have 12 files in 4 uniformly sized chunks:
# 9, 4, 0; 7, 1, 11; 5, 10, 2; 6, 3, 8
# If we delete the first file we get:
# 4, 0, 7; 1, 11, 5; 10, 2, 6; 3, 8
# All of the chunks have new sets of inputs.
# With path-aware chunking, we start with the uniformly sized chunks:
# 9, 4, 0; 7, 1, 11; 5, 10, 2; 6, 3, 8
# First we find the smallest rank in each of the chunks. Their indices are
# stored in the |centers| list and in this example the ranks would be:
# 0, 1, 2, 3
# Then we find the largest rank between the centers. Their indices are stored
# in the |stops| list and in this example the ranks would be:
# 7, 11, 6
# These files mark the boundaries between chunks and these boundary files are
# often maintained even as files are added or deleted.
# In this example, 7, 11, and 6 are the first files in each chunk:
# 9, 4, 0; 7, 1; 11, 5, 10, 2; 6, 3, 8
# If we delete the first file and repeat the process we get:
# 4, 0; 7, 1; 11, 5, 10, 2; 6, 3, 8
# Only the first chunk has a new set of inputs.
if smart_merge:
# Starting with the simple chunks, every file is assigned a rank.
# This requires a hash function that is stable across runs.
hasher = lambda n: hashlib.md5(inputs[n].encode()).hexdigest()
# In each chunk there is a key file with lowest rank; mark them.
# Note that they will not easily change.
centers = [min(indices, key=hasher) for indices in cut_ranges([0] + stops)]
# Between each pair of key files there is a file with highest rank.
# Mark these to be used as border files. They also will not easily change.
# Forget the inital chunks and create new chunks by splitting the list at
# every border file.
stops = [max(indices, key=hasher) for indices in cut_ranges(centers)]
stops.append(input_count)
return stops
def write_jumbo_files(inputs, outputs, written_input_set, written_output_set):
chunk_stops = generate_chunk_stops(inputs, len(outputs))
written_inputs = 0
for output_index, output_file in enumerate(outputs):
written_output_set.add(output_file)
if os.path.isfile(output_file):
with open(output_file, "r") as current:
current_jumbo_file = current.read()
else:
current_jumbo_file = None
out = io.StringIO()
out.write("/* This is a Jumbo file. Don't edit. */\n\n")
out.write("/* Generated with merge_for_jumbo.py. */\n\n")
input_limit = chunk_stops[output_index]
while written_inputs < input_limit:
filename = inputs[written_inputs]
written_inputs += 1
out.write("#include \"%s\"\n" % filename)
written_input_set.add(filename)
new_jumbo_file = out.getvalue()
out.close()
if new_jumbo_file != current_jumbo_file:
with open(output_file, "w") as out:
out.write(new_jumbo_file)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--outputs", nargs="+", required=True,
help='List of output files to split input into')
parser.add_argument("--file-list", required=True)
parser.add_argument("--verbose", action="store_true")
args = parser.parse_args()
lines = []
# If written with gn |write_file| each file is on its own line.
with open(args.file_list) as file_list_file:
lines = [line.strip() for line in file_list_file if line.strip()]
# If written with gn |response_file_contents| the files are space separated.
all_inputs = []
for line in lines:
all_inputs.extend(line.split())
written_output_set = set() # Just for double checking
written_input_set = set() # Just for double checking
for language_ext in (".cc", ".c", ".mm",):
if language_ext == ".cc":
ext_pattern = (".cc", ".cpp")
else:
ext_pattern = tuple([language_ext])
outputs = [x for x in args.outputs if x.endswith(ext_pattern)]
inputs = [x for x in all_inputs if x.endswith(ext_pattern)]
if not outputs:
assert not inputs
continue
write_jumbo_files(inputs, outputs, written_input_set, written_output_set)
assert set(args.outputs) == written_output_set, "Did not fill all outputs"
assert set(all_inputs) == written_input_set, "Did not use all inputs"
if args.verbose:
print("Generated %s (%d files) based on %s" % (
str(args.outputs), len(written_input_set), args.file_list))
if __name__ == "__main__":
main()