build/config/merge_for_jumbo.py - chromium/src - Git at Google

 #!/usr/bin/env python
 #
 # Copyright 2016 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """This script creates a "jumbo" file which merges all incoming files
 for compiling.

 """

 from __future__ import print_function
 from __future__ import unicode_literals

 import argparse
 import hashlib
 import io
 import os

 def cut_ranges(boundaries):
   # Given an increasing sequence of boundary indices, generate a sequence of
   # non-overlapping ranges. The total range is inclusive of the first index
   # and exclusive of the last index from the given sequence.
   for start, stop in zip(boundaries, boundaries[1:]):
     yield range(start, stop)


 def generate_chunk_stops(inputs, output_count, smart_merge=True):
   # Note: In the comments below, unique numeric labels are assigned to files.
   #       Consider them as the sorted rank of the hash of each file path.
   # Simple jumbo chunking generates uniformly sized chunks with the ceiling of:
   # (output_index + 1) * input_count / output_count
   input_count = len(inputs)
   stops = [((i + 1) * input_count + output_count - 1) // output_count
            for i in range(output_count)]
   # This is disruptive at times because file insertions and removals can
   # invalidate many chunks as all files are offset by one.
   # For example, say we have 12 files in 4 uniformly sized chunks:
   # 9, 4, 0; 7,  1, 11;  5, 10, 2; 6, 3, 8
   # If we delete the first file we get:
   # 4, 0, 7; 1, 11,  5; 10,  2, 6; 3, 8
   # All of the chunks have new sets of inputs.

   # With path-aware chunking, we start with the uniformly sized chunks:
   # 9, 4, 0; 7,  1, 11;  5, 10, 2; 6, 3, 8
   # First we find the smallest rank in each of the chunks. Their indices are
   # stored in the |centers| list and in this example the ranks would be:
   # 0, 1, 2, 3
   # Then we find the largest rank between the centers. Their indices are stored
   # in the |stops| list and in this example the ranks would be:
   # 7, 11, 6
   # These files mark the boundaries between chunks and these boundary files are
   # often maintained even as files are added or deleted.
   # In this example, 7, 11, and 6 are the first files in each chunk:
   # 9, 4, 0; 7,  1; 11,  5, 10, 2; 6, 3, 8
   # If we delete the first file and repeat the process we get:
   # 4, 0; 7, 1; 11,  5, 10,  2; 6, 3, 8
   # Only the first chunk has a new set of inputs.
   if smart_merge:
     # Starting with the simple chunks, every file is assigned a rank.
     # This requires a hash function that is stable across runs.
     hasher = lambda n: hashlib.md5(inputs[n].encode()).hexdigest()
     # In each chunk there is a key file with lowest rank; mark them.
     # Note that they will not easily change.
     centers = [min(indices, key=hasher) for indices in cut_ranges([0] + stops)]
     # Between each pair of key files there is a file with highest rank.
     # Mark these to be used as border files. They also will not easily change.
     # Forget the inital chunks and create new chunks by splitting the list at
     # every border file.
     stops = [max(indices, key=hasher) for indices in cut_ranges(centers)]
     stops.append(input_count)
   return stops


 def write_jumbo_files(inputs, outputs, written_input_set, written_output_set):
   chunk_stops = generate_chunk_stops(inputs, len(outputs))

   written_inputs = 0
   for output_index, output_file in enumerate(outputs):
     written_output_set.add(output_file)
     if os.path.isfile(output_file):
       with open(output_file, "r") as current:
         current_jumbo_file = current.read()
     else:
       current_jumbo_file = None

     out = io.StringIO()
     out.write("/* This is a Jumbo file. Don't edit. */\n\n")
     out.write("/* Generated with merge_for_jumbo.py. */\n\n")
     input_limit = chunk_stops[output_index]
     while written_inputs < input_limit:
       filename = inputs[written_inputs]
       written_inputs += 1
       out.write("#include \"%s\"\n" % filename)
       written_input_set.add(filename)
     new_jumbo_file = out.getvalue()
     out.close()

     if new_jumbo_file != current_jumbo_file:
       with open(output_file, "w") as out:
         out.write(new_jumbo_file)


 def main():
   parser = argparse.ArgumentParser()
   parser.add_argument("--outputs", nargs="+", required=True,
                       help='List of output files to split input into')
   parser.add_argument("--file-list", required=True)
   parser.add_argument("--verbose", action="store_true")
   args = parser.parse_args()

   lines = []
   # If written with gn |write_file| each file is on its own line.
   with open(args.file_list) as file_list_file:
     lines = [line.strip() for line in file_list_file if line.strip()]
   # If written with gn |response_file_contents| the files are space separated.
   all_inputs = []
   for line in lines:
     all_inputs.extend(line.split())

   written_output_set = set()  # Just for double checking
   written_input_set = set()  # Just for double checking
   for language_ext in (".cc", ".c", ".mm",):
     if language_ext == ".cc":
       ext_pattern = (".cc", ".cpp")
     else:
       ext_pattern = tuple([language_ext])

     outputs = [x for x in args.outputs if x.endswith(ext_pattern)]
     inputs = [x for x in all_inputs if x.endswith(ext_pattern)]

     if not outputs:
       assert not inputs
       continue

     write_jumbo_files(inputs, outputs, written_input_set, written_output_set)

   assert set(args.outputs) == written_output_set, "Did not fill all outputs"
   assert set(all_inputs) == written_input_set, "Did not use all inputs"
   if args.verbose:
     print("Generated %s (%d files) based on %s" % (
       str(args.outputs), len(written_input_set), args.file_list))

 if __name__ == "__main__":
   main()
	#!/usr/bin/env python
	#
	# Copyright 2016 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""This script creates a "jumbo" file which merges all incoming files
	for compiling.

	"""

	from __future__ import print_function
	from __future__ import unicode_literals

	import argparse
	import hashlib
	import io
	import os

	def cut_ranges(boundaries):
	# Given an increasing sequence of boundary indices, generate a sequence of
	# non-overlapping ranges. The total range is inclusive of the first index
	# and exclusive of the last index from the given sequence.
	for start, stop in zip(boundaries, boundaries[1:]):
	yield range(start, stop)


	def generate_chunk_stops(inputs, output_count, smart_merge=True):
	# Note: In the comments below, unique numeric labels are assigned to files.
	# Consider them as the sorted rank of the hash of each file path.
	# Simple jumbo chunking generates uniformly sized chunks with the ceiling of:
	# (output_index + 1) * input_count / output_count
	input_count = len(inputs)
	stops = [((i + 1) * input_count + output_count - 1) // output_count
	for i in range(output_count)]
	# This is disruptive at times because file insertions and removals can
	# invalidate many chunks as all files are offset by one.
	# For example, say we have 12 files in 4 uniformly sized chunks:
	# 9, 4, 0; 7, 1, 11; 5, 10, 2; 6, 3, 8
	# If we delete the first file we get:
	# 4, 0, 7; 1, 11, 5; 10, 2, 6; 3, 8
	# All of the chunks have new sets of inputs.

	# With path-aware chunking, we start with the uniformly sized chunks:
	# 9, 4, 0; 7, 1, 11; 5, 10, 2; 6, 3, 8
	# First we find the smallest rank in each of the chunks. Their indices are
	# stored in the \|centers\| list and in this example the ranks would be:
	# 0, 1, 2, 3
	# Then we find the largest rank between the centers. Their indices are stored
	# in the \|stops\| list and in this example the ranks would be:
	# 7, 11, 6
	# These files mark the boundaries between chunks and these boundary files are
	# often maintained even as files are added or deleted.
	# In this example, 7, 11, and 6 are the first files in each chunk:
	# 9, 4, 0; 7, 1; 11, 5, 10, 2; 6, 3, 8
	# If we delete the first file and repeat the process we get:
	# 4, 0; 7, 1; 11, 5, 10, 2; 6, 3, 8
	# Only the first chunk has a new set of inputs.
	if smart_merge:
	# Starting with the simple chunks, every file is assigned a rank.
	# This requires a hash function that is stable across runs.
	hasher = lambda n: hashlib.md5(inputs[n].encode()).hexdigest()
	# In each chunk there is a key file with lowest rank; mark them.
	# Note that they will not easily change.
	centers = [min(indices, key=hasher) for indices in cut_ranges([0] + stops)]
	# Between each pair of key files there is a file with highest rank.
	# Mark these to be used as border files. They also will not easily change.
	# Forget the inital chunks and create new chunks by splitting the list at
	# every border file.
	stops = [max(indices, key=hasher) for indices in cut_ranges(centers)]
	stops.append(input_count)
	return stops


	def write_jumbo_files(inputs, outputs, written_input_set, written_output_set):
	chunk_stops = generate_chunk_stops(inputs, len(outputs))

	written_inputs = 0
	for output_index, output_file in enumerate(outputs):
	written_output_set.add(output_file)
	if os.path.isfile(output_file):
	with open(output_file, "r") as current:
	current_jumbo_file = current.read()
	else:
	current_jumbo_file = None

	out = io.StringIO()
	out.write("/* This is a Jumbo file. Don't edit. */\n\n")
	out.write("/* Generated with merge_for_jumbo.py. */\n\n")
	input_limit = chunk_stops[output_index]
	while written_inputs < input_limit:
	filename = inputs[written_inputs]
	written_inputs += 1
	out.write("#include \"%s\"\n" % filename)
	written_input_set.add(filename)
	new_jumbo_file = out.getvalue()
	out.close()

	if new_jumbo_file != current_jumbo_file:
	with open(output_file, "w") as out:
	out.write(new_jumbo_file)


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--outputs", nargs="+", required=True,
	help='List of output files to split input into')
	parser.add_argument("--file-list", required=True)
	parser.add_argument("--verbose", action="store_true")
	args = parser.parse_args()

	lines = []
	# If written with gn \|write_file\| each file is on its own line.
	with open(args.file_list) as file_list_file:
	lines = [line.strip() for line in file_list_file if line.strip()]
	# If written with gn \|response_file_contents\| the files are space separated.
	all_inputs = []
	for line in lines:
	all_inputs.extend(line.split())

	written_output_set = set() # Just for double checking
	written_input_set = set() # Just for double checking
	for language_ext in (".cc", ".c", ".mm",):
	if language_ext == ".cc":
	ext_pattern = (".cc", ".cpp")
	else:
	ext_pattern = tuple([language_ext])

	outputs = [x for x in args.outputs if x.endswith(ext_pattern)]
	inputs = [x for x in all_inputs if x.endswith(ext_pattern)]

	if not outputs:
	assert not inputs
	continue

	write_jumbo_files(inputs, outputs, written_input_set, written_output_set)

	assert set(args.outputs) == written_output_set, "Did not fill all outputs"
	assert set(all_inputs) == written_input_set, "Did not use all inputs"
	if args.verbose:
	print("Generated %s (%d files) based on %s" % (
	str(args.outputs), len(written_input_set), args.file_list))

	if __name__ == "__main__":
	main()