base/i18n/build_utf8_validator_tables.cc - chromium/src - Git at Google

 // Copyright 2014 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // Create a state machine for validating UTF-8. The algorithm in brief:
 // 1. Convert the complete unicode range of code points, except for the
 //    surrogate code points, to an ordered array of sequences of bytes in
 //    UTF-8.
 // 2. Convert individual bytes to ranges, starting from the right of each byte
 //    sequence. For each range, ensure the bytes on the left and the ranges
 //    on the right are the identical.
 // 3. Convert the resulting list of ranges into a state machine, collapsing
 //    identical states.
 // 4. Convert the state machine to an array of bytes.
 // 5. Output as a C++ file.
 //
 // To use:
 //  $ ninja -C out/Release build_utf8_validator_tables
 //  $ out/Release/build_utf8_validator_tables
 //                                   --output=base/i18n/utf8_validator_tables.cc
 //  $ git add base/i18n/utf8_validator_tables.cc
 //
 // Because the table is not expected to ever change, it is checked into the
 // repository rather than being regenerated at build time.
 //
 // This code uses type uint8_t throughout to represent bytes, to avoid
 // signed/unsigned char confusion.

 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include <algorithm>
 #include <map>
 #include <string>
 #include <vector>

 #include "base/command_line.h"
 #include "base/files/file_path.h"
 #include "base/files/file_util.h"
 #include "base/logging.h"
 #include "base/numerics/safe_conversions.h"
 #include "base/stl_util.h"
 #include "base/strings/stringprintf.h"
 #include "third_party/icu/source/common/unicode/utf8.h"

 namespace {

 const char kHelpText[] =
     "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";

 const char kProlog[] =
     "// Copyright 2013 The Chromium Authors. All rights reserved.\n"
     "// Use of this source code is governed by a BSD-style license that can "
     "be\n"
     "// found in the LICENSE file.\n"
     "\n"
     "// This file is auto-generated by build_utf8_validator_tables.\n"
     "// DO NOT EDIT.\n"
     "\n"
     "#include \"base/i18n/utf8_validator_tables.h\"\n"
     "\n"
     "namespace base {\n"
     "namespace internal {\n"
     "\n"
     "const uint8_t kUtf8ValidatorTables[] = {\n";

 const char kEpilog[] =
     "};\n"
     "\n"
     "const size_t kUtf8ValidatorTablesSize = "
     "base::size(kUtf8ValidatorTables);\n"
     "\n"
     "}  // namespace internal\n"
     "}  // namespace base\n";

 // Ranges are inclusive at both ends--they represent [from, to]
 class Range {
  public:
   // Ranges always start with just one byte.
   explicit Range(uint8_t value) : from_(value), to_(value) {}

   // Range objects are copyable and assignable to be used in STL
   // containers. Since they only contain non-pointer POD types, the default copy
   // constructor, assignment operator and destructor will work.

   // Add a byte to the range. We intentionally only support adding a byte at the
   // end, since that is the only operation the code needs.
   void AddByte(uint8_t to) {
     CHECK(to == to_ + 1);
     to_ = to;
   }

   uint8_t from() const { return from_; }
   uint8_t to() const { return to_; }

   bool operator<(const Range& rhs) const {
     return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));
   }

   bool operator==(const Range& rhs) const {
     return from() == rhs.from() && to() == rhs.to();
   }

  private:
   uint8_t from_;
   uint8_t to_;
 };

 // A vector of Ranges is like a simple regular expression--it corresponds to
 // a set of strings of the same length that have bytes in each position in
 // the appropriate range.
 typedef std::vector<Range> StringSet;

 // A UTF-8 "character" is represented by a sequence of bytes.
 typedef std::vector<uint8_t> Character;

 // In the second stage of the algorithm, we want to convert a large list of
 // Characters into a small list of StringSets.
 struct Pair {
   Character character;
   StringSet set;
 };

 typedef std::vector<Pair> PairVector;

 // A class to print a table of numbers in the same style as clang-format.
 class TablePrinter {
  public:
   explicit TablePrinter(FILE* stream)
       : stream_(stream), values_on_this_line_(0), current_offset_(0) {}

   void PrintValue(uint8_t value) {
     if (values_on_this_line_ == 0) {
       fputs("   ", stream_);
     } else if (values_on_this_line_ == kMaxValuesPerLine) {
       fprintf(stream_, "  // 0x%02x\n   ", current_offset_);
       values_on_this_line_ = 0;
     }
     fprintf(stream_, " 0x%02x,", static_cast<int>(value));
     ++values_on_this_line_;
     ++current_offset_;
   }

   void NewLine() {
     while (values_on_this_line_ < kMaxValuesPerLine) {
       fputs("      ", stream_);
       ++values_on_this_line_;
     }
     fprintf(stream_, "  // 0x%02x\n", current_offset_);
     values_on_this_line_ = 0;
   }

  private:
   // stdio stream. Not owned.
   FILE* stream_;

   // Number of values so far printed on this line.
   int values_on_this_line_;

   // Total values printed so far.
   int current_offset_;

   static const int kMaxValuesPerLine = 8;

   DISALLOW_COPY_AND_ASSIGN(TablePrinter);
 };

 // Start by filling a PairVector with characters. The resulting vector goes from
 // "\x00" to "\xf4\x8f\xbf\xbf".
 PairVector InitializeCharacters() {
   PairVector vector;
   for (int i = 0; i <= 0x10FFFF; ++i) {
     if (i >= 0xD800 && i < 0xE000) {
       // Surrogate codepoints are not permitted. Non-character code points are
       // explicitly permitted.
       continue;
     }
     uint8_t bytes[4];
     unsigned int offset = 0;
     UBool is_error = false;
     U8_APPEND(bytes, offset, base::size(bytes), i, is_error);
     DCHECK(!is_error);
     DCHECK_GT(offset, 0u);
     DCHECK_LE(offset, base::size(bytes));
     Pair pair = {Character(bytes, bytes + offset), StringSet()};
     vector.push_back(pair);
   }
   return vector;
 }

 // Construct a new Pair from |character| and the concatenation of |new_range|
 // and |existing_set|, and append it to |pairs|.
 void ConstructPairAndAppend(const Character& character,
                             const Range& new_range,
                             const StringSet& existing_set,
                             PairVector* pairs) {
   Pair new_pair = {character, StringSet(1, new_range)};
   new_pair.set.insert(
       new_pair.set.end(), existing_set.begin(), existing_set.end());
   pairs->push_back(new_pair);
 }

 // Each pass over the PairVector strips one byte off the right-hand-side of the
 // characters and adds a range to the set on the right. For example, the first
 // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
 // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
 // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
 // [\xa0-\xbf][\x80-\xbf]).
 void MoveRightMostCharToSet(PairVector* pairs) {
   PairVector new_pairs;
   PairVector::const_iterator it = pairs->begin();
   while (it != pairs->end() && it->character.empty()) {
     new_pairs.push_back(*it);
     ++it;
   }
   CHECK(it != pairs->end());
   Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
   Range new_range(it->character.back());
   StringSet converted = it->set;
   ++it;
   while (it != pairs->end()) {
     const Pair& current_pair = *it++;
     if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
         std::equal(unconverted_bytes.begin(),
                    unconverted_bytes.end(),
                    current_pair.character.begin()) &&
         converted == current_pair.set) {
       // The particular set of UTF-8 codepoints we are validating guarantees
       // that each byte range will be contiguous. This would not necessarily be
       // true for an arbitrary set of UTF-8 codepoints.
       DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
       new_range.AddByte(current_pair.character.back());
       continue;
     }
     ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
     unconverted_bytes = Character(current_pair.character.begin(),
                                   current_pair.character.end() - 1);
     new_range = Range(current_pair.character.back());
     converted = current_pair.set;
   }
   ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
   new_pairs.swap(*pairs);
 }

 void MoveAllCharsToSets(PairVector* pairs) {
   // Since each pass of the function moves one character, and UTF-8 sequences
   // are at most 4 characters long, this simply runs the algorithm four times.
   for (int i = 0; i < 4; ++i) {
     MoveRightMostCharToSet(pairs);
   }
 #if DCHECK_IS_ON()
   for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
        ++it) {
     DCHECK(it->character.empty());
   }
 #endif
 }

 // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
 // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
 // algorithm is working. Use the command-line option
 // --vmodule=build_utf8_validator_tables=1 to see this output.
 void LogStringSets(const PairVector& pairs) {
   for (const auto& pair_it : pairs) {
     std::string set_as_string;
     for (auto set_it = pair_it.set.begin(); set_it != pair_it.set.end();
          ++set_it) {
       set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
                                           static_cast<int>(set_it->from()),
                                           static_cast<int>(set_it->to()));
     }
     VLOG(1) << set_as_string;
   }
 }

 // A single state in the state machine is represented by a sorted vector of
 // start bytes and target states. All input bytes in the range between the start
 // byte and the next entry in the vector (or 0xFF) result in a transition to the
 // target state.
 struct StateRange {
   uint8_t from;
   uint8_t target_state;
 };

 typedef std::vector<StateRange> State;

 // Generates a state where all bytes go to state 1 (invalid). This is also used
 // as an initialiser for other states (since bytes from outside the desired
 // range are invalid).
 State GenerateInvalidState() {
   const StateRange range = {0, 1};
   return State(1, range);
 }

 // A map from a state (ie. a set of strings which will match from this state) to
 // a number (which is an index into the array of states).
 typedef std::map<StringSet, uint8_t> StateMap;

 // Create a new state corresponding to |set|, add it |states| and |state_map|
 // and return the index it was given in |states|.
 uint8_t MakeState(const StringSet& set,
                   std::vector<State>* states,
                   StateMap* state_map) {
   DCHECK(!set.empty());
   const Range& range = set.front();
   const StringSet rest(set.begin() + 1, set.end());
   const StateMap::const_iterator where = state_map->find(rest);
   const uint8_t target_state = where == state_map->end()
                                    ? MakeState(rest, states, state_map)
                                    : where->second;
   DCHECK_LT(0, range.from());
   DCHECK_LT(range.to(), 0xFF);
   const StateRange new_state_initializer[] = {
       {0, 1},
       {range.from(), target_state},
       {static_cast<uint8_t>(range.to() + 1), 1}};
   states->push_back(
       State(new_state_initializer,
             new_state_initializer + base::size(new_state_initializer)));
   const uint8_t new_state_number =
       base::checked_cast<uint8_t>(states->size() - 1);
   CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
   return new_state_number;
 }

 std::vector<State> GenerateStates(const PairVector& pairs) {
   // States 0 and 1 are the initial/valid state and invalid state, respectively.
   std::vector<State> states(2, GenerateInvalidState());
   StateMap state_map;
   state_map.insert(std::make_pair(StringSet(), 0));
   for (auto it = pairs.begin(); it != pairs.end(); ++it) {
     DCHECK(it->character.empty());
     DCHECK(!it->set.empty());
     const Range& range = it->set.front();
     const StringSet rest(it->set.begin() + 1, it->set.end());
     const StateMap::const_iterator where = state_map.find(rest);
     const uint8_t target_state = where == state_map.end()
                                      ? MakeState(rest, &states, &state_map)
                                      : where->second;
     if (states[0].back().from == range.from()) {
       DCHECK_EQ(1, states[0].back().target_state);
       states[0].back().target_state = target_state;
       DCHECK_LT(range.to(), 0xFF);
       const StateRange new_range = {static_cast<uint8_t>(range.to() + 1), 1};
       states[0].push_back(new_range);
     } else {
       DCHECK_LT(range.to(), 0xFF);
       const StateRange new_range_initializer[] = {
           {range.from(), target_state},
           {static_cast<uint8_t>(range.to() + 1), 1}};
       states[0].insert(
           states[0].end(), new_range_initializer,
           new_range_initializer + base::size(new_range_initializer));
     }
   }
   return states;
 }

 // Output the generated states as a C++ table. Two tricks are used to compact
 // the table: each state in the table starts with a shift value which indicates
 // how many bits we can discard from the right-hand-side of the byte before
 // doing the table lookup. Secondly, only the state-transitions for bytes
 // with the top-bit set are included in the table; bytes without the top-bit set
 // are just ASCII and are handled directly by the code.
 void PrintStates(const std::vector<State>& states, FILE* stream) {
   // First calculate the start-offset of each state. This allows the state
   // machine to jump directly to the correct offset, avoiding an extra
   // indirection. State 0 starts at offset 0.
   std::vector<uint8_t> state_offset(1, 0);
   std::vector<uint8_t> shifts;
   uint8_t pos = 0;

   for (const auto& state_it : states) {
     // We want to set |shift| to the (0-based) index of the least-significant
     // set bit in any of the ranges for this state, since this tells us how many
     // bits we can discard and still determine what range a byte lies in. Sadly
     // it appears that ffs() is not portable, so we do it clumsily.
     uint8_t shift = 7;
     for (auto range_it = state_it.begin(); range_it != state_it.end();
          ++range_it) {
       while (shift > 0 && range_it->from % (1 << shift) != 0) {
         --shift;
       }
     }
     shifts.push_back(shift);
     pos += 1 + (1 << (7 - shift));
     state_offset.push_back(pos);
   }

   DCHECK_EQ(129, state_offset[1]);

   fputs(kProlog, stream);
   TablePrinter table_printer(stream);

   for (uint8_t state_index = 0; state_index < states.size(); ++state_index) {
     const uint8_t shift = shifts[state_index];
     uint8_t next_range = 0;
     uint8_t target_state = 1;
     fprintf(stream,
             "    // State %d, offset 0x%02x\n",
             static_cast<int>(state_index),
             static_cast<int>(state_offset[state_index]));
     table_printer.PrintValue(shift);
     for (int i = 0; i < 0x100; i += (1 << shift)) {
       if (next_range < states[state_index].size() &&
           states[state_index][next_range].from == i) {
         target_state = states[state_index][next_range].target_state;
         ++next_range;
       }
       if (i >= 0x80) {
         table_printer.PrintValue(state_offset[target_state]);
       }
     }
     table_printer.NewLine();
   }

   fputs(kEpilog, stream);
 }

 }  // namespace

 int main(int argc, char* argv[]) {
   base::CommandLine::Init(argc, argv);
   logging::LoggingSettings settings;
   settings.logging_dest =
       logging::LOG_TO_SYSTEM_DEBUG_LOG | logging::LOG_TO_STDERR;
   logging::InitLogging(settings);
   if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) {
     fwrite(kHelpText, 1, base::size(kHelpText), stdout);
     exit(EXIT_SUCCESS);
   }
   base::FilePath filename =
       base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");

   FILE* output = stdout;
   if (!filename.empty()) {
     output = base::OpenFile(filename, "wb");
     if (!output)
       PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
                   << "' for writing";
   }

   // Step 1: Enumerate the characters
   PairVector pairs = InitializeCharacters();
   // Step 2: Convert to sets.
   MoveAllCharsToSets(&pairs);
   if (VLOG_IS_ON(1)) {
     LogStringSets(pairs);
   }
   // Step 3: Generate states.
   std::vector<State> states = GenerateStates(pairs);
   // Step 4/5: Print output
   PrintStates(states, output);

   if (!filename.empty()) {
     if (!base::CloseFile(output))
       PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()
                   << "'";
   }

   return EXIT_SUCCESS;
 }
	// Copyright 2014 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// Create a state machine for validating UTF-8. The algorithm in brief:
	// 1. Convert the complete unicode range of code points, except for the
	// surrogate code points, to an ordered array of sequences of bytes in
	// UTF-8.
	// 2. Convert individual bytes to ranges, starting from the right of each byte
	// sequence. For each range, ensure the bytes on the left and the ranges
	// on the right are the identical.
	// 3. Convert the resulting list of ranges into a state machine, collapsing
	// identical states.
	// 4. Convert the state machine to an array of bytes.
	// 5. Output as a C++ file.
	//
	// To use:
	// $ ninja -C out/Release build_utf8_validator_tables
	// $ out/Release/build_utf8_validator_tables
	// --output=base/i18n/utf8_validator_tables.cc
	// $ git add base/i18n/utf8_validator_tables.cc
	//
	// Because the table is not expected to ever change, it is checked into the
	// repository rather than being regenerated at build time.
	//
	// This code uses type uint8_t throughout to represent bytes, to avoid
	// signed/unsigned char confusion.

	#include <stddef.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#include <algorithm>
	#include <map>
	#include <string>
	#include <vector>

	#include "base/command_line.h"
	#include "base/files/file_path.h"
	#include "base/files/file_util.h"
	#include "base/logging.h"
	#include "base/numerics/safe_conversions.h"
	#include "base/stl_util.h"
	#include "base/strings/stringprintf.h"
	#include "third_party/icu/source/common/unicode/utf8.h"

	namespace {

	const char kHelpText[] =
	"Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";

	const char kProlog[] =
	"// Copyright 2013 The Chromium Authors. All rights reserved.\n"
	"// Use of this source code is governed by a BSD-style license that can "
	"be\n"
	"// found in the LICENSE file.\n"
	"\n"
	"// This file is auto-generated by build_utf8_validator_tables.\n"
	"// DO NOT EDIT.\n"
	"\n"
	"#include \"base/i18n/utf8_validator_tables.h\"\n"
	"\n"
	"namespace base {\n"
	"namespace internal {\n"
	"\n"
	"const uint8_t kUtf8ValidatorTables[] = {\n";

	const char kEpilog[] =
	"};\n"
	"\n"
	"const size_t kUtf8ValidatorTablesSize = "
	"base::size(kUtf8ValidatorTables);\n"
	"\n"
	"} // namespace internal\n"
	"} // namespace base\n";

	// Ranges are inclusive at both ends--they represent [from, to]
	class Range {
	public:
	// Ranges always start with just one byte.
	explicit Range(uint8_t value) : from_(value), to_(value) {}

	// Range objects are copyable and assignable to be used in STL
	// containers. Since they only contain non-pointer POD types, the default copy
	// constructor, assignment operator and destructor will work.

	// Add a byte to the range. We intentionally only support adding a byte at the
	// end, since that is the only operation the code needs.
	void AddByte(uint8_t to) {
	CHECK(to == to_ + 1);
	to_ = to;
	}

	uint8_t from() const { return from_; }
	uint8_t to() const { return to_; }

	bool operator<(const Range& rhs) const {
	return (from() < rhs.from() \|\| (from() == rhs.from() && to() < rhs.to()));
	}

	bool operator==(const Range& rhs) const {
	return from() == rhs.from() && to() == rhs.to();
	}

	private:
	uint8_t from_;
	uint8_t to_;
	};

	// A vector of Ranges is like a simple regular expression--it corresponds to
	// a set of strings of the same length that have bytes in each position in
	// the appropriate range.
	typedef std::vector<Range> StringSet;

	// A UTF-8 "character" is represented by a sequence of bytes.
	typedef std::vector<uint8_t> Character;

	// In the second stage of the algorithm, we want to convert a large list of
	// Characters into a small list of StringSets.
	struct Pair {
	Character character;
	StringSet set;
	};

	typedef std::vector<Pair> PairVector;

	// A class to print a table of numbers in the same style as clang-format.
	class TablePrinter {
	public:
	explicit TablePrinter(FILE* stream)
	: stream_(stream), values_on_this_line_(0), current_offset_(0) {}

	void PrintValue(uint8_t value) {
	if (values_on_this_line_ == 0) {
	fputs(" ", stream_);
	} else if (values_on_this_line_ == kMaxValuesPerLine) {
	fprintf(stream_, " // 0x%02x\n ", current_offset_);
	values_on_this_line_ = 0;
	}
	fprintf(stream_, " 0x%02x,", static_cast<int>(value));
	++values_on_this_line_;
	++current_offset_;
	}

	void NewLine() {
	while (values_on_this_line_ < kMaxValuesPerLine) {
	fputs(" ", stream_);
	++values_on_this_line_;
	}
	fprintf(stream_, " // 0x%02x\n", current_offset_);
	values_on_this_line_ = 0;
	}

	private:
	// stdio stream. Not owned.
	FILE* stream_;

	// Number of values so far printed on this line.
	int values_on_this_line_;

	// Total values printed so far.
	int current_offset_;

	static const int kMaxValuesPerLine = 8;

	DISALLOW_COPY_AND_ASSIGN(TablePrinter);
	};

	// Start by filling a PairVector with characters. The resulting vector goes from
	// "\x00" to "\xf4\x8f\xbf\xbf".
	PairVector InitializeCharacters() {
	PairVector vector;
	for (int i = 0; i <= 0x10FFFF; ++i) {
	if (i >= 0xD800 && i < 0xE000) {
	// Surrogate codepoints are not permitted. Non-character code points are
	// explicitly permitted.
	continue;
	}
	uint8_t bytes[4];
	unsigned int offset = 0;
	UBool is_error = false;
	U8_APPEND(bytes, offset, base::size(bytes), i, is_error);
	DCHECK(!is_error);
	DCHECK_GT(offset, 0u);
	DCHECK_LE(offset, base::size(bytes));
	Pair pair = {Character(bytes, bytes + offset), StringSet()};
	vector.push_back(pair);
	}
	return vector;
	}

	// Construct a new Pair from \|character\| and the concatenation of \|new_range\|
	// and \|existing_set\|, and append it to \|pairs\|.
	void ConstructPairAndAppend(const Character& character,
	const Range& new_range,
	const StringSet& existing_set,
	PairVector* pairs) {
	Pair new_pair = {character, StringSet(1, new_range)};
	new_pair.set.insert(
	new_pair.set.end(), existing_set.begin(), existing_set.end());
	pairs->push_back(new_pair);
	}

	// Each pass over the PairVector strips one byte off the right-hand-side of the
	// characters and adds a range to the set on the right. For example, the first
	// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
	// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
	// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
	// [\xa0-\xbf][\x80-\xbf]).
	void MoveRightMostCharToSet(PairVector* pairs) {
	PairVector new_pairs;
	PairVector::const_iterator it = pairs->begin();
	while (it != pairs->end() && it->character.empty()) {
	new_pairs.push_back(*it);
	++it;
	}
	CHECK(it != pairs->end());
	Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
	Range new_range(it->character.back());
	StringSet converted = it->set;
	++it;
	while (it != pairs->end()) {
	const Pair& current_pair = *it++;
	if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
	std::equal(unconverted_bytes.begin(),
	unconverted_bytes.end(),
	current_pair.character.begin()) &&
	converted == current_pair.set) {
	// The particular set of UTF-8 codepoints we are validating guarantees
	// that each byte range will be contiguous. This would not necessarily be
	// true for an arbitrary set of UTF-8 codepoints.
	DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
	new_range.AddByte(current_pair.character.back());
	continue;
	}
	ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
	unconverted_bytes = Character(current_pair.character.begin(),
	current_pair.character.end() - 1);
	new_range = Range(current_pair.character.back());
	converted = current_pair.set;
	}
	ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
	new_pairs.swap(*pairs);
	}

	void MoveAllCharsToSets(PairVector* pairs) {
	// Since each pass of the function moves one character, and UTF-8 sequences
	// are at most 4 characters long, this simply runs the algorithm four times.
	for (int i = 0; i < 4; ++i) {
	MoveRightMostCharToSet(pairs);
	}
	#if DCHECK_IS_ON()
	for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
	++it) {
	DCHECK(it->character.empty());
	}
	#endif
	}

	// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
	// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
	// algorithm is working. Use the command-line option
	// --vmodule=build_utf8_validator_tables=1 to see this output.
	void LogStringSets(const PairVector& pairs) {
	for (const auto& pair_it : pairs) {
	std::string set_as_string;
	for (auto set_it = pair_it.set.begin(); set_it != pair_it.set.end();
	++set_it) {
	set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
	static_cast<int>(set_it->from()),
	static_cast<int>(set_it->to()));
	}
	VLOG(1) << set_as_string;
	}
	}

	// A single state in the state machine is represented by a sorted vector of
	// start bytes and target states. All input bytes in the range between the start
	// byte and the next entry in the vector (or 0xFF) result in a transition to the
	// target state.
	struct StateRange {
	uint8_t from;
	uint8_t target_state;
	};

	typedef std::vector<StateRange> State;

	// Generates a state where all bytes go to state 1 (invalid). This is also used
	// as an initialiser for other states (since bytes from outside the desired
	// range are invalid).
	State GenerateInvalidState() {
	const StateRange range = {0, 1};
	return State(1, range);
	}

	// A map from a state (ie. a set of strings which will match from this state) to
	// a number (which is an index into the array of states).
	typedef std::map<StringSet, uint8_t> StateMap;

	// Create a new state corresponding to \|set\|, add it \|states\| and \|state_map\|
	// and return the index it was given in \|states\|.
	uint8_t MakeState(const StringSet& set,
	std::vector<State>* states,
	StateMap* state_map) {
	DCHECK(!set.empty());
	const Range& range = set.front();
	const StringSet rest(set.begin() + 1, set.end());
	const StateMap::const_iterator where = state_map->find(rest);
	const uint8_t target_state = where == state_map->end()
	? MakeState(rest, states, state_map)
	: where->second;
	DCHECK_LT(0, range.from());
	DCHECK_LT(range.to(), 0xFF);
	const StateRange new_state_initializer[] = {
	{0, 1},
	{range.from(), target_state},
	{static_cast<uint8_t>(range.to() + 1), 1}};
	states->push_back(
	State(new_state_initializer,
	new_state_initializer + base::size(new_state_initializer)));
	const uint8_t new_state_number =
	base::checked_cast<uint8_t>(states->size() - 1);
	CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
	return new_state_number;
	}

	std::vector<State> GenerateStates(const PairVector& pairs) {
	// States 0 and 1 are the initial/valid state and invalid state, respectively.
	std::vector<State> states(2, GenerateInvalidState());
	StateMap state_map;
	state_map.insert(std::make_pair(StringSet(), 0));
	for (auto it = pairs.begin(); it != pairs.end(); ++it) {
	DCHECK(it->character.empty());
	DCHECK(!it->set.empty());
	const Range& range = it->set.front();
	const StringSet rest(it->set.begin() + 1, it->set.end());
	const StateMap::const_iterator where = state_map.find(rest);
	const uint8_t target_state = where == state_map.end()
	? MakeState(rest, &states, &state_map)
	: where->second;
	if (states[0].back().from == range.from()) {
	DCHECK_EQ(1, states[0].back().target_state);
	states[0].back().target_state = target_state;
	DCHECK_LT(range.to(), 0xFF);
	const StateRange new_range = {static_cast<uint8_t>(range.to() + 1), 1};
	states[0].push_back(new_range);
	} else {
	DCHECK_LT(range.to(), 0xFF);
	const StateRange new_range_initializer[] = {
	{range.from(), target_state},
	{static_cast<uint8_t>(range.to() + 1), 1}};
	states[0].insert(
	states[0].end(), new_range_initializer,
	new_range_initializer + base::size(new_range_initializer));
	}
	}
	return states;
	}

	// Output the generated states as a C++ table. Two tricks are used to compact
	// the table: each state in the table starts with a shift value which indicates
	// how many bits we can discard from the right-hand-side of the byte before
	// doing the table lookup. Secondly, only the state-transitions for bytes
	// with the top-bit set are included in the table; bytes without the top-bit set
	// are just ASCII and are handled directly by the code.
	void PrintStates(const std::vector<State>& states, FILE* stream) {
	// First calculate the start-offset of each state. This allows the state
	// machine to jump directly to the correct offset, avoiding an extra
	// indirection. State 0 starts at offset 0.
	std::vector<uint8_t> state_offset(1, 0);
	std::vector<uint8_t> shifts;
	uint8_t pos = 0;

	for (const auto& state_it : states) {
	// We want to set \|shift\| to the (0-based) index of the least-significant
	// set bit in any of the ranges for this state, since this tells us how many
	// bits we can discard and still determine what range a byte lies in. Sadly
	// it appears that ffs() is not portable, so we do it clumsily.
	uint8_t shift = 7;
	for (auto range_it = state_it.begin(); range_it != state_it.end();
	++range_it) {
	while (shift > 0 && range_it->from % (1 << shift) != 0) {
	--shift;
	}
	}
	shifts.push_back(shift);
	pos += 1 + (1 << (7 - shift));
	state_offset.push_back(pos);
	}

	DCHECK_EQ(129, state_offset[1]);

	fputs(kProlog, stream);
	TablePrinter table_printer(stream);

	for (uint8_t state_index = 0; state_index < states.size(); ++state_index) {
	const uint8_t shift = shifts[state_index];
	uint8_t next_range = 0;
	uint8_t target_state = 1;
	fprintf(stream,
	" // State %d, offset 0x%02x\n",
	static_cast<int>(state_index),
	static_cast<int>(state_offset[state_index]));
	table_printer.PrintValue(shift);
	for (int i = 0; i < 0x100; i += (1 << shift)) {
	if (next_range < states[state_index].size() &&
	states[state_index][next_range].from == i) {
	target_state = states[state_index][next_range].target_state;
	++next_range;
	}
	if (i >= 0x80) {
	table_printer.PrintValue(state_offset[target_state]);
	}
	}
	table_printer.NewLine();
	}

	fputs(kEpilog, stream);
	}

	} // namespace

	int main(int argc, char* argv[]) {
	base::CommandLine::Init(argc, argv);
	logging::LoggingSettings settings;
	settings.logging_dest =
	logging::LOG_TO_SYSTEM_DEBUG_LOG \| logging::LOG_TO_STDERR;
	logging::InitLogging(settings);
	if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) {
	fwrite(kHelpText, 1, base::size(kHelpText), stdout);
	exit(EXIT_SUCCESS);
	}
	base::FilePath filename =
	base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");

	FILE* output = stdout;
	if (!filename.empty()) {
	output = base::OpenFile(filename, "wb");
	if (!output)
	PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
	<< "' for writing";
	}

	// Step 1: Enumerate the characters
	PairVector pairs = InitializeCharacters();
	// Step 2: Convert to sets.
	MoveAllCharsToSets(&pairs);
	if (VLOG_IS_ON(1)) {
	LogStringSets(pairs);
	}
	// Step 3: Generate states.
	std::vector<State> states = GenerateStates(pairs);
	// Step 4/5: Print output
	PrintStates(states, output);

	if (!filename.empty()) {
	if (!base::CloseFile(output))
	PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()
	<< "'";
	}

	return EXIT_SUCCESS;
	}