third_party/boost/include/boost/spirit/home/support/char_encoding/unicode/create_tables.cpp - webm/webmlive - Git at Google

 /*=============================================================================
     Copyright (c) 2001-2011 Joel de Guzman

     Distributed under the Boost Software License, Version 1.0. (See accompanying
     file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 =============================================================================*/
 #include <boost/config/warning_disable.hpp>
 #include <boost/spirit/include/qi.hpp>
 #include <boost/spirit/include/phoenix.hpp>
 #include <boost/unordered_map.hpp>
 #include <boost/algorithm/string/trim.hpp>
 #include <boost/cstdint.hpp>
 #include <boost/foreach.hpp>
 #include <boost/array.hpp>
 #include <boost/scoped_array.hpp>
 #include <boost/range/iterator_range.hpp>

 #include <iostream>
 #include <fstream>
 #include <vector>
 #include <algorithm>
 #include <string>
 #include <map>

 // We place the data here. Each line comprises various fields
 typedef std::vector<std::string> ucd_line;
 typedef std::vector<ucd_line> ucd_vector;
 typedef std::vector<ucd_line>::iterator ucd_iterator;

 // spirit and phoenix using declarations
 using boost::spirit::qi::parse;
 using boost::spirit::qi::hex;
 using boost::spirit::qi::char_;
 using boost::spirit::qi::eol;
 using boost::spirit::qi::rule;
 using boost::spirit::qi::omit;
 using boost::spirit::qi::_1;
 using boost::spirit::qi::_val;
 using boost::phoenix::push_back;
 using boost::phoenix::ref;

 // basic unsigned types
 using boost::uint8_t;
 using boost::uint16_t;
 using boost::uint32_t;

 // a char range
 struct ucd_range
 {
     ucd_range(uint32_t start, uint32_t finish)
         : start(start), finish(finish) {}

     // we need this so we can use ucd_range as a multimap key
     friend bool operator<(ucd_range const& a, ucd_range const& b)
     {
         return a.start < b.start;
     }

     uint32_t start;
     uint32_t finish;
 };

 class ucd_info
 {
 public:

     ucd_info(char const* filename)
     {
         std::ifstream in(filename, std::ios_base::in);
         if (!in)
         {
             std::cerr << "Error: Could not open input file: "
                 << filename << std::endl;
         }
         else
         {
             std::string data;               // We will read the contents here.
             in.unsetf(std::ios::skipws);    // No white space skipping!
             std::copy(
                 std::istream_iterator<char>(in),
                 std::istream_iterator<char>(),
                 std::back_inserter(data));

             typedef std::string::const_iterator iterator_type;
             iterator_type f = data.begin();
             iterator_type l = data.end();

             rule<iterator_type> endl = -('#' >> *(char_-eol)) >> eol;
             rule<iterator_type, std::string()> field = *(char_-(';'|endl)) >> (';'|&endl);
             rule<iterator_type, ucd_line()> line = +(field-endl) >> endl;
             rule<iterator_type, std::vector<ucd_line>()> file = +(endl | line[push_back(_val, _1)]);

             parse(f, l, file, info);
         }
     }

     template <typename Array>
     void collect(Array& data, int field, bool collect_properties = true) const
     {
         BOOST_ASSERT(!info.empty());
         ucd_vector::const_iterator current = info.begin();
         ucd_vector::const_iterator end = info.end();

         while (current != end)
         {
             std::string range = (*current)[0];
             boost::trim(range);

             std::string::const_iterator f = range.begin();
             std::string::const_iterator l = range.end();

             // get the code-point range
             uint32_t start;
             uint32_t finish;
             parse(f, l, hex[ref(start) = ref(finish) = _1] >> -(".." >> hex[ref(finish) = _1]));

             // special case for UnicodeData.txt ranges:
             if ((*current)[1].find("First>") != std::string::npos)
             {
                 ++current;
                 BOOST_ASSERT(current != end);
                 BOOST_ASSERT((*current)[1].find("Last>") != std::string::npos);

                 std::string range = (*current)[0];
                 boost::trim(range);
                 f = range.begin();
                 l = range.end();

                 parse(f, l, hex[ref(finish) = _1]);
             }

             std::string code;
             if (field < int(current->size()))
                 code = (*current)[field];
             boost::trim(code);
             // Only collect properties we are interested in
             if (collect_properties) // code for properties
             {
                 if (!ignore_property(code))
                 {
                     for (uint32_t i = start; i <= finish; ++i)
                         data[i] |= map_property(code);
                 }
             }
             else // code for actual numeric values
             {
                 for (uint32_t i = start; i <= finish; ++i)
                 {
                     if (code.empty())
                     {
                         data[i] = 0; // signal that this code maps to itself
                     }
                     else
                     {
                         f = code.begin();
                         l = code.end();
                         parse(f, l, hex, data[i]);
                     }
                 }
             }
             ++current;
         }
     }

 private:

     static bool ignore_property(std::string const& p)
     {
         // We don't handle all properties
         std::map<std::string, int>& pm = get_property_map();
         std::map<std::string, int>::iterator i = pm.find(p);
         return i == pm.end();
     }

     static int
     map_property(std::string const& p)
     {
         std::map<std::string, int>& pm = get_property_map();
         std::map<std::string, int>::iterator i = pm.find(p);
         BOOST_ASSERT(i != pm.end());
         return i->second;
     }

     static std::map<std::string, int>&
     get_property_map()
     {
         // The properties we are interested in:
         static std::map<std::string, int> map;
         if (map.empty())
         {
             // General_Category
             map["Lu"] = 0;
             map["Ll"] = 1;
             map["Lt"] = 2;
             map["Lm"] = 3;
             map["Lo"] = 4;

             map["Mn"] = 8;
             map["Me"] = 9;
             map["Mc"] = 10;

             map["Nd"] = 16;
             map["Nl"] = 17;
             map["No"] = 18;

             map["Zs"] = 24;
             map["Zl"] = 25;
             map["Zp"] = 26;

             map["Cc"] = 32;
             map["Cf"] = 33;
             map["Co"] = 34;
             map["Cs"] = 35;
             map["Cn"] = 36;

             map["Pd"] = 40;
             map["Ps"] = 41;
             map["Pe"] = 42;
             map["Pc"] = 43;
             map["Po"] = 44;
             map["Pi"] = 45;
             map["Pf"] = 46;

             map["Sm"] = 48;
             map["Sc"] = 49;
             map["Sk"] = 50;
             map["So"] = 51;

             // Derived Properties.
             map["Alphabetic"] = 64;
             map["Uppercase"] = 128;
             map["Lowercase"] = 256;
             map["White_Space"] = 512;
             map["Hex_Digit"] = 1024;
             map["Noncharacter_Code_Point"] = 2048;
             map["Default_Ignorable_Code_Point"] = 4096;

             // Script
             map["Arabic"] = 0;
             map["Imperial_Aramaic"] = 1;
             map["Armenian"] = 2;
             map["Avestan"] = 3;
             map["Balinese"] = 4;
             map["Bamum"] = 5;
             map["Bengali"] = 6;
             map["Bopomofo"] = 7;
             map["Braille"] = 8;
             map["Buginese"] = 9;
             map["Buhid"] = 10;
             map["Canadian_Aboriginal"] = 11;
             map["Carian"] = 12;
             map["Cham"] = 13;
             map["Cherokee"] = 14;
             map["Coptic"] = 15;
             map["Cypriot"] = 16;
             map["Cyrillic"] = 17;
             map["Devanagari"] = 18;
             map["Deseret"] = 19;
             map["Egyptian_Hieroglyphs"] = 20;
             map["Ethiopic"] = 21;
             map["Georgian"] = 22;
             map["Glagolitic"] = 23;
             map["Gothic"] = 24;
             map["Greek"] = 25;
             map["Gujarati"] = 26;
             map["Gurmukhi"] = 27;
             map["Hangul"] = 28;
             map["Han"] = 29;
             map["Hanunoo"] = 30;
             map["Hebrew"] = 31;
             map["Hiragana"] = 32;
             map["Katakana_Or_Hiragana"] = 33;
             map["Old_Italic"] = 34;
             map["Javanese"] = 35;
             map["Kayah_Li"] = 36;
             map["Katakana"] = 37;
             map["Kharoshthi"] = 38;
             map["Khmer"] = 39;
             map["Kannada"] = 40;
             map["Kaithi"] = 41;
             map["Tai_Tham"] = 42;
             map["Lao"] = 43;
             map["Latin"] = 44;
             map["Lepcha"] = 45;
             map["Limbu"] = 46;
             map["Linear_B"] = 47;
             map["Lisu"] = 48;
             map["Lycian"] = 49;
             map["Lydian"] = 50;
             map["Malayalam"] = 51;
             map["Mongolian"] = 52;
             map["Meetei_Mayek"] = 53;
             map["Myanmar"] = 54;
             map["Nko"] = 55;
             map["Ogham"] = 56;
             map["Ol_Chiki"] = 57;
             map["Old_Turkic"] = 58;
             map["Oriya"] = 59;
             map["Osmanya"] = 60;
             map["Phags_Pa"] = 61;
             map["Inscriptional_Pahlavi"] = 62;
             map["Phoenician"] = 63;
             map["Inscriptional_Parthian"] = 64;
             map["Rejang"] = 65;
             map["Runic"] = 66;
             map["Samaritan"] = 67;
             map["Old_South_Arabian"] = 68;
             map["Saurashtra"] = 69;
             map["Shavian"] = 70;
             map["Sinhala"] = 71;
             map["Sundanese"] = 72;
             map["Syloti_Nagri"] = 73;
             map["Syriac"] = 74;
             map["Tagbanwa"] = 75;
             map["Tai_Le"] = 76;
             map["New_Tai_Lue"] = 77;
             map["Tamil"] = 78;
             map["Tai_Viet"] = 79;
             map["Telugu"] = 80;
             map["Tifinagh"] = 81;
             map["Tagalog"] = 82;
             map["Thaana"] = 83;
             map["Thai"] = 84;
             map["Tibetan"] = 85;
             map["Ugaritic"] = 86;
             map["Vai"] = 87;
             map["Old_Persian"] = 88;
             map["Cuneiform"] = 89;
             map["Yi"] = 90;
             map["Inherited"] = 91;
             map["Common"] = 92;
             map["Unknown"] = 93;
         }
         return map;
     }

     ucd_vector info;
 };

 template <typename T, uint32_t block_size_ = 256>
 class ucd_table_builder
 {
 public:

     static uint32_t const block_size = block_size_;
     static uint32_t const full_span = 0x110000;
     typedef T value_type;

     ucd_table_builder() : p(new T[full_span])
     {
         for (uint32_t i = 0; i < full_span; ++i)
             p[i] = 0;
     }

     void collect(char const* filename, int field, bool collect_properties = true)
     {
         std::cout << "collecting " << filename << std::endl;
         ucd_info info(filename);
         info.collect(p, field, collect_properties);
     }

     void build(std::vector<uint8_t>& stage1, std::vector<T const*>& stage2)
     {
         std::cout << "building tables" << std::endl;
         std::map<block_ptr, std::vector<T const*> > blocks;
         for (T const* i = p.get(); i < (p.get() + full_span); i += block_size)
             blocks[block_ptr(i)].push_back(i);

         // Not enough bits to store the block indices.
         BOOST_ASSERT(blocks.size() < (1 << (sizeof(uint8_t) * 8)));

         typedef std::pair<block_ptr, std::vector<T const*> > blocks_value_type;
         std::map<T const*, std::vector<T const*> > sorted_blocks;
         BOOST_FOREACH(blocks_value_type const& val, blocks)
         {
             sorted_blocks[val.first.p] = val.second;
         }

         stage1.clear();
         stage1.reserve(full_span / block_size);
         stage1.resize(full_span / block_size);
         stage2.clear();
         stage2.reserve(blocks.size());

         typedef std::pair<T const*, std::vector<T const*> > sorted_blocks_value_type;
         BOOST_FOREACH(sorted_blocks_value_type const& val, sorted_blocks)
         {
             stage2.push_back(val.first);
             BOOST_FOREACH(T const* val2, val.second)
             {
                 stage1[(val2 - p.get()) / block_size] = stage2.size() - 1;
             }
         }
     }

 private:

     struct block_ptr
     {
         block_ptr(T const* p) : p(p) {}

         friend bool operator<(block_ptr a, block_ptr b)
         {
             return std::lexicographical_compare(
                 a.p, a.p + block_size, b.p, b.p + block_size);
         }

         T const* p;
     };

     boost::scoped_array<T> p;
 };

 template <typename Out>
 void print_tab(Out& out, int tab)
 {
     for (int i = 0; i < tab; ++i)
         out << ' ';
 }

 template <typename Out, typename C>
 void print_table(Out& out, C const& c, bool trailing_comma, int width = 4, int group = 16)
 {
     int const tab = 4;
     C::size_type size = c.size();
     BOOST_ASSERT(size > 1);
     print_tab(out, tab);
     out << std::setw(width) << int(c[0]);
     for (C::size_type i = 1; i < size; ++i)
     {
         out << ", ";
         if ((i % group) == 0)
         {
             out << std::endl;
             print_tab(out, tab);
         }
         out << std::setw(width) << int(c[i]);
     }

     if (trailing_comma)
         out << ", " << std::endl;
 }

 template <typename Out>
 void print_head(Out& out)
 {
     out
         << "/*=============================================================================\n"
         << "    Copyright (c) 2001-2011 Joel de Guzman\n"
         << "\n"
         << "    Distributed under the Boost Software License, Version 1.0. (See accompanying\n"
         << "    file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n"
         << "\n"
         << "    AUTOGENERATED. DO NOT EDIT!!!\n"
         << "==============================================================================*/\n"
         << "#include <boost/cstdint.hpp>\n"
         << "\n"
         << "namespace boost { namespace spirit { namespace ucd { namespace detail\n"
         << "{"
         ;
 }

 template <typename Out>
 void print_tail(Out& out)
 {
     out
         << "\n"
         << "}}}} // namespace boost::spirit::unicode::detail\n"
         ;
 }

 char const* get_int_type_name(int size)
 {
     switch (size)
     {
         case 1: return "::boost::uint8_t";
         case 2: return "::boost::uint16_t";
         case 4: return "::boost::uint32_t";
         case 5: return "::boost::uint64_t";
         default: BOOST_ASSERT(false); return 0; // invalid size
     };
 }

 template <typename Out, typename Builder>
 void print_file(Out& out, Builder& builder, int field_width, char const* name)
 {
     std::cout << "Generating " << name << " tables" << std::endl;

     uint32_t const block_size = Builder::block_size;
     typedef typename Builder::value_type value_type;
     print_head(out);

     std::vector<uint8_t> stage1;
     std::vector<value_type const*> stage2;
     builder.build(stage1, stage2);
     std::cout << "Block Size: " << block_size << std::endl;
     std::cout << "Total Bytes: "
         << stage1.size()+(stage2.size()*block_size*sizeof(value_type))
         << std::endl;

     out
         << "\n"
         << "    static const ::boost::uint8_t " << name << "_stage1[] = {\n"
         << "\n"
         ;

     print_table(out, stage1, false, 3);
     char const* int_name = get_int_type_name(sizeof(value_type));

     out
         << "\n"
         << "    };"
         << "\n"
         << "\n"
         << "    static const " << int_name << ' ' << name << "_stage2[] = {"
         ;

     int block_n = 0;
     for (int i = 0; i < int(stage2.size()); ++i)
     {
         value_type const* p = stage2[i];
         bool last = (i+1 == stage2.size());
         out << "\n\n    // block " << block_n++ << std::endl;
         print_table(out,
             boost::iterator_range<value_type const*>(p, p+block_size), !last, field_width);
     }

     out
         << "\n"
         << "    };"
         << "\n"
         ;

     out
         << "\n"
         << "    inline " << int_name << ' ' << name << "_lookup(::boost::uint32_t ch)\n"
         << "    {\n"
         << "        ::boost::uint32_t block_offset = " << name << "_stage1[ch / " << block_size << "] * " << block_size << ";\n"
         << "        return " << name << "_stage2[block_offset + ch % " << block_size << "];\n"
         << "    }\n"
         ;

     print_tail(out);
 }

 int main()
 {
     // The category tables
     {
         std::ofstream out("category_table.hpp");
         ucd_table_builder<uint16_t, 256> builder;
         builder.collect("UnicodeData.txt", 2);
         builder.collect("DerivedCoreProperties.txt", 1);
         builder.collect("PropList.txt", 1);
         print_file(out, builder, 4, "category");
     }

     // The script tables
     {
         std::ofstream out("script_table.hpp");
         ucd_table_builder<uint8_t, 256> builder;
         builder.collect("Scripts.txt", 1);
         print_file(out, builder, 3, "script");
     }

     // The lowercase tables
     {
         std::ofstream out("lowercase_table.hpp");
         ucd_table_builder<uint32_t, 256> builder;
         builder.collect("UnicodeData.txt", 13, false);
         print_file(out, builder, 6, "lowercase");
     }

     // The uppercase tables
     {
         std::ofstream out("uppercase_table.hpp");
         ucd_table_builder<uint32_t, 256> builder;
         builder.collect("UnicodeData.txt", 12, false);
         print_file(out, builder, 6, "uppercase");
     }

     return 0;
 }
	/*=============================================================================
	Copyright (c) 2001-2011 Joel de Guzman

	Distributed under the Boost Software License, Version 1.0. (See accompanying
	file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
	=============================================================================*/
	#include <boost/config/warning_disable.hpp>
	#include <boost/spirit/include/qi.hpp>
	#include <boost/spirit/include/phoenix.hpp>
	#include <boost/unordered_map.hpp>
	#include <boost/algorithm/string/trim.hpp>
	#include <boost/cstdint.hpp>
	#include <boost/foreach.hpp>
	#include <boost/array.hpp>
	#include <boost/scoped_array.hpp>
	#include <boost/range/iterator_range.hpp>

	#include <iostream>
	#include <fstream>
	#include <vector>
	#include <algorithm>
	#include <string>
	#include <map>

	// We place the data here. Each line comprises various fields
	typedef std::vector<std::string> ucd_line;
	typedef std::vector<ucd_line> ucd_vector;
	typedef std::vector<ucd_line>::iterator ucd_iterator;

	// spirit and phoenix using declarations
	using boost::spirit::qi::parse;
	using boost::spirit::qi::hex;
	using boost::spirit::qi::char_;
	using boost::spirit::qi::eol;
	using boost::spirit::qi::rule;
	using boost::spirit::qi::omit;
	using boost::spirit::qi::_1;
	using boost::spirit::qi::_val;
	using boost::phoenix::push_back;
	using boost::phoenix::ref;

	// basic unsigned types
	using boost::uint8_t;
	using boost::uint16_t;
	using boost::uint32_t;

	// a char range
	struct ucd_range
	{
	ucd_range(uint32_t start, uint32_t finish)
	: start(start), finish(finish) {}

	// we need this so we can use ucd_range as a multimap key
	friend bool operator<(ucd_range const& a, ucd_range const& b)
	{
	return a.start < b.start;
	}

	uint32_t start;
	uint32_t finish;
	};

	class ucd_info
	{
	public:

	ucd_info(char const* filename)
	{
	std::ifstream in(filename, std::ios_base::in);
	if (!in)
	{
	std::cerr << "Error: Could not open input file: "
	<< filename << std::endl;
	}
	else
	{
	std::string data; // We will read the contents here.
	in.unsetf(std::ios::skipws); // No white space skipping!
	std::copy(
	std::istream_iterator<char>(in),
	std::istream_iterator<char>(),
	std::back_inserter(data));

	typedef std::string::const_iterator iterator_type;
	iterator_type f = data.begin();
	iterator_type l = data.end();

	rule<iterator_type> endl = -('#' >> *(char_-eol)) >> eol;
	rule<iterator_type, std::string()> field = *(char_-(';'\|endl)) >> (';'\|&endl);
	rule<iterator_type, ucd_line()> line = +(field-endl) >> endl;
	rule<iterator_type, std::vector<ucd_line>()> file = +(endl \| line[push_back(_val, _1)]);

	parse(f, l, file, info);
	}
	}

	template <typename Array>
	void collect(Array& data, int field, bool collect_properties = true) const
	{
	BOOST_ASSERT(!info.empty());
	ucd_vector::const_iterator current = info.begin();
	ucd_vector::const_iterator end = info.end();

	while (current != end)
	{
	std::string range = (*current)[0];
	boost::trim(range);

	std::string::const_iterator f = range.begin();
	std::string::const_iterator l = range.end();

	// get the code-point range
	uint32_t start;
	uint32_t finish;
	parse(f, l, hex[ref(start) = ref(finish) = _1] >> -(".." >> hex[ref(finish) = _1]));

	// special case for UnicodeData.txt ranges:
	if ((*current)[1].find("First>") != std::string::npos)
	{
	++current;
	BOOST_ASSERT(current != end);
	BOOST_ASSERT((*current)[1].find("Last>") != std::string::npos);

	std::string range = (*current)[0];
	boost::trim(range);
	f = range.begin();
	l = range.end();

	parse(f, l, hex[ref(finish) = _1]);
	}

	std::string code;
	if (field < int(current->size()))
	code = (*current)[field];
	boost::trim(code);
	// Only collect properties we are interested in
	if (collect_properties) // code for properties
	{
	if (!ignore_property(code))
	{
	for (uint32_t i = start; i <= finish; ++i)
	data[i] \|= map_property(code);
	}
	}
	else // code for actual numeric values
	{
	for (uint32_t i = start; i <= finish; ++i)
	{
	if (code.empty())
	{
	data[i] = 0; // signal that this code maps to itself
	}
	else
	{
	f = code.begin();
	l = code.end();
	parse(f, l, hex, data[i]);
	}
	}
	}
	++current;
	}
	}

	private:

	static bool ignore_property(std::string const& p)
	{
	// We don't handle all properties
	std::map<std::string, int>& pm = get_property_map();
	std::map<std::string, int>::iterator i = pm.find(p);
	return i == pm.end();
	}

	static int
	map_property(std::string const& p)
	{
	std::map<std::string, int>& pm = get_property_map();
	std::map<std::string, int>::iterator i = pm.find(p);
	BOOST_ASSERT(i != pm.end());
	return i->second;
	}

	static std::map<std::string, int>&
	get_property_map()
	{
	// The properties we are interested in:
	static std::map<std::string, int> map;
	if (map.empty())
	{
	// General_Category
	map["Lu"] = 0;
	map["Ll"] = 1;
	map["Lt"] = 2;
	map["Lm"] = 3;
	map["Lo"] = 4;

	map["Mn"] = 8;
	map["Me"] = 9;
	map["Mc"] = 10;

	map["Nd"] = 16;
	map["Nl"] = 17;
	map["No"] = 18;

	map["Zs"] = 24;
	map["Zl"] = 25;
	map["Zp"] = 26;

	map["Cc"] = 32;
	map["Cf"] = 33;
	map["Co"] = 34;
	map["Cs"] = 35;
	map["Cn"] = 36;

	map["Pd"] = 40;
	map["Ps"] = 41;
	map["Pe"] = 42;
	map["Pc"] = 43;
	map["Po"] = 44;
	map["Pi"] = 45;
	map["Pf"] = 46;

	map["Sm"] = 48;
	map["Sc"] = 49;
	map["Sk"] = 50;
	map["So"] = 51;

	// Derived Properties.
	map["Alphabetic"] = 64;
	map["Uppercase"] = 128;
	map["Lowercase"] = 256;
	map["White_Space"] = 512;
	map["Hex_Digit"] = 1024;
	map["Noncharacter_Code_Point"] = 2048;
	map["Default_Ignorable_Code_Point"] = 4096;

	// Script
	map["Arabic"] = 0;
	map["Imperial_Aramaic"] = 1;
	map["Armenian"] = 2;
	map["Avestan"] = 3;
	map["Balinese"] = 4;
	map["Bamum"] = 5;
	map["Bengali"] = 6;
	map["Bopomofo"] = 7;
	map["Braille"] = 8;
	map["Buginese"] = 9;
	map["Buhid"] = 10;
	map["Canadian_Aboriginal"] = 11;
	map["Carian"] = 12;
	map["Cham"] = 13;
	map["Cherokee"] = 14;
	map["Coptic"] = 15;
	map["Cypriot"] = 16;
	map["Cyrillic"] = 17;
	map["Devanagari"] = 18;
	map["Deseret"] = 19;
	map["Egyptian_Hieroglyphs"] = 20;
	map["Ethiopic"] = 21;
	map["Georgian"] = 22;
	map["Glagolitic"] = 23;
	map["Gothic"] = 24;
	map["Greek"] = 25;
	map["Gujarati"] = 26;
	map["Gurmukhi"] = 27;
	map["Hangul"] = 28;
	map["Han"] = 29;
	map["Hanunoo"] = 30;
	map["Hebrew"] = 31;
	map["Hiragana"] = 32;
	map["Katakana_Or_Hiragana"] = 33;
	map["Old_Italic"] = 34;
	map["Javanese"] = 35;
	map["Kayah_Li"] = 36;
	map["Katakana"] = 37;
	map["Kharoshthi"] = 38;
	map["Khmer"] = 39;
	map["Kannada"] = 40;
	map["Kaithi"] = 41;
	map["Tai_Tham"] = 42;
	map["Lao"] = 43;
	map["Latin"] = 44;
	map["Lepcha"] = 45;
	map["Limbu"] = 46;
	map["Linear_B"] = 47;
	map["Lisu"] = 48;
	map["Lycian"] = 49;
	map["Lydian"] = 50;
	map["Malayalam"] = 51;
	map["Mongolian"] = 52;
	map["Meetei_Mayek"] = 53;
	map["Myanmar"] = 54;
	map["Nko"] = 55;
	map["Ogham"] = 56;
	map["Ol_Chiki"] = 57;
	map["Old_Turkic"] = 58;
	map["Oriya"] = 59;
	map["Osmanya"] = 60;
	map["Phags_Pa"] = 61;
	map["Inscriptional_Pahlavi"] = 62;
	map["Phoenician"] = 63;
	map["Inscriptional_Parthian"] = 64;
	map["Rejang"] = 65;
	map["Runic"] = 66;
	map["Samaritan"] = 67;
	map["Old_South_Arabian"] = 68;
	map["Saurashtra"] = 69;
	map["Shavian"] = 70;
	map["Sinhala"] = 71;
	map["Sundanese"] = 72;
	map["Syloti_Nagri"] = 73;
	map["Syriac"] = 74;
	map["Tagbanwa"] = 75;
	map["Tai_Le"] = 76;
	map["New_Tai_Lue"] = 77;
	map["Tamil"] = 78;
	map["Tai_Viet"] = 79;
	map["Telugu"] = 80;
	map["Tifinagh"] = 81;
	map["Tagalog"] = 82;
	map["Thaana"] = 83;
	map["Thai"] = 84;
	map["Tibetan"] = 85;
	map["Ugaritic"] = 86;
	map["Vai"] = 87;
	map["Old_Persian"] = 88;
	map["Cuneiform"] = 89;
	map["Yi"] = 90;
	map["Inherited"] = 91;
	map["Common"] = 92;
	map["Unknown"] = 93;
	}
	return map;
	}

	ucd_vector info;
	};

	template <typename T, uint32_t block_size_ = 256>
	class ucd_table_builder
	{
	public:

	static uint32_t const block_size = block_size_;
	static uint32_t const full_span = 0x110000;
	typedef T value_type;

	ucd_table_builder() : p(new T[full_span])
	{
	for (uint32_t i = 0; i < full_span; ++i)
	p[i] = 0;
	}

	void collect(char const* filename, int field, bool collect_properties = true)
	{
	std::cout << "collecting " << filename << std::endl;
	ucd_info info(filename);
	info.collect(p, field, collect_properties);
	}

	void build(std::vector<uint8_t>& stage1, std::vector<T const*>& stage2)
	{
	std::cout << "building tables" << std::endl;
	std::map<block_ptr, std::vector<T const*> > blocks;
	for (T const* i = p.get(); i < (p.get() + full_span); i += block_size)
	blocks[block_ptr(i)].push_back(i);

	// Not enough bits to store the block indices.
	BOOST_ASSERT(blocks.size() < (1 << (sizeof(uint8_t) * 8)));

	typedef std::pair<block_ptr, std::vector<T const*> > blocks_value_type;
	std::map<T const, std::vector<T const> > sorted_blocks;
	BOOST_FOREACH(blocks_value_type const& val, blocks)
	{
	sorted_blocks[val.first.p] = val.second;
	}

	stage1.clear();
	stage1.reserve(full_span / block_size);
	stage1.resize(full_span / block_size);
	stage2.clear();
	stage2.reserve(blocks.size());

	typedef std::pair<T const, std::vector<T const> > sorted_blocks_value_type;
	BOOST_FOREACH(sorted_blocks_value_type const& val, sorted_blocks)
	{
	stage2.push_back(val.first);
	BOOST_FOREACH(T const* val2, val.second)
	{
	stage1[(val2 - p.get()) / block_size] = stage2.size() - 1;
	}
	}
	}

	private:

	struct block_ptr
	{
	block_ptr(T const* p) : p(p) {}

	friend bool operator<(block_ptr a, block_ptr b)
	{
	return std::lexicographical_compare(
	a.p, a.p + block_size, b.p, b.p + block_size);
	}

	T const* p;
	};

	boost::scoped_array<T> p;
	};

	template <typename Out>
	void print_tab(Out& out, int tab)
	{
	for (int i = 0; i < tab; ++i)
	out << ' ';
	}

	template <typename Out, typename C>
	void print_table(Out& out, C const& c, bool trailing_comma, int width = 4, int group = 16)
	{
	int const tab = 4;
	C::size_type size = c.size();
	BOOST_ASSERT(size > 1);
	print_tab(out, tab);
	out << std::setw(width) << int(c[0]);
	for (C::size_type i = 1; i < size; ++i)
	{
	out << ", ";
	if ((i % group) == 0)
	{
	out << std::endl;
	print_tab(out, tab);
	}
	out << std::setw(width) << int(c[i]);
	}

	if (trailing_comma)
	out << ", " << std::endl;
	}

	template <typename Out>
	void print_head(Out& out)
	{
	out
	<< "/*=============================================================================\n"
	<< " Copyright (c) 2001-2011 Joel de Guzman\n"
	<< "\n"
	<< " Distributed under the Boost Software License, Version 1.0. (See accompanying\n"
	<< " file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n"
	<< "\n"
	<< " AUTOGENERATED. DO NOT EDIT!!!\n"
	<< "==============================================================================*/\n"
	<< "#include <boost/cstdint.hpp>\n"
	<< "\n"
	<< "namespace boost { namespace spirit { namespace ucd { namespace detail\n"
	<< "{"
	;
	}

	template <typename Out>
	void print_tail(Out& out)
	{
	out
	<< "\n"
	<< "}}}} // namespace boost::spirit::unicode::detail\n"
	;
	}

	char const* get_int_type_name(int size)
	{
	switch (size)
	{
	case 1: return "::boost::uint8_t";
	case 2: return "::boost::uint16_t";
	case 4: return "::boost::uint32_t";
	case 5: return "::boost::uint64_t";
	default: BOOST_ASSERT(false); return 0; // invalid size
	};
	}

	template <typename Out, typename Builder>
	void print_file(Out& out, Builder& builder, int field_width, char const* name)
	{
	std::cout << "Generating " << name << " tables" << std::endl;

	uint32_t const block_size = Builder::block_size;
	typedef typename Builder::value_type value_type;
	print_head(out);

	std::vector<uint8_t> stage1;
	std::vector<value_type const*> stage2;
	builder.build(stage1, stage2);
	std::cout << "Block Size: " << block_size << std::endl;
	std::cout << "Total Bytes: "
	<< stage1.size()+(stage2.size()block_sizesizeof(value_type))
	<< std::endl;

	out
	<< "\n"
	<< " static const ::boost::uint8_t " << name << "_stage1[] = {\n"
	<< "\n"
	;

	print_table(out, stage1, false, 3);
	char const* int_name = get_int_type_name(sizeof(value_type));

	out
	<< "\n"
	<< " };"
	<< "\n"
	<< "\n"
	<< " static const " << int_name << ' ' << name << "_stage2[] = {"
	;

	int block_n = 0;
	for (int i = 0; i < int(stage2.size()); ++i)
	{
	value_type const* p = stage2[i];
	bool last = (i+1 == stage2.size());
	out << "\n\n // block " << block_n++ << std::endl;
	print_table(out,
	boost::iterator_range<value_type const*>(p, p+block_size), !last, field_width);
	}

	out
	<< "\n"
	<< " };"
	<< "\n"
	;

	out
	<< "\n"
	<< " inline " << int_name << ' ' << name << "_lookup(::boost::uint32_t ch)\n"
	<< " {\n"
	<< " ::boost::uint32_t block_offset = " << name << "_stage1[ch / " << block_size << "] * " << block_size << ";\n"
	<< " return " << name << "_stage2[block_offset + ch % " << block_size << "];\n"
	<< " }\n"
	;

	print_tail(out);
	}

	int main()
	{
	// The category tables
	{
	std::ofstream out("category_table.hpp");
	ucd_table_builder<uint16_t, 256> builder;
	builder.collect("UnicodeData.txt", 2);
	builder.collect("DerivedCoreProperties.txt", 1);
	builder.collect("PropList.txt", 1);
	print_file(out, builder, 4, "category");
	}

	// The script tables
	{
	std::ofstream out("script_table.hpp");
	ucd_table_builder<uint8_t, 256> builder;
	builder.collect("Scripts.txt", 1);
	print_file(out, builder, 3, "script");
	}

	// The lowercase tables
	{
	std::ofstream out("lowercase_table.hpp");
	ucd_table_builder<uint32_t, 256> builder;
	builder.collect("UnicodeData.txt", 13, false);
	print_file(out, builder, 6, "lowercase");
	}

	// The uppercase tables
	{
	std::ofstream out("uppercase_table.hpp");
	ucd_table_builder<uint32_t, 256> builder;
	builder.collect("UnicodeData.txt", 12, false);
	print_file(out, builder, 6, "uppercase");
	}

	return 0;
	}