/*============================================================================= | |
Copyright (c) 2001-2011 Joel de Guzman | |
Distributed under the Boost Software License, Version 1.0. (See accompanying | |
file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
=============================================================================*/ | |
#include <boost/config/warning_disable.hpp> | |
#include <boost/spirit/include/qi.hpp> | |
#include <boost/spirit/include/phoenix.hpp> | |
#include <boost/unordered_map.hpp> | |
#include <boost/algorithm/string/trim.hpp> | |
#include <boost/cstdint.hpp> | |
#include <boost/foreach.hpp> | |
#include <boost/array.hpp> | |
#include <boost/scoped_array.hpp> | |
#include <boost/range/iterator_range.hpp> | |
#include <iostream> | |
#include <fstream> | |
#include <vector> | |
#include <algorithm> | |
#include <string> | |
#include <map> | |
// We place the data here. Each line comprises various fields | |
typedef std::vector<std::string> ucd_line; | |
typedef std::vector<ucd_line> ucd_vector; | |
typedef std::vector<ucd_line>::iterator ucd_iterator; | |
// spirit and phoenix using declarations | |
using boost::spirit::qi::parse; | |
using boost::spirit::qi::hex; | |
using boost::spirit::qi::char_; | |
using boost::spirit::qi::eol; | |
using boost::spirit::qi::rule; | |
using boost::spirit::qi::omit; | |
using boost::spirit::qi::_1; | |
using boost::spirit::qi::_val; | |
using boost::phoenix::push_back; | |
using boost::phoenix::ref; | |
// basic unsigned types | |
using boost::uint8_t; | |
using boost::uint16_t; | |
using boost::uint32_t; | |
// a char range | |
struct ucd_range | |
{ | |
ucd_range(uint32_t start, uint32_t finish) | |
: start(start), finish(finish) {} | |
// we need this so we can use ucd_range as a multimap key | |
friend bool operator<(ucd_range const& a, ucd_range const& b) | |
{ | |
return a.start < b.start; | |
} | |
uint32_t start; | |
uint32_t finish; | |
}; | |
class ucd_info | |
{ | |
public: | |
ucd_info(char const* filename) | |
{ | |
std::ifstream in(filename, std::ios_base::in); | |
if (!in) | |
{ | |
std::cerr << "Error: Could not open input file: " | |
<< filename << std::endl; | |
} | |
else | |
{ | |
std::string data; // We will read the contents here. | |
in.unsetf(std::ios::skipws); // No white space skipping! | |
std::copy( | |
std::istream_iterator<char>(in), | |
std::istream_iterator<char>(), | |
std::back_inserter(data)); | |
typedef std::string::const_iterator iterator_type; | |
iterator_type f = data.begin(); | |
iterator_type l = data.end(); | |
rule<iterator_type> endl = -('#' >> *(char_-eol)) >> eol; | |
rule<iterator_type, std::string()> field = *(char_-(';'|endl)) >> (';'|&endl); | |
rule<iterator_type, ucd_line()> line = +(field-endl) >> endl; | |
rule<iterator_type, std::vector<ucd_line>()> file = +(endl | line[push_back(_val, _1)]); | |
parse(f, l, file, info); | |
} | |
} | |
template <typename Array> | |
void collect(Array& data, int field, bool collect_properties = true) const | |
{ | |
BOOST_ASSERT(!info.empty()); | |
ucd_vector::const_iterator current = info.begin(); | |
ucd_vector::const_iterator end = info.end(); | |
while (current != end) | |
{ | |
std::string range = (*current)[0]; | |
boost::trim(range); | |
std::string::const_iterator f = range.begin(); | |
std::string::const_iterator l = range.end(); | |
// get the code-point range | |
uint32_t start; | |
uint32_t finish; | |
parse(f, l, hex[ref(start) = ref(finish) = _1] >> -(".." >> hex[ref(finish) = _1])); | |
// special case for UnicodeData.txt ranges: | |
if ((*current)[1].find("First>") != std::string::npos) | |
{ | |
++current; | |
BOOST_ASSERT(current != end); | |
BOOST_ASSERT((*current)[1].find("Last>") != std::string::npos); | |
std::string range = (*current)[0]; | |
boost::trim(range); | |
f = range.begin(); | |
l = range.end(); | |
parse(f, l, hex[ref(finish) = _1]); | |
} | |
std::string code; | |
if (field < int(current->size())) | |
code = (*current)[field]; | |
boost::trim(code); | |
// Only collect properties we are interested in | |
if (collect_properties) // code for properties | |
{ | |
if (!ignore_property(code)) | |
{ | |
for (uint32_t i = start; i <= finish; ++i) | |
data[i] |= map_property(code); | |
} | |
} | |
else // code for actual numeric values | |
{ | |
for (uint32_t i = start; i <= finish; ++i) | |
{ | |
if (code.empty()) | |
{ | |
data[i] = 0; // signal that this code maps to itself | |
} | |
else | |
{ | |
f = code.begin(); | |
l = code.end(); | |
parse(f, l, hex, data[i]); | |
} | |
} | |
} | |
++current; | |
} | |
} | |
private: | |
static bool ignore_property(std::string const& p) | |
{ | |
// We don't handle all properties | |
std::map<std::string, int>& pm = get_property_map(); | |
std::map<std::string, int>::iterator i = pm.find(p); | |
return i == pm.end(); | |
} | |
static int | |
map_property(std::string const& p) | |
{ | |
std::map<std::string, int>& pm = get_property_map(); | |
std::map<std::string, int>::iterator i = pm.find(p); | |
BOOST_ASSERT(i != pm.end()); | |
return i->second; | |
} | |
static std::map<std::string, int>& | |
get_property_map() | |
{ | |
// The properties we are interested in: | |
static std::map<std::string, int> map; | |
if (map.empty()) | |
{ | |
// General_Category | |
map["Lu"] = 0; | |
map["Ll"] = 1; | |
map["Lt"] = 2; | |
map["Lm"] = 3; | |
map["Lo"] = 4; | |
map["Mn"] = 8; | |
map["Me"] = 9; | |
map["Mc"] = 10; | |
map["Nd"] = 16; | |
map["Nl"] = 17; | |
map["No"] = 18; | |
map["Zs"] = 24; | |
map["Zl"] = 25; | |
map["Zp"] = 26; | |
map["Cc"] = 32; | |
map["Cf"] = 33; | |
map["Co"] = 34; | |
map["Cs"] = 35; | |
map["Cn"] = 36; | |
map["Pd"] = 40; | |
map["Ps"] = 41; | |
map["Pe"] = 42; | |
map["Pc"] = 43; | |
map["Po"] = 44; | |
map["Pi"] = 45; | |
map["Pf"] = 46; | |
map["Sm"] = 48; | |
map["Sc"] = 49; | |
map["Sk"] = 50; | |
map["So"] = 51; | |
// Derived Properties. | |
map["Alphabetic"] = 64; | |
map["Uppercase"] = 128; | |
map["Lowercase"] = 256; | |
map["White_Space"] = 512; | |
map["Hex_Digit"] = 1024; | |
map["Noncharacter_Code_Point"] = 2048; | |
map["Default_Ignorable_Code_Point"] = 4096; | |
// Script | |
map["Arabic"] = 0; | |
map["Imperial_Aramaic"] = 1; | |
map["Armenian"] = 2; | |
map["Avestan"] = 3; | |
map["Balinese"] = 4; | |
map["Bamum"] = 5; | |
map["Bengali"] = 6; | |
map["Bopomofo"] = 7; | |
map["Braille"] = 8; | |
map["Buginese"] = 9; | |
map["Buhid"] = 10; | |
map["Canadian_Aboriginal"] = 11; | |
map["Carian"] = 12; | |
map["Cham"] = 13; | |
map["Cherokee"] = 14; | |
map["Coptic"] = 15; | |
map["Cypriot"] = 16; | |
map["Cyrillic"] = 17; | |
map["Devanagari"] = 18; | |
map["Deseret"] = 19; | |
map["Egyptian_Hieroglyphs"] = 20; | |
map["Ethiopic"] = 21; | |
map["Georgian"] = 22; | |
map["Glagolitic"] = 23; | |
map["Gothic"] = 24; | |
map["Greek"] = 25; | |
map["Gujarati"] = 26; | |
map["Gurmukhi"] = 27; | |
map["Hangul"] = 28; | |
map["Han"] = 29; | |
map["Hanunoo"] = 30; | |
map["Hebrew"] = 31; | |
map["Hiragana"] = 32; | |
map["Katakana_Or_Hiragana"] = 33; | |
map["Old_Italic"] = 34; | |
map["Javanese"] = 35; | |
map["Kayah_Li"] = 36; | |
map["Katakana"] = 37; | |
map["Kharoshthi"] = 38; | |
map["Khmer"] = 39; | |
map["Kannada"] = 40; | |
map["Kaithi"] = 41; | |
map["Tai_Tham"] = 42; | |
map["Lao"] = 43; | |
map["Latin"] = 44; | |
map["Lepcha"] = 45; | |
map["Limbu"] = 46; | |
map["Linear_B"] = 47; | |
map["Lisu"] = 48; | |
map["Lycian"] = 49; | |
map["Lydian"] = 50; | |
map["Malayalam"] = 51; | |
map["Mongolian"] = 52; | |
map["Meetei_Mayek"] = 53; | |
map["Myanmar"] = 54; | |
map["Nko"] = 55; | |
map["Ogham"] = 56; | |
map["Ol_Chiki"] = 57; | |
map["Old_Turkic"] = 58; | |
map["Oriya"] = 59; | |
map["Osmanya"] = 60; | |
map["Phags_Pa"] = 61; | |
map["Inscriptional_Pahlavi"] = 62; | |
map["Phoenician"] = 63; | |
map["Inscriptional_Parthian"] = 64; | |
map["Rejang"] = 65; | |
map["Runic"] = 66; | |
map["Samaritan"] = 67; | |
map["Old_South_Arabian"] = 68; | |
map["Saurashtra"] = 69; | |
map["Shavian"] = 70; | |
map["Sinhala"] = 71; | |
map["Sundanese"] = 72; | |
map["Syloti_Nagri"] = 73; | |
map["Syriac"] = 74; | |
map["Tagbanwa"] = 75; | |
map["Tai_Le"] = 76; | |
map["New_Tai_Lue"] = 77; | |
map["Tamil"] = 78; | |
map["Tai_Viet"] = 79; | |
map["Telugu"] = 80; | |
map["Tifinagh"] = 81; | |
map["Tagalog"] = 82; | |
map["Thaana"] = 83; | |
map["Thai"] = 84; | |
map["Tibetan"] = 85; | |
map["Ugaritic"] = 86; | |
map["Vai"] = 87; | |
map["Old_Persian"] = 88; | |
map["Cuneiform"] = 89; | |
map["Yi"] = 90; | |
map["Inherited"] = 91; | |
map["Common"] = 92; | |
map["Unknown"] = 93; | |
} | |
return map; | |
} | |
ucd_vector info; | |
}; | |
template <typename T, uint32_t block_size_ = 256> | |
class ucd_table_builder | |
{ | |
public: | |
static uint32_t const block_size = block_size_; | |
static uint32_t const full_span = 0x110000; | |
typedef T value_type; | |
ucd_table_builder() : p(new T[full_span]) | |
{ | |
for (uint32_t i = 0; i < full_span; ++i) | |
p[i] = 0; | |
} | |
void collect(char const* filename, int field, bool collect_properties = true) | |
{ | |
std::cout << "collecting " << filename << std::endl; | |
ucd_info info(filename); | |
info.collect(p, field, collect_properties); | |
} | |
void build(std::vector<uint8_t>& stage1, std::vector<T const*>& stage2) | |
{ | |
std::cout << "building tables" << std::endl; | |
std::map<block_ptr, std::vector<T const*> > blocks; | |
for (T const* i = p.get(); i < (p.get() + full_span); i += block_size) | |
blocks[block_ptr(i)].push_back(i); | |
// Not enough bits to store the block indices. | |
BOOST_ASSERT(blocks.size() < (1 << (sizeof(uint8_t) * 8))); | |
typedef std::pair<block_ptr, std::vector<T const*> > blocks_value_type; | |
std::map<T const*, std::vector<T const*> > sorted_blocks; | |
BOOST_FOREACH(blocks_value_type const& val, blocks) | |
{ | |
sorted_blocks[val.first.p] = val.second; | |
} | |
stage1.clear(); | |
stage1.reserve(full_span / block_size); | |
stage1.resize(full_span / block_size); | |
stage2.clear(); | |
stage2.reserve(blocks.size()); | |
typedef std::pair<T const*, std::vector<T const*> > sorted_blocks_value_type; | |
BOOST_FOREACH(sorted_blocks_value_type const& val, sorted_blocks) | |
{ | |
stage2.push_back(val.first); | |
BOOST_FOREACH(T const* val2, val.second) | |
{ | |
stage1[(val2 - p.get()) / block_size] = stage2.size() - 1; | |
} | |
} | |
} | |
private: | |
struct block_ptr | |
{ | |
block_ptr(T const* p) : p(p) {} | |
friend bool operator<(block_ptr a, block_ptr b) | |
{ | |
return std::lexicographical_compare( | |
a.p, a.p + block_size, b.p, b.p + block_size); | |
} | |
T const* p; | |
}; | |
boost::scoped_array<T> p; | |
}; | |
template <typename Out> | |
void print_tab(Out& out, int tab) | |
{ | |
for (int i = 0; i < tab; ++i) | |
out << ' '; | |
} | |
template <typename Out, typename C> | |
void print_table(Out& out, C const& c, bool trailing_comma, int width = 4, int group = 16) | |
{ | |
int const tab = 4; | |
C::size_type size = c.size(); | |
BOOST_ASSERT(size > 1); | |
print_tab(out, tab); | |
out << std::setw(width) << int(c[0]); | |
for (C::size_type i = 1; i < size; ++i) | |
{ | |
out << ", "; | |
if ((i % group) == 0) | |
{ | |
out << std::endl; | |
print_tab(out, tab); | |
} | |
out << std::setw(width) << int(c[i]); | |
} | |
if (trailing_comma) | |
out << ", " << std::endl; | |
} | |
template <typename Out> | |
void print_head(Out& out) | |
{ | |
out | |
<< "/*=============================================================================\n" | |
<< " Copyright (c) 2001-2011 Joel de Guzman\n" | |
<< "\n" | |
<< " Distributed under the Boost Software License, Version 1.0. (See accompanying\n" | |
<< " file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n" | |
<< "\n" | |
<< " AUTOGENERATED. DO NOT EDIT!!!\n" | |
<< "==============================================================================*/\n" | |
<< "#include <boost/cstdint.hpp>\n" | |
<< "\n" | |
<< "namespace boost { namespace spirit { namespace ucd { namespace detail\n" | |
<< "{" | |
; | |
} | |
template <typename Out> | |
void print_tail(Out& out) | |
{ | |
out | |
<< "\n" | |
<< "}}}} // namespace boost::spirit::unicode::detail\n" | |
; | |
} | |
char const* get_int_type_name(int size) | |
{ | |
switch (size) | |
{ | |
case 1: return "::boost::uint8_t"; | |
case 2: return "::boost::uint16_t"; | |
case 4: return "::boost::uint32_t"; | |
case 5: return "::boost::uint64_t"; | |
default: BOOST_ASSERT(false); return 0; // invalid size | |
}; | |
} | |
template <typename Out, typename Builder> | |
void print_file(Out& out, Builder& builder, int field_width, char const* name) | |
{ | |
std::cout << "Generating " << name << " tables" << std::endl; | |
uint32_t const block_size = Builder::block_size; | |
typedef typename Builder::value_type value_type; | |
print_head(out); | |
std::vector<uint8_t> stage1; | |
std::vector<value_type const*> stage2; | |
builder.build(stage1, stage2); | |
std::cout << "Block Size: " << block_size << std::endl; | |
std::cout << "Total Bytes: " | |
<< stage1.size()+(stage2.size()*block_size*sizeof(value_type)) | |
<< std::endl; | |
out | |
<< "\n" | |
<< " static const ::boost::uint8_t " << name << "_stage1[] = {\n" | |
<< "\n" | |
; | |
print_table(out, stage1, false, 3); | |
char const* int_name = get_int_type_name(sizeof(value_type)); | |
out | |
<< "\n" | |
<< " };" | |
<< "\n" | |
<< "\n" | |
<< " static const " << int_name << ' ' << name << "_stage2[] = {" | |
; | |
int block_n = 0; | |
for (int i = 0; i < int(stage2.size()); ++i) | |
{ | |
value_type const* p = stage2[i]; | |
bool last = (i+1 == stage2.size()); | |
out << "\n\n // block " << block_n++ << std::endl; | |
print_table(out, | |
boost::iterator_range<value_type const*>(p, p+block_size), !last, field_width); | |
} | |
out | |
<< "\n" | |
<< " };" | |
<< "\n" | |
; | |
out | |
<< "\n" | |
<< " inline " << int_name << ' ' << name << "_lookup(::boost::uint32_t ch)\n" | |
<< " {\n" | |
<< " ::boost::uint32_t block_offset = " << name << "_stage1[ch / " << block_size << "] * " << block_size << ";\n" | |
<< " return " << name << "_stage2[block_offset + ch % " << block_size << "];\n" | |
<< " }\n" | |
; | |
print_tail(out); | |
} | |
int main() | |
{ | |
// The category tables | |
{ | |
std::ofstream out("category_table.hpp"); | |
ucd_table_builder<uint16_t, 256> builder; | |
builder.collect("UnicodeData.txt", 2); | |
builder.collect("DerivedCoreProperties.txt", 1); | |
builder.collect("PropList.txt", 1); | |
print_file(out, builder, 4, "category"); | |
} | |
// The script tables | |
{ | |
std::ofstream out("script_table.hpp"); | |
ucd_table_builder<uint8_t, 256> builder; | |
builder.collect("Scripts.txt", 1); | |
print_file(out, builder, 3, "script"); | |
} | |
// The lowercase tables | |
{ | |
std::ofstream out("lowercase_table.hpp"); | |
ucd_table_builder<uint32_t, 256> builder; | |
builder.collect("UnicodeData.txt", 13, false); | |
print_file(out, builder, 6, "lowercase"); | |
} | |
// The uppercase tables | |
{ | |
std::ofstream out("uppercase_table.hpp"); | |
ucd_table_builder<uint32_t, 256> builder; | |
builder.collect("UnicodeData.txt", 12, false); | |
print_file(out, builder, 6, "uppercase"); | |
} | |
return 0; | |
} |