| // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "third_party/hunspell/google/bdict_writer.h" |
| |
| #include "base/logging.h" |
| #include "base/strings/stringprintf.h" |
| #include "third_party/hunspell/google/bdict.h" |
| |
| namespace hunspell { |
| |
| // Represents one node the word trie in memory. This does not have to be very |
| // efficient since it is only used when building. |
| class DicNode { |
| public: |
| enum StorageType { |
| UNDEFINED, // Uninitialized storage type. |
| LEAF, // Word with no additional string data. |
| LEAFMORE, // Word with additional suffix following. |
| LIST16, // List of sub-nodes with 16-bit relative offsets. |
| LIST8, // List of sub-nodes with 8-bit relative offsets. |
| LOOKUP32, // Lookup table with 32-bit absolute offsets. |
| LOOKUP16, // LOokup table with 16-bit relative offsets. |
| }; |
| |
| DicNode() : addition(0), storage(UNDEFINED) { |
| } |
| |
| ~DicNode() { |
| for (size_t i = 0; i < children.size(); i++) |
| delete children[i]; |
| } |
| |
| bool is_leaf() const { return children.empty(); } |
| |
| // When non-zero, this character is the additional level that this |
| // node represents. This will be 0 for some leaf nodes when there is no |
| // addition and for the root node. |
| char addition; |
| |
| std::vector<DicNode*> children; |
| |
| // When there are no children, this is a leaf node and this "addition string" |
| // is appended to the result. When there are children, this will be empty. |
| std::string leaf_addition; |
| |
| // For leaf nodes, this are the indices into the affix table. |
| std::vector<int> affix_indices; |
| |
| // Initially uninitialized, ComputeStorage() will fill this in with the |
| // desired serialization method. |
| StorageType storage; |
| }; |
| |
| namespace { |
| |
| void SerializeTrie(const DicNode* node, std::string* output); |
| |
| // Returns true if the nth character in the given word is |ch|. Will return |
| // false when there is no nth character. Note that this will also match an |
| // implicit NULL at the end of the string. |
| bool NthCharacterIs(const std::string& word, size_t n, char ch) { |
| if (word.length() < n) // Want to allow n == length() to catch the NULL. |
| return false; |
| return word.c_str()[n] == ch; // Use c_str() to get NULL terminator. |
| } |
| |
| // Recursively build the trie data structure for the range in the |words| list |
| // in [begin, end). It is assumed that all words in that range will have the |
| // same |node_depth - 2| characters at the beginning. This node will key off of |
| // the |node_depth - 1| character, with a special case for the root. |
| // |
| // |prefix_chars| is how deep this node is in the trie (and corresponds to how |
| // many letters of the word we will skip). The root level will have |
| // |prefix_chars| of 0. |
| // |
| // The given |node| will be filled with the data. The return value is the |
| // index into the |words| vector of the next word to process. It will be |
| // equal to |end| when all words have been consumed. |
| size_t BuildTrie(const BDictWriter::WordList& words, |
| size_t begin, size_t end, |
| size_t node_depth, DicNode* node) { |
| // Find the prefix that this node represents. |
| const std::string& begin_str = words[begin].first; |
| if (begin_str.length() < node_depth) { |
| // Singleton. |
| node->addition = 0; |
| node->affix_indices = words[begin].second; |
| return begin + 1; |
| } |
| |
| // Now find the range of words sharing this prefix. |
| size_t match_count; |
| if (node_depth == 0 && begin == 0) { |
| // Special case the root node. |
| match_count = end - begin; |
| node->addition = 0; |
| } else { |
| match_count = 0; |
| node->addition = begin_str[node_depth - 1]; |
| // We know the strings should have [0, node_depth-1) characters at the |
| // beginning already matching, so we only need to check the new one. |
| while (begin + match_count < end && |
| NthCharacterIs(words[begin + match_count].first, |
| node_depth - 1, node->addition)) |
| match_count++; |
| } |
| |
| if (match_count == 1) { |
| // Just found a leaf node with no other words sharing its prefix. Save any |
| // remaining characters and we're done. |
| node->affix_indices = words[begin].second; |
| node->leaf_addition = begin_str.substr(node_depth); |
| return begin + 1; |
| } |
| |
| // We have a range of words, add them as children of this node. |
| size_t i = begin; |
| while (i < begin + match_count) { |
| DicNode* cur = new DicNode; |
| i = BuildTrie(words, i, begin + match_count, node_depth + 1, cur); |
| node->children.push_back(cur); |
| } |
| |
| return begin + match_count; |
| } |
| |
| // Lookup tables are complicated. They can have a magic 0th entry not counted |
| // in the table dimensions, and also have indices only for the used sub-range. |
| // This function will compute the starting point and size of a lookup table, |
| // in addition to whether it should have the magic 0th entry for the given |
| // list of child nodes. |
| void ComputeLookupStrategyDetails(const std::vector<DicNode*>& children, |
| bool* has_0th_entry, |
| int* first_item, |
| int* list_size) { |
| *has_0th_entry = false; |
| *first_item = 0; |
| *list_size = 0; |
| if (children.empty()) |
| return; |
| |
| size_t first_offset = 0; |
| if (children[0]->addition == 0) { |
| *has_0th_entry = true; |
| first_offset++; |
| } |
| |
| if (children.size() == first_offset) |
| return; |
| |
| *first_item = static_cast<unsigned char>(children[first_offset]->addition); |
| unsigned char last_item = children[children.size() - 1]->addition; |
| *list_size = last_item - *first_item + 1; |
| } |
| |
| // Recursively fills in the storage strategy for this node and each of its |
| // children. This must be done before actually serializing because the storage |
| // mode will depend on the size of the children. |
| size_t ComputeTrieStorage(DicNode* node) { |
| if (node->is_leaf()) { |
| // The additional affix list holds affixes when there is more than one. Each |
| // entry is two bytes, plus an additional FFFF terminator. |
| size_t supplimentary_size = 0; |
| if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) { |
| // We cannot store the first affix ID of the affix list into a leaf node. |
| // In this case, we have to store all the affix IDs and a terminator |
| // into a supplimentary list. |
| supplimentary_size = node->affix_indices.size() * 2 + 2; |
| } else if (node->affix_indices.size() > 1) { |
| // We can store the first affix ID of the affix list into a leaf node. |
| // In this case, we need to store the remaining affix IDs and a |
| // terminator into a supplimentary list. |
| supplimentary_size = node->affix_indices.size() * 2; |
| } |
| |
| if (node->leaf_addition.empty()) { |
| node->storage = DicNode::LEAF; |
| return 2 + supplimentary_size; |
| } |
| node->storage = DicNode::LEAFMORE; |
| // Signature & affix (2) + null for leaf_addition (1) = 3 |
| return 3 + node->leaf_addition.size() + supplimentary_size; |
| } |
| |
| // Recursively compute the size of the children for non-leaf nodes. |
| size_t child_size = 0; |
| for (size_t i = 0; i < node->children.size(); i++) |
| child_size += ComputeTrieStorage(node->children[i]); |
| |
| // Fixed size is only 1 byte which is the ID byte and the count combined. |
| static const int kListHeaderSize = 1; |
| |
| // Lists can only store up to 16 items. |
| static const size_t kListThreshold = 16; |
| if (node->children.size() < kListThreshold && child_size <= 0xFF) { |
| node->storage = DicNode::LIST8; |
| return kListHeaderSize + node->children.size() * 2 + child_size; |
| } |
| |
| if (node->children.size() < kListThreshold && child_size <= 0xFFFF) { |
| node->storage = DicNode::LIST16; |
| // Fixed size is one byte plus 3 for each table entry. |
| return kListHeaderSize + node->children.size() * 3 + child_size; |
| } |
| |
| static const int kTableHeaderSize = 2; // Type + table size. |
| |
| bool has_0th_item; |
| int first_table_item, table_item_count; |
| ComputeLookupStrategyDetails(node->children, &has_0th_item, |
| &first_table_item, &table_item_count); |
| if (child_size + kTableHeaderSize + (has_0th_item ? 2 : 0) + |
| table_item_count * 2 < 0xFFFF) { |
| // Use 16-bit addressing since the children will fit. |
| node->storage = DicNode::LOOKUP16; |
| return kTableHeaderSize + (has_0th_item ? 2 : 0) + table_item_count * 2 + |
| child_size; |
| } |
| |
| // Use 32-bit addressing as a last resort. |
| node->storage = DicNode::LOOKUP32; |
| return kTableHeaderSize + (has_0th_item ? 4 : 0) + table_item_count * 4 + |
| child_size; |
| } |
| |
| // Serializes the given node when it is DicNode::LEAF* to the output. |
| void SerializeLeaf(const DicNode* node, std::string* output) { |
| // The low 6 bits of the ID byte are the high 6 bits of the first affix ID. |
| int first_affix = node->affix_indices.size() ? node->affix_indices[0] : 0; |
| |
| // We may store the first value with the node or in the supplimentary list. |
| size_t first_affix_in_supplimentary_list = 1; |
| if (first_affix > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) { |
| // There are not enough bits for this value, move it to the supplimentary |
| // list where there are more bits per value. |
| first_affix_in_supplimentary_list = 0; |
| first_affix = BDict::FIRST_AFFIX_IS_UNUSED; |
| } |
| |
| unsigned char id_byte = (first_affix >> 8) & |
| BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK; |
| |
| // The next two bits indicates an additional string and more affixes. |
| if (node->storage == DicNode::LEAFMORE) |
| id_byte |= BDict::LEAF_NODE_ADDITIONAL_VALUE; |
| if (node->affix_indices.size() > 1 || first_affix_in_supplimentary_list == 0) |
| id_byte |= BDict::LEAF_NODE_FOLLOWING_VALUE; |
| output->push_back(id_byte); |
| |
| // Following is the low 8 bits of the affix index. |
| output->push_back(first_affix & 0xff); |
| |
| // Handle the optional addition with NULL terminator. |
| if (node->storage == DicNode::LEAFMORE) { |
| for (size_t i = 0; i < node->leaf_addition.size() + 1; i++) |
| output->push_back(node->leaf_addition.c_str()[i]); |
| } |
| |
| // Handle any following affixes. We already wrote the 0th one. |
| if (node->affix_indices.size() > first_affix_in_supplimentary_list) { |
| for (size_t i = first_affix_in_supplimentary_list; |
| i < node->affix_indices.size() && i < BDict::MAX_AFFIXES_PER_WORD; |
| i++) { |
| output->push_back(static_cast<char>(node->affix_indices[i] & 0xFF)); |
| output->push_back( |
| static_cast<char>((node->affix_indices[i] >> 8) & 0xFF)); |
| } |
| |
| // Terminator for affix list. We use 0xFFFF. |
| output->push_back(static_cast<unsigned char>(0xFF)); |
| output->push_back(static_cast<unsigned char>(0xFF)); |
| } |
| } |
| |
| // Serializes the given node when it is DicNode::LIST* to the output. |
| void SerializeList(const DicNode* node, std::string* output) { |
| bool is_8_bit = node->storage == DicNode::LIST8; |
| unsigned char id_byte = BDict::LIST_NODE_TYPE_VALUE | |
| (is_8_bit ? 0 : BDict::LIST_NODE_16BIT_VALUE); |
| id_byte |= node->children.size(); // We assume the size is < 4 bits. |
| output->push_back(id_byte); |
| |
| // Reserve enough room for the lookup table (either 2 or 3 bytes per entry). |
| int bytes_per_entry = (is_8_bit ? 2 : 3); |
| size_t table_begin = output->size(); |
| output->resize(output->size() + node->children.size() * bytes_per_entry); |
| size_t children_begin = output->size(); |
| |
| for (size_t i = 0; i < node->children.size(); i++) { |
| // First is the character this entry represents. |
| (*output)[table_begin + i * bytes_per_entry] = node->children[i]->addition; |
| |
| // Next is the 8- or 16-bit offset. |
| size_t offset = output->size() - children_begin; |
| if (is_8_bit) { |
| DCHECK(offset <= 0xFF); |
| (*output)[table_begin + i * bytes_per_entry + 1] = |
| static_cast<char>(offset & 0xFF); |
| } else { |
| unsigned short* output16 = reinterpret_cast<unsigned short*>( |
| &(*output)[table_begin + i * bytes_per_entry + 1]); |
| *output16 = static_cast<unsigned short>(offset); |
| } |
| |
| // Now append the children's data. |
| SerializeTrie(node->children[i], output); |
| } |
| } |
| |
| // Serializes the given node when it is DicNode::LOOKUP* to the output. |
| void SerializeLookup(const DicNode* node, std::string* output) { |
| unsigned char id_byte = BDict::LOOKUP_NODE_TYPE_VALUE; |
| |
| bool has_0th_item; |
| int first_table_item, table_item_count; |
| ComputeLookupStrategyDetails(node->children, &has_0th_item, |
| &first_table_item, &table_item_count); |
| |
| // Set the extra bits in the ID byte. |
| bool is_32_bit = (node->storage == DicNode::LOOKUP32); |
| if (is_32_bit) |
| id_byte |= BDict::LOOKUP_NODE_32BIT_VALUE; |
| if (has_0th_item) |
| id_byte |= BDict::LOOKUP_NODE_0TH_VALUE; |
| |
| size_t begin_offset = output->size(); |
| |
| output->push_back(id_byte); |
| output->push_back(static_cast<char>(first_table_item)); |
| output->push_back(static_cast<char>(table_item_count)); |
| |
| // Save room for the lookup table and the optional 0th item. |
| int bytes_per_entry = (is_32_bit ? 4 : 2); |
| size_t zeroth_item_offset = output->size(); |
| if (has_0th_item) |
| output->resize(output->size() + bytes_per_entry); |
| size_t table_begin = output->size(); |
| output->resize(output->size() + table_item_count * bytes_per_entry); |
| |
| // Append the children. |
| for (size_t i = 0; i < node->children.size(); i++) { |
| size_t offset = output->size(); |
| |
| // Compute the location at which we'll store the offset of the child data. |
| // We may be writing the magic 0th item. |
| size_t offset_offset; |
| if (i == 0 && has_0th_item) { |
| offset_offset = zeroth_item_offset; |
| } else { |
| int table_index = static_cast<unsigned char>(node->children[i]->addition) - first_table_item; |
| offset_offset = table_begin + table_index * bytes_per_entry; |
| } |
| |
| // Write the offset. |
| if (is_32_bit) { |
| // Use 32-bit absolute offsets. |
| // FIXME(brettw) use bit cast. |
| unsigned* offset32 = reinterpret_cast<unsigned*>(&(*output)[offset_offset]); |
| *offset32 = static_cast<unsigned>(output->size()); |
| } else { |
| // Use 16-bit relative offsets. |
| unsigned short* offset16 = reinterpret_cast<unsigned short*>(&(*output)[offset_offset]); |
| *offset16 = static_cast<unsigned short>(output->size() - begin_offset); |
| } |
| |
| SerializeTrie(node->children[i], output); |
| } |
| } |
| |
| // Recursively serializes this node and all of its children to the output. |
| void SerializeTrie(const DicNode* node, std::string* output) { |
| if (node->storage == DicNode::LEAF || |
| node->storage == DicNode::LEAFMORE) { |
| SerializeLeaf(node, output); |
| } else if (node->storage == DicNode::LIST8 || |
| node->storage == DicNode::LIST16) { |
| SerializeList(node, output); |
| } else if (node->storage == DicNode::LOOKUP16 || |
| node->storage == DicNode::LOOKUP32) { |
| SerializeLookup(node, output); |
| } |
| } |
| |
| // Serializes the given list of strings with 0 bytes separating them. The end |
| // will be marked by a double-0. |
| void SerializeStringListNullTerm(const std::vector<std::string>& strings, |
| std::string* output) { |
| for (size_t i = 0; i < strings.size(); i++) { |
| // Can't tolerate empty strings since the'll mark the end. |
| if (strings[i].empty()) |
| output->push_back(' '); |
| else |
| output->append(strings[i]); |
| output->push_back(0); |
| } |
| output->push_back(0); |
| } |
| |
| void SerializeReplacements( |
| const std::vector< std::pair<std::string, std::string> >& repl, |
| std::string* output) { |
| for (size_t i = 0; i < repl.size(); i++) { |
| output->append(repl[i].first); |
| output->push_back(0); |
| output->append(repl[i].second); |
| output->push_back(0); |
| } |
| output->push_back(0); |
| } |
| |
| } // namespace |
| |
| BDictWriter::BDictWriter() : trie_root_(NULL) { |
| } |
| |
| BDictWriter::~BDictWriter() { |
| delete trie_root_; |
| } |
| |
| void BDictWriter::SetWords(const WordList& words) { |
| trie_root_ = new DicNode; |
| BuildTrie(words, 0, words.size(), 0, trie_root_); |
| } |
| |
| std::string BDictWriter::GetBDict() const { |
| std::string ret; |
| |
| // Save room for the header. This will be populated at the end. |
| ret.resize(sizeof(hunspell::BDict::Header)); |
| |
| // Serialize the affix portion. |
| size_t aff_offset = ret.size(); |
| SerializeAff(&ret); |
| |
| // Serialize the dictionary words. |
| size_t dic_offset = ret.size(); |
| ret.reserve(ret.size() + ComputeTrieStorage(trie_root_)); |
| SerializeTrie(trie_root_, &ret); |
| |
| // Fill the header last, now that we have the data. |
| hunspell::BDict::Header* header = |
| reinterpret_cast<hunspell::BDict::Header*>(&ret[0]); |
| header->signature = hunspell::BDict::SIGNATURE; |
| header->major_version = hunspell::BDict::MAJOR_VERSION; |
| header->minor_version = hunspell::BDict::MINOR_VERSION; |
| header->aff_offset = static_cast<uint32>(aff_offset); |
| header->dic_offset = static_cast<uint32>(dic_offset); |
| |
| // Write the MD5 digest of the affix information and the dictionary words at |
| // the end of the BDic header. |
| if (header->major_version >= 2) |
| base::MD5Sum(&ret[aff_offset], ret.size() - aff_offset, &header->digest); |
| |
| return ret; |
| } |
| |
| void BDictWriter::SerializeAff(std::string* output) const { |
| // Reserve enough room for the header. |
| size_t header_offset = output->size(); |
| output->resize(output->size() + sizeof(hunspell::BDict::AffHeader)); |
| |
| // Write the comment. |
| output->push_back('\n'); |
| output->append(comment_); |
| output->push_back('\n'); |
| |
| // We need a magic first AF line that lists the number of following ones. |
| size_t affix_group_offset = output->size(); |
| output->append(base::StringPrintf("AF %d", |
| static_cast<int>(affix_groups_.size()))); |
| output->push_back(0); |
| SerializeStringListNullTerm(affix_groups_, output); |
| |
| size_t affix_rule_offset = output->size(); |
| SerializeStringListNullTerm(affix_rules_, output); |
| |
| size_t rep_offset = output->size(); |
| SerializeReplacements(replacements_, output); |
| |
| size_t other_offset = output->size(); |
| SerializeStringListNullTerm(other_commands_, output); |
| |
| // Add the header now that we know the offsets. |
| hunspell::BDict::AffHeader* header = |
| reinterpret_cast<hunspell::BDict::AffHeader*>(&(*output)[header_offset]); |
| header->affix_group_offset = static_cast<uint32>(affix_group_offset); |
| header->affix_rule_offset = static_cast<uint32>(affix_rule_offset); |
| header->rep_offset = static_cast<uint32>(rep_offset); |
| header->other_offset = static_cast<uint32>(other_offset); |
| } |
| |
| } // namespace hunspell |