third_party/hunspell/google/bdict_writer.cc - chromium/src - Git at Google

 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "third_party/hunspell/google/bdict_writer.h"

 #include "base/logging.h"
 #include "base/strings/stringprintf.h"
 #include "third_party/hunspell/google/bdict.h"

 namespace hunspell {

 // Represents one node the word trie in memory. This does not have to be very
 // efficient since it is only used when building.
 class DicNode {
  public:
   enum StorageType {
     UNDEFINED,  // Uninitialized storage type.
     LEAF,       // Word with no additional string data.
     LEAFMORE,   // Word with additional suffix following.
     LIST16,     // List of sub-nodes with 16-bit relative offsets.
     LIST8,      // List of sub-nodes with 8-bit relative offsets.
     LOOKUP32,   // Lookup table with 32-bit absolute offsets.
     LOOKUP16,   // LOokup table with 16-bit relative offsets.
   };

   DicNode() : addition(0), storage(UNDEFINED) {
   }

   ~DicNode() {
     for (size_t i = 0; i < children.size(); i++)
       delete children[i];
   }

   bool is_leaf() const { return children.empty(); }

   // When non-zero, this character is the additional level that this
   // node represents. This will be 0 for some leaf nodes when there is no
   // addition and for the root node.
   char addition;

   std::vector<DicNode*> children;

   // When there are no children, this is a leaf node and this "addition string"
   // is appended to the result. When there are children, this will be empty.
   std::string leaf_addition;

   // For leaf nodes, this are the indices into the affix table.
   std::vector<int> affix_indices;

   // Initially uninitialized, ComputeStorage() will fill this in with the
   // desired serialization method.
   StorageType storage;
 };

 namespace {

 void SerializeTrie(const DicNode* node, std::string* output);

 // Returns true if the nth character in the given word is |ch|. Will return
 // false when there is no nth character. Note that this will also match an
 // implicit NULL at the end of the string.
 bool NthCharacterIs(const std::string& word, size_t n, char ch) {
   if (word.length() < n)  // Want to allow n == length() to catch the NULL.
     return false;
   return word.c_str()[n] == ch;  // Use c_str() to get NULL terminator.
 }

 // Recursively build the trie data structure for the range in the |words| list
 // in [begin, end). It is assumed that all words in that range will have the
 // same |node_depth - 2| characters at the beginning. This node will key off of
 // the |node_depth - 1| character, with a special case for the root.
 //
 // |prefix_chars| is how deep this node is in the trie (and corresponds to how
 // many letters of the word we will skip). The root level will have
 // |prefix_chars| of 0.
 //
 // The given |node| will be filled with the data. The return value is the
 // index into the |words| vector of the next word to process. It will be
 // equal to |end| when all words have been consumed.
 size_t BuildTrie(const BDictWriter::WordList& words,
                  size_t begin, size_t end,
                  size_t node_depth, DicNode* node) {
   // Find the prefix that this node represents.
   const std::string& begin_str = words[begin].first;
   if (begin_str.length() < node_depth) {
     // Singleton.
     node->addition = 0;
     node->affix_indices = words[begin].second;
     return begin + 1;
   }

   // Now find the range of words sharing this prefix.
   size_t match_count;
   if (node_depth == 0 && begin == 0) {
     // Special case the root node.
     match_count = end - begin;
     node->addition = 0;
   } else {
     match_count = 0;
     node->addition = begin_str[node_depth - 1];
     // We know the strings should have [0, node_depth-1) characters at the
     // beginning already matching, so we only need to check the new one.
     while (begin + match_count < end &&
            NthCharacterIs(words[begin + match_count].first,
                           node_depth - 1, node->addition))
       match_count++;
   }

   if (match_count == 1) {
     // Just found a leaf node with no other words sharing its prefix. Save any
     // remaining characters and we're done.
     node->affix_indices = words[begin].second;
     node->leaf_addition = begin_str.substr(node_depth);
     return begin + 1;
   }

   // We have a range of words, add them as children of this node.
   size_t i = begin;
   while (i < begin + match_count) {
     DicNode* cur = new DicNode;
     i = BuildTrie(words, i, begin + match_count, node_depth + 1, cur);
     node->children.push_back(cur);
   }

   return begin + match_count;
 }

 // Lookup tables are complicated. They can have a magic 0th entry not counted
 // in the table dimensions, and also have indices only for the used sub-range.
 // This function will compute the starting point and size of a lookup table,
 // in addition to whether it should have the magic 0th entry for the given
 // list of child nodes.
 void ComputeLookupStrategyDetails(const std::vector<DicNode*>& children,
                                   bool* has_0th_entry,
                                   int* first_item,
                                   int* list_size) {
   *has_0th_entry = false;
   *first_item = 0;
   *list_size = 0;
   if (children.empty())
     return;

   size_t first_offset = 0;
   if (children[0]->addition == 0) {
     *has_0th_entry = true;
     first_offset++;
   }

   if (children.size() == first_offset)
     return;

   *first_item = static_cast<unsigned char>(children[first_offset]->addition);
   unsigned char last_item = children[children.size() - 1]->addition;
   *list_size = last_item - *first_item + 1;
 }

 // Recursively fills in the storage strategy for this node and each of its
 // children. This must be done before actually serializing because the storage
 // mode will depend on the size of the children.
 size_t ComputeTrieStorage(DicNode* node) {
   if (node->is_leaf()) {
     // The additional affix list holds affixes when there is more than one. Each
     // entry is two bytes, plus an additional FFFF terminator.
     size_t supplimentary_size = 0;
     if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {
       // We cannot store the first affix ID of the affix list into a leaf node.
       // In this case, we have to store all the affix IDs and a terminator
       // into a supplimentary list.
       supplimentary_size = node->affix_indices.size() * 2 + 2;
     } else if (node->affix_indices.size() > 1) {
       // We can store the first affix ID of the affix list into a leaf node.
       // In this case, we need to store the remaining affix IDs and a
       // terminator into a supplimentary list.
       supplimentary_size = node->affix_indices.size() * 2;
     }

     if (node->leaf_addition.empty()) {
       node->storage = DicNode::LEAF;
       return 2 + supplimentary_size;
     }
     node->storage = DicNode::LEAFMORE;
     // Signature & affix (2) + null for leaf_addition (1) = 3
     return 3 + node->leaf_addition.size() + supplimentary_size;
   }

   // Recursively compute the size of the children for non-leaf nodes.
   size_t child_size = 0;
   for (size_t i = 0; i < node->children.size(); i++)
     child_size += ComputeTrieStorage(node->children[i]);

   // Fixed size is only 1 byte which is the ID byte and the count combined.
   static const int kListHeaderSize = 1;

   // Lists can only store up to 16 items.
   static const size_t kListThreshold = 16;
   if (node->children.size() < kListThreshold && child_size <= 0xFF) {
     node->storage = DicNode::LIST8;
     return kListHeaderSize + node->children.size() * 2 + child_size;
   }

   if (node->children.size() < kListThreshold && child_size <= 0xFFFF) {
     node->storage = DicNode::LIST16;
     // Fixed size is one byte plus 3 for each table entry.
     return kListHeaderSize + node->children.size() * 3 + child_size;
   }

   static const int kTableHeaderSize = 2;  // Type + table size.

   bool has_0th_item;
   int first_table_item, table_item_count;
   ComputeLookupStrategyDetails(node->children, &has_0th_item,
                                &first_table_item, &table_item_count);
   if (child_size + kTableHeaderSize + (has_0th_item ? 2 : 0) +
       table_item_count * 2 < 0xFFFF) {
     // Use 16-bit addressing since the children will fit.
     node->storage = DicNode::LOOKUP16;
     return kTableHeaderSize + (has_0th_item ? 2 : 0) + table_item_count * 2 +
         child_size;
   }

   // Use 32-bit addressing as a last resort.
   node->storage = DicNode::LOOKUP32;
   return kTableHeaderSize + (has_0th_item ? 4 : 0) + table_item_count * 4 +
       child_size;
 }

 // Serializes the given node when it is DicNode::LEAF* to the output.
 void SerializeLeaf(const DicNode* node, std::string* output) {
   // The low 6 bits of the ID byte are the high 6 bits of the first affix ID.
   int first_affix = node->affix_indices.size() ? node->affix_indices[0] : 0;

   // We may store the first value with the node or in the supplimentary list.
   size_t first_affix_in_supplimentary_list = 1;
   if (first_affix > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {
     // There are not enough bits for this value, move it to the supplimentary
     // list where there are more bits per value.
     first_affix_in_supplimentary_list = 0;
     first_affix = BDict::FIRST_AFFIX_IS_UNUSED;
   }

   unsigned char id_byte = (first_affix >> 8) &
       BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK;

   // The next two bits indicates an additional string and more affixes.
   if (node->storage == DicNode::LEAFMORE)
     id_byte |= BDict::LEAF_NODE_ADDITIONAL_VALUE;
   if (node->affix_indices.size() > 1 || first_affix_in_supplimentary_list == 0)
     id_byte |= BDict::LEAF_NODE_FOLLOWING_VALUE;
   output->push_back(id_byte);

   // Following is the low 8 bits of the affix index.
   output->push_back(first_affix & 0xff);

   // Handle the optional addition with NULL terminator.
   if (node->storage == DicNode::LEAFMORE) {
     for (size_t i = 0; i < node->leaf_addition.size() + 1; i++)
       output->push_back(node->leaf_addition.c_str()[i]);
   }

   // Handle any following affixes. We already wrote the 0th one.
   if (node->affix_indices.size() > first_affix_in_supplimentary_list) {
     for (size_t i = first_affix_in_supplimentary_list;
          i < node->affix_indices.size() && i < BDict::MAX_AFFIXES_PER_WORD;
          i++) {
       output->push_back(static_cast<char>(node->affix_indices[i] & 0xFF));
       output->push_back(
           static_cast<char>((node->affix_indices[i] >> 8) & 0xFF));
     }

     // Terminator for affix list. We use 0xFFFF.
     output->push_back(static_cast<unsigned char>(0xFF));
     output->push_back(static_cast<unsigned char>(0xFF));
   }
 }

 // Serializes the given node when it is DicNode::LIST* to the output.
 void SerializeList(const DicNode* node, std::string* output) {
   bool is_8_bit = node->storage == DicNode::LIST8;
   unsigned char id_byte = BDict::LIST_NODE_TYPE_VALUE |
       (is_8_bit ? 0 : BDict::LIST_NODE_16BIT_VALUE);
   id_byte |= node->children.size();  // We assume the size is < 4 bits.
   output->push_back(id_byte);

   // Reserve enough room for the lookup table (either 2 or 3 bytes per entry).
   int bytes_per_entry = (is_8_bit ? 2 : 3);
   size_t table_begin = output->size();
   output->resize(output->size() + node->children.size() * bytes_per_entry);
   size_t children_begin = output->size();

   for (size_t i = 0; i < node->children.size(); i++) {
     // First is the character this entry represents.
     (*output)[table_begin + i * bytes_per_entry] = node->children[i]->addition;

     // Next is the 8- or 16-bit offset.
     size_t offset = output->size() - children_begin;
     if (is_8_bit) {
       DCHECK(offset <= 0xFF);
       (*output)[table_begin + i * bytes_per_entry + 1] =
           static_cast<char>(offset & 0xFF);
     } else {
       unsigned short* output16 = reinterpret_cast<unsigned short*>(
           &(*output)[table_begin + i * bytes_per_entry + 1]);
       *output16 = static_cast<unsigned short>(offset);
     }

     // Now append the children's data.
     SerializeTrie(node->children[i], output);
   }
 }

 // Serializes the given node when it is DicNode::LOOKUP* to the output.
 void SerializeLookup(const DicNode* node, std::string* output) {
   unsigned char id_byte = BDict::LOOKUP_NODE_TYPE_VALUE;

   bool has_0th_item;
   int first_table_item, table_item_count;
   ComputeLookupStrategyDetails(node->children, &has_0th_item,
                                &first_table_item, &table_item_count);

   // Set the extra bits in the ID byte.
   bool is_32_bit = (node->storage == DicNode::LOOKUP32);
   if (is_32_bit)
     id_byte |= BDict::LOOKUP_NODE_32BIT_VALUE;
   if (has_0th_item)
     id_byte |= BDict::LOOKUP_NODE_0TH_VALUE;

   size_t begin_offset = output->size();

   output->push_back(id_byte);
   output->push_back(static_cast<char>(first_table_item));
   output->push_back(static_cast<char>(table_item_count));

   // Save room for the lookup table and the optional 0th item.
   int bytes_per_entry = (is_32_bit ? 4 : 2);
   size_t zeroth_item_offset = output->size();
   if (has_0th_item)
     output->resize(output->size() + bytes_per_entry);
   size_t table_begin = output->size();
   output->resize(output->size() + table_item_count * bytes_per_entry);

   // Append the children.
   for (size_t i = 0; i < node->children.size(); i++) {
     size_t offset = output->size();

     // Compute the location at which we'll store the offset of the child data.
     // We may be writing the magic 0th item.
     size_t offset_offset;
     if (i == 0 && has_0th_item) {
       offset_offset = zeroth_item_offset;
     } else {
       int table_index = static_cast<unsigned char>(node->children[i]->addition) - first_table_item;
       offset_offset = table_begin + table_index * bytes_per_entry;
     }

     // Write the offset.
     if (is_32_bit) {
       // Use 32-bit absolute offsets.
       // FIXME(brettw) use bit cast.
       unsigned* offset32 = reinterpret_cast<unsigned*>(&(*output)[offset_offset]);
       *offset32 = static_cast<unsigned>(output->size());
     } else {
       // Use 16-bit relative offsets.
       unsigned short* offset16 = reinterpret_cast<unsigned short*>(&(*output)[offset_offset]);
       *offset16 = static_cast<unsigned short>(output->size() - begin_offset);
     }

     SerializeTrie(node->children[i], output);
   }
 }

 // Recursively serializes this node and all of its children to the output.
 void SerializeTrie(const DicNode* node, std::string* output) {
   if (node->storage == DicNode::LEAF ||
       node->storage == DicNode::LEAFMORE) {
     SerializeLeaf(node, output);
   } else if (node->storage == DicNode::LIST8 ||
              node->storage == DicNode::LIST16) {
     SerializeList(node, output);
   } else if (node->storage == DicNode::LOOKUP16 ||
              node->storage == DicNode::LOOKUP32) {
     SerializeLookup(node, output);
   }
 }

 // Serializes the given list of strings with 0 bytes separating them. The end
 // will be marked by a double-0.
 void SerializeStringListNullTerm(const std::vector<std::string>& strings,
                                  std::string* output) {
   for (size_t i = 0; i < strings.size(); i++) {
     // Can't tolerate empty strings since the'll mark the end.
     if (strings[i].empty())
       output->push_back(' ');
     else
       output->append(strings[i]);
     output->push_back(0);
   }
   output->push_back(0);
 }

 void SerializeReplacements(
     const std::vector< std::pair<std::string, std::string> >& repl,
     std::string* output) {
   for (size_t i = 0; i < repl.size(); i++) {
     output->append(repl[i].first);
     output->push_back(0);
     output->append(repl[i].second);
     output->push_back(0);
   }
   output->push_back(0);
 }

 }  // namespace

 BDictWriter::BDictWriter() : trie_root_(NULL) {
 }

 BDictWriter::~BDictWriter() {
   delete trie_root_;
 }

 void BDictWriter::SetWords(const WordList& words) {
   trie_root_ = new DicNode;
   BuildTrie(words, 0, words.size(), 0, trie_root_);
 }

 std::string BDictWriter::GetBDict() const {
   std::string ret;

   // Save room for the header. This will be populated at the end.
   ret.resize(sizeof(hunspell::BDict::Header));

   // Serialize the affix portion.
   size_t aff_offset = ret.size();
   SerializeAff(&ret);

   // Serialize the dictionary words.
   size_t dic_offset = ret.size();
   ret.reserve(ret.size() + ComputeTrieStorage(trie_root_));
   SerializeTrie(trie_root_, &ret);

   // Fill the header last, now that we have the data.
   hunspell::BDict::Header* header =
       reinterpret_cast<hunspell::BDict::Header*>(&ret[0]);
   header->signature = hunspell::BDict::SIGNATURE;
   header->major_version = hunspell::BDict::MAJOR_VERSION;
   header->minor_version = hunspell::BDict::MINOR_VERSION;
   header->aff_offset = static_cast<uint32>(aff_offset);
   header->dic_offset = static_cast<uint32>(dic_offset);

   // Write the MD5 digest of the affix information and the dictionary words at
   // the end of the BDic header.
   if (header->major_version >= 2)
     base::MD5Sum(&ret[aff_offset], ret.size() - aff_offset, &header->digest);

   return ret;
 }

 void BDictWriter::SerializeAff(std::string* output) const {
   // Reserve enough room for the header.
   size_t header_offset = output->size();
   output->resize(output->size() + sizeof(hunspell::BDict::AffHeader));

   // Write the comment.
   output->push_back('\n');
   output->append(comment_);
   output->push_back('\n');

   // We need a magic first AF line that lists the number of following ones.
   size_t affix_group_offset = output->size();
   output->append(base::StringPrintf("AF %d",
                                     static_cast<int>(affix_groups_.size())));
   output->push_back(0);
   SerializeStringListNullTerm(affix_groups_, output);

   size_t affix_rule_offset = output->size();
   SerializeStringListNullTerm(affix_rules_, output);

   size_t rep_offset = output->size();
   SerializeReplacements(replacements_, output);

   size_t other_offset = output->size();
   SerializeStringListNullTerm(other_commands_, output);

   // Add the header now that we know the offsets.
   hunspell::BDict::AffHeader* header =
       reinterpret_cast<hunspell::BDict::AffHeader*>(&(*output)[header_offset]);
   header->affix_group_offset = static_cast<uint32>(affix_group_offset);
   header->affix_rule_offset = static_cast<uint32>(affix_rule_offset);
   header->rep_offset = static_cast<uint32>(rep_offset);
   header->other_offset = static_cast<uint32>(other_offset);
 }

 }  // namespace hunspell
	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "third_party/hunspell/google/bdict_writer.h"

	#include "base/logging.h"
	#include "base/strings/stringprintf.h"
	#include "third_party/hunspell/google/bdict.h"

	namespace hunspell {

	// Represents one node the word trie in memory. This does not have to be very
	// efficient since it is only used when building.
	class DicNode {
	public:
	enum StorageType {
	UNDEFINED, // Uninitialized storage type.
	LEAF, // Word with no additional string data.
	LEAFMORE, // Word with additional suffix following.
	LIST16, // List of sub-nodes with 16-bit relative offsets.
	LIST8, // List of sub-nodes with 8-bit relative offsets.
	LOOKUP32, // Lookup table with 32-bit absolute offsets.
	LOOKUP16, // LOokup table with 16-bit relative offsets.
	};

	DicNode() : addition(0), storage(UNDEFINED) {
	}

	~DicNode() {
	for (size_t i = 0; i < children.size(); i++)
	delete children[i];
	}

	bool is_leaf() const { return children.empty(); }

	// When non-zero, this character is the additional level that this
	// node represents. This will be 0 for some leaf nodes when there is no
	// addition and for the root node.
	char addition;

	std::vector<DicNode*> children;

	// When there are no children, this is a leaf node and this "addition string"
	// is appended to the result. When there are children, this will be empty.
	std::string leaf_addition;

	// For leaf nodes, this are the indices into the affix table.
	std::vector<int> affix_indices;

	// Initially uninitialized, ComputeStorage() will fill this in with the
	// desired serialization method.
	StorageType storage;
	};

	namespace {

	void SerializeTrie(const DicNode* node, std::string* output);

	// Returns true if the nth character in the given word is \|ch\|. Will return
	// false when there is no nth character. Note that this will also match an
	// implicit NULL at the end of the string.
	bool NthCharacterIs(const std::string& word, size_t n, char ch) {
	if (word.length() < n) // Want to allow n == length() to catch the NULL.
	return false;
	return word.c_str()[n] == ch; // Use c_str() to get NULL terminator.
	}

	// Recursively build the trie data structure for the range in the \|words\| list
	// in [begin, end). It is assumed that all words in that range will have the
	// same \|node_depth - 2\| characters at the beginning. This node will key off of
	// the \|node_depth - 1\| character, with a special case for the root.
	//
	// \|prefix_chars\| is how deep this node is in the trie (and corresponds to how
	// many letters of the word we will skip). The root level will have
	// \|prefix_chars\| of 0.
	//
	// The given \|node\| will be filled with the data. The return value is the
	// index into the \|words\| vector of the next word to process. It will be
	// equal to \|end\| when all words have been consumed.
	size_t BuildTrie(const BDictWriter::WordList& words,
	size_t begin, size_t end,
	size_t node_depth, DicNode* node) {
	// Find the prefix that this node represents.
	const std::string& begin_str = words[begin].first;
	if (begin_str.length() < node_depth) {
	// Singleton.
	node->addition = 0;
	node->affix_indices = words[begin].second;
	return begin + 1;
	}

	// Now find the range of words sharing this prefix.
	size_t match_count;
	if (node_depth == 0 && begin == 0) {
	// Special case the root node.
	match_count = end - begin;
	node->addition = 0;
	} else {
	match_count = 0;
	node->addition = begin_str[node_depth - 1];
	// We know the strings should have [0, node_depth-1) characters at the
	// beginning already matching, so we only need to check the new one.
	while (begin + match_count < end &&
	NthCharacterIs(words[begin + match_count].first,
	node_depth - 1, node->addition))
	match_count++;
	}

	if (match_count == 1) {
	// Just found a leaf node with no other words sharing its prefix. Save any
	// remaining characters and we're done.
	node->affix_indices = words[begin].second;
	node->leaf_addition = begin_str.substr(node_depth);
	return begin + 1;
	}

	// We have a range of words, add them as children of this node.
	size_t i = begin;
	while (i < begin + match_count) {
	DicNode* cur = new DicNode;
	i = BuildTrie(words, i, begin + match_count, node_depth + 1, cur);
	node->children.push_back(cur);
	}

	return begin + match_count;
	}

	// Lookup tables are complicated. They can have a magic 0th entry not counted
	// in the table dimensions, and also have indices only for the used sub-range.
	// This function will compute the starting point and size of a lookup table,
	// in addition to whether it should have the magic 0th entry for the given
	// list of child nodes.
	void ComputeLookupStrategyDetails(const std::vector<DicNode*>& children,
	bool* has_0th_entry,
	int* first_item,
	int* list_size) {
	*has_0th_entry = false;
	*first_item = 0;
	*list_size = 0;
	if (children.empty())
	return;

	size_t first_offset = 0;
	if (children[0]->addition == 0) {
	*has_0th_entry = true;
	first_offset++;
	}

	if (children.size() == first_offset)
	return;

	*first_item = static_cast<unsigned char>(children[first_offset]->addition);
	unsigned char last_item = children[children.size() - 1]->addition;
	list_size = last_item - first_item + 1;
	}

	// Recursively fills in the storage strategy for this node and each of its
	// children. This must be done before actually serializing because the storage
	// mode will depend on the size of the children.
	size_t ComputeTrieStorage(DicNode* node) {
	if (node->is_leaf()) {
	// The additional affix list holds affixes when there is more than one. Each
	// entry is two bytes, plus an additional FFFF terminator.
	size_t supplimentary_size = 0;
	if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {
	// We cannot store the first affix ID of the affix list into a leaf node.
	// In this case, we have to store all the affix IDs and a terminator
	// into a supplimentary list.
	supplimentary_size = node->affix_indices.size() * 2 + 2;
	} else if (node->affix_indices.size() > 1) {
	// We can store the first affix ID of the affix list into a leaf node.
	// In this case, we need to store the remaining affix IDs and a
	// terminator into a supplimentary list.
	supplimentary_size = node->affix_indices.size() * 2;
	}

	if (node->leaf_addition.empty()) {
	node->storage = DicNode::LEAF;
	return 2 + supplimentary_size;
	}
	node->storage = DicNode::LEAFMORE;
	// Signature & affix (2) + null for leaf_addition (1) = 3
	return 3 + node->leaf_addition.size() + supplimentary_size;
	}

	// Recursively compute the size of the children for non-leaf nodes.
	size_t child_size = 0;
	for (size_t i = 0; i < node->children.size(); i++)
	child_size += ComputeTrieStorage(node->children[i]);

	// Fixed size is only 1 byte which is the ID byte and the count combined.
	static const int kListHeaderSize = 1;

	// Lists can only store up to 16 items.
	static const size_t kListThreshold = 16;
	if (node->children.size() < kListThreshold && child_size <= 0xFF) {
	node->storage = DicNode::LIST8;
	return kListHeaderSize + node->children.size() * 2 + child_size;
	}

	if (node->children.size() < kListThreshold && child_size <= 0xFFFF) {
	node->storage = DicNode::LIST16;
	// Fixed size is one byte plus 3 for each table entry.
	return kListHeaderSize + node->children.size() * 3 + child_size;
	}

	static const int kTableHeaderSize = 2; // Type + table size.

	bool has_0th_item;
	int first_table_item, table_item_count;
	ComputeLookupStrategyDetails(node->children, &has_0th_item,
	&first_table_item, &table_item_count);
	if (child_size + kTableHeaderSize + (has_0th_item ? 2 : 0) +
	table_item_count * 2 < 0xFFFF) {
	// Use 16-bit addressing since the children will fit.
	node->storage = DicNode::LOOKUP16;
	return kTableHeaderSize + (has_0th_item ? 2 : 0) + table_item_count * 2 +
	child_size;
	}

	// Use 32-bit addressing as a last resort.
	node->storage = DicNode::LOOKUP32;
	return kTableHeaderSize + (has_0th_item ? 4 : 0) + table_item_count * 4 +
	child_size;
	}

	// Serializes the given node when it is DicNode::LEAF* to the output.
	void SerializeLeaf(const DicNode* node, std::string* output) {
	// The low 6 bits of the ID byte are the high 6 bits of the first affix ID.
	int first_affix = node->affix_indices.size() ? node->affix_indices[0] : 0;

	// We may store the first value with the node or in the supplimentary list.
	size_t first_affix_in_supplimentary_list = 1;
	if (first_affix > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {
	// There are not enough bits for this value, move it to the supplimentary
	// list where there are more bits per value.
	first_affix_in_supplimentary_list = 0;
	first_affix = BDict::FIRST_AFFIX_IS_UNUSED;
	}

	unsigned char id_byte = (first_affix >> 8) &
	BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK;

	// The next two bits indicates an additional string and more affixes.
	if (node->storage == DicNode::LEAFMORE)
	id_byte \|= BDict::LEAF_NODE_ADDITIONAL_VALUE;
	if (node->affix_indices.size() > 1 \|\| first_affix_in_supplimentary_list == 0)
	id_byte \|= BDict::LEAF_NODE_FOLLOWING_VALUE;
	output->push_back(id_byte);

	// Following is the low 8 bits of the affix index.
	output->push_back(first_affix & 0xff);

	// Handle the optional addition with NULL terminator.
	if (node->storage == DicNode::LEAFMORE) {
	for (size_t i = 0; i < node->leaf_addition.size() + 1; i++)
	output->push_back(node->leaf_addition.c_str()[i]);
	}

	// Handle any following affixes. We already wrote the 0th one.
	if (node->affix_indices.size() > first_affix_in_supplimentary_list) {
	for (size_t i = first_affix_in_supplimentary_list;
	i < node->affix_indices.size() && i < BDict::MAX_AFFIXES_PER_WORD;
	i++) {
	output->push_back(static_cast<char>(node->affix_indices[i] & 0xFF));
	output->push_back(
	static_cast<char>((node->affix_indices[i] >> 8) & 0xFF));
	}

	// Terminator for affix list. We use 0xFFFF.
	output->push_back(static_cast<unsigned char>(0xFF));
	output->push_back(static_cast<unsigned char>(0xFF));
	}
	}

	// Serializes the given node when it is DicNode::LIST* to the output.
	void SerializeList(const DicNode* node, std::string* output) {
	bool is_8_bit = node->storage == DicNode::LIST8;
	unsigned char id_byte = BDict::LIST_NODE_TYPE_VALUE \|
	(is_8_bit ? 0 : BDict::LIST_NODE_16BIT_VALUE);
	id_byte \|= node->children.size(); // We assume the size is < 4 bits.
	output->push_back(id_byte);

	// Reserve enough room for the lookup table (either 2 or 3 bytes per entry).
	int bytes_per_entry = (is_8_bit ? 2 : 3);
	size_t table_begin = output->size();
	output->resize(output->size() + node->children.size() * bytes_per_entry);
	size_t children_begin = output->size();

	for (size_t i = 0; i < node->children.size(); i++) {
	// First is the character this entry represents.
	(output)[table_begin + i bytes_per_entry] = node->children[i]->addition;

	// Next is the 8- or 16-bit offset.
	size_t offset = output->size() - children_begin;
	if (is_8_bit) {
	DCHECK(offset <= 0xFF);
	(output)[table_begin + i bytes_per_entry + 1] =
	static_cast<char>(offset & 0xFF);
	} else {
	unsigned short* output16 = reinterpret_cast<unsigned short*>(
	&(output)[table_begin + i bytes_per_entry + 1]);
	*output16 = static_cast<unsigned short>(offset);
	}

	// Now append the children's data.
	SerializeTrie(node->children[i], output);
	}
	}

	// Serializes the given node when it is DicNode::LOOKUP* to the output.
	void SerializeLookup(const DicNode* node, std::string* output) {
	unsigned char id_byte = BDict::LOOKUP_NODE_TYPE_VALUE;

	bool has_0th_item;
	int first_table_item, table_item_count;
	ComputeLookupStrategyDetails(node->children, &has_0th_item,
	&first_table_item, &table_item_count);

	// Set the extra bits in the ID byte.
	bool is_32_bit = (node->storage == DicNode::LOOKUP32);
	if (is_32_bit)
	id_byte \|= BDict::LOOKUP_NODE_32BIT_VALUE;
	if (has_0th_item)
	id_byte \|= BDict::LOOKUP_NODE_0TH_VALUE;

	size_t begin_offset = output->size();

	output->push_back(id_byte);
	output->push_back(static_cast<char>(first_table_item));
	output->push_back(static_cast<char>(table_item_count));

	// Save room for the lookup table and the optional 0th item.
	int bytes_per_entry = (is_32_bit ? 4 : 2);
	size_t zeroth_item_offset = output->size();
	if (has_0th_item)
	output->resize(output->size() + bytes_per_entry);
	size_t table_begin = output->size();
	output->resize(output->size() + table_item_count * bytes_per_entry);

	// Append the children.
	for (size_t i = 0; i < node->children.size(); i++) {
	size_t offset = output->size();

	// Compute the location at which we'll store the offset of the child data.
	// We may be writing the magic 0th item.
	size_t offset_offset;
	if (i == 0 && has_0th_item) {
	offset_offset = zeroth_item_offset;
	} else {
	int table_index = static_cast<unsigned char>(node->children[i]->addition) - first_table_item;
	offset_offset = table_begin + table_index * bytes_per_entry;
	}

	// Write the offset.
	if (is_32_bit) {
	// Use 32-bit absolute offsets.
	// FIXME(brettw) use bit cast.
	unsigned* offset32 = reinterpret_cast<unsigned>(&(output)[offset_offset]);
	*offset32 = static_cast<unsigned>(output->size());
	} else {
	// Use 16-bit relative offsets.
	unsigned short* offset16 = reinterpret_cast<unsigned short>(&(output)[offset_offset]);
	*offset16 = static_cast<unsigned short>(output->size() - begin_offset);
	}

	SerializeTrie(node->children[i], output);
	}
	}

	// Recursively serializes this node and all of its children to the output.
	void SerializeTrie(const DicNode* node, std::string* output) {
	if (node->storage == DicNode::LEAF \|\|
	node->storage == DicNode::LEAFMORE) {
	SerializeLeaf(node, output);
	} else if (node->storage == DicNode::LIST8 \|\|
	node->storage == DicNode::LIST16) {
	SerializeList(node, output);
	} else if (node->storage == DicNode::LOOKUP16 \|\|
	node->storage == DicNode::LOOKUP32) {
	SerializeLookup(node, output);
	}
	}

	// Serializes the given list of strings with 0 bytes separating them. The end
	// will be marked by a double-0.
	void SerializeStringListNullTerm(const std::vector<std::string>& strings,
	std::string* output) {
	for (size_t i = 0; i < strings.size(); i++) {
	// Can't tolerate empty strings since the'll mark the end.
	if (strings[i].empty())
	output->push_back(' ');
	else
	output->append(strings[i]);
	output->push_back(0);
	}
	output->push_back(0);
	}

	void SerializeReplacements(
	const std::vector< std::pair<std::string, std::string> >& repl,
	std::string* output) {
	for (size_t i = 0; i < repl.size(); i++) {
	output->append(repl[i].first);
	output->push_back(0);
	output->append(repl[i].second);
	output->push_back(0);
	}
	output->push_back(0);
	}

	} // namespace

	BDictWriter::BDictWriter() : trie_root_(NULL) {
	}

	BDictWriter::~BDictWriter() {
	delete trie_root_;
	}

	void BDictWriter::SetWords(const WordList& words) {
	trie_root_ = new DicNode;
	BuildTrie(words, 0, words.size(), 0, trie_root_);
	}

	std::string BDictWriter::GetBDict() const {
	std::string ret;

	// Save room for the header. This will be populated at the end.
	ret.resize(sizeof(hunspell::BDict::Header));

	// Serialize the affix portion.
	size_t aff_offset = ret.size();
	SerializeAff(&ret);

	// Serialize the dictionary words.
	size_t dic_offset = ret.size();
	ret.reserve(ret.size() + ComputeTrieStorage(trie_root_));
	SerializeTrie(trie_root_, &ret);

	// Fill the header last, now that we have the data.
	hunspell::BDict::Header* header =
	reinterpret_cast<hunspell::BDict::Header*>(&ret[0]);
	header->signature = hunspell::BDict::SIGNATURE;
	header->major_version = hunspell::BDict::MAJOR_VERSION;
	header->minor_version = hunspell::BDict::MINOR_VERSION;
	header->aff_offset = static_cast<uint32>(aff_offset);
	header->dic_offset = static_cast<uint32>(dic_offset);

	// Write the MD5 digest of the affix information and the dictionary words at
	// the end of the BDic header.
	if (header->major_version >= 2)
	base::MD5Sum(&ret[aff_offset], ret.size() - aff_offset, &header->digest);

	return ret;
	}

	void BDictWriter::SerializeAff(std::string* output) const {
	// Reserve enough room for the header.
	size_t header_offset = output->size();
	output->resize(output->size() + sizeof(hunspell::BDict::AffHeader));

	// Write the comment.
	output->push_back('\n');
	output->append(comment_);
	output->push_back('\n');

	// We need a magic first AF line that lists the number of following ones.
	size_t affix_group_offset = output->size();
	output->append(base::StringPrintf("AF %d",
	static_cast<int>(affix_groups_.size())));
	output->push_back(0);
	SerializeStringListNullTerm(affix_groups_, output);

	size_t affix_rule_offset = output->size();
	SerializeStringListNullTerm(affix_rules_, output);

	size_t rep_offset = output->size();
	SerializeReplacements(replacements_, output);

	size_t other_offset = output->size();
	SerializeStringListNullTerm(other_commands_, output);

	// Add the header now that we know the offsets.
	hunspell::BDict::AffHeader* header =
	reinterpret_cast<hunspell::BDict::AffHeader>(&(output)[header_offset]);
	header->affix_group_offset = static_cast<uint32>(affix_group_offset);
	header->affix_rule_offset = static_cast<uint32>(affix_rule_offset);
	header->rep_offset = static_cast<uint32>(rep_offset);
	header->other_offset = static_cast<uint32>(other_offset);
	}

	} // namespace hunspell