| // Copyright 2006-2008 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "chrome/tools/convert_dict/dic_reader.h" |
| |
| #include <stddef.h> |
| |
| #include <algorithm> |
| #include <set> |
| |
| #include "base/files/file_util.h" |
| #include "base/strings/string_util.h" |
| #include "chrome/tools/convert_dict/aff_reader.h" |
| #include "chrome/tools/convert_dict/hunspell_reader.h" |
| |
| namespace convert_dict { |
| |
| namespace { |
| |
| // Maps each unique word to the unique affix group IDs associated with it. |
| typedef std::map<std::string, std::set<int> > WordSet; |
| |
| void SplitDicLine(const std::string& line, std::vector<std::string>* output) { |
| // We split the line on a slash not preceded by a backslash. A slash at the |
| // beginning of the line is not a separator either. |
| size_t slash_index = line.size(); |
| for (size_t i = 0; i < line.size(); i++) { |
| if (line[i] == '/' && i > 0 && line[i - 1] != '\\') { |
| slash_index = i; |
| break; |
| } |
| } |
| |
| output->clear(); |
| |
| // Everything before the slash index is the first term. We also need to |
| // convert all escaped slashes ("\/" sequences) to regular slashes. |
| std::string word = line.substr(0, slash_index); |
| base::ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/"); |
| output->push_back(word); |
| |
| // Everything (if anything) after the slash is the second. |
| if (slash_index < line.size() - 1) |
| output->push_back(line.substr(slash_index + 1)); |
| } |
| |
| // This function reads words from a .dic file, or a .dic_delta file. Note that |
| // we read 'all' the words in the file, irrespective of the word count given |
| // in the first non empty line of a .dic file. Also note that, for a .dic_delta |
| // file, the first line actually does _not_ have the number of words. In order |
| // to control this, we use the |file_has_word_count_in_the_first_line| |
| // parameter to tell this method whether the first non empty line in the file |
| // contains the number of words or not. If it does, skip the first line. If it |
| // does not, then the first line contains a word. |
| bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader, |
| const char* file_type, const char* encoding, |
| bool file_has_word_count_in_the_first_line) { |
| int line_number = 0; |
| while (!feof(file)) { |
| std::string line = ReadLine(file); |
| line_number++; |
| StripComment(&line); |
| if (line.empty()) |
| continue; |
| |
| if (file_has_word_count_in_the_first_line) { |
| // Skip the first nonempty line, this is the line count. We don't bother |
| // with it and just read all the lines. |
| file_has_word_count_in_the_first_line = false; |
| continue; |
| } |
| |
| std::vector<std::string> split; |
| SplitDicLine(line, &split); |
| if (split.empty() || split.size() > 2) { |
| printf("Line %d has extra slashes in the %s file\n", line_number, |
| file_type); |
| return false; |
| } |
| |
| // The first part is the word, the second (optional) part is the affix. We |
| // always use UTF-8 as the encoding to simplify life. |
| std::string utf8word; |
| std::string encoding_string(encoding); |
| if (encoding_string == "UTF-8") { |
| utf8word = split[0]; |
| } else if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { |
| printf("Unable to convert line %d from %s to UTF-8 in the %s file\n", |
| line_number, encoding, file_type); |
| return false; |
| } |
| |
| // We always convert the affix to an index. 0 means no affix. |
| int affix_index = 0; |
| if (split.size() == 2) { |
| // Got a rule, which is the stuff after the slash. The line may also have |
| // an optional term separated by a tab. This is the morphological |
| // description. We don't care about this (it is used in the tests to |
| // generate a nice dump), so we remove it. |
| size_t split1_tab_offset = split[1].find('\t'); |
| if (split1_tab_offset != std::string::npos) |
| split[1] = split[1].substr(0, split1_tab_offset); |
| |
| if (aff_reader->has_indexed_affixes()) |
| affix_index = atoi(split[1].c_str()); |
| else |
| affix_index = aff_reader->GetAFIndexForAFString(split[1]); |
| } |
| |
| // Discard the morphological description if it is attached to the first |
| // token. (It is attached to the first token if a word doesn't have affix |
| // rules.) |
| size_t word_tab_offset = utf8word.find('\t'); |
| if (word_tab_offset != std::string::npos) |
| utf8word = utf8word.substr(0, word_tab_offset); |
| |
| auto found = word_set->find(utf8word); |
| std::set<int> affix_vector; |
| affix_vector.insert(affix_index); |
| |
| if (found == word_set->end()) |
| word_set->insert(std::make_pair(utf8word, affix_vector)); |
| else |
| found->second.insert(affix_index); |
| } |
| |
| return true; |
| } |
| |
| } // namespace |
| |
| DicReader::DicReader(const base::FilePath& path) { |
| file_ = base::OpenFile(path, "r"); |
| |
| base::FilePath additional_path = |
| path.ReplaceExtension(FILE_PATH_LITERAL("dic_delta")); |
| additional_words_file_ = base::OpenFile(additional_path, "r"); |
| |
| if (additional_words_file_) |
| printf("Reading %" PRFilePath " ...\n", additional_path.value().c_str()); |
| else |
| printf("%" PRFilePath " not found.\n", additional_path.value().c_str()); |
| } |
| |
| DicReader::~DicReader() { |
| if (file_) |
| base::CloseFile(file_); |
| if (additional_words_file_) |
| base::CloseFile(additional_words_file_); |
| } |
| |
| bool DicReader::Read(AffReader* aff_reader) { |
| if (!file_) |
| return false; |
| |
| WordSet word_set; |
| |
| // Add words from the dic file to the word set. |
| // Note that the first line is the word count in the file. |
| if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", |
| aff_reader->encoding(), true)) |
| return false; |
| |
| // Add words from the .dic_delta file to the word set, if it exists. |
| // The first line is the first word to add. Word count line is not present. |
| // NOTE: These additional words should be encoded as UTF-8. |
| if (additional_words_file_) { |
| PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta", |
| "UTF-8", false); |
| } |
| // Make sure the words are sorted, they may be unsorted in the input. |
| for (auto word = word_set.begin(); word != word_set.end(); ++word) { |
| std::vector<int> affixes; |
| for (auto aff = word->second.begin(); aff != word->second.end(); ++aff) |
| affixes.push_back(*aff); |
| |
| // Double check that the affixes are sorted. This isn't strictly necessary |
| // but it's nice for the file to have a fixed layout. |
| std::sort(affixes.begin(), affixes.end()); |
| std::reverse(affixes.begin(), affixes.end()); |
| words_.push_back(std::make_pair(word->first, affixes)); |
| } |
| |
| // Double-check that the words are sorted. |
| std::sort(words_.begin(), words_.end()); |
| return true; |
| } |
| |
| } // namespace convert_dict |