blob: 51a13d09c9fa5963105d9eacfee4584a931633b4 [file] [log] [blame]
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/tools/convert_dict/aff_reader.h"
#include <stddef.h>
#include <algorithm>
#include "base/files/file_util.h"
#include "base/i18n/icu_string_conversions.h"
#include "base/strings/string_split.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/tools/convert_dict/hunspell_reader.h"
namespace convert_dict {
namespace {
// Returns true if the given line begins with the given case-sensitive
// NULL-terminated ASCII string.
bool StringBeginsWith(const std::string& str, const char* with) {
size_t cur = 0;
while (cur < str.size() && with[cur] != 0) {
if (str[cur] != with[cur])
return false;
cur++;
}
return with[cur] == 0;
}
// Collapses runs of spaces to only one space.
void CollapseDuplicateSpaces(std::string* str) {
int prev_space = false;
for (size_t i = 0; i < str->length(); i++) {
if ((*str)[i] == ' ') {
if (prev_space) {
str->erase(str->begin() + i);
i--;
}
prev_space = true;
} else {
prev_space = false;
}
}
}
// Print an error message and terminate execution
void Panic(const char* fmt, ...) {
va_list ap;
printf("ERROR: ");
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
printf("\n");
exit(1);
}
} // namespace
AffReader::AffReader(const base::FilePath& path)
: has_indexed_affixes_(false) {
file_ = base::OpenFile(path, "r");
// Default to Latin1 in case the file doesn't specify it.
encoding_ = "ISO8859-1";
}
AffReader::~AffReader() {
if (file_)
base::CloseFile(file_);
}
bool AffReader::Read() {
if (!file_)
return false;
// TODO(brettw) handle byte order mark.
bool got_command = false;
bool got_first_af = false;
bool got_first_rep = false;
has_indexed_affixes_ = false;
while (!feof(file_)) {
std::string line = ReadLine(file_);
// Save comment lines before any commands.
if (!got_command && !line.empty() && line[0] == '#') {
intro_comment_.append(line);
intro_comment_.push_back('\n');
continue;
}
StripComment(&line);
if (line.empty())
continue;
got_command = true;
if (StringBeginsWith(line, "SET ")) {
// Character set encoding.
encoding_ = line.substr(4);
TrimLine(&encoding_);
} else if (StringBeginsWith(line, "AF ")) {
// Affix. The first one is the number of ones following which we don't
// bother with.
has_indexed_affixes_ = true;
if (got_first_af) {
std::string group(line.substr(3));
AddAffixGroup(&group);
} else {
got_first_af = true;
}
} else if (StringBeginsWith(line, "SFX ") ||
StringBeginsWith(line, "PFX ")) {
AddAffix(&line);
} else if (StringBeginsWith(line, "REP ")) {
// The first rep line is the number of ones following which we don't
// bother with.
if (got_first_rep) {
std::string replacement(line.substr(4));
AddReplacement(&replacement);
} else {
got_first_rep = true;
}
} else if (StringBeginsWith(line, "TRY ") ||
StringBeginsWith(line, "MAP ")) {
HandleEncodedCommand(line);
} else if (StringBeginsWith(line, "IGNORE ")) {
Panic("We don't support the IGNORE command yet. This would change how "
"we would insert things in our lookup table.");
} else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) {
Panic("We don't support the COMPLEXPREFIXES command yet. This would "
"mean we have to insert words backwards as well (I think)");
} else {
// All other commands get stored in the other commands list.
HandleRawCommand(line);
}
}
return true;
}
bool AffReader::EncodingToUTF8(const std::string& encoded,
std::string* utf8) const {
base::string16 word;
if (!base::CodepageToUTF16(encoded, encoding(),
base::OnStringConversionError::FAIL, &word))
return false;
*utf8 = base::UTF16ToUTF8(word);
return true;
}
int AffReader::GetAFIndexForAFString(const std::string& af_string) {
auto found = affix_groups_.find(af_string);
if (found != affix_groups_.end())
return found->second;
std::string my_string(af_string);
return AddAffixGroup(&my_string);
}
// We convert the data from our map to an indexed list, and also prefix each
// line with "AF" for the parser to read later.
std::vector<std::string> AffReader::GetAffixGroups() const {
int max_id = 0;
for (auto i = affix_groups_.begin(); i != affix_groups_.end(); ++i) {
if (i->second > max_id)
max_id = i->second;
}
std::vector<std::string> ret;
ret.resize(max_id);
for (auto i = affix_groups_.begin(); i != affix_groups_.end(); ++i) {
// Convert the indices into 1-based.
ret[i->second - 1] = std::string("AF ") + i->first;
}
return ret;
}
int AffReader::AddAffixGroup(std::string* rule) {
TrimLine(rule);
// We use the 1-based index of the rule. This matches the way Hunspell
// refers to the numbers.
int affix_id = static_cast<int>(affix_groups_.size()) + 1;
affix_groups_.insert(std::make_pair(*rule, affix_id));
return affix_id;
}
void AffReader::AddAffix(std::string* rule) {
TrimLine(rule);
CollapseDuplicateSpaces(rule);
// These lines have two forms:
// AFX D Y 4 <- First line, lists how many affixes for "D" there are.
// AFX D 0 d e <- Following lines.
// We want to ensure the two last groups on the last line are encoded in
// UTF-8, and we want to make sure that the affix identifier "D" is *not*
// encoded, since that's basically an 8-bit identifier.
// Count to the third space. Everything after that will be re-encoded. This
// will re-encode the number on the first line, but that will be a NOP. If
// there are not that many groups, we won't reencode it, but pass it through.
int found_spaces = 0;
std::string token;
for (size_t i = 0; i < rule->length(); i++) {
if ((*rule)[i] == ' ') {
found_spaces++;
if (found_spaces == 3) {
size_t part_start = i;
std::string part;
if (token[0] != 'Y' && token[0] != 'N') {
// This token represents a stripping prefix or suffix, which is
// either a length or a string to be replaced.
// We also reencode them to UTF-8.
part_start = i - token.length();
}
part = rule->substr(part_start); // From here to end.
if (part.find('-') != std::string::npos) {
// This rule has a morph rule used by old Hungarian dictionaries.
// When a line has a morph rule, its format becomes as listed below.
// AFX D 0 d e - M
// To make hunspell work more happily, replace this morph rule with
// a compound flag as listed below.
// AFX D 0 d/M e
std::vector<std::string> tokens = base::SplitString(
part, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
if (tokens.size() >= 5) {
part = base::StringPrintf("%s %s/%s %s",
tokens[0].c_str(),
tokens[1].c_str(),
tokens[4].c_str(),
tokens[2].c_str());
}
}
size_t slash_index = part.find('/');
if (slash_index != std::string::npos && !has_indexed_affixes()) {
// This can also have a rule string associated with it following a
// slash. For example:
// PFX P 0 foo/Y .
// The "Y" is a flag. For example, the aff file might have a line:
// COMPOUNDFLAG Y
// so that means that this prefix would be a compound one.
//
// It expects these rules to use the same alias rules as the .dic
// file. We've forced it to use aliases, which is a numerical index
// instead of these character flags, and this needs to be consistent.
std::string before_flags = part.substr(0, slash_index + 1);
// After the slash are both the flags, then whitespace, then the part
// that tells us what to strip.
std::vector<std::string> after_slash = base::SplitString(
part.substr(slash_index + 1), " ",
base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
if (after_slash.size() == 0) {
Panic("Found 0 terms after slash in affix rule '%s', "
"but need at least 2.",
part.c_str());
}
if (after_slash.size() == 1) {
printf("WARNING: Found 1 term after slash in affix rule '%s', "
"but expected at least 2. Adding '.'.\n",
part.c_str());
after_slash.push_back(".");
}
// Note that we may get a third term here which is the morphological
// description of this rule. This happens in the tests only, so we can
// just ignore it.
part = base::StringPrintf("%s%d %s",
before_flags.c_str(),
GetAFIndexForAFString(after_slash[0]),
after_slash[1].c_str());
}
// Reencode from here
std::string reencoded;
if (!EncodingToUTF8(part, &reencoded))
Panic("Cannot encode affix rule part '%s' to utf8.", part.c_str());
*rule = rule->substr(0, part_start) + reencoded;
break;
}
token.clear();
} else {
token.push_back((*rule)[i]);
}
}
affix_rules_.push_back(*rule);
}
void AffReader::AddReplacement(std::string* rule) {
TrimLine(rule);
CollapseDuplicateSpaces(rule);
std::string utf8rule;
if (!EncodingToUTF8(*rule, &utf8rule))
Panic("Cannot encode replacement rule '%s' to utf8.", rule->c_str());
// The first space separates key and value.
size_t space_index = utf8rule.find(' ');
if (space_index == std::string::npos)
Panic("Did not find a space in '%s'.", utf8rule.c_str());
std::vector<std::string> split;
split.push_back(utf8rule.substr(0, space_index));
split.push_back(utf8rule.substr(space_index + 1));
// Underscores are used to represent spaces in most aff files
// (since the line is parsed on spaces).
std::replace(split[0].begin(), split[0].end(), '_', ' ');
std::replace(split[1].begin(), split[1].end(), '_', ' ');
replacements_.push_back(std::make_pair(split[0], split[1]));
}
void AffReader::HandleRawCommand(const std::string& line) {
other_commands_.push_back(line);
}
void AffReader::HandleEncodedCommand(const std::string& line) {
std::string utf8;
if (!EncodingToUTF8(line, &utf8))
Panic("Cannot encode command '%s' to utf8.", line.c_str());
other_commands_.push_back(utf8);
}
} // namespace convert_dict