blob: 7dc5b4f3fd1a2511f28c1d8fabe3ebf554645de1 [file] [log] [blame]
// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/spellcheck_worditerator.h"
#include <map>
#include <string>
#include "base/basictypes.h"
#include "base/string_util.h"
#include "third_party/icu38/public/common/unicode/uchar.h"
#include "third_party/icu38/public/common/unicode/unorm.h"
#include "third_party/icu38/public/common/unicode/uscript.h"
#include "third_party/icu38/public/common/unicode/uset.h"
#include "third_party/icu38/public/i18n/unicode/ulocdata.h"
SpellcheckCharAttribute::SpellcheckCharAttribute() {
InitializeScriptTable();
// Even though many dictionaries treats numbers and contractions as words and
// treats USCRIPT_COMMON characters as word characters, the
// SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word
// characters to strictly-distinguish contraction characters from word
// characters.
SetWordScript(USCRIPT_COMMON, false);
// Initialize the table of characters used for contractions.
// This array consists of the 'Midletter' and 'MidNumLet' characters of the
// word-break property list provided by Unicode, Inc.:
// http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
static const UChar32 kMidLetters[] = {
L'\x003A', // MidLetter # COLON
L'\x00B7', // MidLetter # MIDDLE DOT
L'\x0387', // MidLetter # GREEK ANO TELEIA
L'\x05F4', // MidLetter # HEBREW PUNCTUATION GERSHAYIM
L'\x2027', // MidLetter # HYPHENATION POINT
L'\xFE13', // MidLetter # PRESENTATION FORM FOR VERTICAL COLON
L'\xFE55', // MidLetter # SMALL COLON
L'\xFF1A', // MidLetter # FULLWIDTH COLON
L'\x0027', // MidNumLet # APOSTROPHE
L'\x002E', // MidNumLet # FULL STOP
L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK
L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK
L'\x2024', // MidNumLet # ONE DOT LEADER
L'\xFE52', // MidNumLet # SMALL FULL STOP
L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE
L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP
};
for (size_t i = 0; i < arraysize(kMidLetters); ++i)
middle_letters_[kMidLetters[i]] = true;
}
SpellcheckCharAttribute::~SpellcheckCharAttribute() {
}
// Sets the default language for this object.
// This function retrieves the exemplar set to set up the default character
// attributes.
void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) {
// Retrieves the locale data of the given language.
std::string language_encoded;
WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP,
&language_encoded);
UErrorCode status = U_ZERO_ERROR;
ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status);
if (U_FAILURE(status))
return;
// Retrieves the exemplar set of the given language and update the
// character-attribute table to treat its characters as word characters.
USet* exemplar_set = uset_open(1, 0);
ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,
&status);
ulocdata_close(locale_data);
if (U_SUCCESS(status)) {
int length = uset_size(exemplar_set);
for (int i = 0; i < length; ++i) {
UChar32 character = uset_charAt(exemplar_set, i);
SetWordScript(GetScriptCode(character), true);
}
}
uset_close(exemplar_set);
}
// Returns whether or not the given character is a character used by the
// selected dictionary.
bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {
return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);
}
// Returns whether or not the given character is a character used by
// contractions.
bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {
std::map<UChar32, bool>::const_iterator iterator;
iterator = middle_letters_.find(character);
if (iterator == middle_letters_.end())
return false;
return iterator->second;
}
// Initializes the mapping table.
void SpellcheckCharAttribute::InitializeScriptTable() {
for (size_t i = 0; i < arraysize(script_attributes_); ++i)
script_attributes_[i] = false;
}
// Retrieves the ICU script code.
UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {
UErrorCode status = U_ZERO_ERROR;
UScriptCode script_code = uscript_getScript(character, &status);
return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;
}
// Updates the mapping table from an ICU script code to its attribute, i.e.
// whether not a script is used by the selected dictionary.
void SpellcheckCharAttribute::SetWordScript(const int script_code,
bool in_use) {
if (script_code < 0 ||
static_cast<size_t>(script_code) >= arraysize(script_attributes_))
return;
script_attributes_[script_code] = in_use;
}
// Returns whether or not the given script is used by the selected
// dictionary.
bool SpellcheckCharAttribute::IsWordScript(
const UScriptCode script_code) const {
if (script_code < 0 ||
static_cast<size_t>(script_code) >= arraysize(script_attributes_))
return false;
return script_attributes_[script_code];
}
SpellcheckWordIterator::SpellcheckWordIterator()
: word_(NULL),
length_(0),
position_(0),
allow_contraction_(false),
attribute_(NULL) {
}
SpellcheckWordIterator::~SpellcheckWordIterator() {
}
// Initialize a word-iterator object.
void SpellcheckWordIterator::Initialize(
const SpellcheckCharAttribute* attribute,
const char16* word,
size_t length,
bool allow_contraction) {
word_ = word;
position_ = 0;
length_ = static_cast<int>(length);
allow_contraction_ = allow_contraction;
attribute_ = attribute;
}
// Retrieves a word (or a contraction).
// When a contraction is enclosed with contraction characters (e.g. 'isn't',
// 'rock'n'roll'), we should discard the beginning and the end of the
// contraction but we should never split the contraction.
// To handle this case easily, we should firstly extract a segment consisting
// of word characters and contraction characters, and discard contraction
// characters at the beginning and the end of the extracted segment.
bool SpellcheckWordIterator::GetNextWord(string16* word_string,
int* word_start,
int* word_length) {
word_string->empty();
*word_start = 0;
*word_length = 0;
while (position_ < length_) {
int segment_start = 0;
int segment_end = 0;
GetSegment(&segment_start, &segment_end);
TrimSegment(segment_start, segment_end, word_start, word_length);
if (*word_length > 0)
return Normalize(*word_start, *word_length, word_string);
}
return false;
}
// Retrieves a segment consisting of word characters (and contraction
// characters if the |allow_contraction_| value is true).
// When the current position refers to a non-word character, this function
// returns a non-empty segment consisting of the character itself. In this
// case, the TrimSegment() function discards the character and returns an
// empty word (i.e. |word_length| == 0).
void SpellcheckWordIterator::GetSegment(int* segment_start,
int* segment_end) {
int position = position_;
while (position < length_) {
UChar32 character;
U16_NEXT(word_, position, length_, character);
if (!attribute_->IsWordChar(character)) {
if (!allow_contraction_ || !attribute_->IsContractionChar(character))
break;
}
}
*segment_start = position_;
*segment_end = position;
position_ = position;
}
// Discards non-word characters at the beginning and the end of the given
// segment.
void SpellcheckWordIterator::TrimSegment(int segment_start,
int segment_end,
int* word_start,
int* word_length) const {
while (segment_start < segment_end) {
UChar32 character;
int segment_next = segment_start;
U16_NEXT(word_, segment_next, segment_end, character);
if (attribute_->IsWordChar(character)) {
*word_start = segment_start;
break;
}
segment_start = segment_next;
}
while (segment_end >= segment_start) {
UChar32 character;
int segment_prev = segment_end;
U16_PREV(word_, segment_start, segment_prev, character);
if (attribute_->IsWordChar(character)) {
*word_length = segment_end - segment_start;
break;
}
segment_end = segment_prev;
}
}
// Normalizes a non-terminated string into its canonical form so that
// a spellchecker object can check spellings of words which contain ligatures,
// full-width letters, etc.
// USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but
// also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,
// etc. For its details, please read the script table in
// "http://www.unicode.org/Public/UNIDATA/Scripts.txt".
bool SpellcheckWordIterator::Normalize(int input_start,
int input_length,
string16* output_string) const {
// Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"
// does not only write NFKD and NFKC can compose ligatures into their ASCII
// alternatives, but also write NFKC keeps accents of characters.
// Therefore, NFKC seems to be the best option for hunspell.
// To use NKFC for normalization, the length of the output string is mostly
// equal to the one of the input string. (One exception is ligatures.)
// To avoid the unorm_normalize() function from being called always twice,
// we temporarily allocate |input_length| + 1 characters to the output string
// and call the function with it. We re-allocate the output string
// only if it cannot store the normalized string, i.e. the output string is
// longer than the input one.
const char16* input_string = &word_[input_start];
UErrorCode error_code = U_ZERO_ERROR;
int output_length = input_length + 1;
char16* output_buffer = WriteInto(output_string, output_length);
output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
output_buffer, output_length, &error_code);
if (error_code == U_BUFFER_OVERFLOW_ERROR) {
error_code = U_ZERO_ERROR;
output_buffer = WriteInto(output_string, ++output_length);
output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
output_buffer, output_length, &error_code);
}
return (error_code == U_ZERO_ERROR);
}