src/chrome/browser/spellcheck_worditerator.cc - chromium/src - Git at Google

 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "chrome/browser/spellcheck_worditerator.h"

 #include <map>
 #include <string>

 #include "base/basictypes.h"
 #include "base/string_util.h"

 #include "third_party/icu38/public/common/unicode/uchar.h"
 #include "third_party/icu38/public/common/unicode/unorm.h"
 #include "third_party/icu38/public/common/unicode/uscript.h"
 #include "third_party/icu38/public/common/unicode/uset.h"
 #include "third_party/icu38/public/i18n/unicode/ulocdata.h"

 SpellcheckCharAttribute::SpellcheckCharAttribute() {
   InitializeScriptTable();

   // Even though many dictionaries treats numbers and contractions as words and
   // treats USCRIPT_COMMON characters as word characters, the
   // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word
   // characters to strictly-distinguish contraction characters from word
   // characters.
   SetWordScript(USCRIPT_COMMON, false);

   // Initialize the table of characters used for contractions.
   // This array consists of the 'Midletter' and 'MidNumLet' characters of the
   // word-break property list provided by Unicode, Inc.:
   //   http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
   static const UChar32 kMidLetters[] = {
       L'\x003A',  // MidLetter # COLON
       L'\x00B7',  // MidLetter # MIDDLE DOT
       L'\x0387',  // MidLetter # GREEK ANO TELEIA
       L'\x05F4',  // MidLetter # HEBREW PUNCTUATION GERSHAYIM
       L'\x2027',  // MidLetter # HYPHENATION POINT
       L'\xFE13',  // MidLetter # PRESENTATION FORM FOR VERTICAL COLON
       L'\xFE55',  // MidLetter # SMALL COLON
       L'\xFF1A',  // MidLetter # FULLWIDTH COLON
       L'\x0027',  // MidNumLet # APOSTROPHE
       L'\x002E',  // MidNumLet # FULL STOP
       L'\x2018',  // MidNumLet # LEFT SINGLE QUOTATION MARK
       L'\x2019',  // MidNumLet # RIGHT SINGLE QUOTATION MARK
       L'\x2024',  // MidNumLet # ONE DOT LEADER
       L'\xFE52',  // MidNumLet # SMALL FULL STOP
       L'\xFF07',  // MidNumLet # FULLWIDTH APOSTROPHE
       L'\xFF0E',  // MidNumLet # FULLWIDTH FULL STOP
   };
   for (size_t i = 0; i < arraysize(kMidLetters); ++i)
     middle_letters_[kMidLetters[i]] = true;
 }

 SpellcheckCharAttribute::~SpellcheckCharAttribute() {
 }

 // Sets the default language for this object.
 // This function retrieves the exemplar set to set up the default character
 // attributes.
 void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) {
   // Retrieves the locale data of the given language.
   std::string language_encoded;
   WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP,
                  &language_encoded);
   UErrorCode status = U_ZERO_ERROR;
   ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status);
   if (U_FAILURE(status))
     return;

   // Retrieves the exemplar set of the given language and update the
   // character-attribute table to treat its characters as word characters.
   USet* exemplar_set = uset_open(1, 0);
   ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,
                           &status);
   ulocdata_close(locale_data);
   if (U_SUCCESS(status)) {
     int length = uset_size(exemplar_set);
     for (int i = 0; i < length; ++i) {
       UChar32 character = uset_charAt(exemplar_set, i);
       SetWordScript(GetScriptCode(character), true);
     }
   }
   uset_close(exemplar_set);
 }

 // Returns whether or not the given character is a character used by the
 // selected dictionary.
 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {
   return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);
 }

 // Returns whether or not the given character is a character used by
 // contractions.
 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {
   std::map<UChar32, bool>::const_iterator iterator;
   iterator = middle_letters_.find(character);
   if (iterator == middle_letters_.end())
     return false;
   return iterator->second;
 }

 // Initializes the mapping table.
 void SpellcheckCharAttribute::InitializeScriptTable() {
   for (size_t i = 0; i < arraysize(script_attributes_); ++i)
     script_attributes_[i] = false;
 }

 // Retrieves the ICU script code.
 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {
   UErrorCode status = U_ZERO_ERROR;
   UScriptCode script_code = uscript_getScript(character, &status);
   return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;
 }

 // Updates the mapping table from an ICU script code to its attribute, i.e.
 // whether not a script is used by the selected dictionary.
 void SpellcheckCharAttribute::SetWordScript(const int script_code,
                                             bool in_use) {
   if (script_code < 0 ||
       static_cast<size_t>(script_code) >= arraysize(script_attributes_))
     return;
   script_attributes_[script_code] = in_use;
 }

 // Returns whether or not the given script is used by the selected
 // dictionary.
 bool SpellcheckCharAttribute::IsWordScript(
     const UScriptCode script_code) const {
   if (script_code < 0 ||
       static_cast<size_t>(script_code) >= arraysize(script_attributes_))
     return false;
   return script_attributes_[script_code];
 }

 SpellcheckWordIterator::SpellcheckWordIterator()
     : word_(NULL),
       length_(0),
       position_(0),
       allow_contraction_(false),
       attribute_(NULL) {
 }

 SpellcheckWordIterator::~SpellcheckWordIterator() {
 }

 // Initialize a word-iterator object.
 void SpellcheckWordIterator::Initialize(
     const SpellcheckCharAttribute* attribute,
     const char16* word,
     size_t length,
     bool allow_contraction) {
   word_ = word;
   position_ = 0;
   length_ = static_cast<int>(length);
   allow_contraction_ = allow_contraction;
   attribute_ = attribute;
 }

 // Retrieves a word (or a contraction).
 // When a contraction is enclosed with contraction characters (e.g. 'isn't',
 // 'rock'n'roll'), we should discard the beginning and the end of the
 // contraction but we should never split the contraction.
 // To handle this case easily, we should firstly extract a segment consisting
 // of word characters and contraction characters, and discard contraction
 // characters at the beginning and the end of the extracted segment.
 bool SpellcheckWordIterator::GetNextWord(string16* word_string,
                                          int* word_start,
                                          int* word_length) {
   word_string->empty();
   *word_start = 0;
   *word_length = 0;
   while (position_ < length_) {
     int segment_start = 0;
     int segment_end = 0;
     GetSegment(&segment_start, &segment_end);
     TrimSegment(segment_start, segment_end, word_start, word_length);
     if (*word_length > 0)
       return Normalize(*word_start, *word_length, word_string);
   }

   return false;
 }

 // Retrieves a segment consisting of word characters (and contraction
 // characters if the |allow_contraction_| value is true).
 // When the current position refers to a non-word character, this function
 // returns a non-empty segment consisting of the character itself. In this
 // case, the TrimSegment() function discards the character and returns an
 // empty word (i.e. |word_length| == 0).
 void SpellcheckWordIterator::GetSegment(int* segment_start,
                                         int* segment_end) {
   int position = position_;
   while (position <  length_) {
     UChar32 character;
     U16_NEXT(word_, position, length_, character);
     if (!attribute_->IsWordChar(character)) {
       if (!allow_contraction_ || !attribute_->IsContractionChar(character))
         break;
     }
   }
   *segment_start = position_;
   *segment_end = position;
   position_ = position;
 }

 // Discards non-word characters at the beginning and the end of the given
 // segment.
 void SpellcheckWordIterator::TrimSegment(int segment_start,
                                          int segment_end,
                                          int* word_start,
                                          int* word_length) const {
   while (segment_start < segment_end) {
     UChar32 character;
     int segment_next = segment_start;
     U16_NEXT(word_, segment_next, segment_end, character);
     if (attribute_->IsWordChar(character)) {
       *word_start = segment_start;
       break;
     }
     segment_start = segment_next;
   }
   while (segment_end >= segment_start) {
     UChar32 character;
     int segment_prev = segment_end;
     U16_PREV(word_, segment_start, segment_prev, character);
     if (attribute_->IsWordChar(character)) {
       *word_length = segment_end - segment_start;
       break;
     }
     segment_end = segment_prev;
   }
 }

 // Normalizes a non-terminated string into its canonical form so that
 // a spellchecker object can check spellings of words which contain ligatures,
 // full-width letters, etc.
 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but
 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,
 // etc. For its details, please read the script table in
 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt".
 bool SpellcheckWordIterator::Normalize(int input_start,
                                        int input_length,
                                        string16* output_string) const {
   // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"
   // does not only write NFKD and NFKC can compose ligatures into their ASCII
   // alternatives, but also write NFKC keeps accents of characters.
   // Therefore, NFKC seems to be the best option for hunspell.
   // To use NKFC for normalization, the length of the output string is mostly
   // equal to the one of the input string. (One exception is ligatures.)
   // To avoid the unorm_normalize() function from being called always twice,
   // we temporarily allocate |input_length| + 1 characters to the output string
   // and call the function with it. We re-allocate the output string
   // only if it cannot store the normalized string, i.e. the output string is
   // longer than the input one.
   const char16* input_string = &word_[input_start];
   UErrorCode error_code = U_ZERO_ERROR;
   int output_length = input_length + 1;
   char16* output_buffer = WriteInto(output_string, output_length);
   output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
                                   output_buffer, output_length, &error_code);
   if (error_code == U_BUFFER_OVERFLOW_ERROR) {
     error_code = U_ZERO_ERROR;
     output_buffer = WriteInto(output_string, ++output_length);
     output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
                                     output_buffer, output_length, &error_code);
   }
   return (error_code == U_ZERO_ERROR);
 }
	// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "chrome/browser/spellcheck_worditerator.h"

	#include <map>
	#include <string>

	#include "base/basictypes.h"
	#include "base/string_util.h"

	#include "third_party/icu38/public/common/unicode/uchar.h"
	#include "third_party/icu38/public/common/unicode/unorm.h"
	#include "third_party/icu38/public/common/unicode/uscript.h"
	#include "third_party/icu38/public/common/unicode/uset.h"
	#include "third_party/icu38/public/i18n/unicode/ulocdata.h"

	SpellcheckCharAttribute::SpellcheckCharAttribute() {
	InitializeScriptTable();

	// Even though many dictionaries treats numbers and contractions as words and
	// treats USCRIPT_COMMON characters as word characters, the
	// SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word
	// characters to strictly-distinguish contraction characters from word
	// characters.
	SetWordScript(USCRIPT_COMMON, false);

	// Initialize the table of characters used for contractions.
	// This array consists of the 'Midletter' and 'MidNumLet' characters of the
	// word-break property list provided by Unicode, Inc.:
	// http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
	static const UChar32 kMidLetters[] = {
	L'\x003A', // MidLetter # COLON
	L'\x00B7', // MidLetter # MIDDLE DOT
	L'\x0387', // MidLetter # GREEK ANO TELEIA
	L'\x05F4', // MidLetter # HEBREW PUNCTUATION GERSHAYIM
	L'\x2027', // MidLetter # HYPHENATION POINT
	L'\xFE13', // MidLetter # PRESENTATION FORM FOR VERTICAL COLON
	L'\xFE55', // MidLetter # SMALL COLON
	L'\xFF1A', // MidLetter # FULLWIDTH COLON
	L'\x0027', // MidNumLet # APOSTROPHE
	L'\x002E', // MidNumLet # FULL STOP
	L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK
	L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK
	L'\x2024', // MidNumLet # ONE DOT LEADER
	L'\xFE52', // MidNumLet # SMALL FULL STOP
	L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE
	L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP
	};
	for (size_t i = 0; i < arraysize(kMidLetters); ++i)
	middle_letters_[kMidLetters[i]] = true;
	}

	SpellcheckCharAttribute::~SpellcheckCharAttribute() {
	}

	// Sets the default language for this object.
	// This function retrieves the exemplar set to set up the default character
	// attributes.
	void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) {
	// Retrieves the locale data of the given language.
	std::string language_encoded;
	WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP,
	&language_encoded);
	UErrorCode status = U_ZERO_ERROR;
	ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status);
	if (U_FAILURE(status))
	return;

	// Retrieves the exemplar set of the given language and update the
	// character-attribute table to treat its characters as word characters.
	USet* exemplar_set = uset_open(1, 0);
	ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,
	&status);
	ulocdata_close(locale_data);
	if (U_SUCCESS(status)) {
	int length = uset_size(exemplar_set);
	for (int i = 0; i < length; ++i) {
	UChar32 character = uset_charAt(exemplar_set, i);
	SetWordScript(GetScriptCode(character), true);
	}
	}
	uset_close(exemplar_set);
	}

	// Returns whether or not the given character is a character used by the
	// selected dictionary.
	bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {
	return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);
	}

	// Returns whether or not the given character is a character used by
	// contractions.
	bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {
	std::map<UChar32, bool>::const_iterator iterator;
	iterator = middle_letters_.find(character);
	if (iterator == middle_letters_.end())
	return false;
	return iterator->second;
	}

	// Initializes the mapping table.
	void SpellcheckCharAttribute::InitializeScriptTable() {
	for (size_t i = 0; i < arraysize(script_attributes_); ++i)
	script_attributes_[i] = false;
	}

	// Retrieves the ICU script code.
	UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {
	UErrorCode status = U_ZERO_ERROR;
	UScriptCode script_code = uscript_getScript(character, &status);
	return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;
	}

	// Updates the mapping table from an ICU script code to its attribute, i.e.
	// whether not a script is used by the selected dictionary.
	void SpellcheckCharAttribute::SetWordScript(const int script_code,
	bool in_use) {
	if (script_code < 0 \|\|
	static_cast<size_t>(script_code) >= arraysize(script_attributes_))
	return;
	script_attributes_[script_code] = in_use;
	}

	// Returns whether or not the given script is used by the selected
	// dictionary.
	bool SpellcheckCharAttribute::IsWordScript(
	const UScriptCode script_code) const {
	if (script_code < 0 \|\|
	static_cast<size_t>(script_code) >= arraysize(script_attributes_))
	return false;
	return script_attributes_[script_code];
	}

	SpellcheckWordIterator::SpellcheckWordIterator()
	: word_(NULL),
	length_(0),
	position_(0),
	allow_contraction_(false),
	attribute_(NULL) {
	}

	SpellcheckWordIterator::~SpellcheckWordIterator() {
	}

	// Initialize a word-iterator object.
	void SpellcheckWordIterator::Initialize(
	const SpellcheckCharAttribute* attribute,
	const char16* word,
	size_t length,
	bool allow_contraction) {
	word_ = word;
	position_ = 0;
	length_ = static_cast<int>(length);
	allow_contraction_ = allow_contraction;
	attribute_ = attribute;
	}

	// Retrieves a word (or a contraction).
	// When a contraction is enclosed with contraction characters (e.g. 'isn't',
	// 'rock'n'roll'), we should discard the beginning and the end of the
	// contraction but we should never split the contraction.
	// To handle this case easily, we should firstly extract a segment consisting
	// of word characters and contraction characters, and discard contraction
	// characters at the beginning and the end of the extracted segment.
	bool SpellcheckWordIterator::GetNextWord(string16* word_string,
	int* word_start,
	int* word_length) {
	word_string->empty();
	*word_start = 0;
	*word_length = 0;
	while (position_ < length_) {
	int segment_start = 0;
	int segment_end = 0;
	GetSegment(&segment_start, &segment_end);
	TrimSegment(segment_start, segment_end, word_start, word_length);
	if (*word_length > 0)
	return Normalize(word_start, word_length, word_string);
	}

	return false;
	}

	// Retrieves a segment consisting of word characters (and contraction
	// characters if the \|allow_contraction_\| value is true).
	// When the current position refers to a non-word character, this function
	// returns a non-empty segment consisting of the character itself. In this
	// case, the TrimSegment() function discards the character and returns an
	// empty word (i.e. \|word_length\| == 0).
	void SpellcheckWordIterator::GetSegment(int* segment_start,
	int* segment_end) {
	int position = position_;
	while (position < length_) {
	UChar32 character;
	U16_NEXT(word_, position, length_, character);
	if (!attribute_->IsWordChar(character)) {
	if (!allow_contraction_ \|\| !attribute_->IsContractionChar(character))
	break;
	}
	}
	*segment_start = position_;
	*segment_end = position;
	position_ = position;
	}

	// Discards non-word characters at the beginning and the end of the given
	// segment.
	void SpellcheckWordIterator::TrimSegment(int segment_start,
	int segment_end,
	int* word_start,
	int* word_length) const {
	while (segment_start < segment_end) {
	UChar32 character;
	int segment_next = segment_start;
	U16_NEXT(word_, segment_next, segment_end, character);
	if (attribute_->IsWordChar(character)) {
	*word_start = segment_start;
	break;
	}
	segment_start = segment_next;
	}
	while (segment_end >= segment_start) {
	UChar32 character;
	int segment_prev = segment_end;
	U16_PREV(word_, segment_start, segment_prev, character);
	if (attribute_->IsWordChar(character)) {
	*word_length = segment_end - segment_start;
	break;
	}
	segment_end = segment_prev;
	}
	}

	// Normalizes a non-terminated string into its canonical form so that
	// a spellchecker object can check spellings of words which contain ligatures,
	// full-width letters, etc.
	// USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but
	// also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,
	// etc. For its details, please read the script table in
	// "http://www.unicode.org/Public/UNIDATA/Scripts.txt".
	bool SpellcheckWordIterator::Normalize(int input_start,
	int input_length,
	string16* output_string) const {
	// Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"
	// does not only write NFKD and NFKC can compose ligatures into their ASCII
	// alternatives, but also write NFKC keeps accents of characters.
	// Therefore, NFKC seems to be the best option for hunspell.
	// To use NKFC for normalization, the length of the output string is mostly
	// equal to the one of the input string. (One exception is ligatures.)
	// To avoid the unorm_normalize() function from being called always twice,
	// we temporarily allocate \|input_length\| + 1 characters to the output string
	// and call the function with it. We re-allocate the output string
	// only if it cannot store the normalized string, i.e. the output string is
	// longer than the input one.
	const char16* input_string = &word_[input_start];
	UErrorCode error_code = U_ZERO_ERROR;
	int output_length = input_length + 1;
	char16* output_buffer = WriteInto(output_string, output_length);
	output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
	output_buffer, output_length, &error_code);
	if (error_code == U_BUFFER_OVERFLOW_ERROR) {
	error_code = U_ZERO_ERROR;
	output_buffer = WriteInto(output_string, ++output_length);
	output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
	output_buffer, output_length, &error_code);
	}
	return (error_code == U_ZERO_ERROR);
	}