| // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__ |
| #define CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__ |
| |
| #include <map> |
| #include <string> |
| |
| #include "base/basictypes.h" |
| #include "base/string16.h" |
| |
| #include "unicode/uscript.h" |
| |
| // A class which handles character attributes dependent on a spellchecker and |
| // its dictionary. |
| // This class is used by the SpellcheckWordIterator class to determine whether |
| // or not a character is one used by the spellchecker and its dictinary. |
| class SpellcheckCharAttribute { |
| public: |
| SpellcheckCharAttribute(); |
| |
| ~SpellcheckCharAttribute(); |
| |
| // Sets the default language of the spell checker. This controls which |
| // characters are considered parts of words of the given language. |
| void SetDefaultLanguage(const std::wstring& language); |
| |
| // Returns whether or not the given character is a character used by the |
| // selected dictionary. |
| // Parameters |
| // * character [in] (UChar32) |
| // Represents a Unicode character to be checked. |
| // Return values |
| // * true |
| // The given character is a word character. |
| // * false |
| // The given character is not a word character. |
| bool IsWordChar(UChar32 character) const; |
| |
| // Returns whether or not the given character is a character used by |
| // contractions. |
| // Parameters |
| // * character [in] (UChar32) |
| // Represents a Unicode character to be checked. |
| // Return values |
| // * true |
| // The given character is a character used by contractions. |
| // * false |
| // The given character is not a character used by contractions. |
| bool IsContractionChar(UChar32 character) const; |
| |
| private: |
| // Initializes the mapping table. |
| void InitializeScriptTable(); |
| |
| // Retrieves the ICU script code. |
| UScriptCode GetScriptCode(UChar32 character) const; |
| |
| // Updates an entry in the mapping table. |
| void SetWordScript(const int script_code, bool in_use); |
| |
| // Returns whether or not the given script is used by the selected |
| // dictionary. |
| bool IsWordScript(const UScriptCode script_code) const; |
| |
| private: |
| // Represents a mapping table from a script code to a boolean value |
| // representing whether or not the script is used by the selected dictionary. |
| bool script_attributes_[USCRIPT_CODE_LIMIT]; |
| |
| // Represents a table of characters used by contractions. |
| std::map<UChar32, bool> middle_letters_; |
| |
| DISALLOW_EVIL_CONSTRUCTORS(SpellcheckCharAttribute); |
| }; |
| |
| // A class which implements methods for finding the location of word boundaries |
| // used by the Spellchecker class. |
| // This class is implemented on the following assumptions: |
| // * An input string is encoded in UTF-16 (i.e. it may contain surrogate |
| // pairs), and; |
| // * The length of a string is the number of UTF-16 characters in the string |
| // (i.e. the length of a non-BMP character becomes two). |
| class SpellcheckWordIterator { |
| public: |
| SpellcheckWordIterator(); |
| |
| ~SpellcheckWordIterator(); |
| |
| // Initializes a word-iterator object. |
| // Parameters |
| // * attribute [in] (const SpellcheckCharAttribute*) |
| // Represents a set of character attributes used for filtering out |
| // non-word characters. |
| // * word [in] (const char16*) |
| // Represents a string from which this object extracts words. |
| // (This string does not have to be NUL-terminated.) |
| // * length [in] (size_t) |
| // Represents the length of the given string, in UTF-16 characters. |
| // This value should not include terminating NUL characters. |
| // * allow_contraction [in] (bool) |
| // Represents a flag to control whether or not this object should split a |
| // possible contraction (e.g. "isn't", "in'n'out", etc.) |
| // Return values |
| // * true |
| // This word-iterator object is initialized successfully. |
| // * false |
| // An error occured while initializing this object. |
| void Initialize(const SpellcheckCharAttribute* attribute, |
| const char16* word, |
| size_t length, |
| bool allow_contraction); |
| |
| // Retrieves a word (or a contraction). |
| // Parameters |
| // * word_string [out] (string16*) |
| // Represents a word (or a contraction) to be checked its spelling. |
| // This |word_string| has been already normalized to its canonical form |
| // (i.e. decomposed ligatures, replaced full-width latin characters to |
| // its ASCII alternatives, etc.) so that a SpellChecker object can check |
| // its spelling without any additional operations. |
| // On the other hand, a substring of the input string |
| // string16 str(&word[word_start], word_length); |
| // represents the non-normalized version of this extracted word. |
| // * word_start [out] (int*) |
| // Represents the offset of this word from the beginning of the input |
| // string, in UTF-16 characters. |
| // * word_length [out] (int*) |
| // Represents the length of an extracted word before normalization, in |
| // UTF-16 characters. |
| // When the input string contains ligatures, this value may not be equal |
| // to the length of the |word_string|. |
| // Return values |
| // * true |
| // Found a word (or a contraction) to be checked its spelling. |
| // * false |
| // Not found any more words or contractions to be checked their spellings. |
| bool GetNextWord(string16* word_string, |
| int* word_start, |
| int* word_length); |
| |
| private: |
| // Retrieves a segment consisting of word characters (and contraction |
| // characters if the |allow_contraction| value is true). |
| void GetSegment(int* segment_start, |
| int* segment_end); |
| |
| // Discards non-word characters at the beginning and the end of the given |
| // segment. |
| void TrimSegment(int segment_start, |
| int segment_end, |
| int* word_start, |
| int* word_length) const; |
| |
| // Normalizes the given segment of the |word_| variable and write its |
| // canonical form to the |output_string|. |
| bool Normalize(int input_start, |
| int input_length, |
| string16* output_string) const; |
| |
| private: |
| // The pointer to the input string from which we are extracting words. |
| const char16* word_; |
| |
| // The length of the original string. |
| int length_; |
| |
| // The current position in the original string. |
| int position_; |
| |
| // The flag to control whether or not this object should extract possible |
| // contractions. |
| bool allow_contraction_; |
| |
| // The character attributes used for filtering out non-word characters. |
| const SpellcheckCharAttribute* attribute_; |
| |
| DISALLOW_EVIL_CONSTRUCTORS(SpellcheckWordIterator); |
| }; |
| |
| #endif // CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__ |
| |