blob: 776331463bd0fdbc051ddba989af60a7a14b672f [file] [log] [blame]
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
#define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
#include <map>
#include <string>
#include "base/basictypes.h"
#include "base/string16.h"
#include "unicode/uscript.h"
// A class which handles character attributes dependent on a spellchecker and
// its dictionary.
// This class is used by the SpellcheckWordIterator class to determine whether
// or not a character is one used by the spellchecker and its dictinary.
class SpellcheckCharAttribute {
public:
SpellcheckCharAttribute();
~SpellcheckCharAttribute();
// Sets the default language of the spell checker. This controls which
// characters are considered parts of words of the given language.
void SetDefaultLanguage(const std::string& language);
// Returns whether or not the given character is a character used by the
// selected dictionary.
// Parameters
// * character [in] (UChar32)
// Represents a Unicode character to be checked.
// Return values
// * true
// The given character is a word character.
// * false
// The given character is not a word character.
bool IsWordChar(UChar32 character) const;
// Returns whether or not the given character is a character used by
// contractions.
// Parameters
// * character [in] (UChar32)
// Represents a Unicode character to be checked.
// Return values
// * true
// The given character is a character used by contractions.
// * false
// The given character is not a character used by contractions.
bool IsContractionChar(UChar32 character) const;
private:
// Initializes the mapping table.
void InitializeScriptTable();
// Retrieves the ICU script code.
UScriptCode GetScriptCode(UChar32 character) const;
// Updates an entry in the mapping table.
void SetWordScript(const int script_code, bool in_use);
// Returns whether or not the given script is used by the selected
// dictionary.
bool IsWordScript(const UScriptCode script_code) const;
private:
// Represents a mapping table from a script code to a boolean value
// representing whether or not the script is used by the selected dictionary.
bool script_attributes_[USCRIPT_CODE_LIMIT];
// Represents a table of characters used by contractions.
std::map<UChar32, bool> middle_letters_;
DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute);
};
// A class which implements methods for finding the location of word boundaries
// used by the Spellchecker class.
// This class is implemented on the following assumptions:
// * An input string is encoded in UTF-16 (i.e. it may contain surrogate
// pairs), and;
// * The length of a string is the number of UTF-16 characters in the string
// (i.e. the length of a non-BMP character becomes two).
class SpellcheckWordIterator {
public:
SpellcheckWordIterator();
~SpellcheckWordIterator();
// Initializes a word-iterator object.
// Parameters
// * attribute [in] (const SpellcheckCharAttribute*)
// Represents a set of character attributes used for filtering out
// non-word characters.
// * word [in] (const char16*)
// Represents a string from which this object extracts words.
// (This string does not have to be NUL-terminated.)
// * length [in] (size_t)
// Represents the length of the given string, in UTF-16 characters.
// This value should not include terminating NUL characters.
// * allow_contraction [in] (bool)
// Represents a flag to control whether or not this object should split a
// possible contraction (e.g. "isn't", "in'n'out", etc.)
// Return values
// * true
// This word-iterator object is initialized successfully.
// * false
// An error occured while initializing this object.
void Initialize(const SpellcheckCharAttribute* attribute,
const char16* word,
size_t length,
bool allow_contraction);
// Retrieves a word (or a contraction).
// Parameters
// * word_string [out] (string16*)
// Represents a word (or a contraction) to be checked its spelling.
// This |word_string| has been already normalized to its canonical form
// (i.e. decomposed ligatures, replaced full-width latin characters to
// its ASCII alternatives, etc.) so that a SpellChecker object can check
// its spelling without any additional operations.
// On the other hand, a substring of the input string
// string16 str(&word[word_start], word_length);
// represents the non-normalized version of this extracted word.
// * word_start [out] (int*)
// Represents the offset of this word from the beginning of the input
// string, in UTF-16 characters.
// * word_length [out] (int*)
// Represents the length of an extracted word before normalization, in
// UTF-16 characters.
// When the input string contains ligatures, this value may not be equal
// to the length of the |word_string|.
// Return values
// * true
// Found a word (or a contraction) to be checked its spelling.
// * false
// Not found any more words or contractions to be checked their spellings.
bool GetNextWord(string16* word_string,
int* word_start,
int* word_length);
private:
// Retrieves a segment consisting of word characters (and contraction
// characters if the |allow_contraction| value is true).
void GetSegment(int* segment_start,
int* segment_end);
// Discards non-word characters at the beginning and the end of the given
// segment.
void TrimSegment(int segment_start,
int segment_end,
int* word_start,
int* word_length) const;
// Normalizes the given segment of the |word_| variable and write its
// canonical form to the |output_string|.
bool Normalize(int input_start,
int input_length,
string16* output_string) const;
private:
// The pointer to the input string from which we are extracting words.
const char16* word_;
// The length of the original string.
int length_;
// The current position in the original string.
int position_;
// The flag to control whether or not this object should extract possible
// contractions.
bool allow_contraction_;
// The character attributes used for filtering out non-word characters.
const SpellcheckCharAttribute* attribute_;
DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator);
};
#endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_