| // Copyright 2013 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // |
| // Author: dsites@google.com (Dick Sites) |
| // |
| |
| |
| #ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ |
| #define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ |
| |
| #include "integral_types.h" |
| #include "langspan.h" |
| #include "offsetmap.h" |
| |
| namespace CLD2 { |
| |
| static const int kMaxScriptBuffer = 40960; |
| static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2; |
| static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room |
| static const int kWithinScriptTail = 32; // Stop at word space in last |
| // N bytes of script buffer |
| |
| |
| static inline bool IsContinuationByte(char c) { |
| return static_cast<signed char>(c) < -64; |
| } |
| |
| // Gets lscript number for letters; always returns |
| // 0 (common script) for non-letters |
| int GetUTF8LetterScriptNum(const char* src); |
| |
| // Update src pointer to point to next quadgram, +2..+5 |
| // Looks at src[0..4] |
| const char* AdvanceQuad(const char* src); |
| |
| |
| class ScriptScanner { |
| public: |
| ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text); |
| ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text, |
| bool any_text, bool any_script); |
| ~ScriptScanner(); |
| |
| // Copy next run of same-script non-tag letters to buffer [NUL terminated] |
| bool GetOneScriptSpan(LangSpan* span); |
| |
| // Force Latin and Cyrillic scripts to be lowercase |
| void LowerScriptSpan(LangSpan* span); |
| |
| // Copy next run of same-script non-tag letters to buffer [NUL terminated] |
| // Force Latin and Cyrillic scripts to be lowercase |
| bool GetOneScriptSpanLower(LangSpan* span); |
| |
| // Copy next run of non-tag characters to buffer [NUL terminated] |
| // This just removes tags and removes entities |
| // Buffer has leading space |
| bool GetOneTextSpan(LangSpan* span); |
| |
| // Maps byte offset in most recent GetOneScriptSpan/Lower |
| // span->text [0..text_bytes] into an additional byte offset from |
| // span->offset, to get back to corresponding text in the original |
| // input buffer. |
| // text_offset must be the first byte |
| // of a UTF-8 character, or just beyond the last character. Normally this |
| // routine is called with the first byte of an interesting range and |
| // again with the first byte of the following range. |
| int MapBack(int text_offset); |
| |
| const char* GetBufferStart() {return start_byte_;}; |
| |
| private: |
| // Skip over tags and non-letters |
| int SkipToFrontOfSpan(const char* src, int len, int* script); |
| |
| const char* start_byte_; // Starting byte of buffer to scan |
| const char* next_byte_; // First unscanned byte |
| int byte_length_; // Bytes left |
| |
| bool is_plain_text_; // true fo text, false for HTML |
| char* script_buffer_; // Holds text with expanded entities |
| char* script_buffer_lower_; // Holds lowercased text |
| bool letters_marks_only_; // To distinguish scriptspan of one |
| // letters/marks vs. any mixture of text |
| bool one_script_only_; // To distinguish scriptspan of one |
| // script vs. any mixture of scripts |
| int exit_state_; // For tag parser kTagParseTbl_0, based |
| // on letters_marks_only_ |
| public : |
| // Expose for debugging |
| OffsetMap map2original_; // map from script_buffer_ to buffer |
| OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_ |
| }; |
| |
| } // namespace CLD2 |
| |
| #endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ |
| |