| // Copyright 2013 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // |
| // State Table follower for scanning UTF-8 strings without converting to |
| // 32- or 16-bit Unicode values. |
| // |
| // Author: dsites@google.com (Dick Sites) |
| // |
| |
| #ifndef UTIL_UTF8_UTF8STATETABLE_H_ |
| #define UTIL_UTF8_UTF8STATETABLE_H_ |
| |
| #include <string> |
| #include "integral_types.h" // for uint8, uint32, uint16 |
| #include "stringpiece.h" |
| |
| |
| namespace CLD2 { |
| |
| class OffsetMap; |
| |
| |
| // These four-byte entries compactly encode how many bytes 0..255 to delete |
| // in making a string replacement, how many bytes to add 0..255, and the offset |
| // 0..64k-1 of the replacement string in remap_string. |
| struct RemapEntry { |
| uint8 delete_bytes; |
| uint8 add_bytes; |
| uint16 bytes_offset; |
| }; |
| |
| // Exit type codes for state tables. All but the first get stuffed into |
| // signed one-byte entries. The first is only generated by executable code. |
| // To distinguish from next-state entries, these must be contiguous and |
| // all <= kExitNone |
| typedef enum { |
| kExitDstSpaceFull = 239, |
| kExitIllegalStructure, // 240 |
| kExitOK, // 241 |
| kExitReject, // ... |
| kExitReplace1, |
| kExitReplace2, |
| kExitReplace3, |
| kExitReplace21, |
| kExitReplace31, |
| kExitReplace32, |
| kExitReplaceOffset1, |
| kExitReplaceOffset2, |
| kExitReplace1S0, |
| kExitSpecial, |
| kExitDoAgain, |
| kExitRejectAlt, |
| kExitNone // 255 |
| } ExitReason; |
| |
| typedef enum { |
| kExitDstSpaceFull_2 = 32767, // 0x7fff |
| kExitIllegalStructure_2, // 32768 0x8000 |
| kExitOK_2, // 32769 0x8001 |
| kExitReject_2, // ... |
| kExitReplace1_2, |
| kExitReplace2_2, |
| kExitReplace3_2, |
| kExitReplace21_2, |
| kExitReplace31_2, |
| kExitReplace32_2, |
| kExitReplaceOffset1_2, |
| kExitReplaceOffset2_2, |
| kExitReplace1S0_2, |
| kExitSpecial_2, |
| kExitDoAgain_2, |
| kExitRejectAlt_2, |
| kExitNone_2 // 32783 0x800f |
| } ExitReason_2; |
| |
| |
| // This struct represents one entire state table. The three initialized byte |
| // areas are state_table, remap_base, and remap_string. state0 and state0_size |
| // give the byte offset and length within state_table of the initial state -- |
| // table lookups are expected to start and end in this state, but for |
| // truncated UTF-8 strings, may end in a different state. These allow a quick |
| // test for that condition. entry_shift is 8 for tables subscripted by a full |
| // byte value and 6 for space-optimized tables subscripted by only six |
| // significant bits in UTF-8 continuation bytes. |
| typedef struct { |
| const uint32 state0; |
| const uint32 state0_size; |
| const uint32 total_size; |
| const int max_expand; |
| const int entry_shift; |
| const int bytes_per_entry; |
| const uint32 losub; |
| const uint32 hiadd; |
| const uint8* state_table; |
| const RemapEntry* remap_base; |
| const uint8* remap_string; |
| const uint8* fast_state; |
| } UTF8StateMachineObj; |
| |
| // Near-duplicate declaration for tables with two-byte entries |
| typedef struct { |
| const uint32 state0; |
| const uint32 state0_size; |
| const uint32 total_size; |
| const int max_expand; |
| const int entry_shift; |
| const int bytes_per_entry; |
| const uint32 losub; |
| const uint32 hiadd; |
| const unsigned short* state_table; |
| const RemapEntry* remap_base; |
| const uint8* remap_string; |
| const uint8* fast_state; |
| } UTF8StateMachineObj_2; |
| |
| |
| typedef UTF8StateMachineObj UTF8PropObj; |
| typedef UTF8StateMachineObj UTF8ScanObj; |
| typedef UTF8StateMachineObj UTF8ReplaceObj; |
| typedef UTF8StateMachineObj_2 UTF8PropObj_2; |
| typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2; |
| // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2; |
| |
| |
| // Look up property of one UTF-8 character and advance over it |
| // Return 0 if input length is zero |
| // Return 0 and advance one byte if input is ill-formed |
| uint8 UTF8GenericProperty(const UTF8PropObj* st, |
| const uint8** src, |
| int* srclen); |
| |
| // Look up property of one UTF-8 character (assumed to be valid). |
| // (This is a faster version of UTF8GenericProperty.) |
| bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src); |
| |
| |
| // BigOneByte versions are needed for tables > 240 states, but most |
| // won't need the TwoByte versions. |
| |
| // Look up property of one UTF-8 character and advance over it |
| // Return 0 if input length is zero |
| // Return 0 and advance one byte if input is ill-formed |
| uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, |
| const uint8** src, |
| int* srclen); |
| |
| |
| // TwoByte versions are needed for tables > 240 states that don't fit onto |
| // BigOneByte -- rare ultimate fallback |
| |
| // Look up property of one UTF-8 character (assumed to be valid). |
| // (This is a faster version of UTF8GenericProperty.) |
| bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src); |
| |
| // Look up property of one UTF-8 character and advance over it |
| // Return 0 if input length is zero |
| // Return 0 and advance one byte if input is ill-formed |
| uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, |
| const uint8** src, |
| int* srclen); |
| |
| // Look up property of one UTF-8 character (assumed to be valid). |
| // (This is a faster version of UTF8GenericProperty.) |
| bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src); |
| |
| // Scan a UTF-8 stringpiece based on a state table. |
| // Always scan complete UTF-8 characters |
| // Set number of bytes scanned. Return reason for exiting |
| int UTF8GenericScan(const UTF8ScanObj* st, |
| const StringPiece& str, |
| int* bytes_consumed); |
| |
| |
| |
| // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece |
| // and doing text replacements. |
| // Always scan complete UTF-8 characters |
| // Set number of bytes consumed from input, number filled to output. |
| // Return reason for exiting |
| // Also writes an optional OffsetMap. Pass NULL to skip writing one. |
| int UTF8GenericReplace(const UTF8ReplaceObj* st, |
| const StringPiece& istr, |
| StringPiece& ostr, |
| bool is_plain_text, |
| int* bytes_consumed, |
| int* bytes_filled, |
| int* chars_changed, |
| OffsetMap* offsetmap); |
| |
| // Older version without offsetmap |
| int UTF8GenericReplace(const UTF8ReplaceObj* st, |
| const StringPiece& istr, |
| StringPiece& ostr, |
| bool is_plain_text, |
| int* bytes_consumed, |
| int* bytes_filled, |
| int* chars_changed); |
| |
| // Older version without is_plain_text or offsetmap |
| int UTF8GenericReplace(const UTF8ReplaceObj* st, |
| const StringPiece& istr, |
| StringPiece& ostr, |
| int* bytes_consumed, |
| int* bytes_filled, |
| int* chars_changed); |
| |
| |
| // TwoByte version is needed for tables > about 256 states, such |
| // as the table for full Unicode 4.1 canonical + compatibility mapping |
| |
| // Scan a UTF-8 stringpiece based on state table with two-byte entries, |
| // copying to output stringpiece |
| // and doing text replacements. |
| // Always scan complete UTF-8 characters |
| // Set number of bytes consumed from input, number filled to output. |
| // Return reason for exiting |
| // Also writes an optional OffsetMap. Pass NULL to skip writing one. |
| int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, |
| const StringPiece& istr, |
| StringPiece& ostr, |
| bool is_plain_text, |
| int* bytes_consumed, |
| int* bytes_filled, |
| int* chars_changed, |
| OffsetMap* offsetmap); |
| |
| // Older version without offsetmap |
| int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, |
| const StringPiece& istr, |
| StringPiece& ostr, |
| bool is_plain_text, |
| int* bytes_consumed, |
| int* bytes_filled, |
| int* chars_changed); |
| |
| // Older version without is_plain_text or offsetmap |
| int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, |
| const StringPiece& istr, |
| StringPiece& ostr, |
| int* bytes_consumed, |
| int* bytes_filled, |
| int* chars_changed); |
| |
| |
| static const unsigned char kUTF8LenTbl[256] = { |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 |
| }; |
| |
| inline int UTF8OneCharLen(const char* in) { |
| return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)]; |
| } |
| |
| // Adjust a stringpiece to encompass complete UTF-8 characters. |
| // The data pointer will be increased by 0..3 bytes to get to a character |
| // boundary, and the length will then be decreased by 0..3 bytes |
| // to encompass the last complete character. |
| // This is useful especially when a UTF-8 string must be put into a fixed- |
| // maximum-size buffer cleanly, such as a MySQL buffer. |
| void UTF8TrimToChars(StringPiece* istr); |
| |
| } // End namespace CLD2 |
| |
| #endif // UTIL_UTF8_UTF8STATETABLE_H_ |