internal/utf8statetable.h - external/cld2 - Git at Google

 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 //
 // State Table follower for scanning UTF-8 strings without converting to
 // 32- or 16-bit Unicode values.
 //
 // Author: dsites@google.com (Dick Sites)
 //

 #ifndef UTIL_UTF8_UTF8STATETABLE_H_
 #define UTIL_UTF8_UTF8STATETABLE_H_

 #include <string>
 #include "integral_types.h"             // for uint8, uint32, uint16
 #include "stringpiece.h"


 namespace CLD2 {

 class OffsetMap;


 // These four-byte entries compactly encode how many bytes 0..255 to delete
 // in making a string replacement, how many bytes to add 0..255, and the offset
 // 0..64k-1 of the replacement string in remap_string.
 struct RemapEntry {
   uint8 delete_bytes;
   uint8 add_bytes;
   uint16 bytes_offset;
 };

 // Exit type codes for state tables. All but the first get stuffed into
 // signed one-byte entries. The first is only generated by executable code.
 // To distinguish from next-state entries, these must be contiguous and
 // all <= kExitNone
 typedef enum {
   kExitDstSpaceFull = 239,
   kExitIllegalStructure,  // 240
   kExitOK,                // 241
   kExitReject,            // ...
   kExitReplace1,
   kExitReplace2,
   kExitReplace3,
   kExitReplace21,
   kExitReplace31,
   kExitReplace32,
   kExitReplaceOffset1,
   kExitReplaceOffset2,
   kExitReplace1S0,
   kExitSpecial,
   kExitDoAgain,
   kExitRejectAlt,
   kExitNone               // 255
 } ExitReason;

 typedef enum {
   kExitDstSpaceFull_2 = 32767,       // 0x7fff
   kExitIllegalStructure_2,  // 32768    0x8000
   kExitOK_2,                // 32769    0x8001
   kExitReject_2,            // ...
   kExitReplace1_2,
   kExitReplace2_2,
   kExitReplace3_2,
   kExitReplace21_2,
   kExitReplace31_2,
   kExitReplace32_2,
   kExitReplaceOffset1_2,
   kExitReplaceOffset2_2,
   kExitReplace1S0_2,
   kExitSpecial_2,
   kExitDoAgain_2,
   kExitRejectAlt_2,
   kExitNone_2               // 32783    0x800f
 } ExitReason_2;


 // This struct represents one entire state table. The three initialized byte
 // areas are state_table, remap_base, and remap_string. state0 and state0_size
 // give the byte offset and length within state_table of the initial state --
 // table lookups are expected to start and end in this state, but for
 // truncated UTF-8 strings, may end in a different state. These allow a quick
 // test for that condition. entry_shift is 8 for tables subscripted by a full
 // byte value and 6 for space-optimized tables subscripted by only six
 // significant bits in UTF-8 continuation bytes.
 typedef struct {
   const uint32 state0;
   const uint32 state0_size;
   const uint32 total_size;
   const int max_expand;
   const int entry_shift;
   const int bytes_per_entry;
   const uint32 losub;
   const uint32 hiadd;
   const uint8* state_table;
   const RemapEntry* remap_base;
   const uint8* remap_string;
   const uint8* fast_state;
 } UTF8StateMachineObj;

 // Near-duplicate declaration for tables with two-byte entries
 typedef struct {
   const uint32 state0;
   const uint32 state0_size;
   const uint32 total_size;
   const int max_expand;
   const int entry_shift;
   const int bytes_per_entry;
   const uint32 losub;
   const uint32 hiadd;
   const unsigned short* state_table;
   const RemapEntry* remap_base;
   const uint8* remap_string;
   const uint8* fast_state;
 } UTF8StateMachineObj_2;


 typedef UTF8StateMachineObj UTF8PropObj;
 typedef UTF8StateMachineObj UTF8ScanObj;
 typedef UTF8StateMachineObj UTF8ReplaceObj;
 typedef UTF8StateMachineObj_2 UTF8PropObj_2;
 typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
 // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;


 // Look up property of one UTF-8 character and advance over it
 // Return 0 if input length is zero
 // Return 0 and advance one byte if input is ill-formed
 uint8 UTF8GenericProperty(const UTF8PropObj* st,
                           const uint8** src,
                           int* srclen);

 // Look up property of one UTF-8 character (assumed to be valid).
 // (This is a faster version of UTF8GenericProperty.)
 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);


 // BigOneByte versions are needed for tables > 240 states, but most
 // won't need the TwoByte versions.

 // Look up property of one UTF-8 character and advance over it
 // Return 0 if input length is zero
 // Return 0 and advance one byte if input is ill-formed
 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
                           const uint8** src,
                           int* srclen);


 // TwoByte versions are needed for tables > 240 states that don't fit onto
 // BigOneByte -- rare ultimate fallback

 // Look up property of one UTF-8 character (assumed to be valid).
 // (This is a faster version of UTF8GenericProperty.)
 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);

 // Look up property of one UTF-8 character and advance over it
 // Return 0 if input length is zero
 // Return 0 and advance one byte if input is ill-formed
 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
                           const uint8** src,
                           int* srclen);

 // Look up property of one UTF-8 character (assumed to be valid).
 // (This is a faster version of UTF8GenericProperty.)
 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);

 // Scan a UTF-8 stringpiece based on a state table.
 // Always scan complete UTF-8 characters
 // Set number of bytes scanned. Return reason for exiting
 int UTF8GenericScan(const UTF8ScanObj* st,
                     const StringPiece& str,
                     int* bytes_consumed);


 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
 //   and doing text replacements.
 // Always scan complete UTF-8 characters
 // Set number of bytes consumed from input, number filled to output.
 // Return reason for exiting
 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
 int UTF8GenericReplace(const UTF8ReplaceObj* st,
                     const StringPiece& istr,
                     StringPiece& ostr,
                     bool is_plain_text,
                     int* bytes_consumed,
                     int* bytes_filled,
                     int* chars_changed,
                     OffsetMap* offsetmap);

 // Older version without offsetmap
 int UTF8GenericReplace(const UTF8ReplaceObj* st,
                     const StringPiece& istr,
                     StringPiece& ostr,
                     bool is_plain_text,
                     int* bytes_consumed,
                     int* bytes_filled,
                     int* chars_changed);

 // Older version without is_plain_text or offsetmap
 int UTF8GenericReplace(const UTF8ReplaceObj* st,
                     const StringPiece& istr,
                     StringPiece& ostr,
                     int* bytes_consumed,
                     int* bytes_filled,
                     int* chars_changed);


 // TwoByte version is needed for tables > about 256 states, such
 // as the table for full Unicode 4.1 canonical + compatibility mapping

 // Scan a UTF-8 stringpiece based on state table with two-byte entries,
 //   copying to output stringpiece
 //   and doing text replacements.
 // Always scan complete UTF-8 characters
 // Set number of bytes consumed from input, number filled to output.
 // Return reason for exiting
 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
                     const StringPiece& istr,
                     StringPiece& ostr,
                     bool is_plain_text,
                     int* bytes_consumed,
                     int* bytes_filled,
                     int* chars_changed,
                     OffsetMap* offsetmap);

 // Older version without offsetmap
 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
                     const StringPiece& istr,
                     StringPiece& ostr,
                     bool is_plain_text,
                     int* bytes_consumed,
                     int* bytes_filled,
                     int* chars_changed);

 // Older version without is_plain_text or offsetmap
 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
                     const StringPiece& istr,
                     StringPiece& ostr,
                     int* bytes_consumed,
                     int* bytes_filled,
                     int* chars_changed);


 static const unsigned char kUTF8LenTbl[256] = {
   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
   3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
 };

 inline int UTF8OneCharLen(const char* in) {
   return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
 }

 // Adjust a stringpiece to encompass complete UTF-8 characters.
 // The data pointer will be increased by 0..3 bytes to get to a character
 // boundary, and the length will then be decreased by 0..3 bytes
 // to encompass the last complete character.
 // This is useful especially when a UTF-8 string must be put into a fixed-
 // maximum-size buffer cleanly, such as a MySQL buffer.
 void UTF8TrimToChars(StringPiece* istr);

 }       // End namespace CLD2

 #endif  // UTIL_UTF8_UTF8STATETABLE_H_
	// Copyright 2013 Google Inc. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	//
	// State Table follower for scanning UTF-8 strings without converting to
	// 32- or 16-bit Unicode values.
	//
	// Author: dsites@google.com (Dick Sites)
	//

	#ifndef UTIL_UTF8_UTF8STATETABLE_H_
	#define UTIL_UTF8_UTF8STATETABLE_H_

	#include <string>
	#include "integral_types.h" // for uint8, uint32, uint16
	#include "stringpiece.h"


	namespace CLD2 {

	class OffsetMap;


	// These four-byte entries compactly encode how many bytes 0..255 to delete
	// in making a string replacement, how many bytes to add 0..255, and the offset
	// 0..64k-1 of the replacement string in remap_string.
	struct RemapEntry {
	uint8 delete_bytes;
	uint8 add_bytes;
	uint16 bytes_offset;
	};

	// Exit type codes for state tables. All but the first get stuffed into
	// signed one-byte entries. The first is only generated by executable code.
	// To distinguish from next-state entries, these must be contiguous and
	// all <= kExitNone
	typedef enum {
	kExitDstSpaceFull = 239,
	kExitIllegalStructure, // 240
	kExitOK, // 241
	kExitReject, // ...
	kExitReplace1,
	kExitReplace2,
	kExitReplace3,
	kExitReplace21,
	kExitReplace31,
	kExitReplace32,
	kExitReplaceOffset1,
	kExitReplaceOffset2,
	kExitReplace1S0,
	kExitSpecial,
	kExitDoAgain,
	kExitRejectAlt,
	kExitNone // 255
	} ExitReason;

	typedef enum {
	kExitDstSpaceFull_2 = 32767, // 0x7fff
	kExitIllegalStructure_2, // 32768 0x8000
	kExitOK_2, // 32769 0x8001
	kExitReject_2, // ...
	kExitReplace1_2,
	kExitReplace2_2,
	kExitReplace3_2,
	kExitReplace21_2,
	kExitReplace31_2,
	kExitReplace32_2,
	kExitReplaceOffset1_2,
	kExitReplaceOffset2_2,
	kExitReplace1S0_2,
	kExitSpecial_2,
	kExitDoAgain_2,
	kExitRejectAlt_2,
	kExitNone_2 // 32783 0x800f
	} ExitReason_2;


	// This struct represents one entire state table. The three initialized byte
	// areas are state_table, remap_base, and remap_string. state0 and state0_size
	// give the byte offset and length within state_table of the initial state --
	// table lookups are expected to start and end in this state, but for
	// truncated UTF-8 strings, may end in a different state. These allow a quick
	// test for that condition. entry_shift is 8 for tables subscripted by a full
	// byte value and 6 for space-optimized tables subscripted by only six
	// significant bits in UTF-8 continuation bytes.
	typedef struct {
	const uint32 state0;
	const uint32 state0_size;
	const uint32 total_size;
	const int max_expand;
	const int entry_shift;
	const int bytes_per_entry;
	const uint32 losub;
	const uint32 hiadd;
	const uint8* state_table;
	const RemapEntry* remap_base;
	const uint8* remap_string;
	const uint8* fast_state;
	} UTF8StateMachineObj;

	// Near-duplicate declaration for tables with two-byte entries
	typedef struct {
	const uint32 state0;
	const uint32 state0_size;
	const uint32 total_size;
	const int max_expand;
	const int entry_shift;
	const int bytes_per_entry;
	const uint32 losub;
	const uint32 hiadd;
	const unsigned short* state_table;
	const RemapEntry* remap_base;
	const uint8* remap_string;
	const uint8* fast_state;
	} UTF8StateMachineObj_2;


	typedef UTF8StateMachineObj UTF8PropObj;
	typedef UTF8StateMachineObj UTF8ScanObj;
	typedef UTF8StateMachineObj UTF8ReplaceObj;
	typedef UTF8StateMachineObj_2 UTF8PropObj_2;
	typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
	// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;


	// Look up property of one UTF-8 character and advance over it
	// Return 0 if input length is zero
	// Return 0 and advance one byte if input is ill-formed
	uint8 UTF8GenericProperty(const UTF8PropObj* st,
	const uint8** src,
	int* srclen);

	// Look up property of one UTF-8 character (assumed to be valid).
	// (This is a faster version of UTF8GenericProperty.)
	bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);


	// BigOneByte versions are needed for tables > 240 states, but most
	// won't need the TwoByte versions.

	// Look up property of one UTF-8 character and advance over it
	// Return 0 if input length is zero
	// Return 0 and advance one byte if input is ill-formed
	uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
	const uint8** src,
	int* srclen);


	// TwoByte versions are needed for tables > 240 states that don't fit onto
	// BigOneByte -- rare ultimate fallback

	// Look up property of one UTF-8 character (assumed to be valid).
	// (This is a faster version of UTF8GenericProperty.)
	bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);

	// Look up property of one UTF-8 character and advance over it
	// Return 0 if input length is zero
	// Return 0 and advance one byte if input is ill-formed
	uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
	const uint8** src,
	int* srclen);

	// Look up property of one UTF-8 character (assumed to be valid).
	// (This is a faster version of UTF8GenericProperty.)
	bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);

	// Scan a UTF-8 stringpiece based on a state table.
	// Always scan complete UTF-8 characters
	// Set number of bytes scanned. Return reason for exiting
	int UTF8GenericScan(const UTF8ScanObj* st,
	const StringPiece& str,
	int* bytes_consumed);



	// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
	// and doing text replacements.
	// Always scan complete UTF-8 characters
	// Set number of bytes consumed from input, number filled to output.
	// Return reason for exiting
	// Also writes an optional OffsetMap. Pass NULL to skip writing one.
	int UTF8GenericReplace(const UTF8ReplaceObj* st,
	const StringPiece& istr,
	StringPiece& ostr,
	bool is_plain_text,
	int* bytes_consumed,
	int* bytes_filled,
	int* chars_changed,
	OffsetMap* offsetmap);

	// Older version without offsetmap
	int UTF8GenericReplace(const UTF8ReplaceObj* st,
	const StringPiece& istr,
	StringPiece& ostr,
	bool is_plain_text,
	int* bytes_consumed,
	int* bytes_filled,
	int* chars_changed);

	// Older version without is_plain_text or offsetmap
	int UTF8GenericReplace(const UTF8ReplaceObj* st,
	const StringPiece& istr,
	StringPiece& ostr,
	int* bytes_consumed,
	int* bytes_filled,
	int* chars_changed);


	// TwoByte version is needed for tables > about 256 states, such
	// as the table for full Unicode 4.1 canonical + compatibility mapping

	// Scan a UTF-8 stringpiece based on state table with two-byte entries,
	// copying to output stringpiece
	// and doing text replacements.
	// Always scan complete UTF-8 characters
	// Set number of bytes consumed from input, number filled to output.
	// Return reason for exiting
	// Also writes an optional OffsetMap. Pass NULL to skip writing one.
	int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
	const StringPiece& istr,
	StringPiece& ostr,
	bool is_plain_text,
	int* bytes_consumed,
	int* bytes_filled,
	int* chars_changed,
	OffsetMap* offsetmap);

	// Older version without offsetmap
	int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
	const StringPiece& istr,
	StringPiece& ostr,
	bool is_plain_text,
	int* bytes_consumed,
	int* bytes_filled,
	int* chars_changed);

	// Older version without is_plain_text or offsetmap
	int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
	const StringPiece& istr,
	StringPiece& ostr,
	int* bytes_consumed,
	int* bytes_filled,
	int* chars_changed);


	static const unsigned char kUTF8LenTbl[256] = {
	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
	2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
	3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
	};

	inline int UTF8OneCharLen(const char* in) {
	return kUTF8LenTbl[reinterpret_cast<const uint8>(in)];
	}

	// Adjust a stringpiece to encompass complete UTF-8 characters.
	// The data pointer will be increased by 0..3 bytes to get to a character
	// boundary, and the length will then be decreased by 0..3 bytes
	// to encompass the last complete character.
	// This is useful especially when a UTF-8 string must be put into a fixed-
	// maximum-size buffer cleanly, such as a MySQL buffer.
	void UTF8TrimToChars(StringPiece* istr);

	} // End namespace CLD2

	#endif // UTIL_UTF8_UTF8STATETABLE_H_