| //------------------------------------------------------------------------------------------------------- |
| // Copyright (C) Microsoft. All rights reserved. |
| // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information. |
| //------------------------------------------------------------------------------------------------------- |
| #pragma once |
| |
| #ifdef _WIN32 |
| #include <windows.h> |
| #include <wtypes.h> |
| #else |
| // TODO: Abstract out into it's own file |
| #include "pal.h" |
| #include "inc/rt/palrt.h" |
| #include <stdint.h> |
| #endif |
| |
| |
| // Utf8Codex.h needs to be self contained, so these type defs are duplicated from CommonTypeDefs.h |
| #ifdef _WIN32 |
| typedef WCHAR char16; |
| #define _u(s) L##s |
| #else |
| #define _u(s) u##s |
| #endif |
| |
| typedef char16 wchar; |
| |
| #ifndef Unused |
| #define Unused(var) var |
| #endif |
| |
| #ifndef _WIN32 |
| // Templates are defined here in order to avoid a dependency on C++ |
| // <type_traits> header file, |
| // or on compiler-specific contructs. |
| extern "C++" { |
| |
| template <size_t S> |
| struct _ENUM_FLAG_INTEGER_FOR_SIZE; |
| |
| template <> |
| struct _ENUM_FLAG_INTEGER_FOR_SIZE<1> |
| { |
| typedef int8_t type; |
| }; |
| |
| template <> |
| struct _ENUM_FLAG_INTEGER_FOR_SIZE<2> |
| { |
| typedef int16_t type; |
| }; |
| |
| template <> |
| struct _ENUM_FLAG_INTEGER_FOR_SIZE<4> |
| { |
| typedef int32_t type; |
| }; |
| |
| // used as an approximation of std::underlying_type<T> |
| template <class T> |
| struct _ENUM_FLAG_SIZED_INTEGER |
| { |
| typedef typename _ENUM_FLAG_INTEGER_FOR_SIZE<sizeof(T)>::type |
| type; |
| }; |
| |
| } |
| |
| #define DEFINE_ENUM_FLAG_OPERATORS(ENUMTYPE) \ |
| extern "C++" { \ |
| inline ENUMTYPE operator | (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a) | ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \ |
| inline ENUMTYPE &operator |= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type &)a) |= ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \ |
| inline ENUMTYPE operator & (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a) & ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \ |
| inline ENUMTYPE &operator &= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type &)a) &= ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \ |
| inline ENUMTYPE operator ~ (ENUMTYPE a) { return ENUMTYPE(~((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a)); } \ |
| inline ENUMTYPE operator ^ (ENUMTYPE a, ENUMTYPE b) { return ENUMTYPE(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)a) ^ ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \ |
| inline ENUMTYPE &operator ^= (ENUMTYPE &a, ENUMTYPE b) { return (ENUMTYPE &)(((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type &)a) ^= ((_ENUM_FLAG_SIZED_INTEGER<ENUMTYPE>::type)b)); } \ |
| } |
| |
| #endif |
| |
| typedef unsigned __int32 uint32; |
| // charcount_t represents a count of characters in a String |
| // It is unsigned and the maximum value is (INT_MAX-1) |
| typedef uint32 charcount_t; |
| |
| typedef BYTE utf8char_t; |
| typedef const utf8char_t CUTF8; |
| typedef utf8char_t* LPUTF8; |
| typedef const utf8char_t *LPCUTF8; |
| |
| // Unicode 4.0, unknown char should be converted to replace mark, U+FFFD. |
| #define UNICODE_UNKNOWN_CHAR_MARK 0xFFFD |
| #define UNICODE_TCHAR_UKNOWN_CHAR_MARK _T('\xFFFD') |
| |
| namespace utf8 |
| { |
| |
| // Terminology - |
| // Code point - A ordinal value mapped to a standard ideograph as defined by ISO/IEC 10646-1. Here |
| // also referred to as a UCS code point but can also be often be referred to as a UNICODE |
| // code point. |
| // UTF-8 - An encoding of UCS code points as defined by RFC-3629. |
| // UTF-16 - An encoding of UCS code points as defined by RFC-2781. Use as a synonym for UNICODE or |
| // UCS-2. This is technically incorrect but usually harmless. This file assumes char16 * |
| // maps to an UTF-16LE (little-endian) encoded sequence of words. |
| // Unit - The unit of encoding. For UTF-8 it is a byte (octet). For UTF-16 it is a word (two octets). |
| // Valid - A UTF-8 byte sequence conforming to RFC-3629. |
| // Well-formed - A sequence of bytes that conform to the encoding pattern of UTF8 but might be too long or |
| // otherwise invalid. For example C0 80 is a well-formed but invalid encoding of U+0000. |
| // Start byte - A byte can start a well-formed UTF-8 sequence. |
| // Lead byte - A byte can start a well-formed multi-unit sequence but not a single byte sequence. |
| // Trail byte - A byte that can appear after a lead-byte in a well-formed multi-unit sequence. |
| // Surrogate pair - A UTF-16 word pair to encode characters outside the Unicode base plain as defined by |
| // RFC-2781. Two char16 values are used to encode one UCS code point. |
| // character index - The index into a UTF-16 sequence. |
| // byte index - The index into a UTF-8 sequence. |
| |
| // Return the number of bytes needed to encode the given character (ignoring surrogate pairs) |
| inline size_t EncodedSize(char16 ch) |
| { |
| if (ch < 0x0080) return 1; |
| if (ch < 0x0800) return 2; |
| return 3; |
| } |
| |
| enum DecodeOptions |
| { |
| doDefault = 0x00, |
| doAllowThreeByteSurrogates = 0x01, // Allow invalid 3 byte encodings as would be encoded by CSEU-8 |
| doChunkedEncoding = 0x02, // For sequences at the end of a buffer do not advance into incomplete sequences |
| // If incomplete UTF-8 sequence is encountered at the end of a buffer, this |
| // option will cause Decode() to not advance the ptr value and DecodeTail to |
| // move the pointer back one position so it again points to where c1 was read by |
| // Decode(). In effect, incomplete sequences are treated as if end pointed to the |
| // beginning incomplete sequence instead of in the middle of it. |
| doSecondSurrogatePair = 0x04, // A previous call to DecodeTail returned the first word of a UTF-16 |
| // surrogate pair. The second call will return the second word and reset |
| // this 'option'. |
| doAllowInvalidWCHARs = 0x08, // Don't replace invalid wide chars with 0xFFFD |
| }; |
| DEFINE_ENUM_FLAG_OPERATORS(DecodeOptions); |
| |
| // Decode the trail bytes after the UTF8 lead byte c1 but returning 0xFFFD if trail bytes are expected after end. |
| _At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) - 1 && ptr <= end)) |
| char16 DecodeTail(char16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options, bool *chunkEndsAtTruncatedSequence = nullptr); |
| |
| // Decode the UTF8 sequence into a UTF16 encoding. Code points outside the Unicode base plain will generate |
| // surrogate pairs, using the 'doSecondSurrogatePair' option to remember the first word has already been returned. |
| // If ptr == end 0x0000 is emitted. If ptr < end but the lead byte of the UTF8 sequence |
| // expects trail bytes past end then 0xFFFD are emitted until ptr == end. |
| _At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) && ptr <= end)) |
| inline char16 Decode(LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options, bool *chunkEndsAtTruncatedSequence = nullptr) |
| { |
| if (ptr >= end) return 0; |
| utf8char_t c1 = *ptr++; |
| if (c1 < 0x80) return static_cast<char16>(c1); |
| return DecodeTail(c1, ptr, end, options, chunkEndsAtTruncatedSequence); |
| } |
| |
| // Encode ch into a UTF8 sequence ignoring surrogate pairs (which are encoded as two |
| // separate code points). Use Encode() instead of EncodeFull() directly because it |
| // special cases ASCII to avoid a call the most common characters. |
| LPUTF8 EncodeFull(char16 ch, __out_ecount(3) LPUTF8 ptr); |
| |
| // Encode a surrogate pair into a utf8 sequence |
| LPUTF8 EncodeSurrogatePair(char16 surrogateHigh, char16 surrogateLow, __out_ecount(4) LPUTF8 ptr); |
| |
| // Encode ch into a UTF8 sequence ignoring surrogate pairs (which are encoded as two |
| // separate code points). |
| inline LPUTF8 Encode(char16 ch, __out_ecount(3) LPUTF8 ptr) |
| { |
| if (ch < 0x80) |
| { |
| *ptr = static_cast<utf8char_t>(ch); |
| return ptr + 1; |
| } |
| return EncodeFull(ch, ptr); |
| } |
| |
| // Encode ch into a UTF8 sequence while being aware of surrogate pairs. |
| inline LPUTF8 EncodeTrueUtf8(char16 ch, const char16** source, charcount_t* cch, __out_ecount((*cch + 1) * 3) LPUTF8 ptr) |
| { |
| if (ch < 0x80) |
| { |
| *ptr = static_cast<utf8char_t>(ch); |
| return ptr + 1; |
| } |
| else if (ch < 0xD800 || (ch >= 0xE000 && ch <= 0xFFFF)) |
| { |
| return EncodeFull(ch, ptr); |
| } |
| |
| // We're now decoding a surrogate pair. If the input is malformed (eg. low surrogate is absent) |
| // we'll instead encode the unicode replacement character as utf8 |
| if (*cch > 0) |
| { |
| char16 surrogateHigh = ch; |
| char16 surrogateLow = **source; |
| |
| // Validate that the surrogate code units are within the appropriate |
| // ranges for high and low surrogates |
| if ((surrogateHigh >= 0xD800 && surrogateHigh <= 0xDBFF) && |
| (surrogateLow >= 0xDC00 && surrogateLow <= 0xDFFF)) |
| { |
| LPUTF8 retptr = EncodeSurrogatePair(surrogateHigh, surrogateLow, ptr); |
| |
| // SAL analysis gets confused if we call EncodeSurrogatePair after |
| // modifying cch |
| |
| // Consume the low surrogate |
| *source = *source + 1; |
| *cch = *cch - 1; |
| |
| return retptr; |
| } |
| } |
| |
| // Invalid input: insert the unicode replacement character instead |
| ptr[0] = 0xEF; |
| ptr[1] = 0xBF; |
| ptr[2] = 0xBD; |
| return ptr + 3; |
| } |
| |
| // Return true if ch is a lead byte of a UTF8 multi-unit sequence. |
| inline bool IsLeadByte(utf8char_t ch) |
| { |
| return ch >= 0xC0; |
| } |
| |
| // Return true if ch is a byte that starts a well-formed UTF8 sequence (i.e. is an ASCII character or a valid UTF8 lead byte) |
| inline bool IsStartByte(utf8char_t ch) |
| { |
| return ch < 0x80 || ch >= 0xC0; |
| } |
| |
| // Returns true if ch is a UTF8 multi-unit sequence trail byte. |
| inline bool IsTrailByte(utf8char_t ch) |
| { |
| return (ch & 0xC0) == 0x80; |
| } |
| |
| // Returns true if ptr points to a well-formed UTF8 |
| inline bool IsCharStart(LPCUTF8 ptr) |
| { |
| return IsStartByte(*ptr); |
| } |
| |
| // Return the start of the next well-formed UTF-8 sequence. Use NextChar() instead of |
| // NextCharFull() since NextChar() avoid a call if ptr references a single byte sequence. |
| LPCUTF8 NextCharFull(LPCUTF8 ptr); |
| |
| // Return the start of the next well-formed UTF-8 sequence. |
| inline LPCUTF8 NextChar(LPCUTF8 ptr) |
| { |
| if (*ptr < 0x80) return ptr + 1; |
| return NextCharFull(ptr); |
| } |
| |
| // Return the start of the previous well-formed UTF-8 sequence prior to start or start if |
| // if ptr is already start or no well-formed sequence starts a start. Use PrevChar() instead of |
| // PrevCharFull() since PrevChar() avoids a call if the previous sequence is a single byte |
| // sequence. |
| LPCUTF8 PrevCharFull(LPCUTF8 ptr, LPCUTF8 start); |
| |
| // Return the start of the previous well-formed UTF-8 sequence prior to start or start if |
| // if ptr is already start or no well-formed sequence starts a start. |
| inline LPCUTF8 PrevChar(LPCUTF8 ptr, LPCUTF8 start) |
| { |
| if (ptr > start && *(ptr - 1) < 0x80) return ptr - 1; |
| return PrevCharFull(ptr, start); |
| } |
| |
| // Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer |
| _Ret_range_(0, pbEnd - _Old_(pbUtf8)) |
| size_t DecodeUnitsInto(_Out_writes_(pbEnd - pbUtf8) char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault, bool *chunkEndsAtTruncatedSequence = nullptr); |
| |
| // Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer (excluding the null terminator) |
| size_t DecodeUnitsIntoAndNullTerminate(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault, bool *chunkEndsAtTruncatedSequence = nullptr); |
| |
| size_t DecodeUnitsIntoAndNullTerminateNoAdvance(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated char16 *buffer, LPCUTF8 pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault, bool *chunkEndsAtTruncatedSequence = nullptr); |
| |
| // Encode a UTF-8 sequence into a UTF-8 sequence (which is just a memcpy). This is included for convenience in templates |
| // when the character encoding is a template parameter. |
| __range(cch, cch) |
| inline size_t EncodeInto(__out_ecount(cch) utf8char_t *buffer, const utf8char_t *source, size_t cch) |
| { |
| memcpy_s(buffer, cch * sizeof(utf8char_t), source, cch * sizeof(utf8char_t)); |
| return cch; |
| } |
| |
| // Encode a UTF16-LE sequence of cch words into a UTF-8 sequence returning the number of bytes needed. |
| // Since a UTF16 encoding can take up to 3 bytes buffer must refer to a buffer at least 3 times larger |
| // than cch. |
| // Returns the number of bytes copied into the buffer. |
| __range(0, cch * 3) |
| size_t EncodeInto(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const char16 *source, charcount_t cch); |
| |
| // Like EncodeInto but ensures that buffer[return value] == 0. |
| __range(0, cch * 3) |
| size_t EncodeIntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch); |
| |
| // Like EncodeInto but ensures that buffer[return value] == 0. |
| __range(0, cch * 3) |
| size_t EncodeTrueUtf8IntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch); |
| |
| // Returns true if the pch refers to a UTF-16LE encoding of the given UTF-8 encoding bch. |
| bool CharsAreEqual(LPCOLESTR pch, LPCUTF8 bch, LPCUTF8 end, DecodeOptions options = doDefault); |
| |
| // Convert the character index into a byte index. |
| size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, const charcount_t cchIndex, size_t cbStartIndex, charcount_t cchStartIndex, DecodeOptions options = doDefault); |
| size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, const charcount_t cchIndex, DecodeOptions options = doDefault); |
| |
| // Convert byte index into character index |
| charcount_t ByteIndexIntoCharacterIndex(__in_ecount(cbIndex) LPCUTF8 pch, size_t cbIndex, DecodeOptions options = doDefault); |
| } |