| //------------------------------------------------------------------------------------------------------- |
| // Copyright (C) Microsoft. All rights reserved. |
| // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information. |
| //------------------------------------------------------------------------------------------------------- |
| #include "Utf8Codex.h" |
| |
| #ifndef _WIN32 |
| #undef _Analysis_assume_ |
| #define _Analysis_assume_(expr) |
| #endif |
| |
| #ifdef _MSC_VER |
| //============================= |
| // Disabled Warnings |
| //============================= |
| |
| #pragma warning(push) |
| |
| #pragma warning(disable: 4127) // constant expression for template parameter |
| #endif |
| |
| extern void CodexAssert(bool condition); |
| |
| namespace utf8 |
| { |
| const unsigned int mAlignmentMask = 0x3; |
| |
| inline bool IsAligned(LPCUTF8 pch) |
| { |
| return (reinterpret_cast<size_t>(pch) & mAlignmentMask) == 0; |
| } |
| |
| inline bool IsAligned(LPCOLESTR pch) |
| { |
| return (reinterpret_cast<size_t>(pch) & mAlignmentMask) == 0; |
| } |
| |
| inline bool ShouldFastPath(LPCUTF8 pb, LPCOLESTR pch) |
| { |
| return (reinterpret_cast<size_t>(pb) & mAlignmentMask) == 0 && (reinterpret_cast<size_t>(pch) & mAlignmentMask) == 0; |
| } |
| |
| inline size_t EncodedBytes(char16 prefix) |
| { |
| CodexAssert(0 == (prefix & 0xFF00)); // prefix must really be a byte. We use char16 for as a convenience for the API. |
| |
| // The number of bytes in an UTF8 encoding is determined by the 4 high-order bits of the first byte. |
| // 0xxx -> 1 |
| // 10xx -> 1 (invalid) |
| // 110x -> 2 |
| // 1110 -> 3 |
| // 1111 -> 4 |
| |
| // If this value is XOR with 0xF0 and shift 3 bits to the right it can be used as an |
| // index into a 16 element 2 bit array encoded as a uint32 of n - 1 where n is the number |
| // of bits in the encoding. |
| |
| // The XOR prefix bits mapped to n - 1. |
| // 1xxx -> 00 (8 - 15) |
| // 01xx -> 00 (4 - 7) |
| // 001x -> 01 (2 - 3) |
| // 0001 -> 10 (1) |
| // 0000 -> 11 (0) |
| |
| // This produces the following bit sequence: |
| // 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 |
| // 00 00 00 00 00 00 00 00 00 00 00 00 01 01 10 11 |
| // which is 0x5B |
| |
| return ((0x5B >> (((prefix ^ 0xF0) >> 3) & 0x1E)) & 0x03) + 1; |
| } |
| |
| const char16 g_chUnknown = char16(UNICODE_UNKNOWN_CHAR_MARK); |
| const char16 WCH_UTF16_HIGH_FIRST = char16(0xd800); |
| const char16 WCH_UTF16_HIGH_LAST = char16(0xdbff); |
| const char16 WCH_UTF16_LOW_FIRST = char16(0xdc00); |
| const char16 WCH_UTF16_LOW_LAST = char16(0xdfff); |
| |
| inline BOOL InRange(const char16 ch, const char16 chMin, const char16 chMax) |
| { |
| return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin); |
| } |
| |
| inline BOOL IsValidWideChar(const char16 ch) |
| { |
| return (ch < 0xfdd0) || ((ch > 0xfdef) && (ch <= 0xffef)) || ((ch >= 0xfff9) && (ch <= 0xfffd)); |
| } |
| |
| inline BOOL IsHighSurrogateChar(char16 ch) |
| { |
| return InRange( ch, WCH_UTF16_HIGH_FIRST, WCH_UTF16_HIGH_LAST ); |
| } |
| |
| inline BOOL IsLowSurrogateChar(char16 ch) |
| { |
| return InRange( ch, WCH_UTF16_LOW_FIRST, WCH_UTF16_LOW_LAST ); |
| } |
| |
| _At_(ptr, _In_reads_(end - ptr) _Post_satisfies_(ptr >= _Old_(ptr) - 1 && ptr <= end)) |
| inline char16 DecodeTail(char16 c1, LPCUTF8& ptr, LPCUTF8 end, DecodeOptions& options, bool *chunkEndsAtTruncatedSequence) |
| { |
| char16 ch = 0; |
| BYTE c2, c3, c4; |
| |
| switch (EncodedBytes(c1)) |
| { |
| case 1: |
| if (c1 < 0x80) return c1; |
| |
| if ((options & doSecondSurrogatePair) != 0) |
| { |
| // We're in the middle of decoding a surrogate pair from a four-byte utf8 sequence. |
| // The high word has already been returned, but without advancing ptr, which was on byte 1. |
| // ptr was then advanced externally when reading c1, which is byte 1, so ptr is now on byte 2. |
| // byte 1 must have been a continuation byte, hence will be in case 1. |
| ptr--; // back to byte 1 |
| c1 = ptr[-1]; // the original first byte |
| |
| // ptr is now on c2. We must also have c3 and c4, otherwise doSecondSurrogatePair won't set. |
| _Analysis_assume_(ptr + 2 < end); |
| goto LFourByte; |
| } |
| |
| // 10xxxxxx (trail byte appearing in a lead byte position |
| return g_chUnknown; |
| |
| case 2: |
| // Look for an overlong utf-8 sequence. |
| if (ptr >= end) |
| { |
| if ((options & doChunkedEncoding) != 0) |
| { |
| // The is a sequence that spans a chunk, push ptr back to the beginning of the sequence. |
| ptr--; |
| |
| if (chunkEndsAtTruncatedSequence) |
| { |
| *chunkEndsAtTruncatedSequence = true; |
| } |
| } |
| return g_chUnknown; |
| } |
| c2 = *ptr++; |
| // 110XXXXx 10xxxxxx |
| // UTF16 | UTF8 1st byte 2nd byte |
| // U+0080..U+07FF | C2..DF 80..BF |
| if ( |
| InRange(c1, 0xC2, 0xDF) |
| && InRange(c2, 0x80, 0xBF) |
| ) |
| { |
| ch |= WCHAR(c1 & 0x1f) << 6; // 0x0080 - 0x07ff |
| ch |= WCHAR(c2 & 0x3f); |
| if (!IsValidWideChar(ch) && ((options & doAllowInvalidWCHARs) == 0)) |
| ch = g_chUnknown; |
| } |
| else |
| { |
| ptr--; |
| ch = g_chUnknown; |
| } |
| break; |
| |
| case 3: |
| // 1110XXXX 10Xxxxxx 10xxxxxx |
| // Look for overlong utf-8 sequence. |
| if (ptr + 1 >= end) |
| { |
| if ((options & doChunkedEncoding) != 0) |
| { |
| // The is a sequence that spans a chunk, push ptr back to the beginning of the sequence. |
| ptr--; |
| |
| if (chunkEndsAtTruncatedSequence) |
| { |
| *chunkEndsAtTruncatedSequence = true; |
| } |
| } |
| |
| return g_chUnknown; |
| } |
| |
| // UTF16 | UTF8 1st byte 2nd byte 3rd byte |
| // U+0800..U+0FFF | E0 A0..BF 80..BF |
| // U+1000..U+CFFF | E1..EC 80..BF 80..BF |
| // U+D000..U+D7FF | ED 80..9F 80..BF |
| // U+E000..U+FFFF | EE..EF 80..BF 80..BF |
| c2 = ptr[0]; |
| c3 = ptr[1]; |
| if ( |
| // any following be true |
| (c1 == 0xE0 |
| && InRange(c2, 0xA0, 0xBF) |
| && InRange(c3, 0x80, 0xBF)) |
| || |
| (InRange(c1, 0xE1, 0xEC) |
| && InRange(c2, 0x80, 0xBF) |
| && InRange(c3, 0x80, 0xBF)) |
| || |
| (c1 == 0xED |
| && InRange(c2, 0x80, 0x9F) |
| && InRange(c3, 0x80, 0xBF)) |
| || |
| (InRange(c1, 0xEE, 0xEF) |
| && InRange(c2, 0x80, 0xBF) |
| && InRange(c3, 0x80, 0xBF)) |
| || |
| (((options & doAllowThreeByteSurrogates) != 0) |
| && |
| c1 == 0xED |
| && InRange(c2, 0x80, 0xBF) |
| && InRange(c3, 0x80, 0xBF) |
| ) |
| ) |
| { |
| ch = WCHAR(c1 & 0x0f) << 12; // 0x0800 - 0xffff |
| ch |= WCHAR(c2 & 0x3f) << 6; // 0x0080 - 0x07ff |
| ch |= WCHAR(c3 & 0x3f); |
| if (!IsValidWideChar(ch) && ((options & (doAllowThreeByteSurrogates | doAllowInvalidWCHARs)) == 0)) |
| ch = g_chUnknown; |
| ptr += 2; |
| } |
| else |
| { |
| ch = g_chUnknown; |
| // Windows OS 1713952. Only drop the illegal leading byte |
| // Retry next byte. |
| // ptr is already advanced. |
| } |
| break; |
| |
| case 4: |
| LFourByte: |
| // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx or 11111xxx .... |
| // NOTE: 11111xxx is not supported |
| if (ptr + 2 >= end) |
| { |
| if ((options & doChunkedEncoding) != 0) |
| { |
| // The is a sequence that spans a chunk, push ptr back to the beginning of the sequence. |
| ptr--; |
| |
| if (chunkEndsAtTruncatedSequence) |
| { |
| *chunkEndsAtTruncatedSequence = true; |
| } |
| } |
| |
| ch = g_chUnknown; |
| break; |
| } |
| |
| c2 = ptr[0]; |
| c3 = ptr[1]; |
| c4 = ptr[2]; |
| |
| // UTF16 | UTF8 1st byte 2nd byte 3rd byte 4th byte |
| // U+10000..U+3FFFF | F0 90..BF 80..BF 80..BF |
| // U+40000..U+FFFFF | F1..F3 80..BF 80..BF 80..BF |
| // U+100000..U+10FFFF | F4 80..8F 80..BF 80..BF |
| if (! // NOT Unicode well-formed byte sequences |
| ( |
| // any following be true |
| (c1 == 0xF0 |
| && InRange(c2, 0x90,0xBF) |
| && InRange(c3, 0x80,0xBF) |
| && InRange(c4, 0x80,0xBF)) |
| || |
| (InRange(c1, 0xF1, 0xF3) |
| && InRange(c2, 0x80,0xBF) |
| && InRange(c3, 0x80,0xBF) |
| && InRange(c4, 0x80,0xBF)) |
| || |
| (c1 == 0xF4 |
| && InRange(c2, 0x80,0x8F) |
| && InRange(c3, 0x80,0xBF) |
| && InRange(c4, 0x80,0xBF)) |
| ) |
| ) |
| { |
| // Windows OS 1713952. Only drop the illegal leading byte. |
| // Retry next byte. |
| // ptr is already advanced 1. |
| ch = g_chUnknown; |
| break; |
| } |
| |
| if ((options & doSecondSurrogatePair) == 0) |
| { |
| // Decode high 10 bits of utf-8 20 bit char |
| ch = WCHAR(c1 & 0x07) << 2; |
| ch |= WCHAR(c2 & 0x30) >> 4; |
| ch = (ch - 1) << 6; // ch == 0000 00ww ww00 0000 |
| ch |= WCHAR(c2 & 0x0f) << 2; // ch == 0000 00ww wwzz zz00 |
| ch |= WCHAR(c3 & 0x30) >> 4; // ch == 0000 00ww wwzz zzyy |
| // Encode first word of utf-16 surrogate pair |
| ch += 0xD800; |
| // Remember next call must return second word |
| options = (DecodeOptions)(options | doSecondSurrogatePair); |
| // Leave ptr on byte 1, this way: |
| // - callers who test that ptr has been advanced by utf8::Decode will see progress for |
| // both words of the surrogate pair. |
| // - callers who calculate the number of multi-unit chars by subtracting after from before ptr |
| // will accumulate 0 for first word and 2 for second, thus utf8 chars equals 2 utf16 chars + 2 |
| // multi-unit chars, as it should be. |
| } |
| else |
| { |
| // Decode low 10 bits of utf-8 20 bit char |
| ch = WCHAR(c3 & 0x0f) << 6; // ch == 0000 00yy yy00 0000 |
| ch |= WCHAR(c4 & 0x3f); // ch == 0000 00yy yyxx xxxx |
| // Encode second word of utf-16 surrogate pair |
| ch += 0xDC00; |
| // We're done with this char |
| options = (DecodeOptions)(options & ~doSecondSurrogatePair); |
| ptr += 3; // remember, got here by subtracting one from ptr in case 1, so effective increment is 2 |
| } |
| break; |
| } |
| |
| return ch; |
| } |
| |
| LPUTF8 EncodeFull(char16 ch, __out_ecount(3) LPUTF8 ptr) |
| { |
| if( ch < 0x0080 ) |
| { |
| // One byte |
| *ptr++ = static_cast< utf8char_t >(ch); |
| } |
| else if( ch < 0x0800 ) |
| { |
| // Two bytes : 110yyyxx 10xxxxxx |
| *ptr++ = static_cast<utf8char_t>(ch >> 6) | 0xc0; |
| *ptr++ = static_cast<utf8char_t>(ch & 0x3F) | 0x80; |
| } |
| else |
| { |
| // Three bytes : 1110yyyy 10yyyyxx 10xxxxxx |
| *ptr++ = static_cast<utf8char_t>(ch >> 12) | 0xE0; |
| *ptr++ = static_cast<utf8char_t>((ch >> 6) & 0x3F) | 0x80; |
| *ptr++ = static_cast<utf8char_t>(ch & 0x3F) | 0x80; |
| } |
| |
| return ptr; |
| } |
| |
| _Use_decl_annotations_ |
| LPUTF8 EncodeSurrogatePair(char16 surrogateHigh, char16 surrogateLow, LPUTF8 ptr) |
| { |
| // A unicode codepoint is encoded into a surrogate pair by doing the following: |
| // subtract 0x10000 from the codepoint |
| // Split the resulting value into the high-ten bits and low-ten bits |
| // Add 0xD800 to the high ten bits, and 0xDC00 to the low ten bits |
| // Below, we want to decode the surrogate pair to its original codepoint |
| // So we do the above process in reverse |
| uint32 highTen = (surrogateHigh - 0xD800); |
| uint32 lowTen = (surrogateLow - 0xDC00); |
| uint32 codepoint = 0x10000 + ((highTen << 10) | lowTen); |
| |
| // This is the maximum valid unicode codepoint |
| // This should be ensured anyway since you can't encode a value higher |
| // than this as a surrogate pair, so we assert this here |
| CodexAssert(codepoint <= 0x10FFFF); |
| |
| // Now we need to encode the code point into utf-8 |
| // Codepoints in the range that gets encoded into a surrogate pair |
| // gets encoded into 4 bytes under utf8 |
| // Since the codepoint can be represented by 21 bits, the encoding |
| // does the following: first 3 bits in the first byte, the next 6 in the |
| // second, the next six in the third, and the last six in the 4th byte |
| *ptr++ = static_cast<utf8char_t>(codepoint >> 18) | 0xF0; |
| *ptr++ = static_cast<utf8char_t>((codepoint >> 12) & 0x3F) | 0x80; |
| *ptr++ = static_cast<utf8char_t>((codepoint >> 6) & 0x3F) | 0x80; |
| *ptr++ = static_cast<utf8char_t>(codepoint & 0x3F) | 0x80; |
| |
| return ptr; |
| } |
| |
| LPCUTF8 NextCharFull(LPCUTF8 ptr) |
| { |
| return ptr + EncodedBytes(*ptr); |
| } |
| |
| LPCUTF8 PrevCharFull(LPCUTF8 ptr, LPCUTF8 start) |
| { |
| if (ptr > start) |
| { |
| LPCUTF8 current = ptr - 1; |
| while (current > start && (*current & 0xC0) == 0x80) |
| current--; |
| if (NextChar(current) == ptr) |
| return current; |
| |
| // It is not a valid encoding, just go back one character. |
| return ptr - 1; |
| } |
| else |
| return ptr; |
| } |
| |
| _Use_decl_annotations_ |
| size_t DecodeUnitsInto(char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options, bool *chunkEndsAtTruncatedSequence) |
| { |
| DecodeOptions localOptions = options; |
| |
| if (chunkEndsAtTruncatedSequence) |
| { |
| *chunkEndsAtTruncatedSequence = false; |
| } |
| |
| LPCUTF8 p = pbUtf8; |
| char16 *dest = buffer; |
| |
| if (!ShouldFastPath(p, dest)) goto LSlowPath; |
| |
| LFastPath: |
| while (p + 3 < pbEnd) |
| { |
| unsigned bytes = *(unsigned *)p; |
| if ((bytes & 0x80808080) != 0) goto LSlowPath; |
| ((uint32 *)dest)[0] = (char16(bytes) & 0x00FF) | ((char16(bytes) & 0xFF00) << 8); |
| ((uint32 *)dest)[1] = (char16(bytes >> 16) & 0x00FF) | ((char16(bytes >> 16) & 0xFF00) << 8); |
| p += 4; |
| dest += 4; |
| } |
| |
| LSlowPath: |
| while (p < pbEnd) |
| { |
| LPCUTF8 s = p; |
| char16 chDest = Decode(p, pbEnd, localOptions, chunkEndsAtTruncatedSequence); |
| |
| if (s < p) |
| { |
| // We decoded the character, store it |
| *dest++ = chDest; |
| } |
| else |
| { |
| // Nothing was converted. This might happen at the end of a buffer with doChunkedEncoding. |
| break; |
| } |
| |
| if (ShouldFastPath(p, dest)) goto LFastPath; |
| } |
| |
| pbUtf8 = p; |
| |
| return dest - buffer; |
| } |
| |
| _Use_decl_annotations_ |
| size_t DecodeUnitsIntoAndNullTerminate(char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options, bool *chunkEndsAtTruncatedSequence) |
| { |
| size_t result = DecodeUnitsInto(buffer, pbUtf8, pbEnd, options, chunkEndsAtTruncatedSequence); |
| buffer[result] = 0; |
| return result; |
| } |
| |
| _Use_decl_annotations_ |
| size_t DecodeUnitsIntoAndNullTerminateNoAdvance(char16 *buffer, LPCUTF8 pbUtf8, LPCUTF8 pbEnd, DecodeOptions options, bool *chunkEndsAtTruncatedSequence) |
| { |
| return DecodeUnitsIntoAndNullTerminate(buffer, pbUtf8, pbEnd, options, chunkEndsAtTruncatedSequence); |
| } |
| |
| bool CharsAreEqual(LPCOLESTR pch, LPCUTF8 bch, LPCUTF8 end, DecodeOptions options) |
| { |
| DecodeOptions localOptions = options; |
| while (bch < end) |
| { |
| if (*pch++ != utf8::Decode(bch, end, localOptions)) |
| { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| template <bool cesu8Encoding> |
| __range(0, cchIn * 3) |
| size_t EncodeIntoImpl(__out_ecount(cchIn * 3) LPUTF8 buffer, __in_ecount(cchIn) const char16 *source, charcount_t cchIn) |
| { |
| charcount_t cch = cchIn; // SAL analysis gets confused by EncodeTrueUtf8's dest buffer requirement unless we alias cchIn with a local |
| LPUTF8 dest = buffer; |
| |
| if (!ShouldFastPath(dest, source)) goto LSlowPath; |
| |
| LFastPath: |
| while (cch >= 4) |
| { |
| uint32 first = ((const uint32 *)source)[0]; |
| if ( (first & 0xFF80FF80) != 0) goto LSlowPath; |
| uint32 second = ((const uint32 *)source)[1]; |
| if ( (second & 0xFF80FF80) != 0) goto LSlowPath; |
| *(uint32 *)dest = (first & 0x0000007F) | ((first & 0x007F0000) >> 8) | ((second & 0x0000007f) << 16) | ((second & 0x007F0000) << 8); |
| dest += 4; |
| source += 4; |
| cch -= 4; |
| } |
| |
| LSlowPath: |
| if (cesu8Encoding) |
| { |
| while (cch-- > 0) |
| { |
| dest = Encode(*source++, dest); |
| if (ShouldFastPath(dest, source)) goto LFastPath; |
| } |
| } |
| else |
| { |
| while (cch-- > 0) |
| { |
| // We increment the source pointer here since at least one utf16 code unit is read here |
| // If the code unit turns out to be the high surrogate in a surrogate pair, then |
| // EncodeTrueUtf8 will consume the low surrogate code unit too by decrementing cch |
| // and incrementing source |
| dest = EncodeTrueUtf8(*source++, &source, &cch, dest); |
| if (ShouldFastPath(dest, source)) goto LFastPath; |
| } |
| } |
| |
| return dest - buffer; |
| } |
| |
| __range(0, cch * 3) |
| size_t EncodeInto(__out_ecount(cch * 3) LPUTF8 buffer, __in_ecount(cch) const char16 *source, charcount_t cch) |
| { |
| return EncodeIntoImpl<true>(buffer, source, cch); |
| } |
| |
| __range(0, cch * 3) |
| size_t EncodeIntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch) |
| { |
| size_t result = EncodeInto(buffer, source, cch); |
| buffer[result] = 0; |
| return result; |
| } |
| |
| __range(0, cch * 3) |
| size_t EncodeTrueUtf8IntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch) |
| { |
| size_t result = EncodeIntoImpl<false>(buffer, source, cch); |
| buffer[result] = 0; |
| return result; |
| } |
| |
| // Convert the character index into a byte index. |
| size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, charcount_t cchIndex, DecodeOptions options) |
| { |
| return CharacterIndexToByteIndex(pch, cbLength, cchIndex, 0, 0, options); |
| } |
| |
| size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, const charcount_t cchIndex, size_t cbStartIndex, charcount_t cchStartIndex, DecodeOptions options) |
| { |
| DecodeOptions localOptions = options; |
| LPCUTF8 pchCurrent = pch + cbStartIndex; |
| LPCUTF8 pchEnd = pch + cbLength; |
| LPCUTF8 pchEndMinus4 = pch + (cbLength - 4); |
| charcount_t i = cchIndex - cchStartIndex; |
| |
| // Avoid using a reinterpret_cast to start a misaligned read. |
| if (!IsAligned(pchCurrent)) goto LSlowPath; |
| LFastPath: |
| // Skip 4 bytes at a time. |
| while (pchCurrent < pchEndMinus4 && i > 4) |
| { |
| uint32 ch4 = *reinterpret_cast<const uint32 *>(pchCurrent); |
| if ((ch4 & 0x80808080) == 0) |
| { |
| pchCurrent += 4; |
| i -= 4; |
| } |
| else break; |
| } |
| |
| LSlowPath: |
| while (pchCurrent < pchEnd && i > 0) |
| { |
| Decode(pchCurrent, pchEnd, localOptions); |
| i--; |
| |
| // Try to return to the fast path avoiding misaligned reads. |
| if (i > 4 && IsAligned(pchCurrent)) goto LFastPath; |
| } |
| return i > 0 ? cbLength : pchCurrent - pch; |
| } |
| |
| // Convert byte index into character index |
| charcount_t ByteIndexIntoCharacterIndex(__in_ecount(cbIndex) LPCUTF8 pch, size_t cbIndex, DecodeOptions options) |
| { |
| DecodeOptions localOptions = options; |
| LPCUTF8 pchCurrent = pch; |
| LPCUTF8 pchEnd = pch + cbIndex; |
| LPCUTF8 pchEndMinus4 = pch + (cbIndex - 4); |
| charcount_t i = 0; |
| |
| // Avoid using a reinterpret_cast to start a misaligned read. |
| if (!IsAligned(pchCurrent)) goto LSlowPath; |
| |
| LFastPath: |
| // Skip 4 bytes at a time. |
| while (pchCurrent < pchEndMinus4) |
| { |
| uint32 ch4 = *reinterpret_cast<const uint32 *>(pchCurrent); |
| if ((ch4 & 0x80808080) == 0) |
| { |
| pchCurrent += 4; |
| i += 4; |
| } |
| else break; |
| } |
| |
| LSlowPath: |
| while (pchCurrent < pchEnd) |
| { |
| LPCUTF8 s = pchCurrent; |
| Decode(pchCurrent, pchEnd, localOptions); |
| if (s == pchCurrent) break; |
| i++; |
| |
| // Try to return to the fast path avoiding misaligned reads. |
| if (IsAligned(pchCurrent)) goto LFastPath; |
| } |
| |
| return i; |
| } |
| |
| } // namespace utf8 |
| |
| #ifdef _MSC_VER |
| #pragma warning(pop) |
| #endif |