| // |
| // Copyright (c) Microsoft. All rights reserved. |
| // Licensed under the MIT license. See LICENSE file in the project root for full license information. |
| // |
| |
| /*++ |
| |
| |
| |
| Module Name: |
| |
| unicode/utf8.c |
| |
| Abstract: |
| Functions to encode and decode UTF-8 strings |
| |
| Revision History: |
| |
| |
| |
| --*/ |
| |
| #include "pal/utf8.h" |
| #include "pal/dbgmsg.h" |
| #include "pal/unicode_data.h" |
| |
| // |
| // Constant Declarations. |
| // |
| |
| #define ASCII 0x007f |
| |
| #define UTF8_2_MAX 0x07ff // max UTF8 2-byte sequence (32 * 64 = 2048) |
| #define UTF8_1ST_OF_2 0xc0 // 110x xxxx |
| #define UTF8_1ST_OF_3 0xe0 // 1110 xxxx |
| #define UTF8_1ST_OF_4 0xf0 // 1111 xxxx |
| #define UTF8_TRAIL 0x80 // 10xx xxxx |
| |
| #define HIGHER_6_BIT(u) ((u) >> 12) |
| #define MIDDLE_6_BIT(u) (((u) & 0x0fc0) >> 6) |
| #define LOWER_6_BIT(u) ((u) & 0x003f) |
| |
| #define BIT7(a) ((a) & 0x80) |
| #define BIT6(a) ((a) & 0x40) |
| |
| #define HIGH_SURROGATE_START 0xd800 |
| #define HIGH_SURROGATE_END 0xdbff |
| #define LOW_SURROGATE_START 0xdc00 |
| #define LOW_SURROGATE_END 0xdfff |
| |
| |
| //////////////////////////////////////////////////////////////////////////// |
| // |
| // UTF8ToUnicode |
| // |
| // Maps a UTF-8 character string to its wide character string counterpart. |
| // |
| //////////////////////////////////////////////////////////////////////////// |
| |
| int UTF8ToUnicode( |
| LPCSTR lpSrcStr, |
| int cchSrc, |
| LPWSTR lpDestStr, |
| int cchDest, |
| DWORD dwFlags |
| ) |
| { |
| int nTB = 0; // # trail bytes to follow |
| int cchWC = 0; // # of Unicode code points generated |
| CONST BYTE* pUTF8 = (CONST BYTE*)lpSrcStr; |
| DWORD dwUnicodeChar = 0; // Our character with room for full surrogate char |
| BOOL bSurrogatePair = FALSE; // Indicate we're collecting a surrogate pair |
| BOOL bCheckInvalidBytes = (dwFlags & MB_ERR_INVALID_CHARS); |
| BYTE UTF8; |
| |
| // Note that we can't test destination buffer length here because we may have to |
| // iterate through thousands of broken characters which won't be output, even though |
| // the buffer has no more room. |
| while (cchSrc--) |
| { |
| // |
| // See if there are any trail bytes. |
| // |
| if (BIT7(*pUTF8) == 0) |
| { |
| // |
| // Found ASCII. |
| // |
| if (cchDest) |
| { |
| // In this function always test buffer size before using it |
| if (cchWC >= cchDest) |
| { |
| // Error: Buffer too small, we didn't process this character |
| SetLastError(ERROR_INSUFFICIENT_BUFFER); |
| return (0); |
| } |
| lpDestStr[cchWC] = (WCHAR)*pUTF8; |
| } |
| nTB = bSurrogatePair = 0; |
| cchWC++; |
| } |
| else if (BIT6(*pUTF8) == 0) |
| { |
| // |
| // Found a trail byte. |
| // Note : Ignore the trail byte if there was no lead byte. |
| // |
| if (nTB != 0) |
| { |
| // |
| // Decrement the trail byte counter. |
| // |
| nTB--; |
| |
| // Add room for trail byte and add the trail byte falue |
| dwUnicodeChar <<= 6; |
| dwUnicodeChar |= LOWER_6_BIT(*pUTF8); |
| |
| // If we're done then we may need to store the data |
| if (nTB == 0) |
| { |
| if (bSurrogatePair) |
| { |
| if (cchDest) |
| { |
| if ((cchWC + 1) >= cchDest) |
| { |
| // Error: Buffer too small, we didn't process this character |
| SetLastError(ERROR_INSUFFICIENT_BUFFER); |
| return (0); |
| } |
| |
| lpDestStr[cchWC] = (WCHAR) |
| (((dwUnicodeChar - 0x10000) >> 10) + HIGH_SURROGATE_START); |
| |
| lpDestStr[cchWC+1] = (WCHAR) |
| ((dwUnicodeChar - 0x10000)%0x400 + LOW_SURROGATE_START); |
| } |
| |
| // |
| // End of sequence. Advance the output counter, turn off surrogateness |
| // |
| cchWC += 2; |
| bSurrogatePair = FALSE; |
| } |
| else |
| { |
| if (cchDest) |
| { |
| |
| if (cchWC >= cchDest) |
| { |
| // Error: Buffer too small, we didn't process this character |
| SetLastError(ERROR_INSUFFICIENT_BUFFER); |
| return (0); |
| } |
| |
| lpDestStr[cchWC] = (WCHAR)dwUnicodeChar; |
| } |
| |
| // |
| // End of sequence. Advance the output counter. |
| // |
| cchWC++; |
| } |
| |
| } |
| |
| } |
| else |
| { |
| if (bCheckInvalidBytes) |
| { |
| SetLastError(ERROR_NO_UNICODE_TRANSLATION); |
| return (0); |
| } |
| |
| // error - not expecting a trail byte. That is, there is a trailing byte without leading byte. |
| bSurrogatePair = FALSE; |
| } |
| } |
| else |
| { |
| // |
| // Found a lead byte. |
| // |
| if (nTB > 0) |
| { |
| // error - A leading byte before the previous sequence is completed. |
| if (bCheckInvalidBytes) |
| { |
| SetLastError(ERROR_NO_UNICODE_TRANSLATION); |
| return (0); |
| } |
| // |
| // Error - previous sequence not finished. |
| // |
| nTB = 0; |
| bSurrogatePair = FALSE; |
| // Put this character back so that we can start over another sequence. |
| cchSrc++; |
| pUTF8--; |
| } |
| else |
| { |
| // |
| // Calculate the number of bytes to follow. |
| // Look for the first 0 from left to right. |
| // |
| UTF8 = *pUTF8; |
| while (BIT7(UTF8) != 0) |
| { |
| UTF8 <<= 1; |
| nTB++; |
| } |
| |
| // Recover the data from the byte |
| UTF8 >>= nTB; |
| |
| // |
| // Check for non-shortest form. |
| // |
| switch (nTB) |
| { |
| case 1: |
| nTB = 0; |
| break; |
| case 2: |
| // Make sure that bit 8 ~ bit 11 is not all zero. |
| // 110XXXXx 10xxxxxx |
| if ((*pUTF8 & 0x1e) == 0) |
| { |
| nTB = 0; |
| } |
| break; |
| case 3: |
| // Look ahead to check for non-shortest form. |
| // 1110XXXX 10Xxxxxx 10xxxxxx |
| if (cchSrc >= 2) |
| { |
| if (((*pUTF8 & 0x0f) == 0) && (*(pUTF8 + 1) & 0x20) == 0) |
| { |
| nTB = 0; |
| } |
| } |
| break; |
| case 4: |
| // |
| // This is a surrogate unicode pair |
| // |
| if (cchSrc >= 3) |
| { |
| WORD word = (((WORD)*pUTF8) << 8) | *(pUTF8 + 1); |
| // Look ahead to check for non-shortest form. |
| // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx |
| // Check if the 5 X bits are all zero. |
| // 0x0730 == 00000111 00110000 |
| if ( (word & 0x0730) == 0 || |
| // If the 21st bit is 1, we have extra work |
| ( (word & 0x0400) == 0x0400 && |
| // The 21st bit is 1. |
| // Make sure that the resulting Unicode is within the valid surrogate range. |
| // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range |
| // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF. |
| // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20 |
| // bit are all zero. |
| // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx, |
| // XXXXX can only be 10000. |
| // 0x0330 = 0000 0011 0011 0000 |
| (word & 0x0330) != 0 ) ) |
| { |
| // Not shortest form |
| nTB = 0; |
| } |
| else |
| { |
| // A real surrogate pair |
| bSurrogatePair = TRUE; |
| } |
| } |
| break; |
| default: |
| // |
| // If the bits is greater than 4, this is an invalid |
| // UTF8 lead byte. |
| // |
| nTB = 0; |
| break; |
| } |
| |
| if (nTB != 0) |
| { |
| // |
| // Store the value from the first byte and decrement |
| // the number of bytes to follow. |
| // |
| dwUnicodeChar = UTF8; |
| nTB--; |
| } else |
| { |
| if (bCheckInvalidBytes) |
| { |
| SetLastError(ERROR_NO_UNICODE_TRANSLATION); |
| return (0); |
| } |
| } |
| } |
| } |
| pUTF8++; |
| } |
| |
| if ((bCheckInvalidBytes && nTB != 0) || (cchWC == 0)) |
| { |
| // About (cchWC == 0): |
| // Because we now throw away non-shortest form, it is possible that we generate 0 chars. |
| // In this case, we have to set error to ERROR_NO_UNICODE_TRANSLATION so that we conform |
| // to the spec of MultiByteToWideChar. |
| SetLastError(ERROR_NO_UNICODE_TRANSLATION); |
| return (0); |
| } |
| |
| // |
| // Return the number of Unicode characters written. |
| // |
| return (cchWC); |
| } |
| |
| //////////////////////////////////////////////////////////////////////////// |
| // |
| // UnicodeToUTF8 |
| // |
| // Maps a Unicode character string to its UTF-8 string counterpart. |
| // |
| //////////////////////////////////////////////////////////////////////////// |
| |
| int UnicodeToUTF8( |
| LPCWSTR lpSrcStr, |
| int cchSrc, |
| LPSTR lpDestStr, |
| int cchDest) |
| { |
| LPCWSTR lpWC = lpSrcStr; |
| int cchU8 = 0; // # of UTF8 chars generated |
| DWORD dwSurrogateChar; |
| WCHAR wchHighSurrogate = 0; |
| BOOL bHandled; |
| |
| |
| while ((cchSrc--) && ((cchDest == 0) || (cchU8 < cchDest))) |
| { |
| bHandled = FALSE; |
| |
| // |
| // Check if high surrogate is available |
| // |
| if ((*lpWC >= HIGH_SURROGATE_START) && (*lpWC <= HIGH_SURROGATE_END)) |
| { |
| if (cchDest) |
| { |
| // Another high surrogate, then treat the 1st as normal |
| // Unicode character. |
| if (wchHighSurrogate) |
| { |
| if ((cchU8 + 2) < cchDest) |
| { |
| lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate); |
| lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate); |
| lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate); |
| } |
| else |
| { |
| // not enough buffer |
| cchSrc++; |
| break; |
| } |
| } |
| } |
| else |
| { |
| cchU8 += 3; |
| } |
| wchHighSurrogate = *lpWC; |
| bHandled = TRUE; |
| } |
| |
| if (!bHandled && wchHighSurrogate) |
| { |
| if ((*lpWC >= LOW_SURROGATE_START) && (*lpWC <= LOW_SURROGATE_END)) |
| { |
| // wheee, valid surrogate pairs |
| |
| if (cchDest) |
| { |
| if ((cchU8 + 3) < cchDest) |
| { |
| dwSurrogateChar = (((wchHighSurrogate-0xD800) << 10) + (*lpWC - 0xDC00) + 0x10000); |
| |
| lpDestStr[cchU8++] = (UTF8_1ST_OF_4 | |
| (unsigned char)(dwSurrogateChar >> 18)); // 3 bits from 1st byte |
| |
| lpDestStr[cchU8++] = (UTF8_TRAIL | |
| (unsigned char)((dwSurrogateChar >> 12) & 0x3f)); // 6 bits from 2nd byte |
| |
| lpDestStr[cchU8++] = (UTF8_TRAIL | |
| (unsigned char)((dwSurrogateChar >> 6) & 0x3f)); // 6 bits from 3rd byte |
| |
| lpDestStr[cchU8++] = (UTF8_TRAIL | |
| (unsigned char)(0x3f & dwSurrogateChar)); // 6 bits from 4th byte |
| } |
| else |
| { |
| // not enough buffer |
| cchSrc++; |
| break; |
| } |
| } |
| else |
| { |
| // we already counted 3 previously (in high surrogate) |
| cchU8 ++; |
| } |
| |
| bHandled = TRUE; |
| } |
| else |
| { |
| // Bad Surrogate pair : ERROR |
| // Just process wchHighSurrogate , and the code below will |
| // process the current code point |
| if (cchDest) |
| { |
| if ((cchU8 + 2) < cchDest) |
| { |
| lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate); |
| lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate); |
| lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate); |
| } |
| else |
| { |
| // not enough buffer |
| cchSrc++; |
| break; |
| } |
| } |
| } |
| |
| wchHighSurrogate = 0; |
| } |
| |
| if (!bHandled) |
| { |
| if (*lpWC <= ASCII) |
| { |
| // |
| // Found ASCII. |
| // |
| if (cchDest) |
| { |
| if (cchU8 < cchDest) |
| { |
| lpDestStr[cchU8] = (char)*lpWC; |
| } |
| else |
| { |
| // |
| // Error - buffer too small. |
| // |
| cchSrc++; |
| break; |
| } |
| } |
| cchU8++; |
| } |
| else if (*lpWC <= UTF8_2_MAX) |
| { |
| // |
| // Found 2 byte sequence if < 0x07ff (11 bits). |
| // |
| if (cchDest) |
| { |
| if ((cchU8 + 1) < cchDest) |
| { |
| // |
| // Use upper 5 bits in first byte. |
| // Use lower 6 bits in second byte. |
| // |
| lpDestStr[cchU8++] = UTF8_1ST_OF_2 | (*lpWC >> 6); |
| lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(*lpWC); |
| } |
| else |
| { |
| // |
| // Error - buffer too small. |
| // |
| cchSrc++; |
| break; |
| } |
| } |
| else |
| { |
| cchU8 += 2; |
| } |
| } |
| else |
| { |
| // |
| // Found 3 byte sequence. |
| // |
| if (cchDest) |
| { |
| if ((cchU8 + 2) < cchDest) |
| { |
| // |
| // Use upper 4 bits in first byte. |
| // Use middle 6 bits in second byte. |
| // Use lower 6 bits in third byte. |
| // |
| lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(*lpWC); |
| lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(*lpWC); |
| lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(*lpWC); |
| } |
| else |
| { |
| // |
| // Error - buffer too small. |
| // |
| cchSrc++; |
| break; |
| } |
| } |
| else |
| { |
| cchU8 += 3; |
| } |
| } |
| } |
| |
| lpWC++; |
| } |
| |
| // |
| // If the last character was a high surrogate, then handle it as a normal |
| // unicode character. |
| // |
| if ((cchSrc < 0) && (wchHighSurrogate != 0)) |
| { |
| if (cchDest) |
| { |
| if ((cchU8 + 2) < cchDest) |
| { |
| lpDestStr[cchU8++] = UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate); |
| lpDestStr[cchU8++] = UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate); |
| lpDestStr[cchU8++] = UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate); |
| } |
| else |
| { |
| cchSrc++; |
| } |
| } |
| } |
| |
| // |
| // Make sure the destination buffer was large enough. |
| // |
| if (cchDest && (cchSrc >= 0)) |
| { |
| SetLastError(ERROR_INSUFFICIENT_BUFFER); |
| return (0); |
| } |
| |
| // |
| // Return the number of UTF-8 characters written. |
| // |
| return (cchU8); |
| } |