| #include <newlib.h> |
| #include <stdlib.h> |
| #include <locale.h> |
| #include "mbctype.h" |
| #include <wchar.h> |
| #include <string.h> |
| #include <errno.h> |
| #include "local.h" |
| |
| int (*__mbtowc) (struct _reent *, wchar_t *, const char *, size_t, |
| const char *, mbstate_t *) |
| #ifdef __CYGWIN__ |
| = __utf8_mbtowc; |
| #else |
| = __ascii_mbtowc; |
| #endif |
| |
| int |
| _DEFUN (_mbtowc_r, (r, pwc, s, n, state), |
| struct _reent *r _AND |
| wchar_t *pwc _AND |
| const char *s _AND |
| size_t n _AND |
| mbstate_t *state) |
| { |
| return __mbtowc (r, pwc, s, n, __locale_charset (), state); |
| } |
| |
| int |
| _DEFUN (__ascii_mbtowc, (r, pwc, s, n, charset, state), |
| struct _reent *r _AND |
| wchar_t *pwc _AND |
| const char *s _AND |
| size_t n _AND |
| const char *charset _AND |
| mbstate_t *state) |
| { |
| wchar_t dummy; |
| unsigned char *t = (unsigned char *)s; |
| |
| if (pwc == NULL) |
| pwc = &dummy; |
| |
| if (s == NULL) |
| return 0; |
| |
| if (n == 0) |
| return -2; |
| |
| *pwc = (wchar_t)*t; |
| |
| if (*t == '\0') |
| return 0; |
| |
| return 1; |
| } |
| |
| #ifdef _MB_CAPABLE |
| typedef enum { ESCAPE, DOLLAR, BRACKET, AT, B, J, |
| NUL, JIS_CHAR, OTHER, JIS_C_NUM } JIS_CHAR_TYPE; |
| typedef enum { ASCII, JIS, A_ESC, A_ESC_DL, JIS_1, J_ESC, J_ESC_BR, |
| INV, JIS_S_NUM } JIS_STATE; |
| typedef enum { COPY_A, COPY_J1, COPY_J2, MAKE_A, NOOP, EMPTY, ERROR } JIS_ACTION; |
| |
| /************************************************************************************** |
| * state/action tables for processing JIS encoding |
| * Where possible, switches to JIS are grouped with proceding JIS characters and switches |
| * to ASCII are grouped with preceding JIS characters. Thus, maximum returned length |
| * is 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6. |
| *************************************************************************************/ |
| |
| static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = { |
| /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER */ |
| /* ASCII */ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII }, |
| /* JIS */ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1, INV }, |
| /* A_ESC */ { ASCII, A_ESC_DL, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII }, |
| /* A_ESC_DL */{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII, ASCII, ASCII }, |
| /* JIS_1 */ { INV, JIS, JIS, JIS, JIS, JIS, INV, JIS, INV }, |
| /* J_ESC */ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV }, |
| /* J_ESC_BR */{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV }, |
| }; |
| |
| static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = { |
| /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER */ |
| /* ASCII */ { NOOP, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A, EMPTY, COPY_A, COPY_A}, |
| /* JIS */ { NOOP, COPY_J1, COPY_J1, COPY_J1, COPY_J1, COPY_J1, ERROR, COPY_J1, ERROR }, |
| /* A_ESC */ { COPY_A, NOOP, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A, COPY_A}, |
| /* A_ESC_DL */{ COPY_A, COPY_A, COPY_A, NOOP, NOOP, COPY_A, COPY_A, COPY_A, COPY_A}, |
| /* JIS_1 */ { ERROR, COPY_J2, COPY_J2, COPY_J2, COPY_J2, COPY_J2, ERROR, COPY_J2, ERROR }, |
| /* J_ESC */ { ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR }, |
| /* J_ESC_BR */{ ERROR, ERROR, ERROR, ERROR, MAKE_A, MAKE_A, ERROR, ERROR, ERROR }, |
| }; |
| |
| /* we override the mbstate_t __count field for more complex encodings and use it store a state value */ |
| #define __state __count |
| |
| #ifdef _MB_EXTENDED_CHARSETS_ISO |
| int |
| _DEFUN (__iso_mbtowc, (r, pwc, s, n, charset, state), |
| struct _reent *r _AND |
| wchar_t *pwc _AND |
| const char *s _AND |
| size_t n _AND |
| const char *charset _AND |
| mbstate_t *state) |
| { |
| wchar_t dummy; |
| unsigned char *t = (unsigned char *)s; |
| |
| if (pwc == NULL) |
| pwc = &dummy; |
| |
| if (s == NULL) |
| return 0; |
| |
| if (n == 0) |
| return -2; |
| |
| if (*t >= 0xa0) |
| { |
| int iso_idx = __iso_8859_index (charset + 9); |
| if (iso_idx >= 0) |
| { |
| *pwc = __iso_8859_conv[iso_idx][*t - 0xa0]; |
| if (*pwc == 0) /* Invalid character */ |
| { |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| return 1; |
| } |
| } |
| |
| *pwc = (wchar_t) *t; |
| |
| if (*t == '\0') |
| return 0; |
| |
| return 1; |
| } |
| #endif /* _MB_EXTENDED_CHARSETS_ISO */ |
| |
| #ifdef _MB_EXTENDED_CHARSETS_WINDOWS |
| int |
| _DEFUN (__cp_mbtowc, (r, pwc, s, n, charset, state), |
| struct _reent *r _AND |
| wchar_t *pwc _AND |
| const char *s _AND |
| size_t n _AND |
| const char *charset _AND |
| mbstate_t *state) |
| { |
| wchar_t dummy; |
| unsigned char *t = (unsigned char *)s; |
| |
| if (pwc == NULL) |
| pwc = &dummy; |
| |
| if (s == NULL) |
| return 0; |
| |
| if (n == 0) |
| return -2; |
| |
| if (*t >= 0x80) |
| { |
| int cp_idx = __cp_index (charset + 2); |
| if (cp_idx >= 0) |
| { |
| *pwc = __cp_conv[cp_idx][*t - 0x80]; |
| if (*pwc == 0) /* Invalid character */ |
| { |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| return 1; |
| } |
| } |
| |
| *pwc = (wchar_t)*t; |
| |
| if (*t == '\0') |
| return 0; |
| |
| return 1; |
| } |
| #endif /* _MB_EXTENDED_CHARSETS_WINDOWS */ |
| |
| int |
| _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state), |
| struct _reent *r _AND |
| wchar_t *pwc _AND |
| const char *s _AND |
| size_t n _AND |
| const char *charset _AND |
| mbstate_t *state) |
| { |
| wchar_t dummy; |
| unsigned char *t = (unsigned char *)s; |
| int ch; |
| int i = 0; |
| |
| if (pwc == NULL) |
| pwc = &dummy; |
| |
| if (s == NULL) |
| return 0; |
| |
| if (n == 0) |
| return -2; |
| |
| if (state->__count == 0) |
| ch = t[i++]; |
| else |
| ch = state->__value.__wchb[0]; |
| |
| if (ch == '\0') |
| { |
| *pwc = 0; |
| state->__count = 0; |
| return 0; /* s points to the null character */ |
| } |
| |
| if (ch <= 0x7f) |
| { |
| /* single-byte sequence */ |
| state->__count = 0; |
| *pwc = ch; |
| return 1; |
| } |
| if (ch >= 0xc0 && ch <= 0xdf) |
| { |
| /* two-byte sequence */ |
| state->__value.__wchb[0] = ch; |
| if (state->__count == 0) |
| state->__count = 1; |
| else if (n < (size_t)-1) |
| ++n; |
| if (n < 2) |
| return -2; |
| ch = t[i++]; |
| if (ch < 0x80 || ch > 0xbf) |
| { |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| if (state->__value.__wchb[0] < 0xc2) |
| { |
| /* overlong UTF-8 sequence */ |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| state->__count = 0; |
| *pwc = (wchar_t)((state->__value.__wchb[0] & 0x1f) << 6) |
| | (wchar_t)(ch & 0x3f); |
| return i; |
| } |
| if (ch >= 0xe0 && ch <= 0xef) |
| { |
| /* three-byte sequence */ |
| wchar_t tmp; |
| state->__value.__wchb[0] = ch; |
| if (state->__count == 0) |
| state->__count = 1; |
| else if (n < (size_t)-1) |
| ++n; |
| if (n < 2) |
| return -2; |
| ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1]; |
| if (state->__value.__wchb[0] == 0xe0 && ch < 0xa0) |
| { |
| /* overlong UTF-8 sequence */ |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| if (ch < 0x80 || ch > 0xbf) |
| { |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| state->__value.__wchb[1] = ch; |
| if (state->__count == 1) |
| state->__count = 2; |
| else if (n < (size_t)-1) |
| ++n; |
| if (n < 3) |
| return -2; |
| ch = t[i++]; |
| if (ch < 0x80 || ch > 0xbf) |
| { |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| state->__count = 0; |
| tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12) |
| | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6) |
| | (wchar_t)(ch & 0x3f); |
| *pwc = tmp; |
| return i; |
| } |
| if (ch >= 0xf0 && ch <= 0xf4) |
| { |
| /* four-byte sequence */ |
| wint_t tmp; |
| state->__value.__wchb[0] = ch; |
| if (state->__count == 0) |
| state->__count = 1; |
| else if (n < (size_t)-1) |
| ++n; |
| if (n < 2) |
| return -2; |
| ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1]; |
| if ((state->__value.__wchb[0] == 0xf0 && ch < 0x90) |
| || (state->__value.__wchb[0] == 0xf4 && ch >= 0x90)) |
| { |
| /* overlong UTF-8 sequence or result is > 0x10ffff */ |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| if (ch < 0x80 || ch > 0xbf) |
| { |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| state->__value.__wchb[1] = ch; |
| if (state->__count == 1) |
| state->__count = 2; |
| else if (n < (size_t)-1) |
| ++n; |
| if (n < 3) |
| return -2; |
| ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2]; |
| if (ch < 0x80 || ch > 0xbf) |
| { |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| state->__value.__wchb[2] = ch; |
| if (state->__count == 2) |
| state->__count = 3; |
| else if (n < (size_t)-1) |
| ++n; |
| if (state->__count == 3 && sizeof(wchar_t) == 2) |
| { |
| /* On systems which have wchar_t being UTF-16 values, the value |
| doesn't fit into a single wchar_t in this case. So what we |
| do here is to store the state with a special value of __count |
| and return the first half of a surrogate pair. The first |
| three bytes of a UTF-8 sequence are enough to generate the |
| first half of a UTF-16 surrogate pair. As return value we |
| choose to return the number of bytes actually read up to |
| here. |
| The second half of the surrogate pair is returned in case we |
| recognize the special __count value of four, and the next |
| byte is actually a valid value. See below. */ |
| tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18) |
| | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12) |
| | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6); |
| state->__count = 4; |
| *pwc = 0xd800 | ((tmp - 0x10000) >> 10); |
| return i; |
| } |
| if (n < 4) |
| return -2; |
| ch = t[i++]; |
| if (ch < 0x80 || ch > 0xbf) |
| { |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18) |
| | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12) |
| | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6) |
| | (wint_t)(ch & 0x3f); |
| if (state->__count == 4 && sizeof(wchar_t) == 2) |
| /* Create the second half of the surrogate pair for systems with |
| wchar_t == UTF-16 . */ |
| *pwc = 0xdc00 | (tmp & 0x3ff); |
| else |
| *pwc = tmp; |
| state->__count = 0; |
| return i; |
| } |
| |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| |
| /* Cygwin defines its own doublebyte charset conversion functions |
| because the underlying OS requires wchar_t == UTF-16. */ |
| #ifndef __CYGWIN__ |
| int |
| _DEFUN (__sjis_mbtowc, (r, pwc, s, n, charset, state), |
| struct _reent *r _AND |
| wchar_t *pwc _AND |
| const char *s _AND |
| size_t n _AND |
| const char *charset _AND |
| mbstate_t *state) |
| { |
| wchar_t dummy; |
| unsigned char *t = (unsigned char *)s; |
| int ch; |
| int i = 0; |
| |
| if (pwc == NULL) |
| pwc = &dummy; |
| |
| if (s == NULL) |
| return 0; /* not state-dependent */ |
| |
| if (n == 0) |
| return -2; |
| |
| ch = t[i++]; |
| if (state->__count == 0) |
| { |
| if (_issjis1 (ch)) |
| { |
| state->__value.__wchb[0] = ch; |
| state->__count = 1; |
| if (n <= 1) |
| return -2; |
| ch = t[i++]; |
| } |
| } |
| if (state->__count == 1) |
| { |
| if (_issjis2 (ch)) |
| { |
| *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch; |
| state->__count = 0; |
| return i; |
| } |
| else |
| { |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| } |
| |
| *pwc = (wchar_t)*t; |
| |
| if (*t == '\0') |
| return 0; |
| |
| return 1; |
| } |
| |
| int |
| _DEFUN (__eucjp_mbtowc, (r, pwc, s, n, charset, state), |
| struct _reent *r _AND |
| wchar_t *pwc _AND |
| const char *s _AND |
| size_t n _AND |
| const char *charset _AND |
| mbstate_t *state) |
| { |
| wchar_t dummy; |
| unsigned char *t = (unsigned char *)s; |
| int ch; |
| int i = 0; |
| |
| if (pwc == NULL) |
| pwc = &dummy; |
| |
| if (s == NULL) |
| return 0; |
| |
| if (n == 0) |
| return -2; |
| |
| ch = t[i++]; |
| if (state->__count == 0) |
| { |
| if (_iseucjp1 (ch)) |
| { |
| state->__value.__wchb[0] = ch; |
| state->__count = 1; |
| if (n <= 1) |
| return -2; |
| ch = t[i++]; |
| } |
| } |
| if (state->__count == 1) |
| { |
| if (_iseucjp2 (ch)) |
| { |
| if (state->__value.__wchb[0] == 0x8f) |
| { |
| state->__value.__wchb[1] = ch; |
| state->__count = 2; |
| if (n <= i) |
| return -2; |
| ch = t[i++]; |
| } |
| else |
| { |
| *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)ch; |
| state->__count = 0; |
| return i; |
| } |
| } |
| else |
| { |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| } |
| if (state->__count == 2) |
| { |
| if (_iseucjp2 (ch)) |
| { |
| *pwc = (((wchar_t)state->__value.__wchb[1]) << 8) |
| + (wchar_t)(ch & 0x7f); |
| state->__count = 0; |
| return i; |
| } |
| else |
| { |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| } |
| |
| *pwc = (wchar_t)*t; |
| |
| if (*t == '\0') |
| return 0; |
| |
| return 1; |
| } |
| |
| int |
| _DEFUN (__jis_mbtowc, (r, pwc, s, n, charset, state), |
| struct _reent *r _AND |
| wchar_t *pwc _AND |
| const char *s _AND |
| size_t n _AND |
| const char *charset _AND |
| mbstate_t *state) |
| { |
| wchar_t dummy; |
| unsigned char *t = (unsigned char *)s; |
| JIS_STATE curr_state; |
| JIS_ACTION action; |
| JIS_CHAR_TYPE ch; |
| unsigned char *ptr; |
| unsigned int i; |
| int curr_ch; |
| |
| if (pwc == NULL) |
| pwc = &dummy; |
| |
| if (s == NULL) |
| { |
| state->__state = ASCII; |
| return 1; /* state-dependent */ |
| } |
| |
| if (n == 0) |
| return -2; |
| |
| curr_state = state->__state; |
| ptr = t; |
| |
| for (i = 0; i < n; ++i) |
| { |
| curr_ch = t[i]; |
| switch (curr_ch) |
| { |
| case ESC_CHAR: |
| ch = ESCAPE; |
| break; |
| case '$': |
| ch = DOLLAR; |
| break; |
| case '@': |
| ch = AT; |
| break; |
| case '(': |
| ch = BRACKET; |
| break; |
| case 'B': |
| ch = B; |
| break; |
| case 'J': |
| ch = J; |
| break; |
| case '\0': |
| ch = NUL; |
| break; |
| default: |
| if (_isjis (curr_ch)) |
| ch = JIS_CHAR; |
| else |
| ch = OTHER; |
| } |
| |
| action = JIS_action_table[curr_state][ch]; |
| curr_state = JIS_state_table[curr_state][ch]; |
| |
| switch (action) |
| { |
| case NOOP: |
| break; |
| case EMPTY: |
| state->__state = ASCII; |
| *pwc = (wchar_t)0; |
| return 0; |
| case COPY_A: |
| state->__state = ASCII; |
| *pwc = (wchar_t)*ptr; |
| return (i + 1); |
| case COPY_J1: |
| state->__value.__wchb[0] = t[i]; |
| break; |
| case COPY_J2: |
| state->__state = JIS; |
| *pwc = (((wchar_t)state->__value.__wchb[0]) << 8) + (wchar_t)(t[i]); |
| return (i + 1); |
| case MAKE_A: |
| ptr = (unsigned char *)(t + i + 1); |
| break; |
| case ERROR: |
| default: |
| r->_errno = EILSEQ; |
| return -1; |
| } |
| |
| } |
| |
| state->__state = curr_state; |
| return -2; /* n < bytes needed */ |
| } |
| #endif /* !__CYGWIN__*/ |
| #endif /* _MB_CAPABLE */ |