| /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
| /* |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 2 of the License, or |
| * (at your option) any later version. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details: |
| * |
| * Copyright (C) 2010 Red Hat, Inc. |
| */ |
| |
| #include <config.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <string.h> |
| #include <ctype.h> |
| |
| #include "mm-charsets.h" |
| #include "mm-utils.h" |
| |
| typedef struct { |
| const char *gsm_name; |
| const char *other_name; |
| const char *iconv_from_name; |
| const char *iconv_to_name; |
| MMModemCharset charset; |
| } CharsetEntry; |
| |
| static CharsetEntry charset_map[] = { |
| { "UTF-8", "UTF8", "UTF-8", "UTF-8//TRANSLIT", MM_MODEM_CHARSET_UTF8 }, |
| { "UCS2", NULL, "UCS-2BE", "UCS-2BE//TRANSLIT", MM_MODEM_CHARSET_UCS2 }, |
| { "IRA", "ASCII", "ASCII", "ASCII//TRANSLIT", MM_MODEM_CHARSET_IRA }, |
| { "GSM", NULL, NULL, NULL, MM_MODEM_CHARSET_GSM }, |
| { "8859-1", NULL, "ISO8859-1", "ISO8859-1//TRANSLIT", MM_MODEM_CHARSET_8859_1 }, |
| { "PCCP437", "CP437", "CP437", "CP437//TRANSLIT", MM_MODEM_CHARSET_PCCP437 }, |
| { "PCDN", "CP850", "CP850", "CP850//TRANSLIT", MM_MODEM_CHARSET_PCDN }, |
| { "HEX", NULL, NULL, NULL, MM_MODEM_CHARSET_HEX }, |
| { NULL, NULL, NULL, NULL, MM_MODEM_CHARSET_UNKNOWN } |
| }; |
| |
| const char * |
| mm_modem_charset_to_string (MMModemCharset charset) |
| { |
| CharsetEntry *iter = &charset_map[0]; |
| |
| g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL); |
| |
| while (iter->gsm_name) { |
| if (iter->charset == charset) |
| return iter->gsm_name; |
| iter++; |
| } |
| g_warn_if_reached (); |
| return NULL; |
| } |
| |
| MMModemCharset |
| mm_modem_charset_from_string (const char *string) |
| { |
| CharsetEntry *iter = &charset_map[0]; |
| |
| g_return_val_if_fail (string != NULL, MM_MODEM_CHARSET_UNKNOWN); |
| |
| while (iter->gsm_name) { |
| if (strcasestr (string, iter->gsm_name)) |
| return iter->charset; |
| if (iter->other_name && strcasestr (string, iter->other_name)) |
| return iter->charset; |
| iter++; |
| } |
| return MM_MODEM_CHARSET_UNKNOWN; |
| } |
| |
| static const char * |
| charset_iconv_to (MMModemCharset charset) |
| { |
| CharsetEntry *iter = &charset_map[0]; |
| |
| g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL); |
| |
| while (iter->gsm_name) { |
| if (iter->charset == charset) |
| return iter->iconv_to_name; |
| iter++; |
| } |
| g_warn_if_reached (); |
| return NULL; |
| } |
| |
| static const char * |
| charset_iconv_from (MMModemCharset charset) |
| { |
| CharsetEntry *iter = &charset_map[0]; |
| |
| g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL); |
| |
| while (iter->gsm_name) { |
| if (iter->charset == charset) |
| return iter->iconv_from_name; |
| iter++; |
| } |
| g_warn_if_reached (); |
| return NULL; |
| } |
| |
| gboolean |
| mm_modem_charset_byte_array_append (GByteArray *array, |
| const char *utf8, |
| gboolean quoted, |
| MMModemCharset charset) |
| { |
| const char *iconv_to; |
| char *converted; |
| GError *error = NULL; |
| gsize written = 0; |
| |
| g_return_val_if_fail (array != NULL, FALSE); |
| g_return_val_if_fail (utf8 != NULL, FALSE); |
| |
| iconv_to = charset_iconv_to (charset); |
| g_return_val_if_fail (iconv_to != NULL, FALSE); |
| |
| converted = g_convert (utf8, -1, iconv_to, "UTF-8", NULL, &written, &error); |
| if (!converted) { |
| if (error) { |
| g_warning ("%s: failed to convert '%s' to %s character set: (%d) %s", |
| __func__, utf8, iconv_to, |
| error->code, error->message); |
| g_error_free (error); |
| } |
| return FALSE; |
| } |
| |
| if (quoted) |
| g_byte_array_append (array, (const guint8 *) "\"", 1); |
| g_byte_array_append (array, (const guint8 *) converted, written); |
| if (quoted) |
| g_byte_array_append (array, (const guint8 *) "\"", 1); |
| |
| g_free (converted); |
| return TRUE; |
| } |
| |
| char * |
| mm_modem_charset_hex_to_utf8 (const char *src, MMModemCharset charset) |
| { |
| char *unconverted, *converted; |
| const char *iconv_from; |
| gsize unconverted_len = 0; |
| GError *error = NULL; |
| |
| g_return_val_if_fail (src != NULL, NULL); |
| g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL); |
| |
| iconv_from = charset_iconv_from (charset); |
| g_return_val_if_fail (iconv_from != NULL, FALSE); |
| |
| unconverted = utils_hexstr2bin (src, &unconverted_len); |
| if (!unconverted) |
| return NULL; |
| |
| if (charset == MM_MODEM_CHARSET_UTF8 || charset == MM_MODEM_CHARSET_IRA) |
| return unconverted; |
| |
| converted = g_convert (unconverted, unconverted_len, |
| "UTF-8//TRANSLIT", iconv_from, |
| NULL, NULL, &error); |
| if (!converted || error) { |
| g_clear_error (&error); |
| converted = NULL; |
| } |
| |
| g_free (unconverted); |
| |
| return converted; |
| } |
| |
| char * |
| mm_modem_charset_utf8_to_hex (const char *src, MMModemCharset charset) |
| { |
| gsize converted_len = 0; |
| char *converted; |
| const char *iconv_to; |
| GError *error = NULL; |
| gchar *hex; |
| |
| g_return_val_if_fail (src != NULL, NULL); |
| g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL); |
| |
| iconv_to = charset_iconv_from (charset); |
| g_return_val_if_fail (iconv_to != NULL, FALSE); |
| |
| if (charset == MM_MODEM_CHARSET_UTF8 || charset == MM_MODEM_CHARSET_IRA) |
| return g_strdup (src); |
| |
| converted = g_convert (src, strlen (src), |
| iconv_to, "UTF-8//TRANSLIT", |
| NULL, &converted_len, &error); |
| if (!converted || error) { |
| g_clear_error (&error); |
| g_free (converted); |
| return NULL; |
| } |
| |
| /* Get hex representation of the string */ |
| hex = utils_bin2hexstr ((guint8 *)converted, converted_len); |
| g_free (converted); |
| return hex; |
| } |
| |
| /* GSM 03.38 encoding conversion stuff */ |
| |
| #define GSM_DEF_ALPHABET_SIZE 128 |
| #define GSM_EXT_ALPHABET_SIZE 10 |
| |
| typedef struct GsmUtf8Mapping { |
| gchar chars[3]; |
| guint8 len; |
| guint8 gsm; /* only used for extended GSM charset */ |
| } GsmUtf8Mapping; |
| |
| #define ONE(a) { {a, 0x00, 0x00}, 1, 0 } |
| #define TWO(a, b) { {a, b, 0x00}, 2, 0 } |
| |
| /** |
| * gsm_def_utf8_alphabet: |
| * |
| * Mapping from GSM default alphabet to UTF-8. |
| * |
| * ETSI GSM 03.38, version 6.0.1, section 6.2.1; Default alphabet. Mapping to UCS-2. |
| * Mapping according to http://unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT |
| */ |
| static const GsmUtf8Mapping gsm_def_utf8_alphabet[GSM_DEF_ALPHABET_SIZE] = { |
| /* @ £ $ ¥ */ |
| ONE(0x40), TWO(0xc2, 0xa3), ONE(0x24), TWO(0xc2, 0xa5), |
| /* è é ù ì */ |
| TWO(0xc3, 0xa8), TWO(0xc3, 0xa9), TWO(0xc3, 0xb9), TWO(0xc3, 0xac), |
| /* ò Ç \n Ø */ |
| TWO(0xc3, 0xb2), TWO(0xc3, 0x87), ONE(0x0a), TWO(0xc3, 0x98), |
| /* ø \r Å å */ |
| TWO(0xc3, 0xb8), ONE(0x0d), TWO(0xc3, 0x85), TWO(0xc3, 0xa5), |
| /* Δ _ Φ Γ */ |
| TWO(0xce, 0x94), ONE(0x5f), TWO(0xce, 0xa6), TWO(0xce, 0x93), |
| /* Λ Ω ΠΨ */ |
| TWO(0xce, 0x9b), TWO(0xce, 0xa9), TWO(0xce, 0xa0), TWO(0xce, 0xa8), |
| /* Σ Θ Ξ Escape Code */ |
| TWO(0xce, 0xa3), TWO(0xce, 0x98), TWO(0xce, 0x9e), ONE(0xa0), |
| /* Æ æ ß É */ |
| TWO(0xc3, 0x86), TWO(0xc3, 0xa6), TWO(0xc3, 0x9f), TWO(0xc3, 0x89), |
| /* ' ' ! " # */ |
| ONE(0x20), ONE(0x21), ONE(0x22), ONE(0x23), |
| /* ¤ % & ' */ |
| TWO(0xc2, 0xa4), ONE(0x25), ONE(0x26), ONE(0x27), |
| /* ( ) * + */ |
| ONE(0x28), ONE(0x29), ONE(0x2a), ONE(0x2b), |
| /* , - . / */ |
| ONE(0x2c), ONE(0x2d), ONE(0x2e), ONE(0x2f), |
| /* 0 1 2 3 */ |
| ONE(0x30), ONE(0x31), ONE(0x32), ONE(0x33), |
| /* 4 5 6 7 */ |
| ONE(0x34), ONE(0x35), ONE(0x36), ONE(0x37), |
| /* 8 9 : ; */ |
| ONE(0x38), ONE(0x39), ONE(0x3a), ONE(0x3b), |
| /* < = > ? */ |
| ONE(0x3c), ONE(0x3d), ONE(0x3e), ONE(0x3f), |
| /* ¡ A B C */ |
| TWO(0xc2, 0xa1), ONE(0x41), ONE(0x42), ONE(0x43), |
| /* D E F G */ |
| ONE(0x44), ONE(0x45), ONE(0x46), ONE(0x47), |
| /* H I J K */ |
| ONE(0x48), ONE(0x49), ONE(0x4a), ONE(0x4b), |
| /* L M N O */ |
| ONE(0x4c), ONE(0x4d), ONE(0x4e), ONE(0x4f), |
| /* P Q R S */ |
| ONE(0x50), ONE(0x51), ONE(0x52), ONE(0x53), |
| /* T U V W */ |
| ONE(0x54), ONE(0x55), ONE(0x56), ONE(0x57), |
| /* X Y Z Ä */ |
| ONE(0x58), ONE(0x59), ONE(0x5a), TWO(0xc3, 0x84), |
| /* Ö Ñ Ü § */ |
| TWO(0xc3, 0x96), TWO(0xc3, 0x91), TWO(0xc3, 0x9c), TWO(0xc2, 0xa7), |
| /* ¿ a b c */ |
| TWO(0xc2, 0xbf), ONE(0x61), ONE(0x62), ONE(0x63), |
| /* d e f g */ |
| ONE(0x64), ONE(0x65), ONE(0x66), ONE(0x67), |
| /* h i j k */ |
| ONE(0x68), ONE(0x69), ONE(0x6a), ONE(0x6b), |
| /* l m n o */ |
| ONE(0x6c), ONE(0x6d), ONE(0x6e), ONE(0x6f), |
| /* p q r s */ |
| ONE(0x70), ONE(0x71), ONE(0x72), ONE(0x73), |
| /* t u v w */ |
| ONE(0x74), ONE(0x75), ONE(0x76), ONE(0x77), |
| /* x y z ä */ |
| ONE(0x78), ONE(0x79), ONE(0x7a), TWO(0xc3, 0xa4), |
| /* ö ñ ü à */ |
| TWO(0xc3, 0xb6), TWO(0xc3, 0xb1), TWO(0xc3, 0xbc), TWO(0xc3, 0xa0) |
| }; |
| |
| static guint8 |
| gsm_def_char_to_utf8 (const guint8 gsm, guint8 out_utf8[2]) |
| { |
| g_return_val_if_fail (gsm < GSM_DEF_ALPHABET_SIZE, 0); |
| memcpy (&out_utf8[0], &gsm_def_utf8_alphabet[gsm].chars[0], gsm_def_utf8_alphabet[gsm].len); |
| return gsm_def_utf8_alphabet[gsm].len; |
| } |
| |
| static gboolean |
| utf8_to_gsm_def_char (const char *utf8, guint32 len, guint8 *out_gsm) |
| { |
| int i; |
| |
| if (len > 0 && len < 4) { |
| for (i = 0; i < GSM_DEF_ALPHABET_SIZE; i++) { |
| if (gsm_def_utf8_alphabet[i].len == len) { |
| if (memcmp (&gsm_def_utf8_alphabet[i].chars[0], utf8, len) == 0) { |
| *out_gsm = i; |
| return TRUE; |
| } |
| } |
| } |
| } |
| return FALSE; |
| } |
| |
| |
| #define EONE(a, g) { {a, 0x00, 0x00}, 1, g } |
| #define ETHR(a, b, c, g) { {a, b, c}, 3, g } |
| |
| /** |
| * gsm_ext_utf8_alphabet: |
| * |
| * Mapping from GSM extended alphabet to UTF-8. |
| * |
| */ |
| static const GsmUtf8Mapping gsm_ext_utf8_alphabet[GSM_EXT_ALPHABET_SIZE] = { |
| /* form feed ^ { } */ |
| EONE(0x0c, 0x0a), EONE(0x5e, 0x14), EONE(0x7b, 0x28), EONE(0x7d, 0x29), |
| /* \ [ ~ ] */ |
| EONE(0x5c, 0x2f), EONE(0x5b, 0x3c), EONE(0x7e, 0x3d), EONE(0x5d, 0x3e), |
| /* | € */ |
| EONE(0x7c, 0x40), ETHR(0xe2, 0x82, 0xac, 0x65) |
| }; |
| |
| #define GSM_ESCAPE_CHAR 0x1b |
| |
| static guint8 |
| gsm_ext_char_to_utf8 (const guint8 gsm, guint8 out_utf8[3]) |
| { |
| int i; |
| |
| for (i = 0; i < GSM_EXT_ALPHABET_SIZE; i++) { |
| if (gsm == gsm_ext_utf8_alphabet[i].gsm) { |
| memcpy (&out_utf8[0], &gsm_ext_utf8_alphabet[i].chars[0], gsm_ext_utf8_alphabet[i].len); |
| return gsm_ext_utf8_alphabet[i].len; |
| } |
| } |
| return 0; |
| } |
| |
| static gboolean |
| utf8_to_gsm_ext_char (const char *utf8, guint32 len, guint8 *out_gsm) |
| { |
| int i; |
| |
| if (len > 0 && len < 4) { |
| for (i = 0; i < GSM_EXT_ALPHABET_SIZE; i++) { |
| if (gsm_ext_utf8_alphabet[i].len == len) { |
| if (memcmp (&gsm_ext_utf8_alphabet[i].chars[0], utf8, len) == 0) { |
| *out_gsm = gsm_ext_utf8_alphabet[i].gsm; |
| return TRUE; |
| } |
| } |
| } |
| } |
| return FALSE; |
| } |
| |
| guint8 * |
| mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len) |
| { |
| int i; |
| GByteArray *utf8; |
| |
| g_return_val_if_fail (gsm != NULL, NULL); |
| g_return_val_if_fail (len < 4096, NULL); |
| |
| /* worst case initial length */ |
| utf8 = g_byte_array_sized_new (len * 2 + 1); |
| |
| for (i = 0; i < len; i++) { |
| guint8 uchars[4]; |
| guint8 ulen; |
| |
| if (gsm[i] == GSM_ESCAPE_CHAR) { |
| /* Extended alphabet, decode next char */ |
| ulen = gsm_ext_char_to_utf8 (gsm[i+1], uchars); |
| if (ulen) |
| i += 1; |
| } else { |
| /* Default alphabet */ |
| ulen = gsm_def_char_to_utf8 (gsm[i], uchars); |
| } |
| |
| if (ulen) |
| g_byte_array_append (utf8, &uchars[0], ulen); |
| else |
| g_byte_array_append (utf8, (guint8 *) "?", 1); |
| } |
| |
| g_byte_array_append (utf8, (guint8 *) "\0", 1); /* NULL terminator */ |
| return g_byte_array_free (utf8, FALSE); |
| } |
| |
| guint8 * |
| mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len) |
| { |
| GByteArray *gsm; |
| const char *c = utf8, *next = c; |
| static const guint8 gesc = GSM_ESCAPE_CHAR; |
| int i = 0; |
| |
| g_return_val_if_fail (utf8 != NULL, NULL); |
| g_return_val_if_fail (out_len != NULL, NULL); |
| g_return_val_if_fail (g_utf8_validate (utf8, -1, NULL), NULL); |
| |
| /* worst case initial length */ |
| gsm = g_byte_array_sized_new (g_utf8_strlen (utf8, -1) * 2 + 1); |
| |
| if (*utf8 == 0x00) { |
| /* Zero-length string */ |
| g_byte_array_append (gsm, (guint8 *) "\0", 1); |
| *out_len = 0; |
| return g_byte_array_free (gsm, FALSE); |
| } |
| |
| while (next && *next) { |
| guint8 gch = 0x3f; /* 0x3f == '?' */ |
| |
| next = g_utf8_next_char (c); |
| |
| /* Try escaped chars first, then default alphabet */ |
| if (utf8_to_gsm_ext_char (c, next - c, &gch)) { |
| /* Add the escape char */ |
| g_byte_array_append (gsm, &gesc, 1); |
| g_byte_array_append (gsm, &gch, 1); |
| } else if (utf8_to_gsm_def_char (c, next - c, &gch)) |
| g_byte_array_append (gsm, &gch, 1); |
| |
| c = next; |
| i++; |
| } |
| |
| *out_len = gsm->len; |
| return g_byte_array_free (gsm, FALSE); |
| } |
| |
| static gboolean |
| gsm_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) |
| { |
| guint8 gsm; |
| |
| *out_clen = 1; |
| if (utf8_to_gsm_def_char (utf8, ulen, &gsm)) |
| return TRUE; |
| if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) { |
| *out_clen = 2; |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| static gboolean |
| ira_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) |
| { |
| *out_clen = 1; |
| return (ulen == 1); |
| } |
| |
| static gboolean |
| ucs2_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) |
| { |
| *out_clen = 2; |
| return (c <= 0xFFFF); |
| } |
| |
| static gboolean |
| iso88591_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) |
| { |
| *out_clen = 1; |
| return (c <= 0xFF); |
| } |
| |
| static gboolean |
| pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) |
| { |
| static const gunichar t[] = { |
| 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, |
| 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6, |
| 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, |
| 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa, |
| 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, |
| 0x00a1, 0x00ab, 0x00bb, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, |
| 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, |
| 0x2510, 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, |
| 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, 0x2568, |
| 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, |
| 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, 0x03b1, 0x00df, 0x0393, |
| 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, |
| 0x221e, 0x03c6, 0x03b5, 0x2229, 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, |
| 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, |
| 0x25a0, 0x00a0 |
| }; |
| int i; |
| |
| *out_clen = 1; |
| |
| if (c <= 0x7F) |
| return TRUE; |
| for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) { |
| if (c == t[i]) |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| static gboolean |
| pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) |
| { |
| static const gunichar t[] = { |
| 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, |
| 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6, |
| 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, |
| 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa, |
| 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc, |
| 0x00a1, 0x00ab, 0x00bb, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, |
| 0x00c2, 0x00c0, 0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5, |
| 0x2510, 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3, |
| 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, 0x00f0, |
| 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x0131, 0x00cd, 0x00ce, 0x00cf, 0x2518, |
| 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, 0x00d3, 0x00df, 0x00d4, |
| 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe, 0x00de, 0x00da, 0x00db, 0x00d9, |
| 0x00fd, 0x00dd, 0x00af, 0x00b4, 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, |
| 0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, |
| 0x25a0, 0x00a0 |
| }; |
| int i; |
| |
| *out_clen = 1; |
| |
| if (c <= 0x7F) |
| return TRUE; |
| for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) { |
| if (c == t[i]) |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| typedef struct { |
| MMModemCharset cs; |
| gboolean (*func) (gunichar c, const char *utf8, gsize ulen, guint *out_clen); |
| guint charsize; |
| } SubsetEntry; |
| |
| SubsetEntry subset_table[] = { |
| { MM_MODEM_CHARSET_GSM, gsm_is_subset }, |
| { MM_MODEM_CHARSET_IRA, ira_is_subset }, |
| { MM_MODEM_CHARSET_UCS2, ucs2_is_subset }, |
| { MM_MODEM_CHARSET_8859_1, iso88591_is_subset }, |
| { MM_MODEM_CHARSET_PCCP437, pccp437_is_subset }, |
| { MM_MODEM_CHARSET_PCDN, pcdn_is_subset }, |
| { MM_MODEM_CHARSET_UNKNOWN, NULL }, |
| }; |
| |
| /** |
| * mm_charset_get_encoded_len: |
| * |
| * @utf8: UTF-8 valid string |
| * @charset: the #MMModemCharset to check the length of @utf8 in |
| * @out_unsupported: on return, number of characters of @utf8 that are not fully |
| * representable in @charset |
| * |
| * Returns: the size in bytes of the string if converted from UTF-8 into @charset. |
| **/ |
| guint |
| mm_charset_get_encoded_len (const char *utf8, |
| MMModemCharset charset, |
| guint *out_unsupported) |
| { |
| const char *p = utf8, *next; |
| guint len = 0, unsupported = 0; |
| SubsetEntry *e; |
| |
| g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, 0); |
| g_return_val_if_fail (utf8 != NULL, 0); |
| |
| if (charset == MM_MODEM_CHARSET_UTF8) |
| return strlen (utf8); |
| |
| /* Find the charset in our subset table */ |
| for (e = &subset_table[0]; |
| e->cs != charset && e->cs != MM_MODEM_CHARSET_UNKNOWN; |
| e++); |
| g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, 0); |
| |
| while (*p) { |
| gunichar c; |
| const char *end; |
| guint clen = 0; |
| |
| c = g_utf8_get_char_validated (p, -1); |
| g_return_val_if_fail (c != (gunichar) -1, 0); |
| end = next = g_utf8_find_next_char (p, NULL); |
| if (end == NULL) { |
| /* Find the end... */ |
| end = p; |
| while (*end++); |
| } |
| |
| if (!e->func (c, p, (end - p), &clen)) |
| unsupported++; |
| len += clen; |
| p = next; |
| } |
| |
| if (out_unsupported) |
| *out_unsupported = unsupported; |
| return len; |
| } |
| |
| guint8 * |
| gsm_unpack (const guint8 *gsm, |
| guint32 num_septets, |
| guint8 start_offset, /* in _bits_ */ |
| guint32 *out_unpacked_len) |
| { |
| GByteArray *unpacked; |
| int i; |
| |
| unpacked = g_byte_array_sized_new (num_septets + 1); |
| |
| for (i = 0; i < num_septets; i++) { |
| guint8 bits_here, bits_in_next, octet, offset, c; |
| guint32 start_bit; |
| |
| start_bit = start_offset + (i * 7); /* Overall bit offset of char in buffer */ |
| offset = start_bit % 8; /* Offset to start of char in this byte */ |
| bits_here = offset ? (8 - offset) : 7; |
| bits_in_next = 7 - bits_here; |
| |
| /* Grab bits in the current byte */ |
| octet = gsm[start_bit / 8]; |
| c = (octet >> offset) & (0xFF >> (8 - bits_here)); |
| |
| /* Grab any bits that spilled over to next byte */ |
| if (bits_in_next) { |
| octet = gsm[(start_bit / 8) + 1]; |
| c |= (octet & (0xFF >> (8 - bits_in_next))) << bits_here; |
| } |
| g_byte_array_append (unpacked, &c, 1); |
| } |
| |
| *out_unpacked_len = unpacked->len; |
| return g_byte_array_free (unpacked, FALSE); |
| } |
| |
| guint8 * |
| gsm_pack (const guint8 *src, |
| guint32 src_len, |
| guint8 start_offset, |
| guint32 *out_packed_len) |
| { |
| guint8 *packed; |
| guint octet = 0, lshift, plen; |
| int i = 0; |
| |
| g_return_val_if_fail (start_offset < 8, NULL); |
| |
| plen = (src_len * 7) + start_offset; /* total length in bits */ |
| if (plen % 8) |
| plen += 8; |
| plen /= 8; /* now in bytes */ |
| |
| packed = g_malloc0 (plen); |
| |
| for (i = 0, lshift = start_offset; i < src_len; i++) { |
| packed[octet] |= (src[i] & 0x7F) << lshift; |
| if (lshift > 1) { |
| /* Grab the lost bits and add to next octet */ |
| g_assert (octet + 1 < plen); |
| packed[octet + 1] = (src[i] & 0x7F) >> (8 - lshift); |
| } |
| if (lshift) |
| octet++; |
| lshift = lshift ? lshift - 1 : 7; |
| } |
| |
| if (out_packed_len) |
| *out_packed_len = plen; |
| return packed; |
| } |
| |
| /* We do all our best to get the given string, which is possibly given in the |
| * specified charset, to UTF8. It may happen that the given string is really |
| * the hex representation of the charset-encoded string, so we need to cope with |
| * that case. */ |
| gchar * |
| mm_charset_take_and_convert_to_utf8 (gchar *str, |
| MMModemCharset charset) |
| { |
| gchar *utf8 = NULL; |
| |
| if (!str) |
| return NULL; |
| |
| switch (charset) { |
| case MM_MODEM_CHARSET_UNKNOWN: |
| g_warn_if_reached (); |
| utf8 = str; |
| break; |
| |
| case MM_MODEM_CHARSET_HEX: |
| /* We'll assume that the HEX string is really valid ASCII at the end */ |
| utf8 = str; |
| break; |
| |
| case MM_MODEM_CHARSET_GSM: |
| case MM_MODEM_CHARSET_8859_1: |
| case MM_MODEM_CHARSET_PCCP437: |
| case MM_MODEM_CHARSET_PCDN: { |
| const gchar *iconv_from; |
| GError *error = NULL; |
| |
| iconv_from = charset_iconv_from (charset); |
| utf8 = g_convert (str, strlen (str), |
| "UTF-8//TRANSLIT", iconv_from, |
| NULL, NULL, &error); |
| if (!utf8 || error) { |
| g_clear_error (&error); |
| utf8 = NULL; |
| } |
| |
| g_free (str); |
| break; |
| } |
| |
| case MM_MODEM_CHARSET_UCS2: { |
| gsize len; |
| gboolean possibly_hex = TRUE; |
| |
| /* If the string comes in hex-UCS-2, len needs to be a multiple of 4 */ |
| len = strlen (str); |
| if ((len < 4) || ((len % 4) != 0)) |
| possibly_hex = FALSE; |
| else { |
| const gchar *p = str; |
| |
| /* All chars in the string must be hex */ |
| while (*p && possibly_hex) |
| possibly_hex = isxdigit (*p++); |
| } |
| |
| /* If we get UCS-2, we expect the HEX representation of the string */ |
| if (possibly_hex) { |
| utf8 = mm_modem_charset_hex_to_utf8 (str, charset); |
| if (!utf8) { |
| /* If we couldn't convert the string as HEX-UCS-2, try to see if |
| * the string is valid UTF-8 itself. */ |
| utf8 = str; |
| } else |
| g_free (str); |
| } else |
| /* If we already know it's not hex, try to use the string as it is */ |
| utf8 = str; |
| |
| break; |
| } |
| |
| /* If the given charset is ASCII or UTF8, we really expect the final string |
| * already here */ |
| case MM_MODEM_CHARSET_IRA: |
| case MM_MODEM_CHARSET_UTF8: |
| utf8 = str; |
| break; |
| } |
| |
| /* Validate UTF-8 always before returning. This result will be exposed in DBus |
| * very likely... */ |
| if (!g_utf8_validate (utf8, -1, NULL)) { |
| /* Better return NULL than an invalid UTF-8 string */ |
| g_free (utf8); |
| utf8 = NULL; |
| } |
| |
| return utf8; |
| } |
| |
| /* We do all our best to convert the given string, which comes in UTF-8, to the |
| * specified charset. It may be that the output string needs to be the hex |
| * representation of the charset-encoded string, so we need to cope with that |
| * case. */ |
| gchar * |
| mm_utf8_take_and_convert_to_charset (gchar *str, |
| MMModemCharset charset) |
| { |
| gchar *encoded = NULL; |
| |
| if (!str) |
| return NULL; |
| |
| /* Validate UTF-8 always before converting */ |
| if (!g_utf8_validate (str, -1, NULL)) { |
| /* Better return NULL than an invalid encoded string */ |
| g_free (str); |
| return NULL; |
| } |
| |
| switch (charset) { |
| case MM_MODEM_CHARSET_UNKNOWN: |
| g_warn_if_reached (); |
| encoded = str; |
| break; |
| |
| case MM_MODEM_CHARSET_HEX: |
| /* FIXME: What encoding is this? */ |
| g_warn_if_reached (); |
| encoded = str; |
| break; |
| |
| case MM_MODEM_CHARSET_GSM: |
| case MM_MODEM_CHARSET_8859_1: |
| case MM_MODEM_CHARSET_PCCP437: |
| case MM_MODEM_CHARSET_PCDN: { |
| const gchar *iconv_to; |
| GError *error = NULL; |
| |
| iconv_to = charset_iconv_from (charset); |
| encoded = g_convert (str, strlen (str), |
| iconv_to, "UTF-8", |
| NULL, NULL, &error); |
| if (!encoded || error) { |
| g_clear_error (&error); |
| encoded = NULL; |
| } |
| |
| g_free (str); |
| break; |
| } |
| |
| case MM_MODEM_CHARSET_UCS2: { |
| const gchar *iconv_to; |
| gsize encoded_len = 0; |
| GError *error = NULL; |
| gchar *hex; |
| |
| iconv_to = charset_iconv_from (charset); |
| encoded = g_convert (str, strlen (str), |
| iconv_to, "UTF-8", |
| NULL, &encoded_len, &error); |
| if (!encoded || error) { |
| g_clear_error (&error); |
| encoded = NULL; |
| } |
| |
| /* Get hex representation of the string */ |
| hex = utils_bin2hexstr ((guint8 *)encoded, encoded_len); |
| g_free (encoded); |
| encoded = hex; |
| g_free (str); |
| break; |
| } |
| |
| /* If the given charset is ASCII or UTF8, we really expect the final string |
| * already here. */ |
| case MM_MODEM_CHARSET_IRA: |
| case MM_MODEM_CHARSET_UTF8: |
| encoded = str; |
| break; |
| } |
| |
| return encoded; |
| } |