src/mm-charsets.c - chromiumos/third_party/modemmanager-next - Git at Google

 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
 /*
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details:
  *
  * Copyright (C) 2010 Red Hat, Inc.
  */

 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <string.h>
 #include <ctype.h>

 #include "mm-charsets.h"
 #include "mm-utils.h"

 typedef struct {
     const char *gsm_name;
     const char *other_name;
     const char *iconv_from_name;
     const char *iconv_to_name;
     MMModemCharset charset;
 } CharsetEntry;

 static CharsetEntry charset_map[] = {
     { "UTF-8",   "UTF8",   "UTF-8",     "UTF-8//TRANSLIT",     MM_MODEM_CHARSET_UTF8 },
     { "UCS2",    NULL,     "UCS-2BE",   "UCS-2BE//TRANSLIT",   MM_MODEM_CHARSET_UCS2 },
     { "IRA",     "ASCII",  "ASCII",     "ASCII//TRANSLIT",     MM_MODEM_CHARSET_IRA },
     { "GSM",     NULL,     NULL,        NULL,                  MM_MODEM_CHARSET_GSM },
     { "8859-1",  NULL,     "ISO8859-1", "ISO8859-1//TRANSLIT", MM_MODEM_CHARSET_8859_1 },
     { "PCCP437", "CP437",  "CP437",     "CP437//TRANSLIT",     MM_MODEM_CHARSET_PCCP437 },
     { "PCDN",    "CP850",  "CP850",     "CP850//TRANSLIT",     MM_MODEM_CHARSET_PCDN },
     { "HEX",     NULL,     NULL,        NULL,                  MM_MODEM_CHARSET_HEX },
     { NULL,      NULL,     NULL,        NULL,                  MM_MODEM_CHARSET_UNKNOWN }
 };

 const char *
 mm_modem_charset_to_string (MMModemCharset charset)
 {
     CharsetEntry *iter = &charset_map[0];

     g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);

     while (iter->gsm_name) {
         if (iter->charset == charset)
             return iter->gsm_name;
         iter++;
     }
     g_warn_if_reached ();
     return NULL;
 }

 MMModemCharset
 mm_modem_charset_from_string (const char *string)
 {
     CharsetEntry *iter = &charset_map[0];

     g_return_val_if_fail (string != NULL, MM_MODEM_CHARSET_UNKNOWN);

     while (iter->gsm_name) {
         if (strcasestr (string, iter->gsm_name))
             return iter->charset;
         if (iter->other_name && strcasestr (string, iter->other_name))
             return iter->charset;
         iter++;
     }
     return MM_MODEM_CHARSET_UNKNOWN;
 }

 static const char *
 charset_iconv_to (MMModemCharset charset)
 {
     CharsetEntry *iter = &charset_map[0];

     g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);

     while (iter->gsm_name) {
         if (iter->charset == charset)
             return iter->iconv_to_name;
         iter++;
     }
     g_warn_if_reached ();
     return NULL;
 }

 static const char *
 charset_iconv_from (MMModemCharset charset)
 {
     CharsetEntry *iter = &charset_map[0];

     g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);

     while (iter->gsm_name) {
         if (iter->charset == charset)
             return iter->iconv_from_name;
         iter++;
     }
     g_warn_if_reached ();
     return NULL;
 }

 gboolean
 mm_modem_charset_byte_array_append (GByteArray *array,
                                     const char *utf8,
                                     gboolean quoted,
                                     MMModemCharset charset)
 {
     const char *iconv_to;
     char *converted;
     GError *error = NULL;
     gsize written = 0;

     g_return_val_if_fail (array != NULL, FALSE);
     g_return_val_if_fail (utf8 != NULL, FALSE);

     iconv_to = charset_iconv_to (charset);
     g_return_val_if_fail (iconv_to != NULL, FALSE);

     converted = g_convert (utf8, -1, iconv_to, "UTF-8", NULL, &written, &error);
     if (!converted) {
         if (error) {
             g_warning ("%s: failed to convert '%s' to %s character set: (%d) %s",
                        __func__, utf8, iconv_to,
                        error->code, error->message);
             g_error_free (error);
         }
         return FALSE;
     }

     if (quoted)
         g_byte_array_append (array, (const guint8 *) "\"", 1);
     g_byte_array_append (array, (const guint8 *) converted, written);
     if (quoted)
         g_byte_array_append (array, (const guint8 *) "\"", 1);

     g_free (converted);
     return TRUE;
 }

 char *
 mm_modem_charset_hex_to_utf8 (const char *src, MMModemCharset charset)
 {
     char *unconverted, *converted;
     const char *iconv_from;
     gsize unconverted_len = 0;
     GError *error = NULL;

     g_return_val_if_fail (src != NULL, NULL);
     g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);

     iconv_from = charset_iconv_from (charset);
     g_return_val_if_fail (iconv_from != NULL, FALSE);

     unconverted = utils_hexstr2bin (src, &unconverted_len);
     if (!unconverted)
         return NULL;

     if (charset == MM_MODEM_CHARSET_UTF8 || charset == MM_MODEM_CHARSET_IRA)
         return unconverted;

     converted = g_convert (unconverted, unconverted_len,
                            "UTF-8//TRANSLIT", iconv_from,
                            NULL, NULL, &error);
     if (!converted || error) {
         g_clear_error (&error);
         converted = NULL;
     }

     g_free (unconverted);

     return converted;
 }

 char *
 mm_modem_charset_utf8_to_hex (const char *src, MMModemCharset charset)
 {
     gsize converted_len = 0;
     char *converted;
     const char *iconv_to;
     GError *error = NULL;
     gchar *hex;

     g_return_val_if_fail (src != NULL, NULL);
     g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);

     iconv_to = charset_iconv_from (charset);
     g_return_val_if_fail (iconv_to != NULL, FALSE);

     if (charset == MM_MODEM_CHARSET_UTF8 || charset == MM_MODEM_CHARSET_IRA)
         return g_strdup (src);

     converted = g_convert (src, strlen (src),
                            iconv_to, "UTF-8//TRANSLIT",
                            NULL, &converted_len, &error);
     if (!converted || error) {
         g_clear_error (&error);
         g_free (converted);
         return NULL;
     }

     /* Get hex representation of the string */
     hex = utils_bin2hexstr ((guint8 *)converted, converted_len);
     g_free (converted);
     return hex;
 }

 /* GSM 03.38 encoding conversion stuff */

 #define GSM_DEF_ALPHABET_SIZE 128
 #define GSM_EXT_ALPHABET_SIZE 10

 typedef struct GsmUtf8Mapping {
     gchar chars[3];
     guint8 len;
     guint8 gsm;  /* only used for extended GSM charset */
 } GsmUtf8Mapping;

 #define ONE(a)     { {a, 0x00, 0x00}, 1, 0 }
 #define TWO(a, b)  { {a, b,    0x00}, 2, 0 }

 /**
  * gsm_def_utf8_alphabet:
  *
  * Mapping from GSM default alphabet to UTF-8.
  *
  * ETSI GSM 03.38, version 6.0.1, section 6.2.1; Default alphabet. Mapping to UCS-2.
  * Mapping according to http://unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT
  */
 static const GsmUtf8Mapping gsm_def_utf8_alphabet[GSM_DEF_ALPHABET_SIZE] = {
 	/* @             £                $                ¥   */
     ONE(0x40),       TWO(0xc2, 0xa3), ONE(0x24),       TWO(0xc2, 0xa5),
     /* è             é                ù                ì   */
 	TWO(0xc3, 0xa8), TWO(0xc3, 0xa9), TWO(0xc3, 0xb9), TWO(0xc3, 0xac),
 	/* ò             Ç                \n               Ø   */
     TWO(0xc3, 0xb2), TWO(0xc3, 0x87), ONE(0x0a),       TWO(0xc3, 0x98),
     /* ø             \r               Å                å   */
     TWO(0xc3, 0xb8), ONE(0x0d),       TWO(0xc3, 0x85), TWO(0xc3, 0xa5),
 	/* Δ             _                Φ                Γ   */
     TWO(0xce, 0x94), ONE(0x5f),       TWO(0xce, 0xa6), TWO(0xce, 0x93),
     /* Λ             Ω                Π                Ψ   */
     TWO(0xce, 0x9b), TWO(0xce, 0xa9), TWO(0xce, 0xa0), TWO(0xce, 0xa8),
 	/* Σ             Θ                Ξ                Escape Code */
     TWO(0xce, 0xa3), TWO(0xce, 0x98), TWO(0xce, 0x9e), ONE(0xa0),
     /* Æ             æ                ß                É   */
     TWO(0xc3, 0x86), TWO(0xc3, 0xa6), TWO(0xc3, 0x9f), TWO(0xc3, 0x89),
 	/* ' '           !                "                #   */
     ONE(0x20),       ONE(0x21),       ONE(0x22),       ONE(0x23),
     /* ¤             %                &                '   */
     TWO(0xc2, 0xa4), ONE(0x25),       ONE(0x26),       ONE(0x27),
 	/* (             )                *                +   */
     ONE(0x28),       ONE(0x29),       ONE(0x2a),       ONE(0x2b),
     /* ,             -                .                /   */
     ONE(0x2c),       ONE(0x2d),       ONE(0x2e),       ONE(0x2f),
 	/* 0             1                2                3   */
 	ONE(0x30),       ONE(0x31),       ONE(0x32),       ONE(0x33),
     /* 4             5                6                7   */
 	ONE(0x34),       ONE(0x35),       ONE(0x36),       ONE(0x37),
 	/* 8             9                :                ;   */
 	ONE(0x38),       ONE(0x39),       ONE(0x3a),       ONE(0x3b),
 	/* <             =                >                ?   */
 	ONE(0x3c),       ONE(0x3d),       ONE(0x3e),       ONE(0x3f),
 	/* ¡             A                B                C   */
 	TWO(0xc2, 0xa1), ONE(0x41),       ONE(0x42),       ONE(0x43),
 	/* D             E                F                G   */
 	ONE(0x44),       ONE(0x45),       ONE(0x46),       ONE(0x47),
 	/* H             I                J                K   */
 	ONE(0x48),       ONE(0x49),       ONE(0x4a),       ONE(0x4b),
 	/* L             M                N                O   */
 	ONE(0x4c),       ONE(0x4d),       ONE(0x4e),       ONE(0x4f),
 	/* P             Q                R                S   */
 	ONE(0x50),       ONE(0x51),       ONE(0x52),       ONE(0x53),
 	/* T             U                V                W   */
 	ONE(0x54),       ONE(0x55),       ONE(0x56),       ONE(0x57),
 	/* X             Y                Z                Ä   */
 	ONE(0x58),       ONE(0x59),       ONE(0x5a),       TWO(0xc3, 0x84),
 	/* Ö             Ñ                Ü                §   */
     TWO(0xc3, 0x96), TWO(0xc3, 0x91), TWO(0xc3, 0x9c), TWO(0xc2, 0xa7),
 	/* ¿             a                b                c   */
 	TWO(0xc2, 0xbf), ONE(0x61),       ONE(0x62),       ONE(0x63),
 	/* d             e                f                g   */
 	ONE(0x64),       ONE(0x65),       ONE(0x66),       ONE(0x67),
 	/* h             i                j                k   */
 	ONE(0x68),       ONE(0x69),       ONE(0x6a),       ONE(0x6b),
 	/* l             m                n                o   */
 	ONE(0x6c),       ONE(0x6d),       ONE(0x6e),       ONE(0x6f),
 	/* p             q                r                s   */
 	ONE(0x70),       ONE(0x71),       ONE(0x72),       ONE(0x73),
 	/* t             u                v                w   */
 	ONE(0x74),       ONE(0x75),       ONE(0x76),       ONE(0x77),
 	/* x             y                z                ä   */
 	ONE(0x78),       ONE(0x79),       ONE(0x7a),       TWO(0xc3, 0xa4),
     /* ö             ñ                ü                à   */
     TWO(0xc3, 0xb6), TWO(0xc3, 0xb1), TWO(0xc3, 0xbc), TWO(0xc3, 0xa0)
 };

 static guint8
 gsm_def_char_to_utf8 (const guint8 gsm, guint8 out_utf8[2])
 {
     g_return_val_if_fail (gsm < GSM_DEF_ALPHABET_SIZE, 0);
     memcpy (&out_utf8[0], &gsm_def_utf8_alphabet[gsm].chars[0], gsm_def_utf8_alphabet[gsm].len);
     return gsm_def_utf8_alphabet[gsm].len;
 }

 static gboolean
 utf8_to_gsm_def_char (const char *utf8, guint32 len, guint8 *out_gsm)
 {
     int i;

     if (len > 0 && len < 4) {
         for (i = 0; i < GSM_DEF_ALPHABET_SIZE; i++) {
             if (gsm_def_utf8_alphabet[i].len == len) {
                 if (memcmp (&gsm_def_utf8_alphabet[i].chars[0], utf8, len) == 0) {
                     *out_gsm = i;
                     return TRUE;
                 }
             }
         }
     }
     return FALSE;
 }


 #define EONE(a, g)        { {a, 0x00, 0x00}, 1, g }
 #define ETHR(a, b, c, g)  { {a, b,    c},    3, g }

 /**
  * gsm_ext_utf8_alphabet:
  *
  * Mapping from GSM extended alphabet to UTF-8.
  *
  */
 static const GsmUtf8Mapping gsm_ext_utf8_alphabet[GSM_EXT_ALPHABET_SIZE] = {
     /* form feed      ^                 {                 }  */
     EONE(0x0c, 0x0a), EONE(0x5e, 0x14), EONE(0x7b, 0x28), EONE(0x7d, 0x29),
     /* \              [                 ~                 ]  */
     EONE(0x5c, 0x2f), EONE(0x5b, 0x3c), EONE(0x7e, 0x3d), EONE(0x5d, 0x3e),
     /* |              €                                      */
     EONE(0x7c, 0x40), ETHR(0xe2, 0x82, 0xac, 0x65)
 };

 #define GSM_ESCAPE_CHAR 0x1b

 static guint8
 gsm_ext_char_to_utf8 (const guint8 gsm, guint8 out_utf8[3])
 {
     int i;

     for (i = 0; i < GSM_EXT_ALPHABET_SIZE; i++) {
         if (gsm == gsm_ext_utf8_alphabet[i].gsm) {
             memcpy (&out_utf8[0], &gsm_ext_utf8_alphabet[i].chars[0], gsm_ext_utf8_alphabet[i].len);
             return gsm_ext_utf8_alphabet[i].len;
         }
     }
     return 0;
 }

 static gboolean
 utf8_to_gsm_ext_char (const char *utf8, guint32 len, guint8 *out_gsm)
 {
     int i;

     if (len > 0 && len < 4) {
         for (i = 0; i < GSM_EXT_ALPHABET_SIZE; i++) {
             if (gsm_ext_utf8_alphabet[i].len == len) {
                 if (memcmp (&gsm_ext_utf8_alphabet[i].chars[0], utf8, len) == 0) {
                     *out_gsm = gsm_ext_utf8_alphabet[i].gsm;
                     return TRUE;
                 }
             }
         }
     }
     return FALSE;
 }

 guint8 *
 mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len)
 {
     int i;
     GByteArray *utf8;

     g_return_val_if_fail (gsm != NULL, NULL);
     g_return_val_if_fail (len < 4096, NULL);

     /* worst case initial length */
     utf8 = g_byte_array_sized_new (len * 2 + 1);

     for (i = 0; i < len; i++) {
         guint8 uchars[4];
         guint8 ulen;

         if (gsm[i] == GSM_ESCAPE_CHAR) {
             /* Extended alphabet, decode next char */
             ulen = gsm_ext_char_to_utf8 (gsm[i+1], uchars);
             if (ulen)
                 i += 1;
         } else {
             /* Default alphabet */
             ulen = gsm_def_char_to_utf8 (gsm[i], uchars);
         }

         if (ulen)
             g_byte_array_append (utf8, &uchars[0], ulen);
         else
             g_byte_array_append (utf8, (guint8 *) "?", 1);
     }

     g_byte_array_append (utf8, (guint8 *) "\0", 1);  /* NULL terminator */
     return g_byte_array_free (utf8, FALSE);
 }

 guint8 *
 mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len)
 {
     GByteArray *gsm;
     const char *c = utf8, *next = c;
     static const guint8 gesc = GSM_ESCAPE_CHAR;
     int i = 0;

     g_return_val_if_fail (utf8 != NULL, NULL);
     g_return_val_if_fail (out_len != NULL, NULL);
     g_return_val_if_fail (g_utf8_validate (utf8, -1, NULL), NULL);

     /* worst case initial length */
     gsm = g_byte_array_sized_new (g_utf8_strlen (utf8, -1) * 2 + 1);

     if (*utf8 == 0x00) {
         /* Zero-length string */
         g_byte_array_append (gsm, (guint8 *) "\0", 1);
         *out_len = 0;
         return g_byte_array_free (gsm, FALSE);
     }

     while (next && *next) {
         guint8 gch = 0x3f;  /* 0x3f == '?' */

         next = g_utf8_next_char (c);

         /* Try escaped chars first, then default alphabet */
         if (utf8_to_gsm_ext_char (c, next - c, &gch)) {
             /* Add the escape char */
             g_byte_array_append (gsm, &gesc, 1);
             g_byte_array_append (gsm, &gch, 1);
         } else if (utf8_to_gsm_def_char (c, next - c, &gch))
             g_byte_array_append (gsm, &gch, 1);

         c = next;
         i++;
     }

     *out_len = gsm->len;
     return g_byte_array_free (gsm, FALSE);
 }

 static gboolean
 gsm_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
 {
     guint8 gsm;

     *out_clen = 1;
     if (utf8_to_gsm_def_char (utf8, ulen, &gsm))
         return TRUE;
     if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) {
         *out_clen = 2;
         return TRUE;
     }
     return FALSE;
 }

 static gboolean
 ira_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
 {
     *out_clen = 1;
     return (ulen == 1);
 }

 static gboolean
 ucs2_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
 {
     *out_clen = 2;
     return (c <= 0xFFFF);
 }

 static gboolean
 iso88591_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
 {
     *out_clen = 1;
     return (c <= 0xFF);
 }

 static gboolean
 pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
 {
     static const gunichar t[] = {
         0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea,
         0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6,
         0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc,
         0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa,
         0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc,
         0x00a1, 0x00ab, 0x00bb, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561,
         0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b,
         0x2510, 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,
         0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, 0x2568,
         0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518,
         0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, 0x03b1, 0x00df, 0x0393,
         0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4,
         0x221e, 0x03c6, 0x03b5, 0x2229, 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320,
         0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2,
         0x25a0, 0x00a0
     };
     int i;

     *out_clen = 1;

     if (c <= 0x7F)
         return TRUE;
     for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
         if (c == t[i])
             return TRUE;
     }
     return FALSE;
 }

 static gboolean
 pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
 {
     static const gunichar t[] = {
         0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea,
         0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6,
         0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc,
         0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa,
         0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc,
         0x00a1, 0x00ab, 0x00bb, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1,
         0x00c2, 0x00c0, 0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5,
         0x2510, 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3,
         0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, 0x00f0,
         0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x0131, 0x00cd, 0x00ce, 0x00cf, 0x2518,
         0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, 0x00d3, 0x00df, 0x00d4,
         0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe, 0x00de, 0x00da, 0x00db, 0x00d9,
         0x00fd, 0x00dd, 0x00af, 0x00b4, 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6,
         0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2,
         0x25a0, 0x00a0
     };
     int i;

     *out_clen = 1;

     if (c <= 0x7F)
         return TRUE;
     for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
         if (c == t[i])
             return TRUE;
     }
     return FALSE;
 }

 typedef struct {
     MMModemCharset cs;
     gboolean (*func) (gunichar c, const char *utf8, gsize ulen, guint *out_clen);
     guint charsize;
 } SubsetEntry;

 SubsetEntry subset_table[] = {
     { MM_MODEM_CHARSET_GSM,     gsm_is_subset },
     { MM_MODEM_CHARSET_IRA,     ira_is_subset },
     { MM_MODEM_CHARSET_UCS2,    ucs2_is_subset },
     { MM_MODEM_CHARSET_8859_1,  iso88591_is_subset },
     { MM_MODEM_CHARSET_PCCP437, pccp437_is_subset },
     { MM_MODEM_CHARSET_PCDN,    pcdn_is_subset },
     { MM_MODEM_CHARSET_UNKNOWN, NULL },
 };

 /**
  * mm_charset_get_encoded_len:
  *
  * @utf8: UTF-8 valid string
  * @charset: the #MMModemCharset to check the length of @utf8 in
  * @out_unsupported: on return, number of characters of @utf8 that are not fully
  * representable in @charset
  *
  * Returns: the size in bytes of the string if converted from UTF-8 into @charset.
  **/
 guint
 mm_charset_get_encoded_len (const char *utf8,
                             MMModemCharset charset,
                             guint *out_unsupported)
 {
     const char *p = utf8, *next;
     guint len = 0, unsupported = 0;
     SubsetEntry *e;

     g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, 0);
     g_return_val_if_fail (utf8 != NULL, 0);

     if (charset == MM_MODEM_CHARSET_UTF8)
         return strlen (utf8);

     /* Find the charset in our subset table */
     for (e = &subset_table[0];
          e->cs != charset && e->cs != MM_MODEM_CHARSET_UNKNOWN;
          e++);
     g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, 0);

     while (*p) {
         gunichar c;
         const char *end;
         guint clen = 0;

         c = g_utf8_get_char_validated (p, -1);
         g_return_val_if_fail (c != (gunichar) -1, 0);
         end = next = g_utf8_find_next_char (p, NULL);
         if (end == NULL) {
             /* Find the end... */
             end = p;
             while (*end++);
         }

         if (!e->func (c, p, (end - p), &clen))
             unsupported++;
         len += clen;
         p = next;
     }

     if (out_unsupported)
         *out_unsupported = unsupported;
     return len;
 }

 guint8 *
 gsm_unpack (const guint8 *gsm,
             guint32 num_septets,
             guint8 start_offset,  /* in _bits_ */
             guint32 *out_unpacked_len)
 {
     GByteArray *unpacked;
     int i;

     unpacked = g_byte_array_sized_new (num_septets + 1);

     for (i = 0; i < num_septets; i++) {
         guint8 bits_here, bits_in_next, octet, offset, c;
         guint32 start_bit;

         start_bit = start_offset + (i * 7); /* Overall bit offset of char in buffer */
         offset = start_bit % 8;  /* Offset to start of char in this byte */
         bits_here = offset ? (8 - offset) : 7;
         bits_in_next = 7 - bits_here;

         /* Grab bits in the current byte */
         octet = gsm[start_bit / 8];
         c = (octet >> offset) & (0xFF >> (8 - bits_here));

         /* Grab any bits that spilled over to next byte */
         if (bits_in_next) {
             octet = gsm[(start_bit / 8) + 1];
             c |= (octet & (0xFF >> (8 - bits_in_next))) << bits_here;
         }
         g_byte_array_append (unpacked, &c, 1);
     }

     *out_unpacked_len = unpacked->len;
     return g_byte_array_free (unpacked, FALSE);
 }

 guint8 *
 gsm_pack (const guint8 *src,
           guint32 src_len,
           guint8 start_offset,
           guint32 *out_packed_len)
 {
     guint8 *packed;
     guint octet = 0, lshift, plen;
     int i = 0;

     g_return_val_if_fail (start_offset < 8, NULL);

     plen = (src_len * 7) + start_offset; /* total length in bits */
     if (plen % 8)
         plen += 8;
     plen /= 8;  /* now in bytes */

     packed = g_malloc0 (plen);

     for (i = 0, lshift = start_offset; i < src_len; i++) {
         packed[octet] |= (src[i] & 0x7F) << lshift;
         if (lshift > 1) {
             /* Grab the lost bits and add to next octet */
             g_assert (octet + 1 < plen);
             packed[octet + 1] = (src[i] & 0x7F) >> (8 - lshift);
         }
         if (lshift)
             octet++;
         lshift = lshift ? lshift - 1 : 7;
     }

     if (out_packed_len)
         *out_packed_len = plen;
     return packed;
 }

 /* We do all our best to get the given string, which is possibly given in the
  * specified charset, to UTF8. It may happen that the given string is really
  * the hex representation of the charset-encoded string, so we need to cope with
  * that case. */
 gchar *
 mm_charset_take_and_convert_to_utf8 (gchar *str,
                                      MMModemCharset charset)
 {
     gchar *utf8 = NULL;

     if (!str)
         return NULL;

     switch (charset) {
     case MM_MODEM_CHARSET_UNKNOWN:
         g_warn_if_reached ();
         utf8 = str;
         break;

     case MM_MODEM_CHARSET_HEX:
         /* We'll assume that the HEX string is really valid ASCII at the end */
         utf8 = str;
         break;

     case MM_MODEM_CHARSET_GSM:
     case MM_MODEM_CHARSET_8859_1:
     case MM_MODEM_CHARSET_PCCP437:
     case MM_MODEM_CHARSET_PCDN: {
         const gchar *iconv_from;
         GError *error = NULL;

         iconv_from = charset_iconv_from (charset);
         utf8 = g_convert (str, strlen (str),
                           "UTF-8//TRANSLIT", iconv_from,
                           NULL, NULL, &error);
         if (!utf8 || error) {
             g_clear_error (&error);
             utf8 = NULL;
         }

         g_free (str);
         break;
     }

     case MM_MODEM_CHARSET_UCS2: {
         gsize len;
         gboolean possibly_hex = TRUE;

         /* If the string comes in hex-UCS-2, len needs to be a multiple of 4 */
         len = strlen (str);
         if ((len < 4) || ((len % 4) != 0))
             possibly_hex = FALSE;
         else {
             const gchar *p = str;

             /* All chars in the string must be hex */
             while (*p && possibly_hex)
                 possibly_hex = isxdigit (*p++);
         }

         /* If we get UCS-2, we expect the HEX representation of the string */
         if (possibly_hex) {
             utf8 = mm_modem_charset_hex_to_utf8 (str, charset);
             if (!utf8) {
                 /* If we couldn't convert the string as HEX-UCS-2, try to see if
                  * the string is valid UTF-8 itself. */
                 utf8 = str;
             } else
                 g_free (str);
         } else
             /* If we already know it's not hex, try to use the string as it is */
             utf8 = str;

         break;
     }

     /* If the given charset is ASCII or UTF8, we really expect the final string
      * already here */
     case MM_MODEM_CHARSET_IRA:
     case MM_MODEM_CHARSET_UTF8:
         utf8 = str;
         break;
     }

     /* Validate UTF-8 always before returning. This result will be exposed in DBus
      * very likely... */
     if (!g_utf8_validate (utf8, -1, NULL)) {
         /* Better return NULL than an invalid UTF-8 string */
         g_free (utf8);
         utf8 = NULL;
     }

     return utf8;
 }

 /* We do all our best to convert the given string, which comes in UTF-8, to the
  * specified charset. It may be that the output string needs to be the hex
  * representation of the charset-encoded string, so we need to cope with that
  * case. */
 gchar *
 mm_utf8_take_and_convert_to_charset (gchar *str,
                                      MMModemCharset charset)
 {
     gchar *encoded = NULL;

     if (!str)
         return NULL;

     /* Validate UTF-8 always before converting */
     if (!g_utf8_validate (str, -1, NULL)) {
         /* Better return NULL than an invalid encoded string */
         g_free (str);
         return NULL;
     }

     switch (charset) {
     case MM_MODEM_CHARSET_UNKNOWN:
         g_warn_if_reached ();
         encoded = str;
         break;

     case MM_MODEM_CHARSET_HEX:
         /* FIXME: What encoding is this? */
         g_warn_if_reached ();
         encoded = str;
         break;

     case MM_MODEM_CHARSET_GSM:
     case MM_MODEM_CHARSET_8859_1:
     case MM_MODEM_CHARSET_PCCP437:
     case MM_MODEM_CHARSET_PCDN: {
         const gchar *iconv_to;
         GError *error = NULL;

         iconv_to = charset_iconv_from (charset);
         encoded = g_convert (str, strlen (str),
                              iconv_to, "UTF-8",
                              NULL, NULL, &error);
         if (!encoded || error) {
             g_clear_error (&error);
             encoded = NULL;
         }

         g_free (str);
         break;
     }

     case MM_MODEM_CHARSET_UCS2: {
         const gchar *iconv_to;
         gsize encoded_len = 0;
         GError *error = NULL;
         gchar *hex;

         iconv_to = charset_iconv_from (charset);
         encoded = g_convert (str, strlen (str),
                              iconv_to, "UTF-8",
                              NULL, &encoded_len, &error);
         if (!encoded || error) {
             g_clear_error (&error);
             encoded = NULL;
         }

         /* Get hex representation of the string */
         hex = utils_bin2hexstr ((guint8 *)encoded, encoded_len);
         g_free (encoded);
         encoded = hex;
         g_free (str);
         break;
     }

     /* If the given charset is ASCII or UTF8, we really expect the final string
      * already here. */
     case MM_MODEM_CHARSET_IRA:
     case MM_MODEM_CHARSET_UTF8:
         encoded = str;
         break;
     }

     return encoded;
 }