blob: 0df99471c61453373dd86ca7ac4b3eec6a42548a [file]
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details:
*
* Copyright (C) 2010 Red Hat, Inc.
* Copyright (C) 2020 Aleksander Morgado <aleksander@aleksander.es>
*/
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#define _LIBMM_INSIDE_MM
#include <libmm-glib.h>
#include "mm-charsets.h"
#include "mm-log.h"
/* Common fallback character when transliteration is enabled */
static const gchar *translit_fallback = "?";
/******************************************************************************/
/* Expected charset settings */
typedef struct {
MMModemCharset charset;
const gchar *gsm_name;
const gchar *other_name;
const gchar *iconv_name;
} CharsetSettings;
static const CharsetSettings charset_settings[] = {
{ MM_MODEM_CHARSET_UTF8, "UTF-8", "UTF8", "UTF-8" },
{ MM_MODEM_CHARSET_UCS2, "UCS2", NULL, "UCS-2BE" },
{ MM_MODEM_CHARSET_IRA, "IRA", "ASCII", "ASCII" },
{ MM_MODEM_CHARSET_GSM, "GSM", NULL, NULL },
{ MM_MODEM_CHARSET_8859_1, "8859-1", NULL, "ISO8859-1" },
{ MM_MODEM_CHARSET_PCCP437, "PCCP437", "CP437", "CP437" },
{ MM_MODEM_CHARSET_PCDN, "PCDN", "CP850", "CP850" },
{ MM_MODEM_CHARSET_UTF16, "UTF-16", "UTF16", "UTF-16BE" },
};
MMModemCharset
mm_modem_charset_from_string (const gchar *string)
{
guint i;
g_return_val_if_fail (string != NULL, MM_MODEM_CHARSET_UNKNOWN);
for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) {
if (strcasestr (string, charset_settings[i].gsm_name))
return charset_settings[i].charset;
if (charset_settings[i].other_name && strcasestr (string, charset_settings[i].other_name))
return charset_settings[i].charset;
}
return MM_MODEM_CHARSET_UNKNOWN;
}
static const CharsetSettings *
lookup_charset_settings (MMModemCharset charset)
{
guint i;
g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);
for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) {
if (charset_settings[i].charset == charset)
return &charset_settings[i];
}
g_warn_if_reached ();
return NULL;
}
const gchar *
mm_modem_charset_to_string (MMModemCharset charset)
{
const CharsetSettings *settings;
settings = lookup_charset_settings (charset);
return settings ? settings->gsm_name : NULL;
}
/******************************************************************************/
/* GSM 03.38 encoding conversion stuff */
#define GSM_DEF_ALPHABET_SIZE 128
#define GSM_EXT_ALPHABET_SIZE 10
typedef struct GsmUtf8Mapping {
gchar chars[3];
guint8 len;
guint8 gsm; /* only used for extended GSM charset */
} GsmUtf8Mapping;
#define ONE(a) { {a, 0x00, 0x00}, 1, 0 }
#define TWO(a, b) { {a, b, 0x00}, 2, 0 }
/**
* gsm_def_utf8_alphabet:
*
* Mapping from GSM default alphabet to UTF-8.
*
* ETSI GSM 03.38, version 6.0.1, section 6.2.1; Default alphabet. Mapping to UCS-2.
* Mapping according to http://unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT
*/
static const GsmUtf8Mapping gsm_def_utf8_alphabet[GSM_DEF_ALPHABET_SIZE] = {
/* @ £ $ ¥ */
ONE(0x40), TWO(0xc2, 0xa3), ONE(0x24), TWO(0xc2, 0xa5),
/* è é ù ì */
TWO(0xc3, 0xa8), TWO(0xc3, 0xa9), TWO(0xc3, 0xb9), TWO(0xc3, 0xac),
/* ò Ç \n Ø */
TWO(0xc3, 0xb2), TWO(0xc3, 0x87), ONE(0x0a), TWO(0xc3, 0x98),
/* ø \r Å å */
TWO(0xc3, 0xb8), ONE(0x0d), TWO(0xc3, 0x85), TWO(0xc3, 0xa5),
/* Δ _ Φ Γ */
TWO(0xce, 0x94), ONE(0x5f), TWO(0xce, 0xa6), TWO(0xce, 0x93),
/* Λ Ω Π Ψ */
TWO(0xce, 0x9b), TWO(0xce, 0xa9), TWO(0xce, 0xa0), TWO(0xce, 0xa8),
/* Σ Θ Ξ Escape Code */
TWO(0xce, 0xa3), TWO(0xce, 0x98), TWO(0xce, 0x9e), ONE(0xa0),
/* Æ æ ß É */
TWO(0xc3, 0x86), TWO(0xc3, 0xa6), TWO(0xc3, 0x9f), TWO(0xc3, 0x89),
/* ' ' ! " # */
ONE(0x20), ONE(0x21), ONE(0x22), ONE(0x23),
/* ¤ % & ' */
TWO(0xc2, 0xa4), ONE(0x25), ONE(0x26), ONE(0x27),
/* ( ) * + */
ONE(0x28), ONE(0x29), ONE(0x2a), ONE(0x2b),
/* , - . / */
ONE(0x2c), ONE(0x2d), ONE(0x2e), ONE(0x2f),
/* 0 1 2 3 */
ONE(0x30), ONE(0x31), ONE(0x32), ONE(0x33),
/* 4 5 6 7 */
ONE(0x34), ONE(0x35), ONE(0x36), ONE(0x37),
/* 8 9 : ; */
ONE(0x38), ONE(0x39), ONE(0x3a), ONE(0x3b),
/* < = > ? */
ONE(0x3c), ONE(0x3d), ONE(0x3e), ONE(0x3f),
/* ¡ A B C */
TWO(0xc2, 0xa1), ONE(0x41), ONE(0x42), ONE(0x43),
/* D E F G */
ONE(0x44), ONE(0x45), ONE(0x46), ONE(0x47),
/* H I J K */
ONE(0x48), ONE(0x49), ONE(0x4a), ONE(0x4b),
/* L M N O */
ONE(0x4c), ONE(0x4d), ONE(0x4e), ONE(0x4f),
/* P Q R S */
ONE(0x50), ONE(0x51), ONE(0x52), ONE(0x53),
/* T U V W */
ONE(0x54), ONE(0x55), ONE(0x56), ONE(0x57),
/* X Y Z Ä */
ONE(0x58), ONE(0x59), ONE(0x5a), TWO(0xc3, 0x84),
/* Ö Ñ Ü § */
TWO(0xc3, 0x96), TWO(0xc3, 0x91), TWO(0xc3, 0x9c), TWO(0xc2, 0xa7),
/* ¿ a b c */
TWO(0xc2, 0xbf), ONE(0x61), ONE(0x62), ONE(0x63),
/* d e f g */
ONE(0x64), ONE(0x65), ONE(0x66), ONE(0x67),
/* h i j k */
ONE(0x68), ONE(0x69), ONE(0x6a), ONE(0x6b),
/* l m n o */
ONE(0x6c), ONE(0x6d), ONE(0x6e), ONE(0x6f),
/* p q r s */
ONE(0x70), ONE(0x71), ONE(0x72), ONE(0x73),
/* t u v w */
ONE(0x74), ONE(0x75), ONE(0x76), ONE(0x77),
/* x y z ä */
ONE(0x78), ONE(0x79), ONE(0x7a), TWO(0xc3, 0xa4),
/* ö ñ ü à */
TWO(0xc3, 0xb6), TWO(0xc3, 0xb1), TWO(0xc3, 0xbc), TWO(0xc3, 0xa0)
};
static guint8
gsm_def_char_to_utf8 (const guint8 gsm,
guint8 out_utf8[2])
{
g_return_val_if_fail (gsm < GSM_DEF_ALPHABET_SIZE, 0);
memcpy (&out_utf8[0], &gsm_def_utf8_alphabet[gsm].chars[0], gsm_def_utf8_alphabet[gsm].len);
return gsm_def_utf8_alphabet[gsm].len;
}
static gboolean
utf8_to_gsm_def_char (const gchar *utf8,
guint32 len,
guint8 *out_gsm)
{
gint i;
if (len > 0 && len < 4) {
for (i = 0; i < GSM_DEF_ALPHABET_SIZE; i++) {
if (gsm_def_utf8_alphabet[i].len == len) {
if (memcmp (&gsm_def_utf8_alphabet[i].chars[0], utf8, len) == 0) {
*out_gsm = i;
return TRUE;
}
}
}
}
return FALSE;
}
static gboolean
translit_gsm_nul_byte (GByteArray *gsm)
{
guint i;
guint n_replaces = 0;
for (i = 0; i < gsm->len; i++) {
if (gsm->data[i] == 0x00) {
utf8_to_gsm_def_char (translit_fallback, strlen (translit_fallback), &gsm->data[i]);
n_replaces++;
}
}
return (n_replaces > 0);
}
#define EONE(a, g) { {a, 0x00, 0x00}, 1, g }
#define ETHR(a, b, c, g) { {a, b, c}, 3, g }
/**
* gsm_ext_utf8_alphabet:
*
* Mapping from GSM extended alphabet to UTF-8.
*
*/
static const GsmUtf8Mapping gsm_ext_utf8_alphabet[GSM_EXT_ALPHABET_SIZE] = {
/* form feed ^ { } */
EONE(0x0c, 0x0a), EONE(0x5e, 0x14), EONE(0x7b, 0x28), EONE(0x7d, 0x29),
/* \ [ ~ ] */
EONE(0x5c, 0x2f), EONE(0x5b, 0x3c), EONE(0x7e, 0x3d), EONE(0x5d, 0x3e),
/* | € */
EONE(0x7c, 0x40), ETHR(0xe2, 0x82, 0xac, 0x65)
};
#define GSM_ESCAPE_CHAR 0x1b
static guint8
gsm_ext_char_to_utf8 (const guint8 gsm,
guint8 out_utf8[3])
{
int i;
for (i = 0; i < GSM_EXT_ALPHABET_SIZE; i++) {
if (gsm == gsm_ext_utf8_alphabet[i].gsm) {
memcpy (&out_utf8[0], &gsm_ext_utf8_alphabet[i].chars[0], gsm_ext_utf8_alphabet[i].len);
return gsm_ext_utf8_alphabet[i].len;
}
}
return 0;
}
static gboolean
utf8_to_gsm_ext_char (const gchar *utf8,
guint32 len,
guint8 *out_gsm)
{
int i;
if (len > 0 && len < 4) {
for (i = 0; i < GSM_EXT_ALPHABET_SIZE; i++) {
if (gsm_ext_utf8_alphabet[i].len == len) {
if (memcmp (&gsm_ext_utf8_alphabet[i].chars[0], utf8, len) == 0) {
*out_gsm = gsm_ext_utf8_alphabet[i].gsm;
return TRUE;
}
}
}
}
return FALSE;
}
static guint8
utf8_to_gsm_char (const gchar *utf8,
guint32 len,
guint8 *out_gsm)
{
if (utf8_to_gsm_def_char (utf8, len, out_gsm))
return 1;
if (utf8_to_gsm_ext_char (utf8, len, out_gsm))
return 2;
return 0;
}
static guint8 *
charset_gsm_unpacked_to_utf8 (const guint8 *gsm,
guint32 len,
gboolean translit,
GError **error)
{
g_autoptr(GByteArray) utf8 = NULL;
guint i;
g_return_val_if_fail (gsm != NULL, NULL);
g_return_val_if_fail (len < 4096, NULL);
/* worst case initial length */
utf8 = g_byte_array_sized_new (len * 2 + 1);
for (i = 0; i < len; i++) {
guint8 uchars[4];
guint8 ulen = 0;
/*
* 0x00 is NULL (when followed only by 0x00 up to the
* end of (fixed byte length) message, possibly also up to
* FORM FEED. But 0x00 is also the code for COMMERCIAL AT
* when some other character (CARRIAGE RETURN if nothing else)
* comes after the 0x00.
* http://unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT
*
* So, if we find a '@' (0x00) and all the next chars after that
* are also 0x00, we can consider the string finished already.
*/
if (gsm[i] == 0x00) {
gsize j;
for (j = i + 1; j < len; j++) {
if (gsm[j] != 0x00)
break;
}
if (j == len)
break;
}
if (gsm[i] == GSM_ESCAPE_CHAR) {
/* Extended alphabet, decode next char */
if (i + 1 < len) {
ulen = gsm_ext_char_to_utf8 (gsm[i + 1], uchars);
if (ulen)
i += 1;
}
} else {
/* Default alphabet */
ulen = gsm_def_char_to_utf8 (gsm[i], uchars);
}
if (ulen)
g_byte_array_append (utf8, &uchars[0], ulen);
else if (translit)
g_byte_array_append (utf8, (guint8 *) translit_fallback, strlen (translit_fallback));
else {
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
"Invalid conversion from GSM7");
return NULL;
}
}
/* Always make sure returned string is NUL terminated */
g_byte_array_append (utf8, (guint8 *) "\0", 1);
return g_byte_array_free (g_steal_pointer (&utf8), FALSE);
}
static guint8 *
charset_utf8_to_unpacked_gsm (const gchar *utf8,
gboolean translit,
guint32 *out_len,
GError **error)
{
g_autoptr(GByteArray) gsm = NULL;
const gchar *c;
const gchar *next;
static const guint8 gesc = GSM_ESCAPE_CHAR;
if (!utf8 || !g_utf8_validate (utf8, -1, NULL)) {
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
"Couldn't convert UTF-8 to GSM: input UTF-8 validation failed");
return NULL;
}
/* worst case initial length */
gsm = g_byte_array_sized_new (g_utf8_strlen (utf8, -1) * 2 + 1);
if (*utf8 == 0x00) {
/* Zero-length string */
g_byte_array_append (gsm, (guint8 *) "\0", 1);
if (out_len)
*out_len = 0;
return g_byte_array_free (g_steal_pointer (&gsm), FALSE);
}
next = utf8;
c = utf8;
while (next && *next) {
guint8 gch = 0x3f; /* 0x3f == '?' */
next = g_utf8_next_char (c);
/* Try escaped chars first, then default alphabet */
if (utf8_to_gsm_ext_char (c, next - c, &gch)) {
/* Add the escape char */
g_byte_array_append (gsm, &gesc, 1);
g_byte_array_append (gsm, &gch, 1);
} else if (utf8_to_gsm_def_char (c, next - c, &gch)) {
g_byte_array_append (gsm, &gch, 1);
} else if (translit) {
/* add ? */
g_byte_array_append (gsm, &gch, 1);
} else {
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
"Couldn't convert UTF-8 char to GSM");
return NULL;
}
c = next;
}
/* Output length doesn't consider terminating NUL byte */
if (out_len)
*out_len = gsm->len;
/* Always make sure returned string is NUL terminated */
g_byte_array_append (gsm, (guint8 *) "\0", 1);
return g_byte_array_free (g_steal_pointer (&gsm), FALSE);
}
/******************************************************************************/
/* Checks to see whether conversion to a target charset may be done without
* any loss. */
static gboolean
gsm_is_subset (gunichar c,
const gchar *utf8,
gsize ulen)
{
guint8 gsm;
if (utf8_to_gsm_def_char (utf8, ulen, &gsm))
return TRUE;
if (utf8_to_gsm_ext_char (utf8, ulen, &gsm))
return TRUE;
return FALSE;
}
static gboolean
ira_is_subset (gunichar c,
const gchar *utf8,
gsize ulen)
{
return (ulen == 1);
}
static gboolean
ucs2_is_subset (gunichar c,
const gchar *utf8,
gsize ulen)
{
return (c <= 0xFFFF);
}
static gboolean
utf16_is_subset (gunichar c,
const gchar *utf8,
gsize ulen)
{
return TRUE;
}
static gboolean
iso88591_is_subset (gunichar c,
const gchar *utf8,
gsize ulen)
{
return (c <= 0xFF);
}
static gboolean
pccp437_is_subset (gunichar c,
const gchar *utf8,
gsize ulen)
{
static const gunichar t[] = {
0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea,
0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6,
0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc,
0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa,
0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc,
0x00a1, 0x00ab, 0x00bb, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561,
0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b,
0x2510, 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,
0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, 0x2568,
0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518,
0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, 0x03b1, 0x00df, 0x0393,
0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4,
0x221e, 0x03c6, 0x03b5, 0x2229, 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320,
0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2,
0x25a0, 0x00a0
};
guint i;
if (c <= 0x7F)
return TRUE;
for (i = 0; i < G_N_ELEMENTS (t); i++) {
if (c == t[i])
return TRUE;
}
return FALSE;
}
static gboolean
pcdn_is_subset (gunichar c,
const gchar *utf8,
gsize ulen)
{
static const gunichar t[] = {
0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea,
0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6,
0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc,
0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa,
0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc,
0x00a1, 0x00ab, 0x00bb, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1,
0x00c2, 0x00c0, 0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5,
0x2510, 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3,
0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, 0x00f0,
0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x0131, 0x00cd, 0x00ce, 0x00cf, 0x2518,
0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, 0x00d3, 0x00df, 0x00d4,
0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe, 0x00de, 0x00da, 0x00db, 0x00d9,
0x00fd, 0x00dd, 0x00af, 0x00b4, 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6,
0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2,
0x25a0, 0x00a0
};
guint i;
if (c <= 0x7F)
return TRUE;
for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
if (c == t[i])
return TRUE;
}
return FALSE;
}
typedef struct {
MMModemCharset cs;
gboolean (*func) (gunichar c,
const gchar *utf8,
gsize ulen);
} SubsetEntry;
const SubsetEntry subset_table[] = {
{ MM_MODEM_CHARSET_GSM, gsm_is_subset },
{ MM_MODEM_CHARSET_IRA, ira_is_subset },
{ MM_MODEM_CHARSET_UCS2, ucs2_is_subset },
{ MM_MODEM_CHARSET_UTF16, utf16_is_subset },
{ MM_MODEM_CHARSET_8859_1, iso88591_is_subset },
{ MM_MODEM_CHARSET_PCCP437, pccp437_is_subset },
{ MM_MODEM_CHARSET_PCDN, pcdn_is_subset },
};
gboolean
mm_charset_can_convert_to (const gchar *utf8,
MMModemCharset charset)
{
const gchar *p;
guint i;
g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, FALSE);
g_return_val_if_fail (utf8 != NULL, FALSE);
if (charset == MM_MODEM_CHARSET_UTF8)
return TRUE;
/* Find the charset in our subset table */
for (i = 0; i < G_N_ELEMENTS (subset_table); i++) {
if (subset_table[i].cs == charset)
break;
}
g_return_val_if_fail (i < G_N_ELEMENTS (subset_table), FALSE);
p = utf8;
while (*p) {
gunichar c;
const char *end;
c = g_utf8_get_char_validated (p, -1);
g_return_val_if_fail (c != (gunichar) -1, 0);
end = g_utf8_find_next_char (p, NULL);
if (end == NULL) {
/* Find the string terminating NULL */
end = p;
while (*++end);
}
if (!subset_table[i].func (c, p, (end - p)))
return FALSE;
p = end;
}
return TRUE;
}
/******************************************************************************/
/* GSM-7 pack/unpack operations */
guint8 *
mm_charset_gsm_unpack (const guint8 *gsm,
guint32 num_septets,
guint8 start_offset, /* in _bits_ */
guint32 *out_unpacked_len)
{
GByteArray *unpacked;
guint i;
unpacked = g_byte_array_sized_new (num_septets + 1);
for (i = 0; i < num_septets; i++) {
guint8 bits_here, bits_in_next, octet, offset, c;
guint32 start_bit;
start_bit = start_offset + (i * 7); /* Overall bit offset of char in buffer */
offset = start_bit % 8; /* Offset to start of char in this byte */
bits_here = offset ? (8 - offset) : 7;
bits_in_next = 7 - bits_here;
/* Grab bits in the current byte */
octet = gsm[start_bit / 8];
c = (octet >> offset) & (0xFF >> (8 - bits_here));
/* Grab any bits that spilled over to next byte */
if (bits_in_next) {
octet = gsm[(start_bit / 8) + 1];
c |= (octet & (0xFF >> (8 - bits_in_next))) << bits_here;
}
g_byte_array_append (unpacked, &c, 1);
}
*out_unpacked_len = unpacked->len;
return g_byte_array_free (unpacked, FALSE);
}
guint8 *
mm_charset_gsm_pack (const guint8 *src,
guint32 src_len,
guint8 start_offset,
guint32 *out_packed_len)
{
guint8 *packed;
guint octet = 0, lshift, plen;
guint i = 0;
g_return_val_if_fail (start_offset < 8, NULL);
plen = (src_len * 7) + start_offset; /* total length in bits */
if (plen % 8)
plen += 8;
plen /= 8; /* now in bytes */
packed = g_malloc0 (plen);
for (i = 0, lshift = start_offset; i < src_len; i++) {
packed[octet] |= (src[i] & 0x7F) << lshift;
if (lshift > 1) {
/* Grab the lost bits and add to next octet */
g_assert (octet + 1 < plen);
packed[octet + 1] = (src[i] & 0x7F) >> (8 - lshift);
}
if (lshift)
octet++;
lshift = lshift ? lshift - 1 : 7;
}
if (out_packed_len)
*out_packed_len = plen;
return packed;
}
/*****************************************************************************/
/* Main conversion functions */
static guint8 *
charset_iconv_from_utf8 (const gchar *utf8,
const CharsetSettings *settings,
gboolean translit,
guint *out_size,
GError **error)
{
g_autoptr(GError) inner_error = NULL;
gsize bytes_written = 0;
g_autofree guint8 *encoded = NULL;
encoded = (guint8 *) g_convert (utf8, -1,
settings->iconv_name, "UTF-8",
NULL, &bytes_written, &inner_error);
if (encoded) {
if (out_size)
*out_size = (guint) bytes_written;
return g_steal_pointer (&encoded);
}
if (!translit) {
g_propagate_error (error, g_steal_pointer (&inner_error));
g_prefix_error (error, "Couldn't convert from UTF-8 to %s: ", settings->gsm_name);
return NULL;
}
encoded = (guint8 *) g_convert_with_fallback (utf8, -1,
settings->iconv_name, "UTF-8", translit_fallback,
NULL, &bytes_written, error);
if (encoded) {
if (out_size)
*out_size = (guint) bytes_written;
return g_steal_pointer (&encoded);
}
g_prefix_error (error, "Couldn't convert from UTF-8 to %s with translit: ", settings->gsm_name);
return NULL;
}
GByteArray *
mm_modem_charset_bytearray_from_utf8 (const gchar *utf8,
MMModemCharset charset,
gboolean translit,
GError **error)
{
const CharsetSettings *settings;
guint8 *encoded = NULL;
guint encoded_size = 0;
settings = lookup_charset_settings (charset);
if (charset == MM_MODEM_CHARSET_UNKNOWN) {
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
"Cannot convert from UTF-8: unknown target charset");
return NULL;
}
switch (charset) {
case MM_MODEM_CHARSET_GSM:
encoded = charset_utf8_to_unpacked_gsm (utf8, translit, &encoded_size, error);
break;
case MM_MODEM_CHARSET_IRA:
case MM_MODEM_CHARSET_8859_1:
case MM_MODEM_CHARSET_UTF8:
case MM_MODEM_CHARSET_UCS2:
case MM_MODEM_CHARSET_PCCP437:
case MM_MODEM_CHARSET_PCDN:
case MM_MODEM_CHARSET_UTF16:
encoded = charset_iconv_from_utf8 (utf8, settings, translit, &encoded_size, error);
break;
case MM_MODEM_CHARSET_UNKNOWN:
default:
g_assert_not_reached ();
}
return g_byte_array_new_take (encoded, encoded_size);
}
gchar *
mm_modem_charset_str_from_utf8 (const gchar *utf8,
MMModemCharset charset,
gboolean translit,
GError **error)
{
g_autoptr(GByteArray) bytearray = NULL;
if (charset == MM_MODEM_CHARSET_UNKNOWN) {
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
"Cannot convert from UTF-8: unknown target charset");
return NULL;
}
bytearray = mm_modem_charset_bytearray_from_utf8 (utf8, charset, translit, error);
if (!bytearray)
return NULL;
switch (charset) {
case MM_MODEM_CHARSET_GSM:
/* Note: strings encoded in unpacked GSM-7 can be used as plain
* strings as long as the string doesn't contain character '@', which
* is the one encoded as 0x00. At this point, we perform transliteration
* of the NUL bytes in the GSM-7 bytearray, and we fail the operation
* if one or more replacements were done and transliteration wasn't
* requested */
if (translit_gsm_nul_byte (bytearray) && !translit) {
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
"Cannot convert to GSM-7 string: transliteration required for embedded '@'");
return NULL;
}
/* fall through */
case MM_MODEM_CHARSET_IRA:
case MM_MODEM_CHARSET_8859_1:
case MM_MODEM_CHARSET_UTF8:
case MM_MODEM_CHARSET_PCCP437:
case MM_MODEM_CHARSET_PCDN:
return (gchar *) g_byte_array_free (g_steal_pointer (&bytearray), FALSE);
case MM_MODEM_CHARSET_UCS2:
case MM_MODEM_CHARSET_UTF16:
return mm_utils_bin2hexstr (bytearray->data, bytearray->len);
default:
case MM_MODEM_CHARSET_UNKNOWN:
g_assert_not_reached ();
}
}
static gchar *
charset_iconv_to_utf8 (const guint8 *data,
guint32 len,
const CharsetSettings *settings,
gboolean translit,
GError **error)
{
g_autoptr(GError) inner_error = NULL;
g_autofree gchar *utf8 = NULL;
utf8 = g_convert ((const gchar *) data, len,
"UTF-8",
settings->iconv_name,
NULL, NULL, &inner_error);
if (utf8)
return g_steal_pointer (&utf8);
if (!translit) {
g_propagate_error (error, g_steal_pointer (&inner_error));
g_prefix_error (error, "Couldn't convert from %s to UTF-8: ", settings->gsm_name);
return NULL;
}
utf8 = g_convert_with_fallback ((const gchar *) data, len,
"UTF-8", settings->iconv_name, translit_fallback,
NULL, NULL, error);
if (utf8)
return g_steal_pointer (&utf8);
g_prefix_error (error, "Couldn't convert from %s to UTF-8 with translit: ", settings->gsm_name);
return NULL;
}
gchar *
mm_modem_charset_bytearray_to_utf8 (GByteArray *bytearray,
MMModemCharset charset,
gboolean translit,
GError **error)
{
const CharsetSettings *settings;
g_autofree gchar *utf8 = NULL;
if (charset == MM_MODEM_CHARSET_UNKNOWN) {
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
"Cannot convert from UTF-8: unknown target charset");
return NULL;
}
settings = lookup_charset_settings (charset);
switch (charset) {
case MM_MODEM_CHARSET_GSM:
utf8 = (gchar *) charset_gsm_unpacked_to_utf8 (bytearray->data,
bytearray->len,
translit,
error);
break;
case MM_MODEM_CHARSET_IRA:
case MM_MODEM_CHARSET_UTF8:
case MM_MODEM_CHARSET_8859_1:
case MM_MODEM_CHARSET_PCCP437:
case MM_MODEM_CHARSET_PCDN:
case MM_MODEM_CHARSET_UCS2:
case MM_MODEM_CHARSET_UTF16:
utf8 = charset_iconv_to_utf8 (bytearray->data,
bytearray->len,
settings,
translit,
error);
break;
case MM_MODEM_CHARSET_UNKNOWN:
default:
g_assert_not_reached ();
}
if (!utf8) {
g_prefix_error (error, "Invalid conversion from %s to UTF-8: ", settings->gsm_name);
return NULL;
}
if (!g_utf8_validate (utf8, -1, NULL)) {
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_FAILED,
"Invalid conversion from %s: invalid UTF-8", settings->gsm_name);
return NULL;
}
return g_steal_pointer (&utf8);
}
gchar *
mm_modem_charset_str_to_utf8 (const gchar *str,
gssize len,
MMModemCharset charset,
gboolean translit,
GError **error)
{
g_autoptr(GByteArray) bytearray = NULL;
if (charset == MM_MODEM_CHARSET_UNKNOWN) {
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
"Cannot convert from UTF-8: unknown target charset");
return NULL;
}
/* Note: if the input string is GSM-7 encoded and it contains the '@'
* character, using -1 to indicate string length won't work properly,
* as '@' is encoded as 0x00. Whenever possible, if using GSM-7,
* give a proper len value or otherwise use the bytearray_to_utf8()
* method instead. */
if (len < 0)
len = strlen (str);
switch (charset) {
case MM_MODEM_CHARSET_GSM:
case MM_MODEM_CHARSET_IRA:
case MM_MODEM_CHARSET_8859_1:
case MM_MODEM_CHARSET_UTF8:
case MM_MODEM_CHARSET_PCCP437:
case MM_MODEM_CHARSET_PCDN:
bytearray = g_byte_array_sized_new (len);
g_byte_array_append (bytearray, (const guint8 *)str, len);
break;
case MM_MODEM_CHARSET_UCS2:
case MM_MODEM_CHARSET_UTF16: {
guint8 *bin = NULL;
gsize bin_len;
bin = (guint8 *) mm_utils_hexstr2bin (str, len, &bin_len, error);
if (!bin)
return NULL;
bytearray = g_byte_array_new_take (bin, bin_len);
break;
}
case MM_MODEM_CHARSET_UNKNOWN:
default:
g_assert_not_reached ();
}
return mm_modem_charset_bytearray_to_utf8 (bytearray, charset, translit, error);
}
/******************************************************************************/
/* Runtime charset support via iconv() */
void
mm_modem_charsets_init (void)
{
/* As test string, something we can convert to/from all the encodings */
static const gchar *default_test_str = "ModemManager";
guint i;
mm_obj_dbg (NULL, "[charsets] detecting platform iconv() support...");
for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) {
g_autofree guint8 *enc = NULL;
guint enc_size;
g_autofree gchar *dec = NULL;
if (!charset_settings[i].iconv_name)
continue;
enc = charset_iconv_from_utf8 (default_test_str,
&charset_settings[i],
FALSE,
&enc_size,
NULL);
if (!enc) {
mm_obj_dbg (NULL, "[charsets] %s: iconv conversion to charset not supported", charset_settings[i].iconv_name);
continue;
}
dec = charset_iconv_to_utf8 (enc,
enc_size,
&charset_settings[i],
FALSE,
NULL);
if (!enc) {
mm_obj_dbg (NULL, "[charsets] %s: iconv conversion from charset not supported", charset_settings[i].iconv_name);
continue;
}
mm_obj_dbg (NULL, "[charsets] %s: iconv conversion to/from charset is supported", charset_settings[i].iconv_name);
}
}
static gchar **
util_split_text_gsm7 (const gchar *text,
gsize text_len,
gpointer log_object)
{
g_autoptr(GPtrArray) chunks = NULL;
const gchar *walker;
const char *end;
const gchar *chunk_start;
glong encoded_chunk_length;
glong total_encoded_chunk_length;
chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free);
walker = text;
chunk_start = text;
encoded_chunk_length = 0;
total_encoded_chunk_length = 0;
while (walker && *walker) {
guint8 symbol[2] = {0, 0};
glong written_bytes = 0;
end = g_utf8_find_next_char (walker, NULL);
if (end == NULL) {
/* Find the string terminating NULL */
end = walker;
while (*++end);
}
written_bytes = utf8_to_gsm_char (walker, (end - walker), symbol);
/* If more than one chunk is needed, these have to be of 140 - 6 = 134
* bytes each, as additional space is needed for the UDH header.
* That means up to 153 input characters can be packed:
* 134 * 8 = 1072; 1072/7=153.14
*/
if ((encoded_chunk_length + written_bytes) > 153) {
g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
chunk_start = walker;
encoded_chunk_length = written_bytes;
} else
encoded_chunk_length += written_bytes;
total_encoded_chunk_length += written_bytes;
walker = g_utf8_next_char (walker);
}
/* No splitting needed? */
if (total_encoded_chunk_length <= 160) {
gchar **out;
out = g_new0 (gchar *, 2);
out[0] = g_strdup (text);
return out;
}
/* Otherwise, we do need the splitted chunks. Add the last one
* with contents plus the last trailing NULL */
g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
g_ptr_array_add (chunks, NULL);
return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE);
}
static gchar **
util_split_text_utf16_or_ucs2 (const gchar *text,
gsize text_len,
gpointer log_object)
{
g_autoptr(GPtrArray) chunks = NULL;
const gchar *walker;
const gchar *chunk_start;
glong encoded_chunk_length;
glong total_encoded_chunk_length;
chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free);
walker = text;
chunk_start = text;
encoded_chunk_length = 0;
total_encoded_chunk_length = 0;
while (walker && *walker) {
g_autofree gunichar2 *unichar2 = NULL;
glong unichar2_written = 0;
glong unichar2_written_bytes = 0;
gunichar single;
single = g_utf8_get_char (walker);
unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL);
g_assert (unichar2_written > 0);
/* When splitting for UCS-2 encoding, only one single unichar2 will be
* written, because all codepoints represented in UCS2 fit in the BMP.
* When splitting for UTF-16, though, we may end up writing one or two
* unichar2 (without or with surrogate pairs), because UTF-16 covers the
* whole Unicode spectrum. */
unichar2_written_bytes = (unichar2_written * sizeof (gunichar2));
if ((encoded_chunk_length + unichar2_written_bytes) > 134) {
g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
chunk_start = walker;
encoded_chunk_length = unichar2_written_bytes;
} else
encoded_chunk_length += unichar2_written_bytes;
total_encoded_chunk_length += unichar2_written_bytes;
walker = g_utf8_next_char (walker);
}
/* We have split the original string in chunks, where each chunk
* does not require more than 134 bytes when encoded in UTF-16.
* As a special case now, we consider the case that no splitting
* is necessary, i.e. if the total amount of bytes after encoding
* in UTF-16 is less or equal than 140. */
if (total_encoded_chunk_length <= 140) {
gchar **out;
out = g_new0 (gchar *, 2);
out[0] = g_strdup (text);
return out;
}
/* Otherwise, we do need the splitted chunks. Add the last one
* with contents plus the last trailing NULL */
g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
g_ptr_array_add (chunks, NULL);
return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE);
}
gchar **
mm_charset_util_split_text (const gchar *text,
MMModemCharset *charset,
gpointer log_object)
{
if (!text)
return NULL;
/* Some info about the rules for splitting.
*
* The User Data can be up to 140 bytes in the SMS part:
* 0) If we only need one chunk, it can be of up to 140 bytes.
* If we need more than one chunk, these have to be of 140 - 6 = 134
* bytes each, as we need place for the UDH header.
* 1) If we're using GSM7 encoding, this gives us up to 160 characters,
* as we can pack 160 characters of 7bits each into 140 bytes.
* 160 * 7 = 140 * 8 = 1120.
* If we only have 134 bytes allowed, that would mean that we can pack
* up to 153 input characters:
* 134 * 8 = 1072; 1072/7=153.14
* 2) If we're using UCS2 encoding, we can pack up to 70 characters in
* 140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes.
* 3) If we're using UTF-16 encoding (instead of UCS2), the amount of
* characters we can pack is variable, depends on how the characters
* are encoded in UTF-16 (e.g. if there are characters out of the BMP
* we'll need surrogate pairs and a single character will need 4 bytes
* instead of 2).
*
* This method does the split of the input string into N strings, so that
* each of the strings can be placed in a SMS part.
*/
/* Check if we can do GSM encoding */
if (mm_charset_can_convert_to (text, MM_MODEM_CHARSET_GSM)) {
*charset = MM_MODEM_CHARSET_GSM;
return util_split_text_gsm7 (text, strlen (text), log_object);
}
/* Otherwise fallback to report UCS-2 and split supporting UTF-16 */
*charset = MM_MODEM_CHARSET_UTF16;
return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object);
}