blob: dc02e8e2848777301244eae001dc0f731fe9f9e8 [file] [log] [blame]
/*
* symbols.c -- Implements functions handling symbols conversion,
* including punctuation, for Speech Dispatcher
*
* Copyright (C) 2001,2002,2003, 2007, 2017 Brailcom, o.p.s
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
/* Based off NVDA's symbols replacement code (GPLv2+):
* https://github.com/nvaccess/nvda/blob/master/source/characterProcessing.py
*
* OVERVIEW
*
* This file contains all of the logic related to reading, processing and
* using NVDA symbols replacement files. It should be 100% compatible with
* NVDA's equivalent.
*
* The files are read by the speech_symbols_load() family of functions.
* Each symbol is loaded into a SpeechSymbol structure, and the symbols of
* a file (both simple and complex) are loaded into a SpeechSymbols (note the
* plural form) structure.
*
* The loaded symbols are compiled into GLib PCRE regular expressions
* (originally a Python one, but they are compatible enough) and converted to
* a fully usable form into a list of SpeechSymbolProcessor. These processors
* are then usable to pre-process an input text with
* speech_symbols_processor_process_text().
*
* The loading steps are automatically handled when calling
* speech_symbols_processor_new(). To avoid re-processing files more than
* once even if they are used by different SpeechSymbolProcessor, the loaded
* files are cached as SpeechSymbols into the G_symbols_dicts global variable.
* Similarly, lists of SpeechSymbolProcessor are cached into the
* G_processors global variable.
*
* The caches are automatically loaded when looking up an entry with either
* get_locale_speech_symbols() (for SpeechSymbols lists) or
* get_locale_speech_symbols_processor() (for SpeechSymbolProcessor lists).
* This loading is aware of locale strings syntax and will fallback on the
* language code alone if the language-country combo isn't found.
*
* WARNING: this module is NOT thread-safe. Most notably, the caches are not
* thread-safe, so the public API insert_symbols() shouldn't be balled from
* several threads at once. This should not be an issue, as it is supposed to
* be called from the speak thread only.
*
* This file is mostly a 1:1 translation of NVDA's python code doing the same
* thing, with slight simplifications or adaptations for C, and removal of
* unused features like loading user-specific symbols files.
*/
/*
* TODO:
* - support NUL byte representation. However, they aren't properly handled
* in the rest of SPD, so it's not so important.
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <ctype.h>
#include "symbols.h"
/* This denotes the position of some SSML tags */
struct tags {
gsize pos; /* Its position in the text */
gssize shift; /* How much its position is shifted by the current replacements */
gint deferrable; /* Whether it is fine to defer the tag (e.g. a mark or comment) */
gchar *tags; /* The content of the tags */
};
/* Speech symbol preserve modes */
typedef enum {
SYMPRES_INVALID = -1,
SYMPRES_NEVER = 0, /* Never preserve the symbol */
SYMPRES_ALWAYS = 1, /* Always preserve the symbol */
SYMPRES_NOREP = 2, /* Only preserve the symbol if it is not being
replaced; i.e. the user has set symbol level
lower than the level of this symbol */
SYMPRES_LITERAL = 3 /* Replace literally, without any spacing
addition */
} SymPresMode;
/* Represents a single symbol, and how it should be handled. */
typedef struct {
char *identifier;
char *pattern;
char *replacement;
SymLvl level;
SymPresMode preserve;
char *display_name;
} SpeechSymbol;
/* Represents all symbols in a symbols file.
* This is roughly an internal representation of the symbols files. */
typedef struct {
gchar *source;
/* Ordered list of [identifier(string), pattern(string)] */
GSList *complex_symbols;
/* table of identifier(string):symbol(SpeechSymbol) */
GHashTable *symbols;
} SpeechSymbols;
/* Describes a name->value translation for a field that should be loaded
* as an integer. */
typedef struct {
const char *name;
int value;
} IntFieldDesc;
/* Represents a loaded and cached set of symbols in a usable form */
typedef struct {
gchar *source;
struct tags *tags; /* tags attached to the text */
gint ntags; /* number of elements in tags array */
GRegex *regex; /* compiled regular expression for parsing input */
GRegex **multi_chars_regex; /* array of compiled regular expression for simple multi-char symbols */
gint nmulti_chars_regex; /* number of elements in multi_chars_regex */
/* Table of identifier(string):symbol(SpeechSymbol).
* Indexes are pointers to symbol->identifier. */
GHashTable *symbols;
/* list of SpeechSymbol (weak pointers to entries in @c symbols) */
GSList *complex_list;
/* Level requested by user */
SymLvl level;
/* Level to be supported */
SymLvl support_level;
} SpeechSymbolProcessor;
/* Map of locale code to arbitrary data. */
typedef GHashTable LocaleMap;
typedef gpointer (*LocaleMapCreateDataFunc) (const gchar *locale, const gchar *file);
/* globals for caching */
/* Map of SpeechSymbols, indexed by their locale and file */
static LocaleMap *G_symbols_dicts = NULL;
/* Map of SpeechSymbolProcessor lists, indexed by their locale */
static LocaleMap *G_processors = NULL;
/* List of files to load */
static GSList *symbols_files;
SymLvl str2SymLvl(const char *str)
{
SymLvl punct;
if (!strcmp(str, "no"))
punct = SYMLVL_NO;
else if (!strcmp(str, "none"))
punct = SYMLVL_NONE;
else if (!strcmp(str, "all"))
punct = SYMLVL_ALL;
else if (!strcmp(str, "char"))
punct = SYMLVL_CHAR;
else
punct = SYMLVL_INVALID;
return punct;
}
/*----------------------------- Locale data map -----------------------------*/
static LocaleMap *locale_map_new(GDestroyNotify value_destroy)
{
return g_hash_table_new_full(g_str_hash, g_str_equal, g_free, value_destroy);
}
static gpointer locale_map_lookup(LocaleMap *map, const gchar *locale, const gchar *file)
{
if (file) {
gchar *str = g_strdup_printf("%s %s", locale, file);
gpointer res = g_hash_table_lookup(map, str);
g_free(str);
return res;
} else {
return g_hash_table_lookup(map, locale);
}
}
/* Fetches or creates a locale item for the map.
* If @c locale contains a country and data for the whole locale is not found,
* tries to load the data for the language alone. */
static gpointer locale_map_fetch(LocaleMap *map, const gchar *locale, const gchar *file,
LocaleMapCreateDataFunc create)
{
guint i;
for (i = 0; i < 2; i++) {
gpointer value;
gchar *l;
if (i == 0) {
value = locale_map_lookup(map, locale, file);
l = g_strdup(locale);
} else {
gchar **parts = g_strsplit_set(locale, "_-", 2);
if (!parts[0] || !parts[1]) {
/* no delimiters, no need to try again */
g_strfreev(parts);
continue;
}
l = g_strdup(parts[0]);
value = locale_map_lookup(map, l, file);
g_strfreev(parts);
}
if (value) {
g_free(l);
return value;
}
/* try to create */
value = create(l, file);
if (value) {
g_hash_table_insert(map, l, value);
return value;
}
g_free(l);
}
return NULL;
}
/*--------------------- Escaping xml tags in ssml text ----------------------*/
/*
* We need not ever speak the SSML syntax, so we need to skip the tags.
*
* For lookbehind and lookahead rules to be able to run, we have to really
* remove the tags from the text, but we want to remember where they were.
*
* We thus build an array of the positions of the tags, that the replacement
* function will update, so we know where to put back the tags.
*
* Alongside, we also have to untranslate/translate the xml entities for tag characters.
*/
/* Move tags off from the text */
static gchar *escape_ssml_text(const gchar *text, struct tags **tags_ret, gint *ntags_ret)
{
const gchar *cur, *curtag = NULL;
struct tags *tags;
GString *str;
gchar name[7]; /* Current tag name, only need to recognize against "mark", "/mark", "!--" for now */
gsize namepos = 0;
int filling_tag; /* Whether we are stack tags, or text */
int in_tag; /* Whether we are within a tag */
int in_tag_name; /* Whether we are within the name part of a tag */
int in_apos; /* Whether we are within a '' string in a tag */
int in_quote; /* Whether we are within a "" string in a tag */
gint ntags;
/* First count how many blocks of tags we will have */
filling_tag = 0;
in_tag = 0;
in_tag_name = 0;
in_apos = 0;
in_quote = 0;
ntags = 0;
for (cur = text; *cur; cur++) {
guchar c = *cur;
if (!in_tag) {
if (c == '<') {
in_tag = 1;
if (!filling_tag) {
ntags++;
filling_tag = 1;
}
} else {
/* Some text, switch to text */
filling_tag = 0;
}
} else {
if (in_apos) {
if (c == '\'')
in_apos = 0;
} else if (in_quote) {
if (c == '"')
in_quote = 0;
} else if (c == '\'') {
in_apos = 1;
} else if (c == '"') {
in_quote = 1;
} else if (c == '>') {
in_tag = 0;
}
}
}
/* We can now allocate the array of blocks of tags and restart over, this time filling text and tags */
tags = malloc(ntags * sizeof(*tags));
filling_tag = 0;
in_tag = 0;
in_apos = 0;
in_quote = 0;
ntags = 0;
str = g_string_sized_new(strlen(text));
for (cur = text; *cur; cur++) {
guchar c = *cur;
if (!in_tag) {
if (c == '<') {
in_tag = 1;
in_tag_name = 1;
namepos = 0;
if (!filling_tag) {
/* Note the tags position in the text */
tags[ntags].pos = str->len;
/* A priori only deferrable tags */
tags[ntags].deferrable = 1;
curtag = cur;
filling_tag = 1;
}
} else {
if (filling_tag) {
/* Some text, dump the tags and switch to text */
tags[ntags].tags = g_strndup(curtag, cur - curtag);
ntags++;
filling_tag = 0;
}
if (c == '&') {
/* Unescape ssml character sequences */
if (!strncmp(cur, "&quot;", 6)) {
cur += 5;
g_string_append_c(str, '"');
} else if (!strncmp(cur, "&apos;", 6)) {
cur += 5;
g_string_append_c(str, '\'');
} else if (!strncmp(cur, "&lt;", 4)) {
cur += 3;
g_string_append_c(str, '<');
} else if (!strncmp(cur, "&gt;", 4)) {
cur += 3;
g_string_append_c(str, '>');
} else if (!strncmp(cur, "&amp;", 5)) {
cur += 4;
g_string_append_c(str, '&');
} else
g_string_append_c(str, c);
} else {
/* Pure text, append as such */
g_string_append_c(str, c);
}
}
} else {
if (in_apos) {
if (c == '\'')
in_apos = 0;
} else if (in_quote) {
if (c == '"')
in_quote = 0;
} else if (c == '\'') {
in_apos = 1;
} else if (c == '"') {
in_quote = 1;
} else {
if (in_tag_name) {
if (c == '>' || c == ' ' || c == '\t' || c == '\r' || c == '\n') {
in_tag_name = 0;
name[namepos] = '\0';
if (strcmp(name, "mark")
&& strcmp(name, "/mark")
&& strcmp(name, "mark/")
&& strcmp(name, "!--")) {
/* This is a non-deferrable tag */
tags[ntags].deferrable = 0;
}
} else {
if (namepos < sizeof(name) - 1) {
name[namepos++] = c;
}
}
}
if (c == '>')
in_tag = 0;
}
}
}
/* Trailing tags content */
if (filling_tag) {
tags[ntags].tags = g_strndup(curtag, cur - curtag);
ntags++;
}
*tags_ret = tags;
*ntags_ret = ntags;
return g_string_free(str, FALSE);
}
/* Put back tags into the text */
static gchar *unescape_ssml_text(const gchar *text, struct tags *tags, gint ntags)
{
GString *str;
const gchar *cur;
struct tags *curtags = tags;
str = g_string_sized_new(strlen(text));
for (cur = text; *cur; cur++) {
guchar c;
while (ntags && cur - text == curtags->pos) {
/* We reached the position of a block of tags, put them back */
g_string_append(str, curtags->tags);
curtags++;
ntags--;
}
c = *cur;
/* Re-escape ssml character sequences */
if (c == '"')
g_string_append(str, "&quot;");
else if (c == '\'')
g_string_append(str, "&apos;");
else if (c == '<')
g_string_append(str, "&lt;");
else if (c == '>')
g_string_append(str, "&gt;");
else if (c == '&')
g_string_append(str, "&amp;");
else
g_string_append_c(str, c);
}
while (ntags) {
/* Trailing tags */
g_string_append(str, curtags->tags);
curtags++;
ntags--;
}
free(tags);
return g_string_free(str, FALSE);
}
/*----------------- Speech symbol representation and loading ----------------*/
static SpeechSymbol *speech_symbol_new(void)
{
SpeechSymbol *sym = g_slice_alloc(sizeof *sym);
sym->identifier = NULL;
sym->pattern = NULL;
sym->replacement = NULL;
sym->level = SYMLVL_INVALID;
sym->preserve = SYMPRES_INVALID;
sym->display_name = NULL;
return sym;
}
static void speech_symbol_free(SpeechSymbol *sym)
{
/* sym->identifier is the key, thus freed by hash table */
g_free(sym->pattern);
g_free(sym->replacement);
g_free(sym->display_name);
g_slice_free1(sizeof *sym, sym);
}
/* checks whether the line should be skipped: either blank or commented */
static int skip_line(const char *line)
{
if (*line == '#')
return 1;
while (g_ascii_isspace(*line))
line++;
return *line == 0;
}
/* strips \r and \n at the end of a single line buffer */
static void strip_newline(char *line)
{
while (*line && *line != '\r' && *line != '\n')
line++;
*line = 0;
}
/* Loads an "identifier\tpattern" line into complex_symbols */
static int speech_symbols_load_complex_symbol(SpeechSymbols *ss, const char *line)
{
char **parts = g_strsplit(line, "\t", 2);
if (g_strv_length(parts) != 2) {
g_strfreev(parts);
return -1;
}
ss->complex_symbols = g_slist_prepend(ss->complex_symbols, parts);
return 0;
}
/* Finds the entry in @p map that corresponds to @p name, and put its value
* into the integer pointer to by @p value */
static int speech_symbols_load_int_field(IntFieldDesc *map, guint map_len,
const char *name, int *value)
{
guint i;
for (i = 0; i < map_len; i++) {
if (strcmp(map[i].name, name) == 0) {
*value = map[i].value;
return 0;
}
}
return -1;
}
/* Loads a symbol line into symbols
* syntax is:
* identifier "\t" replacement [ "\t" level [ "\t" preserve ] [ "\t#" comment ] */
static int speech_symbols_load_symbol(SpeechSymbols *ss, const char *line)
{
char **parts = g_strsplit(line, "\t", -1);
guint len = g_strv_length(parts);
char *display_name = NULL;
char *identifier = NULL;
char *replacement = NULL;
int level = SYMLVL_INVALID;
int pres_mode = SYMPRES_INVALID;
SpeechSymbol *sym;
/* last field, if commented: display name */
if (len > 0 && parts[len - 1][0] == '#') {
/* Regardless of how many fields there are,
* if the last field is a comment, it is the display name. */
const char *p;
display_name = parts[len - 1];
parts[--len] = NULL;
p = display_name + 1;
while (g_ascii_isspace(*p))
p++;
memmove(display_name, p, strlen(p) + 1);
}
/* 4th field (optional): preserve */
if (len > 3) {
IntFieldDesc map[] = {
{ "-", SYMPRES_NEVER },
{ "never", SYMPRES_NEVER },
{ "always", SYMPRES_ALWAYS },
{ "norep", SYMPRES_NOREP },
{ "literal", SYMPRES_LITERAL },
};
if (speech_symbols_load_int_field(map, G_N_ELEMENTS(map),
parts[3], &pres_mode) < 0)
goto err;
}
/* 3rd field (optional): level */
if (len > 2) {
IntFieldDesc map[] = {
{ "-", SYMLVL_NONE },
{ "none", SYMLVL_NONE },
{ "some", SYMLVL_SOME },
{ "most", SYMLVL_MOST },
{ "all", SYMLVL_ALL },
{ "char", SYMLVL_CHAR },
};
if (speech_symbols_load_int_field(map, G_N_ELEMENTS(map),
parts[2], &level) < 0)
goto err;
}
/* missing required fields */
if (len < 2 || !parts[0] || !parts[0][0])
goto err;
/* 2nd field: replacement */
if (strcmp(parts[1], "-") == 0)
replacement = NULL;
else
replacement = g_strdup(parts[1]);
/* 1st field: identifier */
if (parts[0][0] == '\\' && parts[0][1]) {
identifier = g_strdup(parts[0] + 1);
switch (identifier[0]) {
case '0':
identifier[0] = '\0';
/* FIXME: support this */
MSG2(1, "symbols", "Loading NUL byte entry is not yet supported");
goto err;
break;
case 't': identifier[0] = '\t'; break;
case 'n': identifier[0] = '\n'; break;
case 'r': identifier[0] = '\r'; break;
case 'f': identifier[0] = '\f'; break;
case 'v': identifier[0] = '\v'; break;
case '#':
case '\\':
/* nothing to do */
break;
}
} else
identifier = g_strdup(parts[0]);
sym = speech_symbol_new();
sym->identifier = identifier;
sym->replacement = replacement;
sym->level = level;
sym->preserve = pres_mode;
sym->display_name = display_name;
g_hash_table_insert(ss->symbols, sym->identifier, sym);
g_strfreev(parts);
return 0;
err:
g_free(display_name);
g_free(identifier);
g_free(replacement);
g_strfreev(parts);
return -1;
}
/* Loads a symbols.dic file into @p ss */
static int speech_symbols_load(SpeechSymbols *ss, const char *filename, gboolean allow_complex)
{
FILE *fp;
char *line = NULL;
size_t n = 0;
unsigned char bom[3];
/* line parsing callback for the current section */
int (*handler) (SpeechSymbols *, const char *) = NULL;
fp = fopen(filename, "r");
if (!fp) {
int level = 5; /* Common case, avoid shouting */
if (errno != ENOENT)
/* Odd error, shout */
level = 1;
MSG2(level, "symbols", "Failed to open file '%s': %s", filename, g_strerror(errno));
return -1;
}
/* skip UTF-8 BOM if present */
if (fread(bom, sizeof *bom, sizeof bom, fp) != sizeof bom ||
bom[0] != 0xEF || bom[1] != 0xBB || bom[2] != 0xBF)
fseek(fp, 0, SEEK_SET);
while (getline(&line, &n, fp) >= 0) {
if (skip_line(line))
continue;
strip_newline(line);
if (allow_complex && strcmp(line, "complexSymbols:") == 0) {
handler = speech_symbols_load_complex_symbol;
} else if (strcmp(line, "symbols:") == 0) {
handler = speech_symbols_load_symbol;
} else if (!handler || handler(ss, line) < 0) {
MSG2(1, "symbols", "Invalid line in file %s: %s",
filename, line);
}
}
free(line);
fclose(fp);
MSG2(1, "symbols", "Loaded file '%s'", filename);
return 0;
}
static void speech_symbols_free(SpeechSymbols *ss)
{
g_slist_free_full(ss->complex_symbols, (GDestroyNotify) g_strfreev);
g_hash_table_destroy(ss->symbols);
g_free(ss->source);
g_free(ss);
}
/* Loads a symbols file for @p locale.
* Returns a SpeechSymbols*, or NULL on error. */
static gpointer speech_symbols_new(const gchar *locale, const gchar *file)
{
SpeechSymbols *ss = g_malloc(sizeof *ss);
gchar *path;
int ret;
ss->complex_symbols = NULL;
ss->source = NULL;
ss->symbols = g_hash_table_new_full(g_str_hash, g_str_equal,
g_free,
(GDestroyNotify) speech_symbol_free);
path = g_build_filename(SpeechdOptions.user_conf_dir, "locale", locale, file, NULL);
MSG2(5, "symbols", "Trying to load %s for '%s' from '%s/locale'", file, locale, SpeechdOptions.user_conf_dir);
ret = speech_symbols_load(ss, path, TRUE);
if (ret < 0) {
path = g_build_filename(SpeechdOptions.user_conf_dir, "locale", file, NULL);
MSG2(5, "symbols", "Trying to load %s from '%s/locale'", file, SpeechdOptions.user_conf_dir);
ret = speech_symbols_load(ss, path, TRUE);
}
if (ret < 0) {
path = g_build_filename(LOCALE_DATA, locale, file, NULL);
MSG2(5, "symbols", "Trying to load %s for '%s' from '%s'", file, locale, path);
ret = speech_symbols_load(ss, path, TRUE);
}
if (ret >= 0) {
MSG2(5, "symbols", "Successful");
/* The elements are added to the start of the list in
* speech_symbols_load_complex_symbol() for better speed (as adding to
* the end requires walking the whole list), but we want them in the
* order they are in the file, so reverse the list. */
ss->complex_symbols = g_slist_reverse(ss->complex_symbols);
ss->source = g_strdup(file);
} else {
/* Nothing loaded in the end */
MSG2(5, "symbols", "Failed");
speech_symbols_free(ss);
ss = NULL;
}
g_free(path);
return ss;
}
static SpeechSymbols *get_locale_speech_symbols(const gchar *locale, const gchar *file)
{
if (!G_symbols_dicts) {
G_symbols_dicts = locale_map_new((GDestroyNotify) speech_symbols_free);
}
return locale_map_fetch(G_symbols_dicts, locale, file, speech_symbols_new);
}
void symbols_preprocessing_add_file(const char *name)
{
MSG2(5, "symbols", "Will load symbol file %s", name);
symbols_files = g_slist_append(symbols_files, g_strdup(name));
}
/*------------------ Speech symbol compilation & processing -----------------*/
/* sort function sorting strings by length, longest first */
static gint list_sort_string_longest_first(gconstpointer a, gconstpointer b)
{
return strlen(b) - strlen(a);
}
static void speech_symbols_processor_free(SpeechSymbolProcessor *ssp)
{
gint i;
if (ssp->regex)
g_regex_unref(ssp->regex);
for (i = 0; i < ssp->nmulti_chars_regex; i++)
g_regex_unref(ssp->multi_chars_regex[i]);
g_free(ssp->multi_chars_regex);
g_slist_free(ssp->complex_list);
if (ssp->symbols)
g_hash_table_unref(ssp->symbols);
g_free(ssp->source);
g_free(ssp);
}
static void speech_symbols_processor_list_free(GSList *sspl)
{
GSList *e;
for (e = sspl; e; e = e->next)
speech_symbols_processor_free(e->data);
}
/* Loads and compiles speech symbols conversions for @p locale.
* Returns a SpeechSymbolProcessor*, or NULL on error */
static SpeechSymbolProcessor *speech_symbols_processor_new(const char *locale, SpeechSymbols *syms, const char *file)
{
SpeechSymbolProcessor *ssp = NULL;
SpeechSymbols *ssbase;
GHashTableIter iter;
gpointer key, value;
GString *characters;
GSList *multi_chars_list = NULL;
gchar *escaped;
GString *escaped_multi;
GString *pattern;
GError *error = NULL;
GSList *sources = NULL;
GSList *node;
int has_dash = 0;
int has_rbracket = 0;
int has_circum = 0;
if (syms)
sources = g_slist_append(sources, syms);
/* Always use the base. */
ssbase = get_locale_speech_symbols("base", file);
if (ssbase)
sources = g_slist_append(sources, ssbase);
ssp = g_malloc(sizeof *ssp);
ssp->multi_chars_regex = NULL;
ssp->nmulti_chars_regex = 0;
ssp->source = g_strdup(file);
/* The computed symbol information from all sources. */
ssp->symbols = g_hash_table_new_full(g_str_hash, g_str_equal,
g_free,
(GDestroyNotify) speech_symbol_free);
/* An indexable list of complex symbols for use in building/executing the regexp. */
ssp->complex_list = NULL;
/* Add all complex symbols first, as they take priority. */
for (node = sources; node; node = node->next) {
SpeechSymbols *syms = node->data;
GSList *node2;
for (node2 = syms->complex_symbols; node2; node2 = node2->next) {
SpeechSymbol *sym;
gchar **key_val = node2->data;
if (g_hash_table_contains(ssp->symbols, key_val[0])) {
/* Already defined */
continue;
}
sym = speech_symbol_new();
sym->identifier = g_strdup(key_val[0]);
sym->pattern = g_strdup(key_val[1]);
g_hash_table_insert(ssp->symbols, sym->identifier, sym);
ssp->complex_list = g_slist_prepend(ssp->complex_list, sym);
}
}
/* Elements are added at the start for performance, but we want them in the original order */
ssp->complex_list = g_slist_reverse(ssp->complex_list);
/* Supplement the data for complex symbols and add all simple symbols. */
characters = g_string_new(NULL);
for (node = sources; node; node = node->next) {
SpeechSymbols *syms = node->data;
g_hash_table_iter_init(&iter, syms->symbols);
while (g_hash_table_iter_next(&iter, &key, &value)) {
const SpeechSymbol *source_sym = value;
SpeechSymbol *sym;
sym = g_hash_table_lookup(ssp->symbols, key);
if (!sym && syms != ssbase) {
/* This is a new simple symbol.
* (All complex symbols have already been added.) */
sym = speech_symbol_new();
sym->identifier = g_strdup(key);
g_hash_table_insert(ssp->symbols, sym->identifier, sym);
if (strlen(sym->identifier) == 1) {
switch (sym->identifier[0]) {
case '-':
has_dash = 1;
break;
case ']':
has_rbracket = 1;
break;
case '^':
has_circum = 1;
break;
default:
g_string_append_c(characters, sym->identifier[0]);
}
} else {
multi_chars_list = g_slist_prepend(multi_chars_list, sym->identifier);
}
}
if (sym) {
/* If fields weren't explicitly specified, inherit the value from later sources. */
if (sym->replacement == NULL)
sym->replacement = g_strdup(source_sym->replacement);
if (sym->level == SYMLVL_INVALID)
sym->level = source_sym->level;
if (sym->preserve == SYMPRES_INVALID)
sym->preserve = source_sym->preserve;
if (sym->display_name == NULL)
sym->display_name = g_strdup(source_sym->display_name);
}
}
}
/* Set defaults for any fields not explicitly set. */
g_hash_table_iter_init(&iter, ssp->symbols);
while (g_hash_table_iter_next(&iter, &key, &value)) {
SpeechSymbol *sym = value;
if (!sym->replacement) {
/* Symbols without a replacement specified are useless. */
MSG2(2, "symbols", "Replacement not defined "
"in locale %s for symbol: %s",
locale, sym->identifier);
ssp->complex_list = g_slist_remove(ssp->complex_list, sym);
g_hash_table_iter_remove(&iter);
continue;
}
if (sym->level == SYMLVL_INVALID)
sym->level = SYMLVL_ALL;
if (sym->preserve == SYMPRES_INVALID)
sym->preserve = SYMPRES_NEVER;
if (sym->display_name == NULL)
sym->display_name = g_strdup(sym->identifier);
}
/* build the regex. */
/* Make characters into a regexp character set. */
escaped = g_regex_escape_string(characters->str, characters->len);
g_string_truncate(characters, 0);
if (*escaped || has_dash || has_rbracket || has_circum) {
g_string_append_printf(characters, "[%s", escaped);
if (has_dash)
g_string_append_printf(characters, "\\-");
if (has_rbracket)
g_string_append_printf(characters, "\\]");
if (has_circum)
g_string_append_printf(characters, "\\^");
g_string_append_c(characters, ']');
}
g_free(escaped);
/* The simple symbols must be ordered longest first so that the longer symbols will match.*/
multi_chars_list = g_slist_sort(multi_chars_list, list_sort_string_longest_first);
/* TODO: check the syntax is compatible with GLib */
pattern = g_string_new(NULL);
/* Strip repeated spaces from the end of the line to stop them from being picked up by repeated. */
g_string_append(pattern, "(?P<rstripSpace> +$)");
/* Repeated characters: more than 3 repeats. */
if (characters->len) {
g_string_append_c(pattern, '|');
g_string_append_printf(pattern, "(?P<repeated>(?P<repTmp>%s)(?P=repTmp){3,})", characters->str);
}
/* Complex symbols.
* Each complex symbol has its own named group so we know which symbol matched. */
guint i = 0;
for (node = ssp->complex_list; node; node = node->next, i++) {
SpeechSymbol *sym = node->data;
g_string_append_c(pattern, '|');
g_string_append_printf(pattern, "(?P<c%u>%s)", i, sym->pattern);
}
/* Simple symbols.
* These are all handled in one named group.
* Because the symbols are just text, we know which symbol matched just by looking at the matched text. */
escaped_multi = g_string_new(NULL);
if (ssp->complex_list)
/* We have some complex symbols. Keep the multi-characters rules along it */
for (node = multi_chars_list; node; node = node->next) {
escaped = g_regex_escape_string(node->data, -1);
if (escaped_multi->len > 0)
g_string_append_c(escaped_multi, '|');
g_string_append(escaped_multi, escaped);
g_free(escaped);
}
if ((escaped_multi->len && ssp->complex_list) || characters->len) {
g_string_append_c(pattern, '|');
g_string_append_printf(pattern, "(?P<simple>");
if (escaped_multi->len && ssp->complex_list)
g_string_append_printf(pattern, "%s", escaped_multi->str);
if (escaped_multi->len && ssp->complex_list && characters->len)
g_string_append_printf(pattern, "|");
if (characters->len)
g_string_append_printf(pattern, "%s", characters->str);
g_string_append_printf(pattern, ")");
}
g_string_free(escaped_multi, TRUE);
g_string_free(characters, TRUE);
MSG2(5, "symbols", "building regex: %s", pattern->str);
ssp->regex = g_regex_new(pattern->str, G_REGEX_OPTIMIZE, 0, &error);
if (!ssp->regex) {
/* if regex compilation failed, bail out */
MSG2(1, "symbols", "ERROR compiling regular expression: %s. "
"This is likely due to an invalid complex "
"symbol regular expression in locale %s.",
error->message, locale);
g_error_free(error);
speech_symbols_processor_free(ssp);
ssp = NULL;
goto out;
}
g_string_truncate(pattern, 0);
gint nsymbols = 0;
/* Simple symbols. */
if (!ssp->complex_list)
/* We have no complex symbols. We can handle them in one named group,
* but possibly in several regexps to avoid the limitations of pcre.
* Because the symbols are just text, we know which symbol matched
* just by looking at the matched text. */
for (node = multi_chars_list; node; node = node->next) {
if (!nsymbols)
g_string_append_printf(pattern, "(?P<simple>");
else
g_string_append_c(pattern, '|');
escaped = g_regex_escape_string(node->data, -1);
g_string_append(pattern, escaped);
g_free(escaped);
nsymbols++;
if (nsymbols == 1000 || !node->next) {
/* Already large pattern, or end of list, flush pattern */
g_string_append_printf(pattern, ")");
MSG2(5, "symbols", "building regex: %s", pattern->str);
GRegex *regex = g_regex_new(pattern->str, G_REGEX_OPTIMIZE, 0, &error);
if (!regex) {
/* if regex compilation failed, bail out */
MSG2(1, "symbols", "ERROR compiling regular expression: %s. "
"This is likely due to an invalid complex "
"symbol regular expression in locale %s.",
error->message, locale);
g_error_free(error);
speech_symbols_processor_free(ssp);
ssp = NULL;
goto out;
}
ssp->nmulti_chars_regex++;
ssp->multi_chars_regex = g_realloc(ssp->multi_chars_regex, ssp->nmulti_chars_regex * sizeof(ssp->multi_chars_regex[0]));
ssp->multi_chars_regex[ssp->nmulti_chars_regex-1] = regex;
g_string_truncate(pattern, 0);
nsymbols = 0;
}
}
out:
g_slist_free(multi_chars_list);
g_string_free(pattern, TRUE);
g_slist_free(sources);
return ssp;
}
/* Loads and compiles speech symbols conversions for @p locale.
* Returns a SpeechSymbolProcessor*, or NULL on error */
static gpointer speech_symbols_processor_list_new(const char *locale, const char *file)
{
SpeechSymbolProcessor *ssp;
SpeechSymbols *ss;
GSList *sspl = NULL;
GSList *node;
gchar **parts = g_strsplit_set(locale, "_-", 2);
MSG2(2, "symbols", "Loading symbols for locale '%s', will try:", locale);
MSG2(2, "symbols", "%s/locale/%s", SpeechdOptions.user_conf_dir, locale);
MSG2(2, "symbols", "%s/locale", SpeechdOptions.user_conf_dir);
MSG2(2, "symbols", LOCALE_DATA "/%s", locale);
MSG2(2, "symbols", "%s/locale/%s", SpeechdOptions.user_conf_dir, parts[0]);
MSG2(2, "symbols", LOCALE_DATA "/%s", parts[0]);
MSG2(2, "symbols", "and also as base:");
MSG2(2, "symbols", "%s/locale/base", SpeechdOptions.user_conf_dir);
MSG2(2, "symbols", LOCALE_DATA "/base");
g_strfreev(parts);
for (node = symbols_files; node; node = node->next) {
MSG2(2, "symbols", "Loading '%s'", (char*) node->data);
ss = get_locale_speech_symbols(locale, node->data);
if (ss) {
ssp = speech_symbols_processor_new(locale, ss, (char*) node->data);
if (ssp)
sspl = g_slist_prepend(sspl, ssp);
} else {
ss = get_locale_speech_symbols("base", node->data);
if (ss) {
/* Let speech_symbols_processor_new include only "base" */
ssp = speech_symbols_processor_new(locale, NULL, (char*) node->data);
if (ssp)
sspl = g_slist_prepend(sspl, ssp);
} else
MSG2(1, "symbols", "Failed to load symbols '%s' for locale '%s'",
(char*) node->data, locale);
}
}
/* The elements are added to the start of the list for better speed (as
* adding to the end requires walking the whole list), but we want them
* in the order they are in the config, so reverse the list. */
sspl = g_slist_reverse(sspl);
return sspl;
}
/* Fetch a named group that matched.
* FIXME: handle empty groups? (e.g. with only lookaheads/lookbehinds) */
static gchar *fetch_named_matching(const GMatchInfo *match_info, const gchar *name)
{
gchar *capture = g_match_info_fetch_named(match_info, name);
if (capture && !*capture) {
g_free(capture);
capture = NULL;
}
return capture;
}
enum group {
RSTRIPSPACE,
REPEATED,
SIMPLE,
COMPLEX,
};
/* Look for the first block of tags strictly after pos, among tags between firsttag and lasttag */
static gint find_nexttag(struct tags *tags, gint pos, gint firsttag, gint endtag)
{
gint middletag;
if (endtag == firsttag)
/* None here */
return endtag;
if (tags[firsttag].pos > pos)
/* That's it already */
return firsttag;
middletag = (firsttag + 1 + endtag) / 2;
if (middletag == endtag)
/* None */
return endtag;
if (tags[middletag].pos > pos)
return find_nexttag(tags, pos, firsttag, middletag);
else
return find_nexttag(tags, pos, middletag, endtag);
}
static int replace_groups(const GMatchInfo *match_info, GString *result, char *replacement, gint pos)
{
int in_escape = 0;
char c;
while ((c = *replacement++)) {
if (!in_escape) {
if (c == '\\')
in_escape = 1;
else
g_string_append_c(result, c);
} else {
if (c == '\\')
g_string_append_c(result, '\\');
else if (c >= '0' && c <= '9') {
gchar *res = g_match_info_fetch(match_info, pos + (c - '0'));
if (res)
g_string_append(result, res);
else
MSG2(1, "symbols", "Unmatched reference \\%c", c);
} else {
MSG2(1, "symbols", "Invalid reference \\%c", c);
g_string_append_c(result, c);
}
in_escape = 0;
}
}
if (in_escape)
MSG2(1, "symbols", "Unterminated backslash");
return 1;
}
/* Regular expression callback for applying replacements */
static gboolean regex_eval(const GMatchInfo *match_info, GString *result, gpointer user_data)
{
SpeechSymbolProcessor *ssp = user_data;
gchar *capture;
enum group captured_group;
gchar *group_0;
gint start = -1, end = -1;
gint prevlen = result->len, shift;
gint nexttag, curtag, deferrable;
guint i = 0;
SpeechSymbol *sym = NULL;
gint pos = 0;
/* First see what we captured */
/* FIXME: Python regex API allows to find the name of the group that
* matched. As GRegex doesn't have that, what we do here is try
* and fetch the groups we know, and see if they matched.
* This is not very optimal, but how can we avoid that? */
if ((capture = fetch_named_matching(match_info, "rstripSpace"))) {
captured_group = RSTRIPSPACE;
} else if ((capture = fetch_named_matching(match_info, "repeated"))) {
captured_group = REPEATED;
} else if ((capture = fetch_named_matching(match_info, "simple"))) {
captured_group = SIMPLE;
} else {
/* Complex symbol. */
GSList *node;
for (node = ssp->complex_list; !sym && node; node = node->next, i++) {
gchar *group_name = g_strdup_printf("c%u", i);
if ((capture = fetch_named_matching(match_info, group_name))) {
gchar **all = g_match_info_fetch_all(match_info);
gint i;
pos = -1;
/* Find out the index of the match */
for (i = 1; all[i]; i++) {
if (all[i][0]) {
pos = i;
break;
}
}
g_strfreev(all);
if (pos != -1)
sym = node->data;
}
g_free(group_name);
if (sym)
break;
}
captured_group = COMPLEX;
}
/* Now check where that lies among tags */
g_match_info_fetch_pos(match_info, 0, &start, &end);
nexttag = find_nexttag(ssp->tags, start, 0, ssp->ntags);
/* Check whether the contained tags are deferrable */
deferrable = 1;
for (curtag = nexttag; curtag < ssp->ntags; curtag++) {
if (ssp->tags[curtag].pos >= end)
/* Don't care about the rest */
break;
/* This block of tags is within the group */
if (!ssp->tags[curtag].deferrable) {
/* Oops, these tags can't be deferred */
deferrable = 0;
break;
}
}
if (!deferrable) {
group_0 = g_match_info_fetch(match_info, 0);
MSG2(1, "symbols", "tags '%s' within group |%s| (at %d..%d), not replacing group :/",
ssp->tags[curtag].tags, group_0, start, end);
g_free(group_0);
g_string_append(result, capture);
g_free(capture);
return FALSE;
}
/* Defer these tags */
for (curtag = nexttag; curtag < ssp->ntags; curtag++) {
if (ssp->tags[curtag].pos >= end)
/* Don't care about the rest */
break;
/* This block of tags is within the group, defer it after the group */
MSG2(5, "symbols", "deferring tags '%s' to %d", ssp->tags[curtag].tags, end);
ssp->tags[curtag].pos = end;
}
/* Ok, now replace */
if (captured_group == RSTRIPSPACE) {
MSG2(5, "symbols", "replacing <rstripSpace>");
/* nothing to do, just don't add it in the result */
} else if (captured_group == REPEATED) {
/* Repeated character. */
char ch[2] = { capture[0], 0 };
SpeechSymbol *sym = g_hash_table_lookup(ssp->symbols, ch);
MSG2(5, "symbols", "replacing <repeated>");
/* this should never happen, but be on the safe side and check it */
if (!sym)
goto symbol_error;
if (ssp->level >= sym->level) {
g_string_append_printf(result, " %lu %s ", (unsigned long) strlen(capture), sym->replacement);
} else {
g_string_append_c(result, ' ');
}
} else {
const gchar *prefix, *suffix;
/* One of the defined symbols. **/
if (captured_group == SIMPLE) {
/* Simple symbol. */
sym = g_hash_table_lookup(ssp->symbols, capture);
MSG2(5, "symbols", "replacing <simple>");
} else {
g_assert(captured_group == COMPLEX);
/* Complex symbol, sym and i already set */
MSG2(5, "symbols", "replacing <c%u> (complex symbol)", i);
}
/* this should never happen, but be on the safe side and check it */
if (!sym)
goto symbol_error;
MSG2(5, "symbols", "replacing sym |%s| (lvl=%d, preserve=%d)",
sym->identifier, sym->level, sym->preserve);
if (sym->preserve == SYMPRES_LITERAL)
prefix = "";
else
prefix = " ";
if (sym->preserve == SYMPRES_ALWAYS ||
(sym->preserve == SYMPRES_NOREP && ssp->level < sym->level))
suffix = capture;
else if (sym->preserve == SYMPRES_LITERAL)
suffix = "";
else
suffix = " ";
if (sym->level > ssp->support_level) {
/* Leave it to the module */
g_string_append(result, capture);
} else if (ssp->level >= sym->level && sym->replacement) {
g_string_append(result, prefix);
MSG2(5, "symbols", "replacing with %s", sym->replacement);
replace_groups(match_info, result, sym->replacement, pos);
g_string_append(result, suffix);
} else {
g_string_append(result, suffix);
}
}
goto out;
symbol_error:
group_0 = g_match_info_fetch(match_info, 0);
MSG2(1, "symbols", "WARNING: no symbol for match |%s| (at %d..%d), this shouldn't happen.",
group_0, start, end);
g_free(group_0);
out:
/* content has grown (or shrunk) by this amount */
shift = (result->len - prevlen) - strlen(capture);
if (nexttag < ssp->ntags)
/* Update positions of tags beyond this */
ssp->tags[nexttag].shift += shift;
g_free(capture);
return FALSE;
}
/* Processes some input and converts symbols in it */
static gchar *speech_symbols_processor_process_text(GSList *sspl, const gchar *input, SymLvl level, SymLvl support_level, SPDDataMode ssml_mode)
{
gchar *text;
gchar *processed;
struct tags *tags = NULL;
gint ntags = 0, i;
GError *error = NULL;
if (ssml_mode == SPD_DATA_SSML) {
text = escape_ssml_text(input, &tags, &ntags);
MSG2(5, "symbols", "escaped ssml '%s' to '%s'", input, text);
} else {
text = g_strdup(input);
}
for ( ; sspl; sspl = sspl->next) {
SpeechSymbolProcessor *ssp = sspl->data;
if (ssml_mode == SPD_DATA_SSML) {
for (i = 0; i < ntags; i++)
tags[i].shift = 0;
ssp->tags = tags;
ssp->ntags = ntags;
} else
ssp->ntags = 0;
ssp->level = level;
ssp->support_level = support_level;
MSG2(5, "symbols", "translating complex symbols and characters");
processed = g_regex_replace_eval(ssp->regex, text, -1, 0, 0, regex_eval, ssp, &error);
if (!processed) {
MSG2(1, "symbols", "ERROR applying regex: %s", error->message);
g_error_free(error);
} else {
MSG2(5, "symbols", "'%s' translated '%s' to '%s'", ssp->source, text, processed);
g_free(text);
text = processed;
gint i;
for (i = 0; i < ssp->nmulti_chars_regex; i++) {
MSG2(5, "symbols", "translating multi-characters step %d", i);
processed = g_regex_replace_eval(ssp->multi_chars_regex[i], text, -1, 0, 0, regex_eval, ssp, &error);
if (!processed) {
MSG2(1, "symbols", "ERROR applying regex: %s", error->message);
g_error_free(error);
} else {
MSG2(5, "symbols", "'%s' translated '%s' to '%s'", ssp->source, text, processed);
g_free(text);
text = processed;
}
}
if (ssml_mode == SPD_DATA_SSML) {
/* This accumulates the shifts of all previous replacements */
gssize shift = 0;
/* Apply new tags positions */
for (i = 0; i < ntags; i++) {
shift += tags[i].shift;
tags[i].pos += shift;
}
}
if (level == SYMLVL_CHAR && g_utf8_strlen(processed, -1) > 1)
/* This translated it, avoid letting other rules continue expanding! */
break;
}
}
if (ssml_mode == SPD_DATA_SSML) {
processed = unescape_ssml_text(text, tags, ntags);
MSG2(5, "symbols", "unescaped ssml '%s' to '%s'", text, processed);
g_free(text);
} else
processed = text;
return processed;
}
/* Gets a possibly cached processor for the given locale */
static GSList *get_locale_speech_symbols_processor(const gchar *locale)
{
if (!G_processors) {
G_processors = locale_map_new((GDestroyNotify) speech_symbols_processor_list_free);
}
return locale_map_fetch(G_processors, locale, NULL, speech_symbols_processor_list_new);
}
/*----------------------------------- API -----------------------------------*/
/* Process some text, converting symbols according to desired pronunciation. */
static gchar *process_speech_symbols(const gchar *locale, const gchar *text, SymLvl level, SymLvl support_level, SPDDataMode ssml_mode)
{
GSList *sspl;
sspl = get_locale_speech_symbols_processor(locale);
/* fallback to English if there's no processor for the locale */
if (!sspl && g_str_has_prefix(locale, "en") && strchr("_-", locale[2]))
sspl = get_locale_speech_symbols_processor("en");
if (!sspl)
return NULL;
return speech_symbols_processor_process_text(sspl, text, level, support_level, ssml_mode);
}
void insert_symbols(TSpeechDMessage *msg, int punct_missing)
{
gchar *processed;
SymLvl level = SYMLVL_NONE;
SymLvl support_level = msg->settings.symbols_preprocessing;
char *locale = strdup(msg->settings.msg_settings.voice.language), *dash;
if (punct_missing && support_level < SYMLVL_ALL)
/* The user preferred to let some modules handle some punctuation,
* but this module doesn't support it, so force handling it ourself. */
support_level = SYMLVL_ALL;
switch (msg->settings.msg_settings.punctuation_mode) {
case SPD_PUNCT_ALL: level = SYMLVL_ALL; break;
case SPD_PUNCT_MOST: level = SYMLVL_MOST; break;
case SPD_PUNCT_SOME: level = SYMLVL_SOME; break;
case SPD_PUNCT_NONE: level = SYMLVL_NONE; break;
}
if (msg->settings.type == SPD_MSGTYPE_CHAR)
level = SYMLVL_CHAR;
dash = strchr(locale, '-');
if (dash)
{
char *c;
*dash = '_';
for (c = dash + 1; *c; c++)
*c = toupper(*c);
}
MSG2(5, "symbols", "processing at level %d, supporting level %d", level, support_level);
processed = process_speech_symbols(locale,
msg->buf, level, support_level, msg->settings.ssml_mode);
if (processed) {
MSG2(5, "symbols", "before: |%s|", msg->buf);
g_free(msg->buf);
msg->buf = processed;
MSG2(5, "symbols", "after: |%s|", msg->buf);
if (support_level >= level)
/* if we performed the replacement, don't let the module speak it again */
msg->settings.msg_settings.punctuation_mode = SPD_PUNCT_NONE;
/* if we provide a character description file, don't let the module spell it */
if (msg->settings.type == SPD_MSGTYPE_CHAR)
if (g_utf8_strlen(processed, -1) > 1)
msg->settings.type = SPD_MSGTYPE_TEXT;
}
free(locale);
}