| |
| /* |
| * symbols.c -- Implements functions handling symbols conversion, |
| * including punctuation, for Speech Dispatcher |
| * |
| * Copyright (C) 2001,2002,2003, 2007, 2017 Brailcom, o.p.s |
| * |
| * This is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 2, or (at your option) |
| * any later version. |
| * |
| * This software is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program. If not, see <https://www.gnu.org/licenses/>. |
| */ |
| |
| /* Based off NVDA's symbols replacement code (GPLv2+): |
| * https://github.com/nvaccess/nvda/blob/master/source/characterProcessing.py |
| * |
| * OVERVIEW |
| * |
| * This file contains all of the logic related to reading, processing and |
| * using NVDA symbols replacement files. It should be 100% compatible with |
| * NVDA's equivalent. |
| * |
| * The files are read by the speech_symbols_load() family of functions. |
| * Each symbol is loaded into a SpeechSymbol structure, and the symbols of |
| * a file (both simple and complex) are loaded into a SpeechSymbols (note the |
| * plural form) structure. |
| * |
| * The loaded symbols are compiled into GLib PCRE regular expressions |
| * (originally a Python one, but they are compatible enough) and converted to |
| * a fully usable form into a list of SpeechSymbolProcessor. These processors |
| * are then usable to pre-process an input text with |
| * speech_symbols_processor_process_text(). |
| * |
| * The loading steps are automatically handled when calling |
| * speech_symbols_processor_new(). To avoid re-processing files more than |
| * once even if they are used by different SpeechSymbolProcessor, the loaded |
| * files are cached as SpeechSymbols into the G_symbols_dicts global variable. |
| * Similarly, lists of SpeechSymbolProcessor are cached into the |
| * G_processors global variable. |
| * |
| * The caches are automatically loaded when looking up an entry with either |
| * get_locale_speech_symbols() (for SpeechSymbols lists) or |
| * get_locale_speech_symbols_processor() (for SpeechSymbolProcessor lists). |
| * This loading is aware of locale strings syntax and will fallback on the |
| * language code alone if the language-country combo isn't found. |
| * |
| * WARNING: this module is NOT thread-safe. Most notably, the caches are not |
| * thread-safe, so the public API insert_symbols() shouldn't be balled from |
| * several threads at once. This should not be an issue, as it is supposed to |
| * be called from the speak thread only. |
| * |
| * This file is mostly a 1:1 translation of NVDA's python code doing the same |
| * thing, with slight simplifications or adaptations for C, and removal of |
| * unused features like loading user-specific symbols files. |
| */ |
| |
| /* |
| * TODO: |
| * - support NUL byte representation. However, they aren't properly handled |
| * in the rest of SPD, so it's not so important. |
| */ |
| |
| #ifdef HAVE_CONFIG_H |
| #include <config.h> |
| #endif |
| |
| #include <ctype.h> |
| |
| #include "symbols.h" |
| |
| /* This denotes the position of some SSML tags */ |
| struct tags { |
| gsize pos; /* Its position in the text */ |
| gssize shift; /* How much its position is shifted by the current replacements */ |
| gint deferrable; /* Whether it is fine to defer the tag (e.g. a mark or comment) */ |
| gchar *tags; /* The content of the tags */ |
| }; |
| |
| /* Speech symbol preserve modes */ |
| typedef enum { |
| SYMPRES_INVALID = -1, |
| SYMPRES_NEVER = 0, /* Never preserve the symbol */ |
| SYMPRES_ALWAYS = 1, /* Always preserve the symbol */ |
| SYMPRES_NOREP = 2, /* Only preserve the symbol if it is not being |
| replaced; i.e. the user has set symbol level |
| lower than the level of this symbol */ |
| SYMPRES_LITERAL = 3 /* Replace literally, without any spacing |
| addition */ |
| } SymPresMode; |
| |
| /* Represents a single symbol, and how it should be handled. */ |
| typedef struct { |
| char *identifier; |
| char *pattern; |
| char *replacement; |
| SymLvl level; |
| SymPresMode preserve; |
| char *display_name; |
| } SpeechSymbol; |
| |
| /* Represents all symbols in a symbols file. |
| * This is roughly an internal representation of the symbols files. */ |
| typedef struct { |
| gchar *source; |
| /* Ordered list of [identifier(string), pattern(string)] */ |
| GSList *complex_symbols; |
| /* table of identifier(string):symbol(SpeechSymbol) */ |
| GHashTable *symbols; |
| } SpeechSymbols; |
| |
| /* Describes a name->value translation for a field that should be loaded |
| * as an integer. */ |
| typedef struct { |
| const char *name; |
| int value; |
| } IntFieldDesc; |
| |
| /* Represents a loaded and cached set of symbols in a usable form */ |
| typedef struct { |
| gchar *source; |
| |
| struct tags *tags; /* tags attached to the text */ |
| gint ntags; /* number of elements in tags array */ |
| |
| GRegex *regex; /* compiled regular expression for parsing input */ |
| GRegex **multi_chars_regex; /* array of compiled regular expression for simple multi-char symbols */ |
| gint nmulti_chars_regex; /* number of elements in multi_chars_regex */ |
| /* Table of identifier(string):symbol(SpeechSymbol). |
| * Indexes are pointers to symbol->identifier. */ |
| GHashTable *symbols; |
| /* list of SpeechSymbol (weak pointers to entries in @c symbols) */ |
| GSList *complex_list; |
| |
| /* Level requested by user */ |
| SymLvl level; |
| /* Level to be supported */ |
| SymLvl support_level; |
| } SpeechSymbolProcessor; |
| |
| /* Map of locale code to arbitrary data. */ |
| typedef GHashTable LocaleMap; |
| typedef gpointer (*LocaleMapCreateDataFunc) (const gchar *locale, const gchar *file); |
| |
| /* globals for caching */ |
| |
| /* Map of SpeechSymbols, indexed by their locale and file */ |
| static LocaleMap *G_symbols_dicts = NULL; |
| /* Map of SpeechSymbolProcessor lists, indexed by their locale */ |
| static LocaleMap *G_processors = NULL; |
| |
| /* List of files to load */ |
| static GSList *symbols_files; |
| |
| SymLvl str2SymLvl(const char *str) |
| { |
| SymLvl punct; |
| |
| if (!strcmp(str, "no")) |
| punct = SYMLVL_NO; |
| else if (!strcmp(str, "none")) |
| punct = SYMLVL_NONE; |
| else if (!strcmp(str, "all")) |
| punct = SYMLVL_ALL; |
| else if (!strcmp(str, "char")) |
| punct = SYMLVL_CHAR; |
| else |
| punct = SYMLVL_INVALID; |
| |
| return punct; |
| } |
| |
| /*----------------------------- Locale data map -----------------------------*/ |
| |
| static LocaleMap *locale_map_new(GDestroyNotify value_destroy) |
| { |
| return g_hash_table_new_full(g_str_hash, g_str_equal, g_free, value_destroy); |
| } |
| |
| static gpointer locale_map_lookup(LocaleMap *map, const gchar *locale, const gchar *file) |
| { |
| if (file) { |
| gchar *str = g_strdup_printf("%s %s", locale, file); |
| gpointer res = g_hash_table_lookup(map, str); |
| g_free(str); |
| return res; |
| } else { |
| return g_hash_table_lookup(map, locale); |
| } |
| } |
| |
| /* Fetches or creates a locale item for the map. |
| * If @c locale contains a country and data for the whole locale is not found, |
| * tries to load the data for the language alone. */ |
| static gpointer locale_map_fetch(LocaleMap *map, const gchar *locale, const gchar *file, |
| LocaleMapCreateDataFunc create) |
| { |
| guint i; |
| |
| for (i = 0; i < 2; i++) { |
| gpointer value; |
| gchar *l; |
| |
| if (i == 0) { |
| value = locale_map_lookup(map, locale, file); |
| l = g_strdup(locale); |
| } else { |
| gchar **parts = g_strsplit_set(locale, "_-", 2); |
| if (!parts[0] || !parts[1]) { |
| /* no delimiters, no need to try again */ |
| g_strfreev(parts); |
| continue; |
| } |
| l = g_strdup(parts[0]); |
| value = locale_map_lookup(map, l, file); |
| g_strfreev(parts); |
| } |
| if (value) { |
| g_free(l); |
| return value; |
| } |
| /* try to create */ |
| value = create(l, file); |
| if (value) { |
| g_hash_table_insert(map, l, value); |
| return value; |
| } |
| g_free(l); |
| } |
| |
| return NULL; |
| } |
| |
| /*--------------------- Escaping xml tags in ssml text ----------------------*/ |
| |
| /* |
| * We need not ever speak the SSML syntax, so we need to skip the tags. |
| * |
| * For lookbehind and lookahead rules to be able to run, we have to really |
| * remove the tags from the text, but we want to remember where they were. |
| * |
| * We thus build an array of the positions of the tags, that the replacement |
| * function will update, so we know where to put back the tags. |
| * |
| * Alongside, we also have to untranslate/translate the xml entities for tag characters. |
| */ |
| |
| /* Move tags off from the text */ |
| static gchar *escape_ssml_text(const gchar *text, struct tags **tags_ret, gint *ntags_ret) |
| { |
| const gchar *cur, *curtag = NULL; |
| struct tags *tags; |
| GString *str; |
| gchar name[7]; /* Current tag name, only need to recognize against "mark", "/mark", "!--" for now */ |
| gsize namepos = 0; |
| |
| int filling_tag; /* Whether we are stack tags, or text */ |
| int in_tag; /* Whether we are within a tag */ |
| int in_tag_name; /* Whether we are within the name part of a tag */ |
| int in_apos; /* Whether we are within a '' string in a tag */ |
| int in_quote; /* Whether we are within a "" string in a tag */ |
| gint ntags; |
| |
| /* First count how many blocks of tags we will have */ |
| filling_tag = 0; |
| in_tag = 0; |
| in_tag_name = 0; |
| in_apos = 0; |
| in_quote = 0; |
| ntags = 0; |
| |
| for (cur = text; *cur; cur++) { |
| guchar c = *cur; |
| |
| if (!in_tag) { |
| if (c == '<') { |
| in_tag = 1; |
| if (!filling_tag) { |
| ntags++; |
| filling_tag = 1; |
| } |
| } else { |
| /* Some text, switch to text */ |
| filling_tag = 0; |
| } |
| } else { |
| if (in_apos) { |
| if (c == '\'') |
| in_apos = 0; |
| } else if (in_quote) { |
| if (c == '"') |
| in_quote = 0; |
| } else if (c == '\'') { |
| in_apos = 1; |
| } else if (c == '"') { |
| in_quote = 1; |
| } else if (c == '>') { |
| in_tag = 0; |
| } |
| } |
| } |
| |
| /* We can now allocate the array of blocks of tags and restart over, this time filling text and tags */ |
| tags = malloc(ntags * sizeof(*tags)); |
| |
| filling_tag = 0; |
| in_tag = 0; |
| in_apos = 0; |
| in_quote = 0; |
| ntags = 0; |
| |
| str = g_string_sized_new(strlen(text)); |
| |
| for (cur = text; *cur; cur++) { |
| guchar c = *cur; |
| |
| if (!in_tag) { |
| if (c == '<') { |
| in_tag = 1; |
| in_tag_name = 1; |
| namepos = 0; |
| if (!filling_tag) { |
| /* Note the tags position in the text */ |
| tags[ntags].pos = str->len; |
| /* A priori only deferrable tags */ |
| tags[ntags].deferrable = 1; |
| curtag = cur; |
| filling_tag = 1; |
| } |
| } else { |
| if (filling_tag) { |
| /* Some text, dump the tags and switch to text */ |
| tags[ntags].tags = g_strndup(curtag, cur - curtag); |
| ntags++; |
| filling_tag = 0; |
| } |
| |
| if (c == '&') { |
| /* Unescape ssml character sequences */ |
| if (!strncmp(cur, """, 6)) { |
| cur += 5; |
| g_string_append_c(str, '"'); |
| } else if (!strncmp(cur, "'", 6)) { |
| cur += 5; |
| g_string_append_c(str, '\''); |
| } else if (!strncmp(cur, "<", 4)) { |
| cur += 3; |
| g_string_append_c(str, '<'); |
| } else if (!strncmp(cur, ">", 4)) { |
| cur += 3; |
| g_string_append_c(str, '>'); |
| } else if (!strncmp(cur, "&", 5)) { |
| cur += 4; |
| g_string_append_c(str, '&'); |
| } else |
| g_string_append_c(str, c); |
| } else { |
| /* Pure text, append as such */ |
| g_string_append_c(str, c); |
| } |
| } |
| } else { |
| if (in_apos) { |
| if (c == '\'') |
| in_apos = 0; |
| } else if (in_quote) { |
| if (c == '"') |
| in_quote = 0; |
| } else if (c == '\'') { |
| in_apos = 1; |
| } else if (c == '"') { |
| in_quote = 1; |
| } else { |
| if (in_tag_name) { |
| if (c == '>' || c == ' ' || c == '\t' || c == '\r' || c == '\n') { |
| in_tag_name = 0; |
| name[namepos] = '\0'; |
| if (strcmp(name, "mark") |
| && strcmp(name, "/mark") |
| && strcmp(name, "mark/") |
| && strcmp(name, "!--")) { |
| /* This is a non-deferrable tag */ |
| tags[ntags].deferrable = 0; |
| } |
| } else { |
| if (namepos < sizeof(name) - 1) { |
| name[namepos++] = c; |
| } |
| } |
| } |
| if (c == '>') |
| in_tag = 0; |
| } |
| } |
| } |
| /* Trailing tags content */ |
| if (filling_tag) { |
| tags[ntags].tags = g_strndup(curtag, cur - curtag); |
| ntags++; |
| } |
| |
| *tags_ret = tags; |
| *ntags_ret = ntags; |
| |
| return g_string_free(str, FALSE); |
| } |
| |
| /* Put back tags into the text */ |
| static gchar *unescape_ssml_text(const gchar *text, struct tags *tags, gint ntags) |
| { |
| GString *str; |
| const gchar *cur; |
| struct tags *curtags = tags; |
| |
| str = g_string_sized_new(strlen(text)); |
| |
| for (cur = text; *cur; cur++) { |
| guchar c; |
| |
| while (ntags && cur - text == curtags->pos) { |
| /* We reached the position of a block of tags, put them back */ |
| g_string_append(str, curtags->tags); |
| curtags++; |
| ntags--; |
| } |
| |
| c = *cur; |
| |
| /* Re-escape ssml character sequences */ |
| if (c == '"') |
| g_string_append(str, """); |
| else if (c == '\'') |
| g_string_append(str, "'"); |
| else if (c == '<') |
| g_string_append(str, "<"); |
| else if (c == '>') |
| g_string_append(str, ">"); |
| else if (c == '&') |
| g_string_append(str, "&"); |
| else |
| g_string_append_c(str, c); |
| } |
| |
| while (ntags) { |
| /* Trailing tags */ |
| g_string_append(str, curtags->tags); |
| curtags++; |
| ntags--; |
| } |
| |
| free(tags); |
| |
| return g_string_free(str, FALSE); |
| } |
| |
| /*----------------- Speech symbol representation and loading ----------------*/ |
| |
| static SpeechSymbol *speech_symbol_new(void) |
| { |
| SpeechSymbol *sym = g_slice_alloc(sizeof *sym); |
| |
| sym->identifier = NULL; |
| sym->pattern = NULL; |
| sym->replacement = NULL; |
| sym->level = SYMLVL_INVALID; |
| sym->preserve = SYMPRES_INVALID; |
| sym->display_name = NULL; |
| |
| return sym; |
| } |
| |
| static void speech_symbol_free(SpeechSymbol *sym) |
| { |
| /* sym->identifier is the key, thus freed by hash table */ |
| g_free(sym->pattern); |
| g_free(sym->replacement); |
| g_free(sym->display_name); |
| g_slice_free1(sizeof *sym, sym); |
| } |
| |
| /* checks whether the line should be skipped: either blank or commented */ |
| static int skip_line(const char *line) |
| { |
| if (*line == '#') |
| return 1; |
| while (g_ascii_isspace(*line)) |
| line++; |
| return *line == 0; |
| } |
| |
| /* strips \r and \n at the end of a single line buffer */ |
| static void strip_newline(char *line) |
| { |
| while (*line && *line != '\r' && *line != '\n') |
| line++; |
| *line = 0; |
| } |
| |
| /* Loads an "identifier\tpattern" line into complex_symbols */ |
| static int speech_symbols_load_complex_symbol(SpeechSymbols *ss, const char *line) |
| { |
| char **parts = g_strsplit(line, "\t", 2); |
| |
| if (g_strv_length(parts) != 2) { |
| g_strfreev(parts); |
| return -1; |
| } |
| |
| ss->complex_symbols = g_slist_prepend(ss->complex_symbols, parts); |
| |
| return 0; |
| } |
| |
| /* Finds the entry in @p map that corresponds to @p name, and put its value |
| * into the integer pointer to by @p value */ |
| static int speech_symbols_load_int_field(IntFieldDesc *map, guint map_len, |
| const char *name, int *value) |
| { |
| guint i; |
| |
| for (i = 0; i < map_len; i++) { |
| if (strcmp(map[i].name, name) == 0) { |
| *value = map[i].value; |
| return 0; |
| } |
| } |
| |
| return -1; |
| } |
| |
| /* Loads a symbol line into symbols |
| * syntax is: |
| * identifier "\t" replacement [ "\t" level [ "\t" preserve ] [ "\t#" comment ] */ |
| static int speech_symbols_load_symbol(SpeechSymbols *ss, const char *line) |
| { |
| char **parts = g_strsplit(line, "\t", -1); |
| guint len = g_strv_length(parts); |
| char *display_name = NULL; |
| char *identifier = NULL; |
| char *replacement = NULL; |
| int level = SYMLVL_INVALID; |
| int pres_mode = SYMPRES_INVALID; |
| SpeechSymbol *sym; |
| |
| /* last field, if commented: display name */ |
| if (len > 0 && parts[len - 1][0] == '#') { |
| /* Regardless of how many fields there are, |
| * if the last field is a comment, it is the display name. */ |
| const char *p; |
| |
| display_name = parts[len - 1]; |
| parts[--len] = NULL; |
| |
| p = display_name + 1; |
| while (g_ascii_isspace(*p)) |
| p++; |
| memmove(display_name, p, strlen(p) + 1); |
| } |
| |
| /* 4th field (optional): preserve */ |
| if (len > 3) { |
| IntFieldDesc map[] = { |
| { "-", SYMPRES_NEVER }, |
| { "never", SYMPRES_NEVER }, |
| { "always", SYMPRES_ALWAYS }, |
| { "norep", SYMPRES_NOREP }, |
| { "literal", SYMPRES_LITERAL }, |
| }; |
| |
| if (speech_symbols_load_int_field(map, G_N_ELEMENTS(map), |
| parts[3], &pres_mode) < 0) |
| goto err; |
| } |
| |
| /* 3rd field (optional): level */ |
| if (len > 2) { |
| IntFieldDesc map[] = { |
| { "-", SYMLVL_NONE }, |
| { "none", SYMLVL_NONE }, |
| { "some", SYMLVL_SOME }, |
| { "most", SYMLVL_MOST }, |
| { "all", SYMLVL_ALL }, |
| { "char", SYMLVL_CHAR }, |
| }; |
| |
| if (speech_symbols_load_int_field(map, G_N_ELEMENTS(map), |
| parts[2], &level) < 0) |
| goto err; |
| } |
| |
| /* missing required fields */ |
| if (len < 2 || !parts[0] || !parts[0][0]) |
| goto err; |
| |
| /* 2nd field: replacement */ |
| if (strcmp(parts[1], "-") == 0) |
| replacement = NULL; |
| else |
| replacement = g_strdup(parts[1]); |
| |
| /* 1st field: identifier */ |
| if (parts[0][0] == '\\' && parts[0][1]) { |
| identifier = g_strdup(parts[0] + 1); |
| switch (identifier[0]) { |
| case '0': |
| identifier[0] = '\0'; |
| /* FIXME: support this */ |
| MSG2(1, "symbols", "Loading NUL byte entry is not yet supported"); |
| goto err; |
| break; |
| case 't': identifier[0] = '\t'; break; |
| case 'n': identifier[0] = '\n'; break; |
| case 'r': identifier[0] = '\r'; break; |
| case 'f': identifier[0] = '\f'; break; |
| case 'v': identifier[0] = '\v'; break; |
| case '#': |
| case '\\': |
| /* nothing to do */ |
| break; |
| } |
| } else |
| identifier = g_strdup(parts[0]); |
| |
| sym = speech_symbol_new(); |
| sym->identifier = identifier; |
| sym->replacement = replacement; |
| sym->level = level; |
| sym->preserve = pres_mode; |
| sym->display_name = display_name; |
| |
| g_hash_table_insert(ss->symbols, sym->identifier, sym); |
| |
| g_strfreev(parts); |
| |
| return 0; |
| |
| err: |
| g_free(display_name); |
| g_free(identifier); |
| g_free(replacement); |
| g_strfreev(parts); |
| |
| return -1; |
| } |
| |
| /* Loads a symbols.dic file into @p ss */ |
| static int speech_symbols_load(SpeechSymbols *ss, const char *filename, gboolean allow_complex) |
| { |
| FILE *fp; |
| char *line = NULL; |
| size_t n = 0; |
| unsigned char bom[3]; |
| /* line parsing callback for the current section */ |
| int (*handler) (SpeechSymbols *, const char *) = NULL; |
| |
| fp = fopen(filename, "r"); |
| if (!fp) { |
| int level = 5; /* Common case, avoid shouting */ |
| if (errno != ENOENT) |
| /* Odd error, shout */ |
| level = 1; |
| MSG2(level, "symbols", "Failed to open file '%s': %s", filename, g_strerror(errno)); |
| return -1; |
| } |
| |
| /* skip UTF-8 BOM if present */ |
| if (fread(bom, sizeof *bom, sizeof bom, fp) != sizeof bom || |
| bom[0] != 0xEF || bom[1] != 0xBB || bom[2] != 0xBF) |
| fseek(fp, 0, SEEK_SET); |
| |
| while (getline(&line, &n, fp) >= 0) { |
| if (skip_line(line)) |
| continue; |
| strip_newline(line); |
| |
| if (allow_complex && strcmp(line, "complexSymbols:") == 0) { |
| handler = speech_symbols_load_complex_symbol; |
| } else if (strcmp(line, "symbols:") == 0) { |
| handler = speech_symbols_load_symbol; |
| } else if (!handler || handler(ss, line) < 0) { |
| MSG2(1, "symbols", "Invalid line in file %s: %s", |
| filename, line); |
| } |
| } |
| |
| free(line); |
| fclose(fp); |
| |
| MSG2(1, "symbols", "Loaded file '%s'", filename); |
| |
| return 0; |
| } |
| |
| static void speech_symbols_free(SpeechSymbols *ss) |
| { |
| g_slist_free_full(ss->complex_symbols, (GDestroyNotify) g_strfreev); |
| g_hash_table_destroy(ss->symbols); |
| g_free(ss->source); |
| g_free(ss); |
| } |
| |
| /* Loads a symbols file for @p locale. |
| * Returns a SpeechSymbols*, or NULL on error. */ |
| static gpointer speech_symbols_new(const gchar *locale, const gchar *file) |
| { |
| SpeechSymbols *ss = g_malloc(sizeof *ss); |
| gchar *path; |
| int ret; |
| |
| ss->complex_symbols = NULL; |
| ss->source = NULL; |
| ss->symbols = g_hash_table_new_full(g_str_hash, g_str_equal, |
| g_free, |
| (GDestroyNotify) speech_symbol_free); |
| |
| path = g_build_filename(SpeechdOptions.user_conf_dir, "locale", locale, file, NULL); |
| MSG2(5, "symbols", "Trying to load %s for '%s' from '%s/locale'", file, locale, SpeechdOptions.user_conf_dir); |
| ret = speech_symbols_load(ss, path, TRUE); |
| if (ret < 0) { |
| path = g_build_filename(SpeechdOptions.user_conf_dir, "locale", file, NULL); |
| MSG2(5, "symbols", "Trying to load %s from '%s/locale'", file, SpeechdOptions.user_conf_dir); |
| ret = speech_symbols_load(ss, path, TRUE); |
| } |
| if (ret < 0) { |
| path = g_build_filename(LOCALE_DATA, locale, file, NULL); |
| MSG2(5, "symbols", "Trying to load %s for '%s' from '%s'", file, locale, path); |
| ret = speech_symbols_load(ss, path, TRUE); |
| } |
| if (ret >= 0) { |
| MSG2(5, "symbols", "Successful"); |
| /* The elements are added to the start of the list in |
| * speech_symbols_load_complex_symbol() for better speed (as adding to |
| * the end requires walking the whole list), but we want them in the |
| * order they are in the file, so reverse the list. */ |
| ss->complex_symbols = g_slist_reverse(ss->complex_symbols); |
| ss->source = g_strdup(file); |
| } else { |
| /* Nothing loaded in the end */ |
| MSG2(5, "symbols", "Failed"); |
| speech_symbols_free(ss); |
| ss = NULL; |
| } |
| g_free(path); |
| |
| return ss; |
| } |
| |
| static SpeechSymbols *get_locale_speech_symbols(const gchar *locale, const gchar *file) |
| { |
| if (!G_symbols_dicts) { |
| G_symbols_dicts = locale_map_new((GDestroyNotify) speech_symbols_free); |
| } |
| |
| return locale_map_fetch(G_symbols_dicts, locale, file, speech_symbols_new); |
| } |
| |
| void symbols_preprocessing_add_file(const char *name) |
| { |
| MSG2(5, "symbols", "Will load symbol file %s", name); |
| symbols_files = g_slist_append(symbols_files, g_strdup(name)); |
| } |
| |
| /*------------------ Speech symbol compilation & processing -----------------*/ |
| |
| /* sort function sorting strings by length, longest first */ |
| static gint list_sort_string_longest_first(gconstpointer a, gconstpointer b) |
| { |
| return strlen(b) - strlen(a); |
| } |
| |
| static void speech_symbols_processor_free(SpeechSymbolProcessor *ssp) |
| { |
| gint i; |
| if (ssp->regex) |
| g_regex_unref(ssp->regex); |
| for (i = 0; i < ssp->nmulti_chars_regex; i++) |
| g_regex_unref(ssp->multi_chars_regex[i]); |
| g_free(ssp->multi_chars_regex); |
| g_slist_free(ssp->complex_list); |
| if (ssp->symbols) |
| g_hash_table_unref(ssp->symbols); |
| g_free(ssp->source); |
| g_free(ssp); |
| } |
| |
| static void speech_symbols_processor_list_free(GSList *sspl) |
| { |
| GSList *e; |
| for (e = sspl; e; e = e->next) |
| speech_symbols_processor_free(e->data); |
| } |
| |
| /* Loads and compiles speech symbols conversions for @p locale. |
| * Returns a SpeechSymbolProcessor*, or NULL on error */ |
| static SpeechSymbolProcessor *speech_symbols_processor_new(const char *locale, SpeechSymbols *syms, const char *file) |
| { |
| SpeechSymbolProcessor *ssp = NULL; |
| SpeechSymbols *ssbase; |
| GHashTableIter iter; |
| gpointer key, value; |
| GString *characters; |
| GSList *multi_chars_list = NULL; |
| gchar *escaped; |
| GString *escaped_multi; |
| GString *pattern; |
| GError *error = NULL; |
| GSList *sources = NULL; |
| GSList *node; |
| int has_dash = 0; |
| int has_rbracket = 0; |
| int has_circum = 0; |
| |
| if (syms) |
| sources = g_slist_append(sources, syms); |
| /* Always use the base. */ |
| ssbase = get_locale_speech_symbols("base", file); |
| if (ssbase) |
| sources = g_slist_append(sources, ssbase); |
| |
| ssp = g_malloc(sizeof *ssp); |
| ssp->multi_chars_regex = NULL; |
| ssp->nmulti_chars_regex = 0; |
| |
| ssp->source = g_strdup(file); |
| /* The computed symbol information from all sources. */ |
| ssp->symbols = g_hash_table_new_full(g_str_hash, g_str_equal, |
| g_free, |
| (GDestroyNotify) speech_symbol_free); |
| /* An indexable list of complex symbols for use in building/executing the regexp. */ |
| ssp->complex_list = NULL; |
| |
| /* Add all complex symbols first, as they take priority. */ |
| for (node = sources; node; node = node->next) { |
| SpeechSymbols *syms = node->data; |
| GSList *node2; |
| |
| for (node2 = syms->complex_symbols; node2; node2 = node2->next) { |
| SpeechSymbol *sym; |
| gchar **key_val = node2->data; |
| |
| if (g_hash_table_contains(ssp->symbols, key_val[0])) { |
| /* Already defined */ |
| continue; |
| } |
| |
| sym = speech_symbol_new(); |
| sym->identifier = g_strdup(key_val[0]); |
| sym->pattern = g_strdup(key_val[1]); |
| g_hash_table_insert(ssp->symbols, sym->identifier, sym); |
| ssp->complex_list = g_slist_prepend(ssp->complex_list, sym); |
| } |
| } |
| /* Elements are added at the start for performance, but we want them in the original order */ |
| ssp->complex_list = g_slist_reverse(ssp->complex_list); |
| |
| /* Supplement the data for complex symbols and add all simple symbols. */ |
| characters = g_string_new(NULL); |
| for (node = sources; node; node = node->next) { |
| SpeechSymbols *syms = node->data; |
| |
| g_hash_table_iter_init(&iter, syms->symbols); |
| while (g_hash_table_iter_next(&iter, &key, &value)) { |
| const SpeechSymbol *source_sym = value; |
| SpeechSymbol *sym; |
| |
| sym = g_hash_table_lookup(ssp->symbols, key); |
| if (!sym && syms != ssbase) { |
| /* This is a new simple symbol. |
| * (All complex symbols have already been added.) */ |
| sym = speech_symbol_new(); |
| sym->identifier = g_strdup(key); |
| g_hash_table_insert(ssp->symbols, sym->identifier, sym); |
| if (strlen(sym->identifier) == 1) { |
| switch (sym->identifier[0]) { |
| case '-': |
| has_dash = 1; |
| break; |
| case ']': |
| has_rbracket = 1; |
| break; |
| case '^': |
| has_circum = 1; |
| break; |
| default: |
| g_string_append_c(characters, sym->identifier[0]); |
| } |
| } else { |
| multi_chars_list = g_slist_prepend(multi_chars_list, sym->identifier); |
| } |
| } |
| if (sym) { |
| /* If fields weren't explicitly specified, inherit the value from later sources. */ |
| if (sym->replacement == NULL) |
| sym->replacement = g_strdup(source_sym->replacement); |
| if (sym->level == SYMLVL_INVALID) |
| sym->level = source_sym->level; |
| if (sym->preserve == SYMPRES_INVALID) |
| sym->preserve = source_sym->preserve; |
| if (sym->display_name == NULL) |
| sym->display_name = g_strdup(source_sym->display_name); |
| } |
| } |
| } |
| |
| /* Set defaults for any fields not explicitly set. */ |
| g_hash_table_iter_init(&iter, ssp->symbols); |
| while (g_hash_table_iter_next(&iter, &key, &value)) { |
| SpeechSymbol *sym = value; |
| |
| if (!sym->replacement) { |
| /* Symbols without a replacement specified are useless. */ |
| MSG2(2, "symbols", "Replacement not defined " |
| "in locale %s for symbol: %s", |
| locale, sym->identifier); |
| ssp->complex_list = g_slist_remove(ssp->complex_list, sym); |
| g_hash_table_iter_remove(&iter); |
| continue; |
| } |
| if (sym->level == SYMLVL_INVALID) |
| sym->level = SYMLVL_ALL; |
| if (sym->preserve == SYMPRES_INVALID) |
| sym->preserve = SYMPRES_NEVER; |
| if (sym->display_name == NULL) |
| sym->display_name = g_strdup(sym->identifier); |
| } |
| |
| /* build the regex. */ |
| |
| /* Make characters into a regexp character set. */ |
| escaped = g_regex_escape_string(characters->str, characters->len); |
| g_string_truncate(characters, 0); |
| if (*escaped || has_dash || has_rbracket || has_circum) { |
| g_string_append_printf(characters, "[%s", escaped); |
| if (has_dash) |
| g_string_append_printf(characters, "\\-"); |
| if (has_rbracket) |
| g_string_append_printf(characters, "\\]"); |
| if (has_circum) |
| g_string_append_printf(characters, "\\^"); |
| g_string_append_c(characters, ']'); |
| } |
| g_free(escaped); |
| |
| /* The simple symbols must be ordered longest first so that the longer symbols will match.*/ |
| multi_chars_list = g_slist_sort(multi_chars_list, list_sort_string_longest_first); |
| |
| /* TODO: check the syntax is compatible with GLib */ |
| pattern = g_string_new(NULL); |
| /* Strip repeated spaces from the end of the line to stop them from being picked up by repeated. */ |
| g_string_append(pattern, "(?P<rstripSpace> +$)"); |
| /* Repeated characters: more than 3 repeats. */ |
| if (characters->len) { |
| g_string_append_c(pattern, '|'); |
| g_string_append_printf(pattern, "(?P<repeated>(?P<repTmp>%s)(?P=repTmp){3,})", characters->str); |
| } |
| /* Complex symbols. |
| * Each complex symbol has its own named group so we know which symbol matched. */ |
| guint i = 0; |
| for (node = ssp->complex_list; node; node = node->next, i++) { |
| SpeechSymbol *sym = node->data; |
| g_string_append_c(pattern, '|'); |
| g_string_append_printf(pattern, "(?P<c%u>%s)", i, sym->pattern); |
| } |
| |
| /* Simple symbols. |
| * These are all handled in one named group. |
| * Because the symbols are just text, we know which symbol matched just by looking at the matched text. */ |
| escaped_multi = g_string_new(NULL); |
| if (ssp->complex_list) |
| /* We have some complex symbols. Keep the multi-characters rules along it */ |
| for (node = multi_chars_list; node; node = node->next) { |
| escaped = g_regex_escape_string(node->data, -1); |
| if (escaped_multi->len > 0) |
| g_string_append_c(escaped_multi, '|'); |
| g_string_append(escaped_multi, escaped); |
| g_free(escaped); |
| } |
| |
| if ((escaped_multi->len && ssp->complex_list) || characters->len) { |
| g_string_append_c(pattern, '|'); |
| g_string_append_printf(pattern, "(?P<simple>"); |
| if (escaped_multi->len && ssp->complex_list) |
| g_string_append_printf(pattern, "%s", escaped_multi->str); |
| if (escaped_multi->len && ssp->complex_list && characters->len) |
| g_string_append_printf(pattern, "|"); |
| if (characters->len) |
| g_string_append_printf(pattern, "%s", characters->str); |
| g_string_append_printf(pattern, ")"); |
| } |
| g_string_free(escaped_multi, TRUE); |
| g_string_free(characters, TRUE); |
| |
| MSG2(5, "symbols", "building regex: %s", pattern->str); |
| ssp->regex = g_regex_new(pattern->str, G_REGEX_OPTIMIZE, 0, &error); |
| if (!ssp->regex) { |
| /* if regex compilation failed, bail out */ |
| MSG2(1, "symbols", "ERROR compiling regular expression: %s. " |
| "This is likely due to an invalid complex " |
| "symbol regular expression in locale %s.", |
| error->message, locale); |
| g_error_free(error); |
| speech_symbols_processor_free(ssp); |
| ssp = NULL; |
| goto out; |
| } |
| |
| g_string_truncate(pattern, 0); |
| |
| gint nsymbols = 0; |
| |
| /* Simple symbols. */ |
| if (!ssp->complex_list) |
| /* We have no complex symbols. We can handle them in one named group, |
| * but possibly in several regexps to avoid the limitations of pcre. |
| * Because the symbols are just text, we know which symbol matched |
| * just by looking at the matched text. */ |
| for (node = multi_chars_list; node; node = node->next) { |
| if (!nsymbols) |
| g_string_append_printf(pattern, "(?P<simple>"); |
| else |
| g_string_append_c(pattern, '|'); |
| escaped = g_regex_escape_string(node->data, -1); |
| g_string_append(pattern, escaped); |
| g_free(escaped); |
| nsymbols++; |
| |
| if (nsymbols == 1000 || !node->next) { |
| /* Already large pattern, or end of list, flush pattern */ |
| g_string_append_printf(pattern, ")"); |
| |
| MSG2(5, "symbols", "building regex: %s", pattern->str); |
| GRegex *regex = g_regex_new(pattern->str, G_REGEX_OPTIMIZE, 0, &error); |
| if (!regex) { |
| /* if regex compilation failed, bail out */ |
| MSG2(1, "symbols", "ERROR compiling regular expression: %s. " |
| "This is likely due to an invalid complex " |
| "symbol regular expression in locale %s.", |
| error->message, locale); |
| g_error_free(error); |
| speech_symbols_processor_free(ssp); |
| ssp = NULL; |
| goto out; |
| } |
| |
| ssp->nmulti_chars_regex++; |
| ssp->multi_chars_regex = g_realloc(ssp->multi_chars_regex, ssp->nmulti_chars_regex * sizeof(ssp->multi_chars_regex[0])); |
| ssp->multi_chars_regex[ssp->nmulti_chars_regex-1] = regex; |
| |
| g_string_truncate(pattern, 0); |
| nsymbols = 0; |
| } |
| } |
| |
| out: |
| g_slist_free(multi_chars_list); |
| |
| g_string_free(pattern, TRUE); |
| g_slist_free(sources); |
| |
| return ssp; |
| } |
| |
| /* Loads and compiles speech symbols conversions for @p locale. |
| * Returns a SpeechSymbolProcessor*, or NULL on error */ |
| static gpointer speech_symbols_processor_list_new(const char *locale, const char *file) |
| { |
| SpeechSymbolProcessor *ssp; |
| SpeechSymbols *ss; |
| GSList *sspl = NULL; |
| GSList *node; |
| |
| gchar **parts = g_strsplit_set(locale, "_-", 2); |
| MSG2(2, "symbols", "Loading symbols for locale '%s', will try:", locale); |
| MSG2(2, "symbols", "%s/locale/%s", SpeechdOptions.user_conf_dir, locale); |
| MSG2(2, "symbols", "%s/locale", SpeechdOptions.user_conf_dir); |
| MSG2(2, "symbols", LOCALE_DATA "/%s", locale); |
| MSG2(2, "symbols", "%s/locale/%s", SpeechdOptions.user_conf_dir, parts[0]); |
| MSG2(2, "symbols", LOCALE_DATA "/%s", parts[0]); |
| MSG2(2, "symbols", "and also as base:"); |
| MSG2(2, "symbols", "%s/locale/base", SpeechdOptions.user_conf_dir); |
| MSG2(2, "symbols", LOCALE_DATA "/base"); |
| g_strfreev(parts); |
| |
| for (node = symbols_files; node; node = node->next) { |
| MSG2(2, "symbols", "Loading '%s'", (char*) node->data); |
| ss = get_locale_speech_symbols(locale, node->data); |
| if (ss) { |
| ssp = speech_symbols_processor_new(locale, ss, (char*) node->data); |
| if (ssp) |
| sspl = g_slist_prepend(sspl, ssp); |
| } else { |
| ss = get_locale_speech_symbols("base", node->data); |
| if (ss) { |
| /* Let speech_symbols_processor_new include only "base" */ |
| ssp = speech_symbols_processor_new(locale, NULL, (char*) node->data); |
| if (ssp) |
| sspl = g_slist_prepend(sspl, ssp); |
| } else |
| MSG2(1, "symbols", "Failed to load symbols '%s' for locale '%s'", |
| (char*) node->data, locale); |
| } |
| } |
| |
| /* The elements are added to the start of the list for better speed (as |
| * adding to the end requires walking the whole list), but we want them |
| * in the order they are in the config, so reverse the list. */ |
| sspl = g_slist_reverse(sspl); |
| |
| return sspl; |
| } |
| |
| /* Fetch a named group that matched. |
| * FIXME: handle empty groups? (e.g. with only lookaheads/lookbehinds) */ |
| static gchar *fetch_named_matching(const GMatchInfo *match_info, const gchar *name) |
| { |
| gchar *capture = g_match_info_fetch_named(match_info, name); |
| |
| if (capture && !*capture) { |
| g_free(capture); |
| capture = NULL; |
| } |
| |
| return capture; |
| } |
| |
| enum group { |
| RSTRIPSPACE, |
| REPEATED, |
| SIMPLE, |
| COMPLEX, |
| }; |
| |
| /* Look for the first block of tags strictly after pos, among tags between firsttag and lasttag */ |
| static gint find_nexttag(struct tags *tags, gint pos, gint firsttag, gint endtag) |
| { |
| gint middletag; |
| |
| if (endtag == firsttag) |
| /* None here */ |
| return endtag; |
| |
| if (tags[firsttag].pos > pos) |
| /* That's it already */ |
| return firsttag; |
| |
| middletag = (firsttag + 1 + endtag) / 2; |
| if (middletag == endtag) |
| /* None */ |
| return endtag; |
| |
| if (tags[middletag].pos > pos) |
| return find_nexttag(tags, pos, firsttag, middletag); |
| else |
| return find_nexttag(tags, pos, middletag, endtag); |
| } |
| |
| static int replace_groups(const GMatchInfo *match_info, GString *result, char *replacement, gint pos) |
| { |
| int in_escape = 0; |
| char c; |
| |
| while ((c = *replacement++)) { |
| if (!in_escape) { |
| if (c == '\\') |
| in_escape = 1; |
| else |
| g_string_append_c(result, c); |
| } else { |
| if (c == '\\') |
| g_string_append_c(result, '\\'); |
| else if (c >= '0' && c <= '9') { |
| gchar *res = g_match_info_fetch(match_info, pos + (c - '0')); |
| if (res) |
| g_string_append(result, res); |
| else |
| MSG2(1, "symbols", "Unmatched reference \\%c", c); |
| } else { |
| MSG2(1, "symbols", "Invalid reference \\%c", c); |
| g_string_append_c(result, c); |
| } |
| in_escape = 0; |
| } |
| } |
| if (in_escape) |
| MSG2(1, "symbols", "Unterminated backslash"); |
| |
| return 1; |
| } |
| |
| /* Regular expression callback for applying replacements */ |
| static gboolean regex_eval(const GMatchInfo *match_info, GString *result, gpointer user_data) |
| { |
| SpeechSymbolProcessor *ssp = user_data; |
| gchar *capture; |
| enum group captured_group; |
| gchar *group_0; |
| gint start = -1, end = -1; |
| gint prevlen = result->len, shift; |
| gint nexttag, curtag, deferrable; |
| guint i = 0; |
| SpeechSymbol *sym = NULL; |
| gint pos = 0; |
| |
| /* First see what we captured */ |
| |
| /* FIXME: Python regex API allows to find the name of the group that |
| * matched. As GRegex doesn't have that, what we do here is try |
| * and fetch the groups we know, and see if they matched. |
| * This is not very optimal, but how can we avoid that? */ |
| |
| if ((capture = fetch_named_matching(match_info, "rstripSpace"))) { |
| captured_group = RSTRIPSPACE; |
| } else if ((capture = fetch_named_matching(match_info, "repeated"))) { |
| captured_group = REPEATED; |
| } else if ((capture = fetch_named_matching(match_info, "simple"))) { |
| captured_group = SIMPLE; |
| } else { |
| /* Complex symbol. */ |
| GSList *node; |
| |
| for (node = ssp->complex_list; !sym && node; node = node->next, i++) { |
| gchar *group_name = g_strdup_printf("c%u", i); |
| |
| if ((capture = fetch_named_matching(match_info, group_name))) { |
| gchar **all = g_match_info_fetch_all(match_info); |
| gint i; |
| |
| pos = -1; |
| /* Find out the index of the match */ |
| for (i = 1; all[i]; i++) { |
| if (all[i][0]) { |
| pos = i; |
| break; |
| } |
| } |
| g_strfreev(all); |
| |
| if (pos != -1) |
| sym = node->data; |
| } |
| g_free(group_name); |
| |
| if (sym) |
| break; |
| } |
| |
| captured_group = COMPLEX; |
| } |
| |
| /* Now check where that lies among tags */ |
| |
| g_match_info_fetch_pos(match_info, 0, &start, &end); |
| |
| nexttag = find_nexttag(ssp->tags, start, 0, ssp->ntags); |
| |
| /* Check whether the contained tags are deferrable */ |
| deferrable = 1; |
| for (curtag = nexttag; curtag < ssp->ntags; curtag++) { |
| if (ssp->tags[curtag].pos >= end) |
| /* Don't care about the rest */ |
| break; |
| /* This block of tags is within the group */ |
| if (!ssp->tags[curtag].deferrable) { |
| /* Oops, these tags can't be deferred */ |
| deferrable = 0; |
| break; |
| } |
| } |
| |
| if (!deferrable) { |
| group_0 = g_match_info_fetch(match_info, 0); |
| MSG2(1, "symbols", "tags '%s' within group |%s| (at %d..%d), not replacing group :/", |
| ssp->tags[curtag].tags, group_0, start, end); |
| g_free(group_0); |
| |
| g_string_append(result, capture); |
| g_free(capture); |
| |
| return FALSE; |
| } |
| |
| /* Defer these tags */ |
| for (curtag = nexttag; curtag < ssp->ntags; curtag++) { |
| if (ssp->tags[curtag].pos >= end) |
| /* Don't care about the rest */ |
| break; |
| /* This block of tags is within the group, defer it after the group */ |
| MSG2(5, "symbols", "deferring tags '%s' to %d", ssp->tags[curtag].tags, end); |
| ssp->tags[curtag].pos = end; |
| } |
| |
| /* Ok, now replace */ |
| if (captured_group == RSTRIPSPACE) { |
| MSG2(5, "symbols", "replacing <rstripSpace>"); |
| /* nothing to do, just don't add it in the result */ |
| } else if (captured_group == REPEATED) { |
| /* Repeated character. */ |
| char ch[2] = { capture[0], 0 }; |
| SpeechSymbol *sym = g_hash_table_lookup(ssp->symbols, ch); |
| |
| MSG2(5, "symbols", "replacing <repeated>"); |
| |
| /* this should never happen, but be on the safe side and check it */ |
| if (!sym) |
| goto symbol_error; |
| |
| if (ssp->level >= sym->level) { |
| g_string_append_printf(result, " %lu %s ", (unsigned long) strlen(capture), sym->replacement); |
| } else { |
| g_string_append_c(result, ' '); |
| } |
| } else { |
| const gchar *prefix, *suffix; |
| |
| /* One of the defined symbols. **/ |
| if (captured_group == SIMPLE) { |
| /* Simple symbol. */ |
| sym = g_hash_table_lookup(ssp->symbols, capture); |
| MSG2(5, "symbols", "replacing <simple>"); |
| } else { |
| g_assert(captured_group == COMPLEX); |
| /* Complex symbol, sym and i already set */ |
| MSG2(5, "symbols", "replacing <c%u> (complex symbol)", i); |
| } |
| |
| /* this should never happen, but be on the safe side and check it */ |
| if (!sym) |
| goto symbol_error; |
| |
| MSG2(5, "symbols", "replacing sym |%s| (lvl=%d, preserve=%d)", |
| sym->identifier, sym->level, sym->preserve); |
| |
| if (sym->preserve == SYMPRES_LITERAL) |
| prefix = ""; |
| else |
| prefix = " "; |
| |
| if (sym->preserve == SYMPRES_ALWAYS || |
| (sym->preserve == SYMPRES_NOREP && ssp->level < sym->level)) |
| suffix = capture; |
| else if (sym->preserve == SYMPRES_LITERAL) |
| suffix = ""; |
| else |
| suffix = " "; |
| |
| if (sym->level > ssp->support_level) { |
| /* Leave it to the module */ |
| g_string_append(result, capture); |
| } else if (ssp->level >= sym->level && sym->replacement) { |
| g_string_append(result, prefix); |
| MSG2(5, "symbols", "replacing with %s", sym->replacement); |
| replace_groups(match_info, result, sym->replacement, pos); |
| g_string_append(result, suffix); |
| } else { |
| g_string_append(result, suffix); |
| } |
| } |
| |
| goto out; |
| |
| symbol_error: |
| group_0 = g_match_info_fetch(match_info, 0); |
| MSG2(1, "symbols", "WARNING: no symbol for match |%s| (at %d..%d), this shouldn't happen.", |
| group_0, start, end); |
| g_free(group_0); |
| |
| out: |
| /* content has grown (or shrunk) by this amount */ |
| shift = (result->len - prevlen) - strlen(capture); |
| |
| if (nexttag < ssp->ntags) |
| /* Update positions of tags beyond this */ |
| ssp->tags[nexttag].shift += shift; |
| |
| g_free(capture); |
| |
| return FALSE; |
| } |
| |
| /* Processes some input and converts symbols in it */ |
| static gchar *speech_symbols_processor_process_text(GSList *sspl, const gchar *input, SymLvl level, SymLvl support_level, SPDDataMode ssml_mode) |
| { |
| gchar *text; |
| gchar *processed; |
| struct tags *tags = NULL; |
| gint ntags = 0, i; |
| GError *error = NULL; |
| |
| if (ssml_mode == SPD_DATA_SSML) { |
| text = escape_ssml_text(input, &tags, &ntags); |
| MSG2(5, "symbols", "escaped ssml '%s' to '%s'", input, text); |
| } else { |
| text = g_strdup(input); |
| } |
| |
| for ( ; sspl; sspl = sspl->next) { |
| SpeechSymbolProcessor *ssp = sspl->data; |
| |
| if (ssml_mode == SPD_DATA_SSML) { |
| for (i = 0; i < ntags; i++) |
| tags[i].shift = 0; |
| ssp->tags = tags; |
| ssp->ntags = ntags; |
| } else |
| ssp->ntags = 0; |
| |
| ssp->level = level; |
| ssp->support_level = support_level; |
| MSG2(5, "symbols", "translating complex symbols and characters"); |
| processed = g_regex_replace_eval(ssp->regex, text, -1, 0, 0, regex_eval, ssp, &error); |
| if (!processed) { |
| MSG2(1, "symbols", "ERROR applying regex: %s", error->message); |
| g_error_free(error); |
| } else { |
| MSG2(5, "symbols", "'%s' translated '%s' to '%s'", ssp->source, text, processed); |
| g_free(text); |
| text = processed; |
| |
| gint i; |
| for (i = 0; i < ssp->nmulti_chars_regex; i++) { |
| MSG2(5, "symbols", "translating multi-characters step %d", i); |
| processed = g_regex_replace_eval(ssp->multi_chars_regex[i], text, -1, 0, 0, regex_eval, ssp, &error); |
| if (!processed) { |
| MSG2(1, "symbols", "ERROR applying regex: %s", error->message); |
| g_error_free(error); |
| } else { |
| MSG2(5, "symbols", "'%s' translated '%s' to '%s'", ssp->source, text, processed); |
| g_free(text); |
| text = processed; |
| } |
| } |
| |
| if (ssml_mode == SPD_DATA_SSML) { |
| /* This accumulates the shifts of all previous replacements */ |
| gssize shift = 0; |
| |
| /* Apply new tags positions */ |
| for (i = 0; i < ntags; i++) { |
| shift += tags[i].shift; |
| tags[i].pos += shift; |
| } |
| } |
| |
| if (level == SYMLVL_CHAR && g_utf8_strlen(processed, -1) > 1) |
| /* This translated it, avoid letting other rules continue expanding! */ |
| break; |
| } |
| } |
| |
| if (ssml_mode == SPD_DATA_SSML) { |
| processed = unescape_ssml_text(text, tags, ntags); |
| MSG2(5, "symbols", "unescaped ssml '%s' to '%s'", text, processed); |
| g_free(text); |
| } else |
| processed = text; |
| |
| return processed; |
| } |
| |
| /* Gets a possibly cached processor for the given locale */ |
| static GSList *get_locale_speech_symbols_processor(const gchar *locale) |
| { |
| if (!G_processors) { |
| G_processors = locale_map_new((GDestroyNotify) speech_symbols_processor_list_free); |
| } |
| |
| return locale_map_fetch(G_processors, locale, NULL, speech_symbols_processor_list_new); |
| } |
| |
| /*----------------------------------- API -----------------------------------*/ |
| |
| /* Process some text, converting symbols according to desired pronunciation. */ |
| static gchar *process_speech_symbols(const gchar *locale, const gchar *text, SymLvl level, SymLvl support_level, SPDDataMode ssml_mode) |
| { |
| GSList *sspl; |
| |
| sspl = get_locale_speech_symbols_processor(locale); |
| /* fallback to English if there's no processor for the locale */ |
| if (!sspl && g_str_has_prefix(locale, "en") && strchr("_-", locale[2])) |
| sspl = get_locale_speech_symbols_processor("en"); |
| if (!sspl) |
| return NULL; |
| |
| return speech_symbols_processor_process_text(sspl, text, level, support_level, ssml_mode); |
| } |
| |
| void insert_symbols(TSpeechDMessage *msg, int punct_missing) |
| { |
| gchar *processed; |
| SymLvl level = SYMLVL_NONE; |
| SymLvl support_level = msg->settings.symbols_preprocessing; |
| char *locale = strdup(msg->settings.msg_settings.voice.language), *dash; |
| |
| if (punct_missing && support_level < SYMLVL_ALL) |
| /* The user preferred to let some modules handle some punctuation, |
| * but this module doesn't support it, so force handling it ourself. */ |
| support_level = SYMLVL_ALL; |
| |
| switch (msg->settings.msg_settings.punctuation_mode) { |
| case SPD_PUNCT_ALL: level = SYMLVL_ALL; break; |
| case SPD_PUNCT_MOST: level = SYMLVL_MOST; break; |
| case SPD_PUNCT_SOME: level = SYMLVL_SOME; break; |
| case SPD_PUNCT_NONE: level = SYMLVL_NONE; break; |
| } |
| |
| if (msg->settings.type == SPD_MSGTYPE_CHAR) |
| level = SYMLVL_CHAR; |
| |
| dash = strchr(locale, '-'); |
| if (dash) |
| { |
| char *c; |
| *dash = '_'; |
| for (c = dash + 1; *c; c++) |
| *c = toupper(*c); |
| } |
| |
| MSG2(5, "symbols", "processing at level %d, supporting level %d", level, support_level); |
| processed = process_speech_symbols(locale, |
| msg->buf, level, support_level, msg->settings.ssml_mode); |
| if (processed) { |
| MSG2(5, "symbols", "before: |%s|", msg->buf); |
| g_free(msg->buf); |
| msg->buf = processed; |
| MSG2(5, "symbols", "after: |%s|", msg->buf); |
| if (support_level >= level) |
| /* if we performed the replacement, don't let the module speak it again */ |
| msg->settings.msg_settings.punctuation_mode = SPD_PUNCT_NONE; |
| |
| /* if we provide a character description file, don't let the module spell it */ |
| if (msg->settings.type == SPD_MSGTYPE_CHAR) |
| if (g_utf8_strlen(processed, -1) > 1) |
| msg->settings.type = SPD_MSGTYPE_TEXT; |
| } |
| |
| free(locale); |
| } |