blob: ac1f4a2175b6d065a4e65bf1e9eae5d00bffae29 [file] [log] [blame]
#!/usr/bin/python3
import os
import re
import sys
import math
import codecs
import xml.etree.ElementTree as etree
class Emoji:
def __init__(self, m):
self.emoji = m.group(1)
self.pronunciation = m.group(2)
self.codepoints = m.group(3)
self.comment = m.group(4)
def __repr__(self):
return "Emoji(emoji={0}, pronunciation={1}, codepoints={2}, comment={3})".format(
repr(self.emoji),
repr(self.pronunciation),
repr(self.codepoints),
repr(self.comment))
def __str__(self):
return "{0}{1}// [{2}]{3}".format(self.emoji, self.pronunciation, self.codepoints, self.comment)
def read_annotations(filename):
ldml = etree.parse(filename).getroot()
for annotations in ldml.findall("annotations"):
for annotation in annotations.findall("annotation"):
if annotation.attrib.get("type", "") == "tts":
yield annotation.attrib["cp"], annotation.text
def read_emoji(filename, encoding="utf-8"):
re_emoji = re.compile(r"^([^ \t]*)([^/]*)// \[([^\]]*)\](.*)$")
with codecs.open(filename, "r", encoding) as f:
for line in f:
line = line.replace("\n", "")
if line.strip() == "":
yield line # blank line
elif line.startswith("//"):
yield line # line comment
elif line.startswith("$"):
yield line # flags only
else:
m = re_emoji.match(line)
if m:
yield Emoji(m)
else:
yield line
def find_langname(lang):
espeak_data_path = os.path.join(os.getcwd(), "espeak-ng-data")
for root, dirnames, filenames in os.walk(espeak_data_path):
if lang in filenames:
filename = os.path.join(root, lang)
with codecs.open(filename, "r", "utf-8") as f:
for line in f:
line = line.replace("\n", "")
if line.startswith("name "):
return line.replace("name ", "")
def normalize(text):
text = text.replace("โ€ž", "")
text = text.replace("โ€œ", "")
text = text.replace("\"", "")
text = text.replace("โ€˜", "'")
text = text.replace("ยท", " ")
text = text.replace("| ", "") # alternatives, e.g. af "vonkstok | skitterstokkie"
return text
emoji_dict = sys.argv[1]
lang = sys.argv[2]
cldr_path = sys.argv[3]
filenames = [
os.path.join(cldr_path, "common", "annotations", "{0}.xml".format(lang)),
os.path.join(cldr_path, "common", "annotationsDerived", "{0}.xml".format(lang)),
os.path.join("data", "annotationsEspeak", "{0}.xml".format(lang))
]
annotations = {}
for filename in filenames:
if os.path.exists(filename):
for cp, name in read_annotations(filename):
annotations[cp] = name
for entry in read_emoji(emoji_dict):
if isinstance(entry, Emoji):
translation = annotations.get(entry.emoji.replace("\uFE0F", ""), None)
if translation:
translation = normalize(translation)
length = len(entry.pronunciation.strip())
tabs = entry.pronunciation.count('\t') - 1
first_tab = 8 - (length % 8)
tab_length = length + first_tab + ((tabs - 1) * 8)
new_length = len(translation)
new_tabs = math.ceil((tab_length - new_length)/8)
entry.pronunciation = "\t{0}{1}".format(translation, "\t"*int(new_tabs))
else:
entry.comment += " (no translation)"
entry = "//{0}".format(entry)
elif entry == "// Emoji and Other Symbol pronunciations for English":
langname = find_langname(lang)
entry = "// Emoji and Other Symbol pronunciations for {0}".format(langname)
elif entry == "// 2. common/annotations/en.xml (CLDR)":
entry = "// 2. common/annotations/{0}.xml (CLDR)".format(lang)
print(entry)