blob: fb07f66a45fe27aab4031e8af47508578d58504d [file] [log] [blame]
# Copyright 2021 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Pseudolocale translations for chrome."""
import re
import string
from collections import namedtuple
from grit import lazy_re
from grit import tclib
ACCENTED_STRINGS = {
'!': '\u00a1',
'$': '\u20ac',
'?': '\u00bf',
'A': '\u00c5',
'C': '\u00c7',
'D': '\u00d0',
'E': '\u00c9',
'G': '\u011c',
'H': '\u0124',
'I': '\u00ce',
'J': '\u0134',
'K': '\u0136',
'L': '\u013b',
'N': '\u00d1',
'O': '\u00d6',
'P': '\u00de',
'R': '\u00ae',
'S': '\u0160',
'T': '\u0162',
'U': '\u00db',
'W': '\u0174',
'Y': '\u00dd',
'Z': '\u017d',
'a': '\u00e5',
'c': '\u00e7',
'd': '\u00f0',
'e': '\u00e9',
'f': '\u0192',
'g': '\u011d',
'h': '\u0125',
'i': '\u00ee',
'j': '\u0135',
'k': '\u0137',
'l': '\u013c',
'n': '\u00f1',
'o': '\u00f6',
'p': '\u00fe',
's': '\u0161',
't': '\u0163',
'u': '\u00fb',
'w': '\u0175',
'y': '\u00fd',
'z': '\u017e',
}
NUMBERS = [
'- one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
'ten'
]
PLACEHOLDER_STRING = '{PLACEHOLDER_VARIABLE}'
ALPHABETIC_RUN = lazy_re.compile(r'([^\W0-9_]+)')
WORD = lazy_re.compile(r'\b\S+\b')
# RTL modifiers for letters
RLO = '\u202e'
PDF = '\u202c'
class Node:
"""A node in the syntax tree representing a message to be translated."""
translatable = False
after = ''
def __init__(self, text, children=None):
self.text = text
self.children = [] if children is None else children
def GetNumWords(self):
"""Returns an approximate worst-case (maximum) number of words within the
tree."""
return sum(child.GetNumWords() for child in self.children)
def Transform(self, fn):
"""Modifies the tree by applying fn to any translatable text within the tree
Args:
fn: Callable[[unicode], unicode]
"""
for child in self.children:
child.Transform(fn)
def ToString(self):
"""Returns a string representation of the tree suitable for creating a
translation from.
"""
children = ''.join(c.ToString() for c in self.children)
return '%s%s%s' % (self.text, children, self.after)
def __repr__(self):
# For debugging
if self.children:
child_lines = '\n'.join(' ' + line for node in self.children
for line in repr(node).split('\n'))
return '%s[before=%s, after=%s\n%s\n]' % (self.__class__.__name__,
repr(self.text), repr(
self.after), child_lines)
else:
return '%s %s' % (self.__class__.__name__, repr(self.text))
@classmethod
def _MatchPattern(cls, text):
match = cls.pattern.match(text)
if match is not None:
return cls(match.group(0)), text[len(match.group(0)):]
return None, text
@classmethod
def Parse(cls, text):
"""Matches the node against the text, consuming any part of the text that
matches.
Args:
text: str
Return: (Optional[Node], str)
If the text starts with something matching the node, returns
(node, leftover).
Otherwise, returns (None, text)
"""
return cls._MatchPattern(text)
class HtmlTag(Node):
"""HTMLTag represents a HTML tag (eg. <a href='...'> or </span>).
Note that since we don't care about the HTML structure, this does not
form a tree, has no children, and no linking between open and close tags.
Lex text so that we don't interfere with html tokens.
This lexing scheme will handle all well formed tags, html or xhtml.
It will not handle comments, CDATA sections, or the unescaping tags:
script, style, xmp or listing. If any of those appear in messages,
something is wrong.
"""
pattern = lazy_re.compile(
r'^</?[a-z]\w*' # beginning of tag
r'(?:\s+\w+(?:\s*=\s*' # attribute start
r'(?:[^\s"\'>]+|"[^\"]*"|\'[^\']*\'))?' # attribute value
r')*\s*/?>',
re.S | re.I)
class RawText(Node):
"""RawText represents regular text able to be translated."""
# Raw text can have a < or $ in it, but only at the very start.
# This guarantees that it's already tried and failed to match an HTML tag
# and variable.
pattern = lazy_re.compile(r'^[^{}][^{}<$]*', re.S)
def GetNumWords(self):
return len(WORD.findall(self.text))
def Transform(self, fn):
self.text = fn(self.text)
class BasicVariable(Node):
"""Represents a variable. Usually used inside a plural option, but has been
overloaded to store placeholders as well.
"""
pattern = lazy_re.compile(r'^\$?{[a-zA-Z0-9_]+}')
def GetNumWords(self):
return 1
class PluralOption(Node):
"""Represents a single option for a plural selection.
eg. =1 {singular option here}
"""
pattern = lazy_re.compile(r'^(=[0-9]+|other)\s*{')
after = '}\n'
@classmethod
def Parse(cls, text):
node, text = cls._MatchPattern(text)
assert node is not None, text
child, text = NodeSequence.Parse(text)
assert child is not None, text
node.children = child.children if isinstance(child,
NodeSequence) else [child]
assert text.startswith('}')
return node, text[1:]
class Plural(Node):
"""Represents a set of options for plurals.
eg. {VARIABLE, plural, =1 {singular} other {plural}}
"""
pattern = lazy_re.compile(r'^{[A-Za-z0-9_]+,\s*plural,\s*(offset:\d+\s*)?',
re.S)
after = '}'
@classmethod
def Parse(cls, text):
node, text = cls._MatchPattern(text)
if node is None:
return None, text
while not text.startswith('}'):
child, text = PluralOption.Parse(text)
assert child is not None, text
node.children.append(child)
text = text.lstrip()
assert text.startswith('}'), text
return node, text[1:]
def GetNumWords(self):
return max(child.GetNumWords() for child in self.children)
class NodeSequence(Node):
"""Represents a series of nodes.
eg. hello {VAR} -> NodeSequence([RawText('Hello'), BasicVariable('{VAR}'])"""
child_types = [HtmlTag, BasicVariable, Plural, RawText]
def __init__(self, children):
super().__init__('', children)
@classmethod
def Parse(cls, text):
children = []
orig_text = None
while text != orig_text:
orig_text = text
for node in cls.child_types:
child, text = node.Parse(text)
if child is not None:
children.append(child)
break
assert children, text
if len(children) == 1:
return children[0], text
return cls(children), text
def BuildTree(text):
"""Builds a tree from some text"""
root, leftovers = NodeSequence.Parse(text)
assert not leftovers, leftovers
return root
def BuildTreeFromMessage(message):
"""Builds a tree from message, substituting any placeholders with
PLACEHOLDER_STRING. Returns (tree, substituted placeholders)
"""
text = ''
placeholders = []
for part in message.GetContent():
if isinstance(part, tclib.Placeholder):
text += PLACEHOLDER_STRING
placeholders.append(part)
else:
text += part
return BuildTree(text), placeholders
def ToTranslation(tree, placeholders):
"""Converts the tree back to a translation, substituting the placeholders
back in as required.
"""
text = tree.ToString()
assert text.count(PLACEHOLDER_STRING) == len(placeholders)
transl = tclib.Translation()
for placeholder in placeholders:
index = text.find(PLACEHOLDER_STRING)
if index > 0:
transl.AppendText(text[:index])
text = text[index + len(PLACEHOLDER_STRING):]
transl.AppendPlaceholder(placeholder)
if text:
transl.AppendText(text)
return transl
def PseudoLongStringMessage(message):
"""Returns a pseudo-long string (en-XA) translation of the provided message.
Args:
message: tclib.Message()
Return:
tclib.Translation()
"""
tree, placeholders = BuildTreeFromMessage(message)
# This will change after the transformation, so do it early.
n_words = tree.GetNumWords()
tree.Transform(lambda x: ''.join(
ACCENTED_STRINGS.get(letter, letter) for letter in x))
transl = ToTranslation(tree, placeholders)
transl.AppendText(' ' + ' '.join(NUMBERS[i % len(NUMBERS)]
for i in range(n_words)))
return transl
def PseudoRTLMessage(message):
"""Returns a pseudo-RTL (ar-XB) translation of the provided message.
Args:
message: tclib.Message()
Return:
tclib.Translation()
"""
tree, placeholders = BuildTreeFromMessage(message)
tree.Transform(lambda text: ALPHABETIC_RUN.sub(
lambda run: RLO + run.group() + PDF, text))
return ToTranslation(tree, placeholders)