| """JsLex: a lexer for Javascript""" |
| # Originally from https://bitbucket.org/ned/jslex |
| import re |
| |
| class Tok(object): |
| """ |
| A specification for a token class. |
| """ |
| num = 0 |
| |
| def __init__(self, name, regex, next=None): |
| self.id = Tok.num |
| Tok.num += 1 |
| self.name = name |
| self.regex = regex |
| self.next = next |
| |
| def literals(choices, prefix="", suffix=""): |
| """ |
| Create a regex from a space-separated list of literal `choices`. |
| |
| If provided, `prefix` and `suffix` will be attached to each choice |
| individually. |
| |
| """ |
| return "|".join(prefix+re.escape(c)+suffix for c in choices.split()) |
| |
| |
| class Lexer(object): |
| """ |
| A generic multi-state regex-based lexer. |
| """ |
| |
| def __init__(self, states, first): |
| self.regexes = {} |
| self.toks = {} |
| |
| for state, rules in states.items(): |
| parts = [] |
| for tok in rules: |
| groupid = "t%d" % tok.id |
| self.toks[groupid] = tok |
| parts.append("(?P<%s>%s)" % (groupid, tok.regex)) |
| self.regexes[state] = re.compile("|".join(parts), re.MULTILINE|re.VERBOSE) |
| |
| self.state = first |
| |
| def lex(self, text): |
| """ |
| Lexically analyze `text`. |
| |
| Yields pairs (`name`, `tokentext`). |
| """ |
| end = len(text) |
| state = self.state |
| regexes = self.regexes |
| toks = self.toks |
| start = 0 |
| |
| while start < end: |
| for match in regexes[state].finditer(text, start): |
| name = match.lastgroup |
| tok = toks[name] |
| toktext = match.group(name) |
| start += len(toktext) |
| yield (tok.name, toktext) |
| |
| if tok.next: |
| state = tok.next |
| break |
| |
| self.state = state |
| |
| |
| class JsLexer(Lexer): |
| """ |
| A Javascript lexer |
| |
| >>> lexer = JsLexer() |
| >>> list(lexer.lex("a = 1")) |
| [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')] |
| |
| This doesn't properly handle non-Ascii characters in the Javascript source. |
| """ |
| |
| # Because these tokens are matched as alternatives in a regex, longer |
| # possibilities must appear in the list before shorter ones, for example, |
| # '>>' before '>'. |
| # |
| # Note that we don't have to detect malformed Javascript, only properly |
| # lex correct Javascript, so much of this is simplified. |
| |
| # Details of Javascript lexical structure are taken from |
| # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf |
| |
| # A useful explanation of automatic semicolon insertion is at |
| # http://inimino.org/~inimino/blog/javascript_semicolons |
| |
| both_before = [ |
| Tok("comment", r"/\*(.|\n)*?\*/"), |
| Tok("linecomment", r"//.*?$"), |
| Tok("ws", r"\s+"), |
| Tok("keyword", literals(""" |
| break case catch class const continue debugger |
| default delete do else enum export extends |
| finally for function if import in instanceof |
| new return super switch this throw try typeof |
| var void while with |
| """, suffix=r"\b"), next='reg'), |
| Tok("reserved", literals("null true false", suffix=r"\b"), next='div'), |
| Tok("id", r""" |
| ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char |
| ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars |
| """, next='div'), |
| Tok("hnum", r"0[xX][0-9a-fA-F]+", next='div'), |
| Tok("onum", r"0[0-7]+"), |
| Tok("dnum", r""" |
| ( (0|[1-9][0-9]*) # DecimalIntegerLiteral |
| \. # dot |
| [0-9]* # DecimalDigits-opt |
| ([eE][-+]?[0-9]+)? # ExponentPart-opt |
| | |
| \. # dot |
| [0-9]+ # DecimalDigits |
| ([eE][-+]?[0-9]+)? # ExponentPart-opt |
| | |
| (0|[1-9][0-9]*) # DecimalIntegerLiteral |
| ([eE][-+]?[0-9]+)? # ExponentPart-opt |
| ) |
| """, next='div'), |
| Tok("punct", literals(""" |
| >>>= === !== >>> <<= >>= <= >= == != << >> && |
| || += -= *= %= &= |= ^= |
| """), next="reg"), |
| Tok("punct", literals("++ -- ) ]"), next='div'), |
| Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'), |
| Tok("string", r'"([^"\\]|(\\(.|\n)))*?"', next='div'), |
| Tok("string", r"'([^'\\]|(\\(.|\n)))*?'", next='div'), |
| ] |
| |
| both_after = [ |
| Tok("other", r"."), |
| ] |
| |
| states = { |
| 'div': # slash will mean division |
| both_before + [ |
| Tok("punct", literals("/= /"), next='reg'), |
| ] + both_after, |
| |
| 'reg': # slash will mean regex |
| both_before + [ |
| Tok("regex", |
| r""" |
| / # opening slash |
| # First character is.. |
| ( [^*\\/[] # anything but * \ / or [ |
| | \\. # or an escape sequence |
| | \[ # or a class, which has |
| ( [^\]\\] # anything but \ or ] |
| | \\. # or an escape sequence |
| )* # many times |
| \] |
| ) |
| # Following characters are same, except for excluding a star |
| ( [^\\/[] # anything but \ / or [ |
| | \\. # or an escape sequence |
| | \[ # or a class, which has |
| ( [^\]\\] # anything but \ or ] |
| | \\. # or an escape sequence |
| )* # many times |
| \] |
| )* # many times |
| / # closing slash |
| [a-zA-Z0-9]* # trailing flags |
| """, next='div'), |
| ] + both_after, |
| } |
| |
| def __init__(self): |
| super(JsLexer, self).__init__(self.states, 'reg') |
| |
| |
| def prepare_js_for_gettext(js): |
| """ |
| Convert the Javascript source `js` into something resembling C for |
| xgettext. |
| |
| What actually happens is that all the regex literals are replaced with |
| "REGEX". |
| """ |
| def escape_quotes(m): |
| """Used in a regex to properly escape double quotes.""" |
| s = m.group(0) |
| if s == '"': |
| return r'\"' |
| else: |
| return s |
| |
| lexer = JsLexer() |
| c = [] |
| for name, tok in lexer.lex(js): |
| if name == 'regex': |
| # C doesn't grok regexes, and they aren't needed for gettext, |
| # so just output a string instead. |
| tok = '"REGEX"'; |
| elif name == 'string': |
| # C doesn't have single-quoted strings, so make all strings |
| # double-quoted. |
| if tok.startswith("'"): |
| guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1]) |
| tok = '"' + guts + '"' |
| elif name == 'id': |
| # C can't deal with Unicode escapes in identifiers. We don't |
| # need them for gettext anyway, so replace them with something |
| # innocuous |
| tok = tok.replace("\\", "U"); |
| c.append(tok) |
| return ''.join(c) |