| # Regular expression patterns for C syntax. |
| # |
| # None of these patterns has any capturing. However, a number of them |
| # have capturing markers compatible with utils.set_capture_groups(). |
| |
| import textwrap |
| |
| |
| def _ind(text, level=1, edges='both'): |
| indent = ' ' * level |
| text = textwrap.indent(text, indent) |
| if edges == 'pre' or edges == 'both': |
| text = '\n' + indent + text.lstrip() |
| if edges == 'post' or edges == 'both': |
| text = text.rstrip() + '\n' + ' ' * (level - 1) |
| return text |
| |
| |
| ####################################### |
| # general |
| |
| HEX = r'(?: [0-9a-zA-Z] )' |
| |
| STRING_LITERAL = textwrap.dedent(rf''' |
| (?: |
| # character literal |
| (?: |
| ['] [^'] ['] |
| | |
| ['] \\ . ['] |
| | |
| ['] \\x{HEX}{HEX} ['] |
| | |
| ['] \\0\d\d ['] |
| | |
| (?: |
| ['] \\o[01]\d\d ['] |
| | |
| ['] \\o2[0-4]\d ['] |
| | |
| ['] \\o25[0-5] ['] |
| ) |
| ) |
| | |
| # string literal |
| (?: |
| ["] (?: [^"\\]* \\ . )* [^"\\]* ["] |
| ) |
| # end string literal |
| ) |
| ''') |
| |
| _KEYWORD = textwrap.dedent(r''' |
| (?: |
| \b |
| (?: |
| auto | |
| extern | |
| register | |
| static | |
| _Thread_local | |
| typedef | |
| |
| const | |
| volatile | |
| |
| signed | |
| unsigned | |
| char | |
| short | |
| int | |
| long | |
| float | |
| double | |
| _Complex | |
| void | |
| |
| struct | |
| union | |
| enum | |
| |
| goto | |
| return | |
| sizeof | |
| break | |
| continue | |
| if | |
| else | |
| for | |
| do | |
| while | |
| switch | |
| case | |
| default | |
| entry |
| ) |
| \b |
| ) |
| ''') |
| KEYWORD = rf''' |
| # keyword |
| {_KEYWORD} |
| # end keyword |
| ''' |
| _KEYWORD = ''.join(_KEYWORD.split()) |
| |
| IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )' |
| # We use a negative lookahead to filter out keywords. |
| STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )' |
| ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )' |
| |
| |
| ####################################### |
| # types |
| |
| SIMPLE_TYPE = textwrap.dedent(rf''' |
| # simple type |
| (?: |
| \b |
| (?: |
| void |
| | |
| (?: signed | unsigned ) # implies int |
| | |
| (?: |
| (?: (?: float | double | long\s+double ) \s+ )? |
| _Complex |
| ) |
| | |
| (?: |
| _Complex |
| (?: \s+ (?: float | double | long\s+double ) )? |
| ) |
| | |
| (?: |
| (?: (?: signed | unsigned ) \s+ )? |
| (?: (?: long | short ) \s+ )? |
| (?: char | short | int | long | float | double ) |
| ) |
| ) |
| \b |
| ) |
| # end simple type |
| ''') |
| |
| COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )' |
| |
| |
| ####################################### |
| # variable declarations |
| |
| _STORAGE = 'auto register static extern _Thread_local'.split() |
| STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )' |
| TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )' |
| PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )' |
| |
| TYPE_SPEC = textwrap.dedent(rf''' |
| # type spec |
| (?: |
| {_ind(SIMPLE_TYPE, 2)} |
| | |
| (?: |
| [_]*typeof[_]* |
| \s* [(] |
| (?: \s* [*&] )* |
| \s* {STRICT_IDENTIFIER} |
| \s* [)] |
| ) |
| | |
| # reference to a compound type |
| (?: |
| {COMPOUND_TYPE_KIND} |
| (?: \s* {ANON_IDENTIFIER} )? |
| ) |
| | |
| # reference to a typedef |
| {STRICT_IDENTIFIER} |
| ) |
| # end type spec |
| ''') |
| |
| DECLARATOR = textwrap.dedent(rf''' |
| # declarator (possibly abstract) |
| (?: |
| (?: {PTR_QUALIFIER} \s* )* |
| (?: |
| (?: |
| (?: # <IDENTIFIER> |
| {STRICT_IDENTIFIER} |
| ) |
| # Inside the brackets is actually a "constant expression". |
| (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays |
| ) |
| | |
| (?: |
| [(] \s* |
| (?: # <WRAPPED_IDENTIFIER> |
| {STRICT_IDENTIFIER} |
| ) |
| # Inside the brackets is actually a "constant expression". |
| (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays |
| \s* [)] |
| ) |
| | |
| # func ptr |
| (?: |
| [(] (?: \s* {PTR_QUALIFIER} )? \s* |
| (?: # <FUNC_IDENTIFIER> |
| {STRICT_IDENTIFIER} |
| ) |
| # Inside the brackets is actually a "constant expression". |
| (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays |
| \s* [)] |
| # We allow for a single level of paren nesting in parameters. |
| \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)] |
| ) |
| ) |
| ) |
| # end declarator |
| ''') |
| |
| VAR_DECL = textwrap.dedent(rf''' |
| # var decl (and typedef and func return type) |
| (?: |
| (?: |
| (?: # <STORAGE> |
| {STORAGE_CLASS} |
| ) |
| \s* |
| )? |
| (?: |
| (?: # <TYPE_QUAL> |
| {TYPE_QUALIFIER} |
| ) |
| \s* |
| )? |
| (?: |
| (?: # <TYPE_SPEC> |
| {_ind(TYPE_SPEC, 4)} |
| ) |
| ) |
| \s* |
| (?: |
| (?: # <DECLARATOR> |
| {_ind(DECLARATOR, 4)} |
| ) |
| ) |
| ) |
| # end var decl |
| ''') |
| |
| INITIALIZER = textwrap.dedent(rf''' |
| # initializer |
| (?: |
| (?: |
| [(] |
| # no nested parens (e.g. func ptr) |
| [^)]* |
| [)] |
| \s* |
| )? |
| (?: |
| # a string literal |
| (?: |
| (?: {_ind(STRING_LITERAL, 4)} \s* )* |
| {_ind(STRING_LITERAL, 4)} |
| ) |
| | |
| |
| # a simple initializer |
| (?: |
| (?: |
| [^'",;{{]* |
| {_ind(STRING_LITERAL, 4)} |
| )* |
| [^'",;{{]* |
| ) |
| | |
| |
| # a struct/array literal |
| (?: |
| # We only expect compound initializers with |
| # single-variable declarations. |
| {{ |
| (?: |
| [^'";]*? |
| {_ind(STRING_LITERAL, 5)} |
| )* |
| [^'";]*? |
| }} |
| (?= \s* ; ) # Note this lookahead. |
| ) |
| ) |
| ) |
| # end initializer |
| ''') |
| |
| |
| ####################################### |
| # compound type declarations |
| |
| STRUCT_MEMBER_DECL = textwrap.dedent(rf''' |
| (?: |
| # inline compound type decl |
| (?: |
| (?: # <COMPOUND_TYPE_KIND> |
| {COMPOUND_TYPE_KIND} |
| ) |
| (?: |
| \s+ |
| (?: # <COMPOUND_TYPE_NAME> |
| {STRICT_IDENTIFIER} |
| ) |
| )? |
| \s* {{ |
| ) |
| | |
| (?: |
| # typed member |
| (?: |
| # Technically it doesn't have to have a type... |
| (?: # <SPECIFIER_QUALIFIER> |
| (?: {TYPE_QUALIFIER} \s* )? |
| {_ind(TYPE_SPEC, 5)} |
| ) |
| (?: |
| # If it doesn't have a declarator then it will have |
| # a size and vice versa. |
| \s* |
| (?: # <DECLARATOR> |
| {_ind(DECLARATOR, 6)} |
| ) |
| )? |
| ) |
| |
| # sized member |
| (?: |
| \s* [:] \s* |
| (?: # <SIZE> |
| # This is actually a "constant expression". |
| \d+ |
| | |
| [^'",}}]+ |
| ) |
| )? |
| \s* |
| (?: # <ENDING> |
| [,;] |
| ) |
| ) |
| | |
| (?: |
| \s* |
| (?: # <CLOSE> |
| }} |
| ) |
| ) |
| ) |
| ''') |
| |
| ENUM_MEMBER_DECL = textwrap.dedent(rf''' |
| (?: |
| (?: |
| \s* |
| (?: # <CLOSE> |
| }} |
| ) |
| ) |
| | |
| (?: |
| \s* |
| (?: # <NAME> |
| {IDENTIFIER} |
| ) |
| (?: |
| \s* = \s* |
| (?: # <INIT> |
| # This is actually a "constant expression". |
| {_ind(STRING_LITERAL, 4)} |
| | |
| [^'",}}]+ |
| ) |
| )? |
| \s* |
| (?: # <ENDING> |
| , | }} |
| ) |
| ) |
| ) |
| ''') |
| |
| |
| ####################################### |
| # statements |
| |
| SIMPLE_STMT_BODY = textwrap.dedent(rf''' |
| # simple statement body |
| (?: |
| (?: |
| [^'"{{}};]* |
| {_ind(STRING_LITERAL, 3)} |
| )* |
| [^'"{{}};]* |
| #(?= [;{{] ) # Note this lookahead. |
| ) |
| # end simple statement body |
| ''') |
| SIMPLE_STMT = textwrap.dedent(rf''' |
| # simple statement |
| (?: |
| (?: # <SIMPLE_STMT> |
| # stmt-inline "initializer" |
| (?: |
| return \b |
| (?: |
| \s* |
| {_ind(INITIALIZER, 5)} |
| )? |
| ) |
| | |
| # variable assignment |
| (?: |
| (?: [*] \s* )? |
| (?: |
| {STRICT_IDENTIFIER} \s* |
| (?: . | -> ) \s* |
| )* |
| {STRICT_IDENTIFIER} |
| (?: \s* \[ \s* \d+ \s* \] )? |
| \s* = \s* |
| {_ind(INITIALIZER, 4)} |
| ) |
| | |
| # catchall return statement |
| (?: |
| return \b |
| (?: |
| (?: |
| [^'";]* |
| {_ind(STRING_LITERAL, 6)} |
| )* |
| \s* [^'";]* |
| )? |
| ) |
| | |
| # simple statement |
| (?: |
| {_ind(SIMPLE_STMT_BODY, 4)} |
| ) |
| ) |
| \s* |
| (?: # <SIMPLE_ENDING> |
| ; |
| ) |
| ) |
| # end simple statement |
| ''') |
| COMPOUND_STMT = textwrap.dedent(rf''' |
| # compound statement |
| (?: |
| \b |
| (?: |
| (?: |
| (?: # <COMPOUND_BARE> |
| else | do |
| ) |
| \b |
| ) |
| | |
| (?: |
| (?: # <COMPOUND_LABELED> |
| (?: |
| case \b |
| (?: |
| [^'":]* |
| {_ind(STRING_LITERAL, 7)} |
| )* |
| \s* [^'":]* |
| ) |
| | |
| default |
| | |
| {STRICT_IDENTIFIER} |
| ) |
| \s* [:] |
| ) |
| | |
| (?: |
| (?: # <COMPOUND_PAREN> |
| for | while | if | switch |
| ) |
| \s* (?= [(] ) # Note this lookahead. |
| ) |
| ) |
| \s* |
| ) |
| # end compound statement |
| ''') |
| |
| |
| ####################################### |
| # function bodies |
| |
| LOCAL = textwrap.dedent(rf''' |
| (?: |
| # an empty statement |
| (?: # <EMPTY> |
| ; |
| ) |
| | |
| # inline type decl |
| (?: |
| (?: |
| (?: # <INLINE_LEADING> |
| [^;{{}}]+? |
| ) |
| \s* |
| )? |
| (?: # <INLINE_PRE> |
| (?: {STORAGE_CLASS} \s* )? |
| (?: {TYPE_QUALIFIER} \s* )? |
| )? # </INLINE_PRE> |
| (?: # <INLINE_KIND> |
| {COMPOUND_TYPE_KIND} |
| ) |
| (?: |
| \s+ |
| (?: # <INLINE_NAME> |
| {STRICT_IDENTIFIER} |
| ) |
| )? |
| \s* {{ |
| ) |
| | |
| # var decl |
| (?: |
| (?: # <STORAGE> |
| {STORAGE_CLASS} |
| )? # </STORAGE> |
| (?: |
| \s* |
| (?: # <VAR_DECL> |
| {_ind(VAR_DECL, 5)} |
| ) |
| ) |
| (?: |
| (?: |
| # initializer |
| # We expect only basic initializers. |
| \s* = \s* |
| (?: # <VAR_INIT> |
| {_ind(INITIALIZER, 6)} |
| ) |
| )? |
| (?: |
| \s* |
| (?: # <VAR_ENDING> |
| [,;] |
| ) |
| ) |
| ) |
| ) |
| | |
| {_ind(COMPOUND_STMT, 2)} |
| | |
| # start-of-block |
| (?: |
| (?: # <BLOCK_LEADING> |
| (?: |
| [^'"{{}};]* |
| {_ind(STRING_LITERAL, 5)} |
| )* |
| [^'"{{}};]* |
| # Presumably we will not see "== {{". |
| [^\s='"{{}});] |
| \s* |
| )? # </BLOCK_LEADING> |
| (?: # <BLOCK_OPEN> |
| {{ |
| ) |
| ) |
| | |
| {_ind(SIMPLE_STMT, 2)} |
| | |
| # end-of-block |
| (?: # <BLOCK_CLOSE> |
| }} |
| ) |
| ) |
| ''') |
| |
| LOCAL_STATICS = textwrap.dedent(rf''' |
| (?: |
| # inline type decl |
| (?: |
| (?: |
| (?: # <INLINE_LEADING> |
| [^;{{}}]+? |
| ) |
| \s* |
| )? |
| (?: # <INLINE_PRE> |
| (?: {STORAGE_CLASS} \s* )? |
| (?: {TYPE_QUALIFIER} \s* )? |
| )? |
| (?: # <INLINE_KIND> |
| {COMPOUND_TYPE_KIND} |
| ) |
| (?: |
| \s+ |
| (?: # <INLINE_NAME> |
| {STRICT_IDENTIFIER} |
| ) |
| )? |
| \s* {{ |
| ) |
| | |
| # var decl |
| (?: |
| # We only look for static variables. |
| (?: # <STATIC_DECL> |
| static \b |
| (?: \s* {TYPE_QUALIFIER} )? |
| \s* {_ind(TYPE_SPEC, 4)} |
| \s* {_ind(DECLARATOR, 4)} |
| ) |
| \s* |
| (?: |
| (?: # <STATIC_INIT> |
| = \s* |
| {_ind(INITIALIZER, 4)} |
| \s* |
| [,;{{] |
| ) |
| | |
| (?: # <STATIC_ENDING> |
| [,;] |
| ) |
| ) |
| ) |
| | |
| # everything else |
| (?: |
| (?: # <DELIM_LEADING> |
| (?: |
| [^'"{{}};]* |
| {_ind(STRING_LITERAL, 4)} |
| )* |
| \s* [^'"{{}};]* |
| ) |
| (?: |
| (?: # <BLOCK_OPEN> |
| {{ |
| ) |
| | |
| (?: # <BLOCK_CLOSE> |
| }} |
| ) |
| | |
| (?: # <STMT_END> |
| ; |
| ) |
| ) |
| ) |
| ) |
| ''') |
| |
| |
| ####################################### |
| # global declarations |
| |
| GLOBAL = textwrap.dedent(rf''' |
| (?: |
| # an empty statement |
| (?: # <EMPTY> |
| ; |
| ) |
| | |
| |
| # compound type decl (maybe inline) |
| (?: |
| (?: |
| (?: # <COMPOUND_LEADING> |
| [^;{{}}]+? |
| ) |
| \s* |
| )? |
| (?: # <COMPOUND_KIND> |
| {COMPOUND_TYPE_KIND} |
| ) |
| (?: |
| \s+ |
| (?: # <COMPOUND_NAME> |
| {STRICT_IDENTIFIER} |
| ) |
| )? |
| \s* {{ |
| ) |
| | |
| # bogus inline decl artifact |
| # This simplifies resolving the relative syntactic ambiguity of |
| # inline structs. |
| (?: |
| (?: # <FORWARD_KIND> |
| {COMPOUND_TYPE_KIND} |
| ) |
| \s* |
| (?: # <FORWARD_NAME> |
| {ANON_IDENTIFIER} |
| ) |
| (?: # <MAYBE_INLINE_ACTUAL> |
| [^=,;({{[*\]]* |
| [=,;({{] |
| ) |
| ) |
| | |
| |
| # typedef |
| (?: |
| \b typedef \b \s* |
| (?: # <TYPEDEF_DECL> |
| {_ind(VAR_DECL, 4)} |
| ) |
| (?: |
| # We expect no inline type definitions in the parameters. |
| \s* [(] \s* |
| (?: # <TYPEDEF_FUNC_PARAMS> |
| [^{{;]* |
| ) |
| \s* [)] |
| )? |
| \s* ; |
| ) |
| | |
| |
| # func decl/definition & var decls |
| # XXX dedicated pattern for funcs (more restricted)? |
| (?: |
| (?: |
| (?: # <VAR_STORAGE> |
| {STORAGE_CLASS} |
| ) |
| \s* |
| )? |
| (?: |
| (?: # <FUNC_INLINE> |
| \b inline \b |
| ) |
| \s* |
| )? |
| (?: # <VAR_DECL> |
| {_ind(VAR_DECL, 4)} |
| ) |
| (?: |
| # func decl / definition |
| (?: |
| (?: |
| # We expect no inline type definitions in the parameters. |
| \s* [(] \s* |
| (?: # <FUNC_PARAMS> |
| [^{{;]* |
| ) |
| \s* [)] \s* |
| (?: # <FUNC_DELIM> |
| [{{;] |
| ) |
| ) |
| | |
| (?: |
| # This is some old-school syntax! |
| \s* [(] \s* |
| # We throw away the bare names: |
| {STRICT_IDENTIFIER} |
| (?: \s* , \s* {STRICT_IDENTIFIER} )* |
| \s* [)] \s* |
| |
| # We keep the trailing param declarations: |
| (?: # <FUNC_LEGACY_PARAMS> |
| # There's at least one! |
| (?: {TYPE_QUALIFIER} \s* )? |
| {_ind(TYPE_SPEC, 7)} |
| \s* |
| {_ind(DECLARATOR, 7)} |
| \s* ; |
| (?: |
| \s* |
| (?: {TYPE_QUALIFIER} \s* )? |
| {_ind(TYPE_SPEC, 8)} |
| \s* |
| {_ind(DECLARATOR, 8)} |
| \s* ; |
| )* |
| ) |
| \s* {{ |
| ) |
| ) |
| | |
| # var / typedef |
| (?: |
| (?: |
| # initializer |
| # We expect only basic initializers. |
| \s* = \s* |
| (?: # <VAR_INIT> |
| {_ind(INITIALIZER, 6)} |
| ) |
| )? |
| \s* |
| (?: # <VAR_ENDING> |
| [,;] |
| ) |
| ) |
| ) |
| ) |
| ) |
| ''') |