| #!/usr/bin/env python |
| # |
| # Copyright 2007 Google Inc. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """Defines the parser for MapReduce FileInputReader's file format string.""" |
| |
| |
| |
| |
| |
| __all__ = ['parse'] |
| |
| import re |
| import tokenize |
| |
| from google.appengine.ext.mapreduce import file_formats |
| |
| |
| def parse(format_string): |
| """Parses format string. |
| |
| Args: |
| format_string: format_string from MapReduce FileInputReader. |
| |
| Returns: |
| a list of file_formats._FileFormat objects. |
| |
| Raises: |
| ValueError: when format_string parsing fails because of invalid syntax |
| or semantics. |
| """ |
| tokenizer = _Tokenizer(format_string) |
| return _Parser(tokenizer).formats |
| |
| |
| class _Parser(object): |
| """Parses a format string according to the following grammar. |
| |
| In Python's modified BNF notation. |
| format_string ::= parameterized_format ( "[" parameterized_format "]" )* |
| parameterized_format ::= format [ format_parameters ] |
| format_parameters ::= "(" format_paramter ("," format_parameter )* ")" |
| format_parameter ::= format_specific_parameter "=" parameter_value |
| format ::= (<letter>|<number>)+ |
| parameter_value ::= (<letter>|<number>|<punctuation>)+ |
| format_specific_parameter ::= (<letter>|<number>)+ |
| """ |
| |
| def __init__(self, tokenizer): |
| """Initialize. |
| |
| Args: |
| tokenizer: an instance of _Tokenizer. |
| |
| Raises: |
| ValueError: when parser couldn't consume all format_string. |
| """ |
| self.formats = [] |
| self._tokenizer = tokenizer |
| self._parse_format_string() |
| if tokenizer.remainder(): |
| raise ValueError('Extra chars after index -%d' % tokenizer.remainder()) |
| |
| def _add_format(self, format_name, kwargs): |
| """Add a format to result list. |
| |
| The format name will be resolved to its corresponding _FileFormat class. |
| kwargs will be passed to the class's __init___. |
| |
| Args: |
| format_name: name of the parsed format in str. |
| kwargs: a dict containing key word arguments for the format. |
| |
| Raises: |
| ValueError: when format_name is not supported or the kwargs are not |
| supported by the format. |
| """ |
| if format_name not in file_formats.FORMATS: |
| raise ValueError('Invalid format %s.' % format_name) |
| format_cls = file_formats.FORMATS[format_name] |
| for k in kwargs: |
| if k not in format_cls.ARGUMENTS: |
| raise ValueError('Invalid argument %s for format %s' % |
| (k, format_name)) |
| self.formats.append(format_cls.default_instance(**kwargs)) |
| |
| def _parse_format_string(self): |
| """Parses format_string.""" |
| self._parse_parameterized_format() |
| if self._tokenizer.consume_if('['): |
| self._parse_format_string() |
| self._tokenizer.consume(']') |
| |
| def _validate_string(self, text): |
| """Validates a string is composed of valid characters. |
| |
| Args: |
| text: any str to validate. |
| |
| Raises: |
| ValueError: when text contains illegal characters. |
| """ |
| if not re.match(tokenize.Name, text): |
| raise ValueError('%s should only contain ascii letters or digits.' % |
| text) |
| |
| def _parse_parameterized_format(self): |
| """Parses parameterized_format.""" |
| format_name = self._tokenizer.next() |
| self._validate_string(format_name) |
| |
| arguments = {} |
| |
| if self._tokenizer.consume_if('('): |
| arguments = self._parse_format_parameters() |
| self._tokenizer.consume(')') |
| |
| self._add_format(format_name, arguments) |
| |
| def _parse_format_parameters(self): |
| """Parses format_parameters. |
| |
| Returns: |
| a dict of parameter names to their values for this format. |
| |
| Raises: |
| ValueError: when the format_parameters have illegal syntax or semantics. |
| """ |
| arguments = {} |
| comma_exist = True |
| while self._tokenizer.peek() not in ')]': |
| if not comma_exist: |
| raise ValueError('Arguments should be separated by comma at index %d.' |
| % self._tokenizer.index) |
| key = self._tokenizer.next() |
| self._validate_string(key) |
| self._tokenizer.consume('=') |
| value = self._tokenizer.next() |
| comma_exist = self._tokenizer.consume_if(',') |
| if key in arguments: |
| raise ValueError('Argument %s defined more than once.' % key) |
| arguments[key] = value |
| return arguments |
| |
| |
| class _Tokenizer(object): |
| """Tokenizes a user supplied format string. |
| |
| A token is either a special character or a group of characters between |
| two special characters or the beginning or the end of format string. |
| Escape character can be used to escape special characters and itself. |
| """ |
| |
| SPECIAL_CHARS = '[]()=,' |
| ESCAPE_CHAR = '\\' |
| |
| def __init__(self, format_string): |
| """Initialize. |
| |
| Args: |
| format_string: user supplied format string for MapReduce InputReader. |
| """ |
| self.index = 0 |
| self._format_string = format_string |
| |
| def peek(self): |
| """Returns the next token with surrounding white spaces stripped. |
| |
| This method does not advance underlying buffer. |
| |
| Returns: |
| the next token with surrounding whitespaces stripped. |
| """ |
| return self.next(advance=False) |
| |
| def next(self, advance=True): |
| """Returns the next token with surrounding white spaces stripped. |
| |
| Args: |
| advance: boolean. True if underlying buffer should be advanced. |
| |
| Returns: |
| the next token with surrounding whitespaces stripped. |
| """ |
| escaped = False |
| token = '' |
| previous_index = self.index |
| while self.remainder(): |
| char = self._format_string[self.index] |
| if char == self.ESCAPE_CHAR: |
| if escaped: |
| token += char |
| self.index += 1 |
| escaped = False |
| else: |
| self.index += 1 |
| escaped = True |
| elif char in self.SPECIAL_CHARS and not escaped: |
| if not token.strip(): |
| self.index += 1 |
| token += char |
| break |
| else: |
| escaped = False |
| self.index += 1 |
| token += char |
| |
| if not advance: |
| self.index = previous_index |
| |
| return token.strip() |
| |
| def consume(self, expected_token): |
| """Consumes the next token which must match expectation. |
| |
| Args: |
| expected_token: the expected value of the next token. |
| |
| Raises: |
| ValueError: raised when the next token doesn't match expected_token. |
| """ |
| token = self.next() |
| if token != expected_token: |
| raise ValueError('Expect "%s" but got "%s" at offset %d' % |
| (expected_token, token, self.index)) |
| |
| def consume_if(self, token): |
| """Consumes the next token when it matches expectation. |
| |
| Args: |
| token: the expected next token. |
| |
| Returns: |
| True when next token matches the argument and is consumed. |
| False otherwise. |
| """ |
| if self.peek() == token: |
| self.consume(token) |
| return True |
| return False |
| |
| def remainder(self): |
| """Returns the number of bytes left to be processed.""" |
| return len(self._format_string) - self.index |
| |