blob: f44040fd0410540b82605a47e6a82e1600712a07 [file] [log] [blame]
#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Defines the parser for MapReduce FileInputReader's file format string."""
__all__ = ['parse']
import re
import tokenize
from google.appengine.ext.mapreduce import file_formats
def parse(format_string):
"""Parses format string.
Args:
format_string: format_string from MapReduce FileInputReader.
Returns:
a list of file_formats._FileFormat objects.
Raises:
ValueError: when format_string parsing fails because of invalid syntax
or semantics.
"""
tokenizer = _Tokenizer(format_string)
return _Parser(tokenizer).formats
class _Parser(object):
"""Parses a format string according to the following grammar.
In Python's modified BNF notation.
format_string ::= parameterized_format ( "[" parameterized_format "]" )*
parameterized_format ::= format [ format_parameters ]
format_parameters ::= "(" format_paramter ("," format_parameter )* ")"
format_parameter ::= format_specific_parameter "=" parameter_value
format ::= (<letter>|<number>)+
parameter_value ::= (<letter>|<number>|<punctuation>)+
format_specific_parameter ::= (<letter>|<number>)+
"""
def __init__(self, tokenizer):
"""Initialize.
Args:
tokenizer: an instance of _Tokenizer.
Raises:
ValueError: when parser couldn't consume all format_string.
"""
self.formats = []
self._tokenizer = tokenizer
self._parse_format_string()
if tokenizer.remainder():
raise ValueError('Extra chars after index -%d' % tokenizer.remainder())
def _add_format(self, format_name, kwargs):
"""Add a format to result list.
The format name will be resolved to its corresponding _FileFormat class.
kwargs will be passed to the class's __init___.
Args:
format_name: name of the parsed format in str.
kwargs: a dict containing key word arguments for the format.
Raises:
ValueError: when format_name is not supported or the kwargs are not
supported by the format.
"""
if format_name not in file_formats.FORMATS:
raise ValueError('Invalid format %s.' % format_name)
format_cls = file_formats.FORMATS[format_name]
for k in kwargs:
if k not in format_cls.ARGUMENTS:
raise ValueError('Invalid argument %s for format %s' %
(k, format_name))
self.formats.append(format_cls.default_instance(**kwargs))
def _parse_format_string(self):
"""Parses format_string."""
self._parse_parameterized_format()
if self._tokenizer.consume_if('['):
self._parse_format_string()
self._tokenizer.consume(']')
def _validate_string(self, text):
"""Validates a string is composed of valid characters.
Args:
text: any str to validate.
Raises:
ValueError: when text contains illegal characters.
"""
if not re.match(tokenize.Name, text):
raise ValueError('%s should only contain ascii letters or digits.' %
text)
def _parse_parameterized_format(self):
"""Parses parameterized_format."""
format_name = self._tokenizer.next()
self._validate_string(format_name)
arguments = {}
if self._tokenizer.consume_if('('):
arguments = self._parse_format_parameters()
self._tokenizer.consume(')')
self._add_format(format_name, arguments)
def _parse_format_parameters(self):
"""Parses format_parameters.
Returns:
a dict of parameter names to their values for this format.
Raises:
ValueError: when the format_parameters have illegal syntax or semantics.
"""
arguments = {}
comma_exist = True
while self._tokenizer.peek() not in ')]':
if not comma_exist:
raise ValueError('Arguments should be separated by comma at index %d.'
% self._tokenizer.index)
key = self._tokenizer.next()
self._validate_string(key)
self._tokenizer.consume('=')
value = self._tokenizer.next()
comma_exist = self._tokenizer.consume_if(',')
if key in arguments:
raise ValueError('Argument %s defined more than once.' % key)
arguments[key] = value
return arguments
class _Tokenizer(object):
"""Tokenizes a user supplied format string.
A token is either a special character or a group of characters between
two special characters or the beginning or the end of format string.
Escape character can be used to escape special characters and itself.
"""
SPECIAL_CHARS = '[]()=,'
ESCAPE_CHAR = '\\'
def __init__(self, format_string):
"""Initialize.
Args:
format_string: user supplied format string for MapReduce InputReader.
"""
self.index = 0
self._format_string = format_string
def peek(self):
"""Returns the next token with surrounding white spaces stripped.
This method does not advance underlying buffer.
Returns:
the next token with surrounding whitespaces stripped.
"""
return self.next(advance=False)
def next(self, advance=True):
"""Returns the next token with surrounding white spaces stripped.
Args:
advance: boolean. True if underlying buffer should be advanced.
Returns:
the next token with surrounding whitespaces stripped.
"""
escaped = False
token = ''
previous_index = self.index
while self.remainder():
char = self._format_string[self.index]
if char == self.ESCAPE_CHAR:
if escaped:
token += char
self.index += 1
escaped = False
else:
self.index += 1
escaped = True
elif char in self.SPECIAL_CHARS and not escaped:
if not token.strip():
self.index += 1
token += char
break
else:
escaped = False
self.index += 1
token += char
if not advance:
self.index = previous_index
return token.strip()
def consume(self, expected_token):
"""Consumes the next token which must match expectation.
Args:
expected_token: the expected value of the next token.
Raises:
ValueError: raised when the next token doesn't match expected_token.
"""
token = self.next()
if token != expected_token:
raise ValueError('Expect "%s" but got "%s" at offset %d' %
(expected_token, token, self.index))
def consume_if(self, token):
"""Consumes the next token when it matches expectation.
Args:
token: the expected next token.
Returns:
True when next token matches the argument and is consumed.
False otherwise.
"""
if self.peek() == token:
self.consume(token)
return True
return False
def remainder(self):
"""Returns the number of bytes left to be processed."""
return len(self._format_string) - self.index