javalang/tokenizer.py - external/github.com/c2nes/javalang - Git at Google


 import re
 import unicodedata

 class LexerError(Exception):
     pass

 class JavaToken(object):
     def __init__(self, value, position=None, javadoc=None):
         self.value = value
         self.position = position
         self.javadoc = javadoc

     def __repr__(self):
         if self.position:
             return '%s "%s" line %d, position %d' % (
                 self.__class__.__name__, self.value, self.position[0], self.position[1]
                 )
         else:
             return '%s "%s"' % (self.__class__.__name__, self.value)

     def __str__(self):
         return repr(self)

     def __eq__(self, other):
         raise Exception("Direct comparison not allowed")

 class EndOfInput(JavaToken):
     pass

 class Keyword(JavaToken):
     VALUES = set(['abstract', 'assert', 'boolean', 'break', 'byte', 'case',
                   'catch', 'char', 'class', 'const', 'continue', 'default',
                   'do', 'double', 'else', 'enum', 'extends', 'final',
                   'finally', 'float', 'for', 'goto', 'if', 'implements',
                   'import', 'instanceof', 'int', 'interface', 'long', 'native',
                   'new', 'package', 'private', 'protected', 'public', 'return',
                   'short', 'static', 'strictfp', 'super', 'switch',
                   'synchronized', 'this', 'throw', 'throws', 'transient', 'try',
                   'void', 'volatile', 'while'])


 class Modifier(Keyword):
     VALUES = set(['abstract', 'final', 'native', 'private', 'protected',
                   'public', 'static', 'strictfp', 'synchronized', 'transient',
                   'volatile'])

 class BasicType(Keyword):
     VALUES = set(['boolean', 'byte', 'char', 'double',
                   'float', 'int', 'long', 'short'])

 class Literal(JavaToken):
     pass

 class Integer(Literal):
     pass

 class DecimalInteger(Literal):
     pass

 class OctalInteger(Integer):
     pass

 class BinaryInteger(Integer):
     pass

 class HexInteger(Integer):
     pass

 class FloatingPoint(Literal):
     pass

 class DecimalFloatingPoint(FloatingPoint):
     pass

 class HexFloatingPoint(FloatingPoint):
     pass

 class Boolean(Literal):
     VALUES = set(["true", "false"])

 class Character(Literal):
     pass

 class String(Literal):
     pass

 class Null(Literal):
     pass

 class Separator(JavaToken):
     VALUES = set(['(', ')', '{', '}', '[', ']', ';', ',', '.'])

 class Operator(JavaToken):
     MAX_LEN = 4
     VALUES = set(['>>>=', '>>=', '<<=',  '%=', '^=', '|=', '&=', '/=',
                   '*=', '-=', '+=', '<<', '--', '++', '||', '&&', '!=',
                   '>=', '<=', '==', '%', '^', '|', '&', '/', '*', '-',
                   '+', ':', '?', '~', '!', '<', '>', '=', '...'])

     # '>>>' and '>>' are excluded so that >> becomes two tokens and >>> becomes
     # three. This is done because we can not distinguish the operators >> and
     # >>> from the closing of multipel type parameter/argument lists when
     # lexing. The job of potentially recombining these symbols is left to the
     # parser

     INFIX = set(['||', '&&', '|', '^', '&', '==', '!=', '<', '>', '<=', '>=',
                  '<<', '>>', '>>>', '+', '-', '*', '/', '%'])

     PREFIX = set(['++', '--', '!', '~', '+', '-'])

     POSTFIX = set(['++', '--'])

     ASSIGNMENT = set(['=', '+=', '-=', '*=', '/=', '&=', '|=', '^=', '%=',
                       '<<=', '>>=', '>>>='])

     def is_infix(self):
         return self.value in self.INFIX

     def is_prefix(self):
         return self.value in self.PREFIX

     def is_postfix(self):
         return self.value in self.POSTFIX

     def is_assignment(self):
         return self.value in self.ASSIGNMENT

 class Annotation(JavaToken):
     pass

 class Identifier(JavaToken):
     pass

 class JavaTokenizer(object):

     IDENT_START_CATEGORIES = set(['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Pc', 'Sc'])

     IDENT_PART_CATEGORIES = set(['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mc', 'Mn', 'Nd', 'Nl', 'Pc', 'Sc'])

     def __init__(self, data):
         self.data = data

         self.current_line = 1
         self.start_of_line = 0

         self.operators = [set() for i in range(0, Operator.MAX_LEN)]

         for v in Operator.VALUES:
             self.operators[len(v) - 1].add(v)

         self.whitespace_consumer = re.compile(r'[^\s]')

         self.javadoc = None


     def reset(self):
         self.i = 0
         self.j = 0

     def consume_whitespace(self):
         match = self.whitespace_consumer.search(self.data, self.i + 1)

         if not match:
             self.i = self.length
             return

         i = match.start()

         start_of_line = self.data.rfind('\n', self.i, i)

         if start_of_line != -1:
             self.start_of_line = start_of_line
             self.current_line += self.data.count('\n', self.i, i)

         self.i = i

     def read_string(self):
         delim = self.data[self.i]

         state = 0
         j = self.i + 1
         length = self.length

         while True:
             if j >= length:
                 self.error('Unterminated character/string literal')

             if state == 0:
                 if self.data[j] == '\\':
                     state = 1
                 elif self.data[j] == delim:
                     break

             elif state == 1:
                 if self.data[j] in 'btnfru"\'\\':
                     state = 0
                 elif self.data[j] in '0123':
                     state = 2
                 elif self.data[j] in '01234567':
                     state = 3
                 else:
                     self.error('Illegal escape character', self.data[j])

             elif state == 2:
                 # Possibly long octal
                 if self.data[j] in '01234567':
                     state = 3
                 elif self.data[j] == '\\':
                     state = 1
                 elif self.data[j] == delim:
                     break

             elif state == 3:
                 state = 0

                 if self.data[j] == '\\':
                     state = 1
                 elif self.data[j] == delim:
                     break

             j += 1

         self.j = j + 1

     def try_operator(self):
         for l in range(min(self.length - self.i, Operator.MAX_LEN), 0, -1):
             if self.data[self.i:self.i + l] in self.operators[l - 1]:
                 self.j = self.i + l
                 return True
         return False

     def read_comment(self):
         if self.data[self.i + 1] == '/':
             i = self.data.find('\n', self.i + 2)

             if i == -1:
                 self.i = self.length
                 return

             i += 1

             self.start_of_line = i
             self.current_line += 1
             self.i = i

         else:
             i = self.data.find('*/', self.i + 2)

             if i == -1:
                 self.i = self.length
                 return

             i += 2

             self.start_of_line = i
             self.current_line += self.data.count('\n', self.i, i)
             self.i = i

     def try_javadoc_comment(self):
         if self.i + 2 >= self.length or self.data[self.i + 2] != '*':
             return False

         j = self.data.find('*/', self.i + 2)

         if j == -1:
             self.j = self.length
             return False

         j += 2

         self.start_of_line = j
         self.current_line += self.data.count('\n', self.i, j)
         self.j = j

         return True

     def read_decimal_float_or_integer(self):
         orig_i = self.i
         self.j = self.i

         self.read_decimal_integer()

         if self.data[self.j] not in '.eEfFdD':
             return DecimalInteger

         if self.data[self.j] == '.':
             self.i = self.j + 1
             self.read_decimal_integer()

         if self.data[self.j] in 'eE':
             self.j = self.j + 1

             if self.data[self.j] in '-+':
                 self.j = self.j + 1

             self.i = self.j
             self.read_decimal_integer()

         if self.data[self.j] in 'fFdD':
             self.j = self.j + 1

         self.i = orig_i
         return DecimalFloatingPoint

     def read_hex_integer_or_float(self):
         orig_i = self.i
         self.j = self.i + 2

         self.read_hex_integer()

         if self.data[self.j] not in '.pP':
             return HexInteger

         if self.data[self.j] == '.':
             self.j = self.j + 1
             self.read_digits('0123456789abcdefABCDEF')

         if self.data[self.j] in 'pP':
             self.j = self.j + 1
         else:
             self.error('Invalid hex float literal')

         if self.data[self.j] in '-+':
             self.j = self.j + 1

         self.i = self.j
         self.read_decimal_integer()

         if self.data[self.j] in 'fFdD':
             self.j = self.j + 1

         self.i = orig_i
         return HexFloatingPoint

     def read_digits(self, digits):
         tmp_i = 0
         c = None

         while True:
             c = self.data[self.j + tmp_i]

             if c in digits:
                 self.j += 1 + tmp_i
                 tmp_i = 0
             elif c == '_':
                 tmp_i += 1
             else:
                 break

         if c in 'lL':
             self.j += 1

     def read_decimal_integer(self):
         self.j = self.i
         self.read_digits('0123456789')

     def read_hex_integer(self):
         self.j = self.i + 2
         self.read_digits('0123456789abcdefABCDEF')

     def read_bin_integer(self):
         self.j = self.i + 2
         self.read_digits('01')

     def read_octal_integer(self):
         self.j = self.i + 1
         self.read_digits('01234567')

     def read_integer_or_float(self, c, c_next):
         if c == '0' and c_next in 'xX':
             return self.read_hex_integer_or_float()
         elif c == '0' and c_next in 'bB':
             self.read_bin_integer()
             return BinaryInteger
         elif c == '0' and c_next in '01234567':
             self.read_octal_integer()
             return OctalInteger
         else:
             return self.read_decimal_float_or_integer()

     def try_separator(self):
         if self.data[self.i] in Separator.VALUES:
             self.j = self.i + 1
             return True
         return False

     def decode_data(self):
         # Encodings to try in order
         codecs = ['utf_8', 'iso-8859-1']

         # If data is already unicode don't try to redecode
         if isinstance(self.data, unicode):
             return self.data

         for codec in codecs:
             try:
                 data = self.data.decode(codec)
                 return data
             except UnicodeDecodeError:
                 pass

         self.error('Could not decode input data')

     def is_java_identifier_start(self, c):
         return unicodedata.category(c) in self.IDENT_START_CATEGORIES

     def read_identifier(self):
         self.j = self.i + 1

         while unicodedata.category(self.data[self.j]) in self.IDENT_PART_CATEGORIES:
             self.j += 1

         ident = self.data[self.i:self.j]
         if ident in Keyword.VALUES:
             token_type = Keyword

             if ident in BasicType.VALUES:
                 token_type = BasicType
             elif ident in Modifier.VALUES:
                 token_type = Modifier

         elif ident in Boolean.VALUES:
             token_type = Boolean
         elif ident == 'null':
             token_type = Null
         else:
             token_type = Identifier

         return token_type

     def pre_tokenize(self):
         new_data = list()
         data = self.decode_data()

         i = 0
         j = 0
         length = len(data)

         NONE         = 0
         ELIGIBLE     = 1
         MARKER_FOUND = 2

         state = NONE

         while j < length:
             if state == NONE:
                 j = data.find('\\', j)

                 if j == -1:
                     j = length
                     break

                 state = ELIGIBLE

             elif state == ELIGIBLE:
                 c = data[j]

                 if c == 'u':
                     state = MARKER_FOUND
                     new_data.append(data[i:j - 1])
                 else:
                     state = NONE

             elif state == MARKER_FOUND:
                 c = data[j]

                 if c != 'u':
                     try:
                         escape_code = int(data[j:j+4], 16)
                     except ValueError:
                         self.error('Invalid unicode escape', data[j:j+4])

                     new_data.append(unichr(escape_code))

                     i = j + 4
                     j = i

                     state = NONE

                     continue

             j = j + 1

         new_data.append(data[i:])

         self.data = ''.join(new_data)
         self.length = len(self.data)

     def tokenize(self):
         self.reset()

         # Convert unicode escapes
         self.pre_tokenize()

         while self.i < self.length:
             token_type = None

             c = self.data[self.i]
             c_next = None
             startswith = c

             if self.i + 1 < self.length:
                 c_next = self.data[self.i + 1]
                 startswith = c + c_next

             if c.isspace():
                 self.consume_whitespace()
                 continue

             elif startswith in ("//", "/*"):
                 if startswith == "/*" self.try_javadoc_comment():
                     self.javadoc = self.data[self.i:self.j]
                     self.i = self.j
                 else:
                     self.read_comment()
                 continue

             elif startswith == '..' and self.try_operator():
                 # Ensure we don't mistake a '...' operator as a sequence of
                 # three '.' separators. This is done as an optimization instead
                 # of moving try_operator higher in the chain because operators
                 # aren't as common and try_operator is expensive
                 token_type = Operator

             elif c == '@':
                 token_type = Annotation
                 self.j = self.i + 1

             elif c == '.' and c_next.isdigit():
                 token_type = self.read_decimal_float_or_integer()

             elif self.try_separator():
                 token_type = Separator

             elif c in ("'", '"'):
                 token_type = String
                 self.read_string()

             elif c in '0123456789':
                 token_type = self.read_integer_or_float(c, c_next)

             elif self.is_java_identifier_start(c):
                 token_type = self.read_identifier()

             elif self.try_operator():
                 token_type = Operator

             else:
                 self.error('Could not process token', c)

             position = (self.current_line, self.i - self.start_of_line)
             token = token_type(self.data[self.i:self.j], position, self.javadoc)
             yield token

             if self.javadoc:
                 self.javadoc = None

             self.i = self.j

     def error(self, message, char=None):
         # Provide additional information in the errors message
         line_start = self.data.rfind('\n', 0, self.i) + 1
         line_end = self.data.find('\n', self.i)
         line = self.data[line_start:line_end].strip()

         line_number = self.current_line

         if not char:
             char = self.data[self.j]

         message = u'%s at "%s", line %s: %s' % (message, char, line_number, line)

         raise LexerError(message)

 def tokenize(code):
     tokenizer = JavaTokenizer(code)
     return tokenizer.tokenize()

 def reformat_tokens(tokens):
     indent = 0
     closed_block = False
     ident_last = False

     output = list()

     for token in tokens:
         if closed_block:
             closed_block = False
             indent -= 4

             output.append('\n')
             output.append(' ' * indent)
             output.append('}')

             if isinstance(token, (Literal, Keyword, Identifier)):
                 output.append('\n')
                 output.append(' ' * indent)

         if token.value == '{':
             indent += 4
             output.append(' {\n')
             output.append(' ' * indent)

         elif token.value == '}':
             closed_block = True

         elif token.value == ',':
             output.append(', ')

         elif isinstance(token, (Literal, Keyword, Identifier)):
             if ident_last:
                 # If the last token was a literla/keyword/identifer put a space in between
                 output.append(' ')
             ident_last = True
             output.append(token.value)

         elif isinstance(token, Operator):
             output.append(' ' + token.value + ' ')

         elif token.value == ';':
             output.append(';\n')
             output.append(' ' * indent)

         else:
             output.append(token.value)

         ident_last = isinstance(token, (Literal, Keyword, Identifier))

     if closed_block:
         output.append('\n}')

     output.append('\n')

     return ''.join(output)

	import re
	import unicodedata

	class LexerError(Exception):
	pass

	class JavaToken(object):
	def __init__(self, value, position=None, javadoc=None):
	self.value = value
	self.position = position
	self.javadoc = javadoc

	def __repr__(self):
	if self.position:
	return '%s "%s" line %d, position %d' % (
	self.__class__.__name__, self.value, self.position[0], self.position[1]
	)
	else:
	return '%s "%s"' % (self.__class__.__name__, self.value)

	def __str__(self):
	return repr(self)

	def __eq__(self, other):
	raise Exception("Direct comparison not allowed")

	class EndOfInput(JavaToken):
	pass

	class Keyword(JavaToken):
	VALUES = set(['abstract', 'assert', 'boolean', 'break', 'byte', 'case',
	'catch', 'char', 'class', 'const', 'continue', 'default',
	'do', 'double', 'else', 'enum', 'extends', 'final',
	'finally', 'float', 'for', 'goto', 'if', 'implements',
	'import', 'instanceof', 'int', 'interface', 'long', 'native',
	'new', 'package', 'private', 'protected', 'public', 'return',
	'short', 'static', 'strictfp', 'super', 'switch',
	'synchronized', 'this', 'throw', 'throws', 'transient', 'try',
	'void', 'volatile', 'while'])


	class Modifier(Keyword):
	VALUES = set(['abstract', 'final', 'native', 'private', 'protected',
	'public', 'static', 'strictfp', 'synchronized', 'transient',
	'volatile'])

	class BasicType(Keyword):
	VALUES = set(['boolean', 'byte', 'char', 'double',
	'float', 'int', 'long', 'short'])

	class Literal(JavaToken):
	pass

	class Integer(Literal):
	pass

	class DecimalInteger(Literal):
	pass

	class OctalInteger(Integer):
	pass

	class BinaryInteger(Integer):
	pass

	class HexInteger(Integer):
	pass

	class FloatingPoint(Literal):
	pass

	class DecimalFloatingPoint(FloatingPoint):
	pass

	class HexFloatingPoint(FloatingPoint):
	pass

	class Boolean(Literal):
	VALUES = set(["true", "false"])

	class Character(Literal):
	pass

	class String(Literal):
	pass

	class Null(Literal):
	pass

	class Separator(JavaToken):
	VALUES = set(['(', ')', '{', '}', '[', ']', ';', ',', '.'])

	class Operator(JavaToken):
	MAX_LEN = 4
	VALUES = set(['>>>=', '>>=', '<<=', '%=', '^=', '\|=', '&=', '/=',
	'*=', '-=', '+=', '<<', '--', '++', '\|\|', '&&', '!=',
	'>=', '<=', '==', '%', '^', '\|', '&', '/', '*', '-',
	'+', ':', '?', '~', '!', '<', '>', '=', '...'])

	# '>>>' and '>>' are excluded so that >> becomes two tokens and >>> becomes
	# three. This is done because we can not distinguish the operators >> and
	# >>> from the closing of multipel type parameter/argument lists when
	# lexing. The job of potentially recombining these symbols is left to the
	# parser

	INFIX = set(['\|\|', '&&', '\|', '^', '&', '==', '!=', '<', '>', '<=', '>=',
	'<<', '>>', '>>>', '+', '-', '*', '/', '%'])

	PREFIX = set(['++', '--', '!', '~', '+', '-'])

	POSTFIX = set(['++', '--'])

	ASSIGNMENT = set(['=', '+=', '-=', '*=', '/=', '&=', '\|=', '^=', '%=',
	'<<=', '>>=', '>>>='])

	def is_infix(self):
	return self.value in self.INFIX

	def is_prefix(self):
	return self.value in self.PREFIX

	def is_postfix(self):
	return self.value in self.POSTFIX

	def is_assignment(self):
	return self.value in self.ASSIGNMENT

	class Annotation(JavaToken):
	pass

	class Identifier(JavaToken):
	pass

	class JavaTokenizer(object):

	IDENT_START_CATEGORIES = set(['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Pc', 'Sc'])

	IDENT_PART_CATEGORIES = set(['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mc', 'Mn', 'Nd', 'Nl', 'Pc', 'Sc'])

	def __init__(self, data):
	self.data = data

	self.current_line = 1
	self.start_of_line = 0

	self.operators = [set() for i in range(0, Operator.MAX_LEN)]

	for v in Operator.VALUES:
	self.operators[len(v) - 1].add(v)

	self.whitespace_consumer = re.compile(r'[^\s]')

	self.javadoc = None


	def reset(self):
	self.i = 0
	self.j = 0

	def consume_whitespace(self):
	match = self.whitespace_consumer.search(self.data, self.i + 1)

	if not match:
	self.i = self.length
	return

	i = match.start()

	start_of_line = self.data.rfind('\n', self.i, i)

	if start_of_line != -1:
	self.start_of_line = start_of_line
	self.current_line += self.data.count('\n', self.i, i)

	self.i = i

	def read_string(self):
	delim = self.data[self.i]

	state = 0
	j = self.i + 1
	length = self.length

	while True:
	if j >= length:
	self.error('Unterminated character/string literal')

	if state == 0:
	if self.data[j] == '\\':
	state = 1
	elif self.data[j] == delim:
	break

	elif state == 1:
	if self.data[j] in 'btnfru"\'\\':
	state = 0
	elif self.data[j] in '0123':
	state = 2
	elif self.data[j] in '01234567':
	state = 3
	else:
	self.error('Illegal escape character', self.data[j])

	elif state == 2:
	# Possibly long octal
	if self.data[j] in '01234567':
	state = 3
	elif self.data[j] == '\\':
	state = 1
	elif self.data[j] == delim:
	break

	elif state == 3:
	state = 0

	if self.data[j] == '\\':
	state = 1
	elif self.data[j] == delim:
	break

	j += 1

	self.j = j + 1

	def try_operator(self):
	for l in range(min(self.length - self.i, Operator.MAX_LEN), 0, -1):
	if self.data[self.i:self.i + l] in self.operators[l - 1]:
	self.j = self.i + l
	return True
	return False

	def read_comment(self):
	if self.data[self.i + 1] == '/':
	i = self.data.find('\n', self.i + 2)

	if i == -1:
	self.i = self.length
	return

	i += 1

	self.start_of_line = i
	self.current_line += 1
	self.i = i

	else:
	i = self.data.find('*/', self.i + 2)

	if i == -1:
	self.i = self.length
	return

	i += 2

	self.start_of_line = i
	self.current_line += self.data.count('\n', self.i, i)
	self.i = i

	def try_javadoc_comment(self):
	if self.i + 2 >= self.length or self.data[self.i + 2] != '*':
	return False

	j = self.data.find('*/', self.i + 2)

	if j == -1:
	self.j = self.length
	return False

	j += 2

	self.start_of_line = j
	self.current_line += self.data.count('\n', self.i, j)
	self.j = j

	return True

	def read_decimal_float_or_integer(self):
	orig_i = self.i
	self.j = self.i

	self.read_decimal_integer()

	if self.data[self.j] not in '.eEfFdD':
	return DecimalInteger

	if self.data[self.j] == '.':
	self.i = self.j + 1
	self.read_decimal_integer()

	if self.data[self.j] in 'eE':
	self.j = self.j + 1

	if self.data[self.j] in '-+':
	self.j = self.j + 1

	self.i = self.j
	self.read_decimal_integer()

	if self.data[self.j] in 'fFdD':
	self.j = self.j + 1

	self.i = orig_i
	return DecimalFloatingPoint

	def read_hex_integer_or_float(self):
	orig_i = self.i
	self.j = self.i + 2

	self.read_hex_integer()

	if self.data[self.j] not in '.pP':
	return HexInteger

	if self.data[self.j] == '.':
	self.j = self.j + 1
	self.read_digits('0123456789abcdefABCDEF')

	if self.data[self.j] in 'pP':
	self.j = self.j + 1
	else:
	self.error('Invalid hex float literal')

	if self.data[self.j] in '-+':
	self.j = self.j + 1

	self.i = self.j
	self.read_decimal_integer()

	if self.data[self.j] in 'fFdD':
	self.j = self.j + 1

	self.i = orig_i
	return HexFloatingPoint

	def read_digits(self, digits):
	tmp_i = 0
	c = None

	while True:
	c = self.data[self.j + tmp_i]

	if c in digits:
	self.j += 1 + tmp_i
	tmp_i = 0
	elif c == '_':
	tmp_i += 1
	else:
	break

	if c in 'lL':
	self.j += 1

	def read_decimal_integer(self):
	self.j = self.i
	self.read_digits('0123456789')

	def read_hex_integer(self):
	self.j = self.i + 2
	self.read_digits('0123456789abcdefABCDEF')

	def read_bin_integer(self):
	self.j = self.i + 2
	self.read_digits('01')

	def read_octal_integer(self):
	self.j = self.i + 1
	self.read_digits('01234567')

	def read_integer_or_float(self, c, c_next):
	if c == '0' and c_next in 'xX':
	return self.read_hex_integer_or_float()
	elif c == '0' and c_next in 'bB':
	self.read_bin_integer()
	return BinaryInteger
	elif c == '0' and c_next in '01234567':
	self.read_octal_integer()
	return OctalInteger
	else:
	return self.read_decimal_float_or_integer()

	def try_separator(self):
	if self.data[self.i] in Separator.VALUES:
	self.j = self.i + 1
	return True
	return False

	def decode_data(self):
	# Encodings to try in order
	codecs = ['utf_8', 'iso-8859-1']

	# If data is already unicode don't try to redecode
	if isinstance(self.data, unicode):
	return self.data

	for codec in codecs:
	try:
	data = self.data.decode(codec)
	return data
	except UnicodeDecodeError:
	pass

	self.error('Could not decode input data')

	def is_java_identifier_start(self, c):
	return unicodedata.category(c) in self.IDENT_START_CATEGORIES

	def read_identifier(self):
	self.j = self.i + 1

	while unicodedata.category(self.data[self.j]) in self.IDENT_PART_CATEGORIES:
	self.j += 1

	ident = self.data[self.i:self.j]
	if ident in Keyword.VALUES:
	token_type = Keyword

	if ident in BasicType.VALUES:
	token_type = BasicType
	elif ident in Modifier.VALUES:
	token_type = Modifier

	elif ident in Boolean.VALUES:
	token_type = Boolean
	elif ident == 'null':
	token_type = Null
	else:
	token_type = Identifier

	return token_type

	def pre_tokenize(self):
	new_data = list()
	data = self.decode_data()

	i = 0
	j = 0
	length = len(data)

	NONE = 0
	ELIGIBLE = 1
	MARKER_FOUND = 2

	state = NONE

	while j < length:
	if state == NONE:
	j = data.find('\\', j)

	if j == -1:
	j = length
	break

	state = ELIGIBLE

	elif state == ELIGIBLE:
	c = data[j]

	if c == 'u':
	state = MARKER_FOUND
	new_data.append(data[i:j - 1])
	else:
	state = NONE

	elif state == MARKER_FOUND:
	c = data[j]

	if c != 'u':
	try:
	escape_code = int(data[j:j+4], 16)
	except ValueError:
	self.error('Invalid unicode escape', data[j:j+4])

	new_data.append(unichr(escape_code))

	i = j + 4
	j = i

	state = NONE

	continue

	j = j + 1

	new_data.append(data[i:])

	self.data = ''.join(new_data)
	self.length = len(self.data)

	def tokenize(self):
	self.reset()

	# Convert unicode escapes
	self.pre_tokenize()

	while self.i < self.length:
	token_type = None

	c = self.data[self.i]
	c_next = None
	startswith = c

	if self.i + 1 < self.length:
	c_next = self.data[self.i + 1]
	startswith = c + c_next

	if c.isspace():
	self.consume_whitespace()
	continue

	elif startswith in ("//", "/*"):
	if startswith == "/*" self.try_javadoc_comment():
	self.javadoc = self.data[self.i:self.j]
	self.i = self.j
	else:
	self.read_comment()
	continue

	elif startswith == '..' and self.try_operator():
	# Ensure we don't mistake a '...' operator as a sequence of
	# three '.' separators. This is done as an optimization instead
	# of moving try_operator higher in the chain because operators
	# aren't as common and try_operator is expensive
	token_type = Operator

	elif c == '@':
	token_type = Annotation
	self.j = self.i + 1

	elif c == '.' and c_next.isdigit():
	token_type = self.read_decimal_float_or_integer()

	elif self.try_separator():
	token_type = Separator

	elif c in ("'", '"'):
	token_type = String
	self.read_string()

	elif c in '0123456789':
	token_type = self.read_integer_or_float(c, c_next)

	elif self.is_java_identifier_start(c):
	token_type = self.read_identifier()

	elif self.try_operator():
	token_type = Operator

	else:
	self.error('Could not process token', c)

	position = (self.current_line, self.i - self.start_of_line)
	token = token_type(self.data[self.i:self.j], position, self.javadoc)
	yield token

	if self.javadoc:
	self.javadoc = None

	self.i = self.j

	def error(self, message, char=None):
	# Provide additional information in the errors message
	line_start = self.data.rfind('\n', 0, self.i) + 1
	line_end = self.data.find('\n', self.i)
	line = self.data[line_start:line_end].strip()

	line_number = self.current_line

	if not char:
	char = self.data[self.j]

	message = u'%s at "%s", line %s: %s' % (message, char, line_number, line)

	raise LexerError(message)

	def tokenize(code):
	tokenizer = JavaTokenizer(code)
	return tokenizer.tokenize()

	def reformat_tokens(tokens):
	indent = 0
	closed_block = False
	ident_last = False

	output = list()

	for token in tokens:
	if closed_block:
	closed_block = False
	indent -= 4

	output.append('\n')
	output.append(' ' * indent)
	output.append('}')

	if isinstance(token, (Literal, Keyword, Identifier)):
	output.append('\n')
	output.append(' ' * indent)

	if token.value == '{':
	indent += 4
	output.append(' {\n')
	output.append(' ' * indent)

	elif token.value == '}':
	closed_block = True

	elif token.value == ',':
	output.append(', ')

	elif isinstance(token, (Literal, Keyword, Identifier)):
	if ident_last:
	# If the last token was a literla/keyword/identifer put a space in between
	output.append(' ')
	ident_last = True
	output.append(token.value)

	elif isinstance(token, Operator):
	output.append(' ' + token.value + ' ')

	elif token.value == ';':
	output.append(';\n')
	output.append(' ' * indent)

	else:
	output.append(token.value)

	ident_last = isinstance(token, (Literal, Keyword, Identifier))

	if closed_block:
	output.append('\n}')

	output.append('\n')

	return ''.join(output)