blob: 2555193e5340df8d2e2b42bfc43659620ecd8d16 [file] [log] [blame]
#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""A simple tokenizer used for the Full Text Search API stub."""
import re
from google.appengine.datastore import document_pb
from google.appengine.api.search.stub import tokens
_WORD_SEPARATORS = [
r'!', r'\"', r'%', r'\(', r'\)', r'\*', r',', r'\.', r'/', r'\:', r'=',
r'>', r'\?', r'@', r'\[', r'\\', r'\]', r'\^', r'\`', r'\{', r'\|', r'\}',
r'~', r'\t', r'\n', r'\f', r'\r', r' ', r'&', r'#', r'$', r';']
_WORD_SEPARATOR_RE = re.compile('|'.join(_WORD_SEPARATORS))
def _StripSeparators(value):
"""Remove special characters and collapse spaces."""
return re.sub(r' [ ]*', ' ', re.sub(_WORD_SEPARATOR_RE, ' ', value))
def NormalizeString(value):
"""Lowers case, removes punctuation and collapses whitespace."""
return _StripSeparators(value).lower().strip()
class SimpleTokenizer(object):
"""A tokenizer which converts text to a normalized stream of tokens.
Text normalization lowers case, removes punctuation and splits on whitespace.
"""
def __init__(self, split_restricts=True, preserve_case=False):
self._split_restricts = split_restricts
self._preserve_case = preserve_case
self._html_pattern = re.compile(r'<[^>]*>')
def SetCase(self, value):
if hasattr(self, '_preserve_case') and self._preserve_case:
return value
else:
return value.lower()
def TokenizeText(self, text, token_position=0,
input_field_type=document_pb.FieldValue.TEXT):
"""Tokenizes the text into a sequence of Tokens."""
return self._TokenizeForType(field_type=input_field_type,
value=text, token_position=token_position)
def TokenizeValue(self, field_value, token_position=0):
"""Tokenizes a document_pb.FieldValue into a sequence of Tokens."""
if field_value.type() == document_pb.FieldValue.GEO:
return self._TokenizeForType(field_type=field_value.type(),
value=field_value.geo(),
token_position=token_position)
return self._TokenizeForType(field_type=field_value.type(),
value=field_value.string_value(),
token_position=token_position)
def _TokenizeString(self, value, field_type):
value = self.SetCase(value)
if field_type != document_pb.FieldValue.ATOM:
if field_type == document_pb.FieldValue.HTML:
value = self._StripHtmlTags(value)
value = _StripSeparators(value)
return value.split()
else:
return [value]
def _StripHtmlTags(self, value):
"""Replace HTML tags with spaces."""
return self._html_pattern.sub(' ', value)
def _TokenizeForType(self, field_type, value, token_position=0):
"""Tokenizes value into a sequence of Tokens."""
if field_type == document_pb.FieldValue.NUMBER:
return [tokens.Token(chars=value, position=token_position)]
if field_type == document_pb.FieldValue.GEO:
return [tokens.GeoPoint(latitude=value.lat(), longitude=value.lng(),
position=token_position)]
tokens_found = []
token_strings = []
if not self._split_restricts:
token_strings = self.SetCase(value).split()
else:
token_strings = self._TokenizeString(value, field_type)
for token in token_strings:
if ':' in token and self._split_restricts:
for subtoken in token.split(':'):
tokens_found.append(
tokens.Token(chars=subtoken, position=token_position))
token_position += 1
elif '"' in token:
for subtoken in token.split('"'):
if not subtoken:
tokens_found.append(
tokens.Quote(chars='"', position=token_position))
else:
tokens_found.append(
tokens.Token(chars=subtoken, position=token_position))
token_position += 1
else:
tokens_found.append(tokens.Token(chars=token, position=token_position))
token_position += 1
return tokens_found