blob: 6c7b617d1e05008434bfd5f66a78de11b12347fa [file] [log] [blame]
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is govered by a BSD-style
# license that can be found in the LICENSE file or at
"""A set of functions that integrate the GAE search index with Monorail."""
import collections
import datetime
import logging
import re
from services import fulltext_helpers
import time
from proto import ast_pb2
from proto import tracker_pb2
# TODO(jrobbins): Consider re-implementing this whole file by using a
# BNF syntax specification and a parser generator or library.
# encodings
UTF8 = 'utf-8'
# Field types and operators
BOOL = tracker_pb2.FieldTypes.BOOL_TYPE
DATE = tracker_pb2.FieldTypes.DATE_TYPE
NUM = tracker_pb2.FieldTypes.INT_TYPE
TXT = tracker_pb2.FieldTypes.STR_TYPE
EQ = ast_pb2.QueryOp.EQ
NE = ast_pb2.QueryOp.NE
LT = ast_pb2.QueryOp.LT
GT = ast_pb2.QueryOp.GT
LE = ast_pb2.QueryOp.LE
GE = ast_pb2.QueryOp.GE
TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS
KEY_HAS = ast_pb2.QueryOp.KEY_HAS
# Mapping from user query comparison operators to our internal representation.
OPS = {
':': TEXT_HAS,
'=': EQ,
'!=': NE,
'<': LT,
'>': GT,
'<=': LE,
'>=': GE,
# This is a partial regular expression that matches all of our comparison
# operators, such as =, 1=, >, and <. Longer ones listed first so that the
# shorter ones don't cause premature matches.
OPS_PATTERN = '|'.join(
map(re.escape, sorted(OPS.keys(), key=lambda op: -len(op))))
# This RE extracts search terms from a subquery string.
TERM_RE = re.compile(
r'(-?"[^"]+")|' # E.g., ["division by zero"]
r'(\S+(%s)[^ "]+)|' # E.g., [stars>10]
r'(\w+(%s)"[^"]+")|' # E.g., [summary:"memory leak"]
r'(-?[._\*\w][-._\*\w]+)' # E.g., [-workaround]
# This RE is used to further decompose a comparison term into prefix, op, and
# value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix
# can include a leading "-" to negate the comparison.
OP_RE = re.compile(
r'(?P<value>([-,.@>/_\*\w]+|"[^"]+"))$' %
# Predefined issue fields passed to the query parser.
(ast_pb2.ANY_FIELD, TXT),
('attachment', TXT), # attachment file names
('attachments', NUM), # number of attachment files
('blocked', BOOL),
('blockedon', TXT),
('blockedon_id', NUM),
('blocking', TXT),
('blocking_id', NUM),
('cc', TXT),
('cc_id', NUM),
('comment', TXT),
('commentby', TXT),
('commentby_id', NUM),
('component', TXT),
('component_id', NUM),
('description', TXT),
('id', NUM),
('label', TXT),
('label_id', NUM),
('mergedinto', NUM),
('open', BOOL),
('owner', TXT),
('owner_id', NUM),
('project', TXT),
('reporter', TXT),
('reporter_id', NUM),
('spam', BOOL),
('stars', NUM),
('starredby', TXT),
('starredby_id', NUM),
('status', TXT),
('status_id', NUM),
('summary', TXT),
_ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS)
'-after': '>',
'-before': '<',
f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type)
for f_name, f_type in _ISSUE_FIELDS_LIST}
def ParseUserQuery(
query, scope, builtin_fields, harmonized_config, warnings=None):
"""Parse a user query and return a set of structure terms.
query: string with user's query. E.g., 'Priority=High'.
scope: string search terms that define the scope in which the
query should be executed. They are expressed in the same
user query language. E.g., adding the canned query.
builtin_fields: dict {field_name: FieldDef(field_name, type)}
mapping field names to FieldDef objects for built-in fields.
harmonized_config: config for all the projects being searched.
@@@ custom field name is not unique in cross project search.
- custom_fields = {field_name: [fd, ...]}
- query build needs to OR each possible interpretation
- could be label in one project and field in another project.
@@@ what about searching across all projects?
warnings: optional list to accumulate warning messages.
A QueryAST with conjunctions (usually just one), where each has a list of
Condition PBs with op, fields, str_values and int_values. E.g., the query
[priority=high leak OR stars>100] over open issues would return
Conjunction(Condition(EQ, [open_fd], [], [1]),
Condition(EQ, [label_fd], ['priority-high'], []),
Condition(TEXT_HAS, any_field_fd, ['leak'], [])),
Conjunction(Condition(EQ, [open_fd], [], [1]),
Condition(GT, [stars_fd], [], [100])))
InvalidQueryError: If a problem was detected in the user's query.
if warnings is None:
warnings = []
if _HasParens(query):
warnings.append('Parentheses are ignored in user queries.')
if _HasParens(scope):
warnings.append('Parentheses are ignored in saved queries.')
# Convert the overall query into one or more OR'd subqueries.
subqueries = query.split(' OR ')
if len(subqueries) > 1: # TODO(jrobbins): temporary limitation just for now.
raise InvalidQueryError('Logical operator OR is not supported yet.')
# Make a dictionary of all fields: built-in + custom in each project.
combined_fields = collections.defaultdict(
list, {field_name: [field_def]
for field_name, field_def in builtin_fields.iteritems()})
for fd in harmonized_config.field_defs:
if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE:
# Only do non-enum fields because enums are stored as labels
conjunctions = [
_ParseConjunction(sq, scope, combined_fields, warnings)
for sq in subqueries]
return ast_pb2.QueryAST(conjunctions=conjunctions)
def _HasParens(s):
"""Return True if there are parentheses in the given string."""
# Monorail cannot handle parenthesized expressions, so we tell the
# user that immediately. Even inside a quoted string, the GAE search
# engine will not handle parens in TEXT-type fields.
return '(' in s or ')' in s
def _ParseConjunction(subquery, scope, fields, warnings):
"""Parse part of a user query into a Conjunction PB."""'Parsing sub query: %r in scope %r', subquery, scope)
scoped_query = ('%s %s' % (scope, subquery)).lower()
cond_strs = _ExtractConds(scoped_query)
conds = [_ParseCond(cond_str, fields, warnings) for cond_str in cond_strs]
return ast_pb2.Conjunction(conds=conds)
def _ParseCond(cond_str, fields, warnings):
"""Parse one user query condition string into a Condition PB."""
op_match = OP_RE.match(cond_str)
# Do not treat as key:value search terms if any of the special prefixes match.
special_prefixes_match = any(
cond_str.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES)
if op_match and not special_prefixes_match:
prefix ='prefix')
op ='op')
val ='value')
# Special case handling to continue to support old date query terms from
# codesite. See monorail:151 for more details.
if prefix.startswith(_DATE_FIELDS):
for date_suffix in _DATE_FIELD_SUFFIX_TO_OP:
if prefix.endswith(date_suffix):
prefix = prefix.rstrip(date_suffix)
op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix]
return _ParseStructuredTerm(prefix, op, val, fields)
# Treat the cond as a full-text search term, which might be negated.
if cond_str.startswith('-'):
cond_str = cond_str[1:]
# Flag a potential user misunderstanding.
if cond_str.lower() in ('and', 'or', 'not'):
'The only supported boolean operator is OR (all capitals).')
return ast_pb2.MakeCond(
op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], [])
def _ParseStructuredTerm(prefix, op_str, value, fields):
"""Parse one user structured query term into an internal representation.
prefix: The query operator, usually a field name. E.g., summary. It can
also be special operators like "is" to test boolean fields.
op_str: the comparison operator. Usually ":" or "=", but can be any OPS.
value: the value to compare against, e.g., term to find in that field.
fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields.
A Condition PB.
unquoted_value = value.strip('"')
# Quick-OR is a convenient way to write one condition that matches any one of
# multiple values, like set membership. E.g., [Priority=High,Critical].
quick_or_vals = [v.strip() for v in unquoted_value.split(',')]
if ((prefix == 'is' or prefix == '-is') and
unquoted_value in ['open', 'blocked', 'spam']):
return ast_pb2.MakeCond(
EQ, fields[unquoted_value], [], [int(prefix == 'is')])
op = OPS[op_str]
negate = False
if prefix.startswith('-'):
negate = True
if op == EQ:
op = NE
elif op == TEXT_HAS:
prefix = prefix[1:]
# Search entries with or without any value in the specified field.
if prefix == 'has':
op = IS_NOT_DEFINED if negate else IS_DEFINED
if unquoted_value in fields: # Look for that field with any value.
return ast_pb2.MakeCond(op, fields[unquoted_value], [], [])
else: # Look for any label with that prefix.
return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], [])
if prefix in fields: # search built-in and custom fields. E.g., summary.
# Note: if first matching field is date-type, we assume they all are.
# TODO(jrobbins): better handling for rare case where multiple projects
# define the same custom field name, and one is a date and another is not.
first_field = fields[prefix][0]
if first_field.field_type == DATE:
date_value = _ParseDateValue(unquoted_value)
return ast_pb2.MakeCond(op, fields[prefix], [], [date_value])
quick_or_ints = []
for qov in quick_or_vals:
except ValueError:
return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals, quick_or_ints)
# Since it is not a field, treat it as labels, E.g., Priority.
quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals]
# Convert substring match to key-value match if user typed 'foo:bar'.
if op == TEXT_HAS:
op = KEY_HAS
return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, [])
def _ExtractConds(query):
"""Parse a query string into a list of individual condition strings.
query: UTF-8 encoded search query string.
A list of query condition strings.
# Convert to unicode then search for distinct terms.
term_matches = TERM_RE.findall(query)
terms = []
for (phrase, word_label, _op1, phrase_label, _op2,
word) in term_matches:
# Case 1: Quoted phrases, e.g., ["hot dog"].
if phrase_label or phrase:
terms.append(phrase_label or phrase)
# Case 2: Comparisons
elif word_label:
special_prefixes_match = any(
word_label.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES)
match = OP_RE.match(word_label)
if match:
label ='prefix')
op ='op')
word ='value')
if special_prefixes_match:
# Do not include quotes if any of the special prefixes match because
# we do not want to treat the label as key:value search terms.
terms.append('%s%s%s' % (label, op, word))
terms.append('%s%s"%s"' % (label, op, word))
# It looked like a key:value cond, but not exactly, so treat it
# as fulltext search. It is probably a tiny bit of source code.
terms.append('"%s"' % word_label)
# Case 3: Simple words.
elif word:
logging.warn('Unexpected search term in %r', query)
return terms
def _ParseDateValue(val):
"""Convert the user-entered date into timestamp."""
# Support timestamp value such as opened>1437671476
return int(val)
except ValueError:
# TODO(jrobbins): future: take timezones into account.
# TODO(jrobbins): for now, explain to users that "today" is
# actually now: the current time, not 12:01am in their timezone.
# In fact, it is not very useful because everything in the system
# happened before the current time.
if val == 'today':
return _CalculatePastDate(0)
elif val.startswith('today-'):
days_ago = int(val.split('-')[1])
except ValueError:
days_ago = 0
return _CalculatePastDate(days_ago)
if '/' in val:
year, month, day = [int(x) for x in val.split('/')]
elif '-' in val:
year, month, day = [int(x) for x in val.split('-')]
return int(time.mktime(datetime.datetime(year, month, day).timetuple()))
except ValueError:
raise InvalidQueryError('Could not parse date')
def _CalculatePastDate(days_ago, now=None):
"""Calculates the timestamp N days ago from now."""
if now is None:
now = int(time.time())
ts = now - days_ago * 24 * 60 * 60
return ts
def CheckSyntax(query, harmonized_config, warnings=None):
"""Parse the given query and report the first error or None."""
query, '', BUILTIN_ISSUE_FIELDS, harmonized_config, warnings=warnings)
except InvalidQueryError as e:
return e.message
return None
class Error(Exception):
"""Base exception class for this package."""
class InvalidQueryError(Error):
"""Error raised when an invalid query is requested."""