blob: ab77f2ed3f1ba0d5eee0a9c3da22a92400c70117 [file] [log] [blame]
// Copyright 2014 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_QUERY_PARSER_QUERY_PARSER_H_
#define COMPONENTS_QUERY_PARSER_QUERY_PARSER_H_
#include <stddef.h>
#include <memory>
#include <string>
#include <vector>
#include "components/query_parser/snippet.h"
namespace query_parser {
class QueryNodeList;
// Used by HasMatchIn.
struct QueryWord {
// The word to match against.
std::u16string word;
// The starting position of the word in the original text.
size_t position;
};
enum class MatchingAlgorithm {
// Only words long enough are considered for prefix search. Shorter words are
// considered for exact matches.
DEFAULT,
// All words are considered for a prefix search.
ALWAYS_PREFIX_SEARCH,
kMaxValue = ALWAYS_PREFIX_SEARCH,
};
using QueryWordVector = std::vector<query_parser::QueryWord>;
// `QueryNode` is used by `QueryParser` to represent the elements that
// constitute a query. While `QueryNode` is exposed by way of `ParseQuery`, it
// really isn't meant for external usage.
class QueryNode {
public:
virtual ~QueryNode() {}
// Serialize ourselves out to a string that can be passed to SQLite. Returns
// the number of words in this node.
virtual int AppendToSQLiteQuery(std::u16string* query) const = 0;
// Return true if this is a `QueryNodeWord`, false if it's a `QueryNodeList`.
virtual bool IsWord() const = 0;
// Returns true if this node matches `word`. If `exact` is true, the string
// must exactly match. Otherwise, this uses a starts-with comparison.
virtual bool Matches(const std::u16string& word, bool exact) const = 0;
// Returns true if this node matches at least one of the words in `words`. An
// entry is added to `match_positions` for all matching words giving the
// matching regions. Uses a starts-with comparison.
virtual bool HasMatchIn(const QueryWordVector& words,
Snippet::MatchPositions* match_positions) const = 0;
// Returns true if this node matches at least one of the words in `words`.
// If `exact` is true, at least one of the words must be an exact match.
virtual bool HasMatchIn(const QueryWordVector& words, bool exact) const = 0;
// Appends the words that make up this node in `words`.
virtual void AppendWords(std::vector<std::u16string>* words) const = 0;
};
using QueryNodeVector = std::vector<std::unique_ptr<query_parser::QueryNode>>;
// This class is used to parse queries entered into the history search into more
// normalized queries that can be passed to the SQLite backend.
class QueryParser {
public:
QueryParser() = delete;
QueryParser(const QueryParser&) = delete;
QueryParser& operator=(const QueryParser&) = delete;
~QueryParser() = delete;
// For CJK ideographs and Korean Hangul, even a single character
// can be useful in prefix matching, but that may give us too many
// false positives. Moreover, the current ICU word breaker gives us
// back every single Chinese character as a word so that there's no
// point doing anything for them and we only adjust the minimum length
// to 2 for Korean Hangul while using 3 for others. This is a temporary
// hack until we have a segmentation support.
static bool IsWordLongEnoughForPrefixSearch(
const std::u16string& word,
MatchingAlgorithm matching_algorithm);
// Parse a query into a SQLite query. The resulting query is placed in
// |sqlite_query| and the number of words is returned.
static int ParseQuery(const std::u16string& query,
MatchingAlgorithm matching_algorithm,
std::u16string* sqlite_query);
// Parses |query|, returning the words that make up it. Any words in quotes
// are put in |words| without the quotes. For example, the query text
// "foo bar" results in two entries being added to words, one for foo and one
// for bar.
static void ParseQueryWords(const std::u16string& query,
MatchingAlgorithm matching_algorithm,
std::vector<std::u16string>* words);
// Parses |query|, returning the nodes that constitute the valid words in the
// query. This is intended for later usage with DoesQueryMatch. Ownership of
// the nodes passes to the caller.
static void ParseQueryNodes(const std::u16string& query,
MatchingAlgorithm matching_algorithm,
QueryNodeVector* nodes);
// Returns true if all of the |find_nodes| are found in |find_in_text|.
// |find_nodes| should have been created by calling |ParseQuery()|. If all
// nodes were successfully found, each of the matching positions in the text
// is added to |match_positions|.
static bool DoesQueryMatch(const std::u16string& find_in_text,
const QueryNodeVector& find_nodes,
Snippet::MatchPositions* match_positions);
// Returns true if all of the |find_nodes| are found in |find_in_words|.
// |find_nodes| should have been created by calling |ParseQuery()|.
// If |exact| is set to true, only exact matches are considered matches.
static bool DoesQueryMatch(const QueryWordVector& find_in_words,
const QueryNodeVector& find_nodes,
bool exact = false);
// Extracts the words from |text|, placing each word into |words|.
// |text| must already be lowercased by the caller, as otherwise the output
// will NEVER match anything.
static void ExtractQueryWords(const std::u16string& text,
QueryWordVector* words);
// Sorts the match positions in |matches| by their first index, then
// coalesces any match positions that intersect each other.
static void SortAndCoalesceMatchPositions(Snippet::MatchPositions* matches);
private:
// Does the work of parsing |query|; creates nodes in |root| as appropriate.
// This is invoked from both of the ParseQuery methods.
static bool ParseQueryImpl(const std::u16string& query,
MatchingAlgorithm matching_algorithm,
QueryNodeList* root);
};
} // namespace query_parser
#endif // COMPONENTS_QUERY_PARSER_QUERY_PARSER_H_