blob: 385bd0ee2760bcbf91991f38ed3f7ce8b061fc21 [file] [log] [blame]
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "ui/app_list/search/tokenized_string_match.h"
#include <stddef.h>
#include <cmath>
#include "base/i18n/string_search.h"
#include "base/logging.h"
#include "base/macros.h"
#include "ui/app_list/search/tokenized_string_char_iterator.h"
namespace app_list {
namespace {
// The factors below are applied when the current char of query matches
// the current char of the text to be matched. Different factors are chosen
// based on where the match happens. kIsPrefixMultiplier is used when the
// matched portion is a prefix of both the query and the text, which implies
// that the matched chars are at the same position in query and text. This is
// the most preferred case thus it has the highest score. When the current char
// of the query and the text does not match, the algorithm moves to the next
// token in the text and try to match from there. kIsFrontOfWordMultipler will
// be used if the first char of the token matches the current char of the query.
// Otherwise, the match is considered as weak and kIsWeakHitMultiplier is
// used.
// Examples:
// Suppose the text to be matched is 'Google Chrome'.
// Query 'go' would yield kIsPrefixMultiplier for each char.
// Query 'gc' would use kIsPrefixMultiplier for 'g' and
// kIsFrontOfWordMultipler for 'c'.
// Query 'ch' would use kIsFrontOfWordMultipler for 'c' and
// kIsWeakHitMultiplier for 'h'.
// Query 'oo' does not match any prefix and would use the substring match
// fallback, thus kIsSubstringMultiplier is used for each char.
const double kIsPrefixMultiplier = 1.0;
const double kIsFrontOfWordMultipler = 0.8;
const double kIsWeakHitMultiplier = 0.6;
const double kIsSubstringMultiplier = 0.4;
// A relevance score that represents no match.
const double kNoMatchScore = 0.0;
// PrefixMatcher matches the chars of a given query as prefix of tokens in
// a given text or as prefix of the acronyms of those text tokens.
class PrefixMatcher {
public:
PrefixMatcher(const TokenizedString& query,
const TokenizedString& text)
: query_iter_(query),
text_iter_(text),
current_match_(gfx::Range::InvalidRange()),
current_relevance_(kNoMatchScore) {
}
// Invokes RunMatch to perform prefix match. Use |states_| as a stack to
// perform DFS (depth first search) so that all possible matches are
// attempted. Stops on the first full match and returns true. Otherwise,
// returns false to indicate no match.
bool Match() {
while (!RunMatch()) {
// No match found and no more states to try. Bail out.
if (states_.empty()) {
current_relevance_ = kNoMatchScore;
current_hits_.clear();
return false;
}
PopState();
// Skip restored match to try other possibilites.
AdvanceToNextTextToken();
}
if (current_match_.IsValid())
current_hits_.push_back(current_match_);
return true;
}
double relevance() const { return current_relevance_; }
const TokenizedStringMatch::Hits& hits() const { return current_hits_; }
private:
// Context record of a match.
struct State {
State() : relevance(kNoMatchScore) {}
State(double relevance,
const gfx::Range& current_match,
const TokenizedStringMatch::Hits& hits,
const TokenizedStringCharIterator& query_iter,
const TokenizedStringCharIterator& text_iter)
: relevance(relevance),
current_match(current_match),
hits(hits.begin(), hits.end()),
query_iter_state(query_iter.GetState()),
text_iter_state(text_iter.GetState()) {}
// The current score of the processed query chars.
double relevance;
// Current matching range.
gfx::Range current_match;
// Completed matching ranges of the processed query chars.
TokenizedStringMatch::Hits hits;
// States of the processed query and text chars.
TokenizedStringCharIterator::State query_iter_state;
TokenizedStringCharIterator::State text_iter_state;
};
typedef std::vector<State> States;
// Match chars from the query and text one by one. For each matching char,
// calculate relevance and matching ranges. And the current stats is
// recorded so that the match could be skipped later to try other
// possiblities. Repeat until any of the iterators run out. Return true if
// query iterator runs out, i.e. all chars in query are matched.
bool RunMatch() {
while (!query_iter_.end() && !text_iter_.end()) {
if (query_iter_.Get() == text_iter_.Get()) {
PushState();
if (query_iter_.GetArrayPos() == text_iter_.GetArrayPos())
current_relevance_ += kIsPrefixMultiplier;
else if (text_iter_.IsFirstCharOfToken())
current_relevance_ += kIsFrontOfWordMultipler;
else
current_relevance_ += kIsWeakHitMultiplier;
if (!current_match_.IsValid())
current_match_.set_start(text_iter_.GetArrayPos());
current_match_.set_end(text_iter_.GetArrayPos() +
text_iter_.GetCharSize());
query_iter_.NextChar();
text_iter_.NextChar();
} else {
AdvanceToNextTextToken();
}
}
return query_iter_.end();
}
// Skip to the next text token and close current match. Invoked when a
// mismatch happens or to skip a restored match.
void AdvanceToNextTextToken() {
if (current_match_.IsValid()) {
current_hits_.push_back(current_match_);
current_match_ = gfx::Range::InvalidRange();
}
text_iter_.NextToken();
}
void PushState() {
states_.push_back(State(current_relevance_,
current_match_,
current_hits_,
query_iter_,
text_iter_));
}
void PopState() {
DCHECK(!states_.empty());
State& last_match = states_.back();
current_relevance_ = last_match.relevance;
current_match_ = last_match.current_match;
current_hits_.swap(last_match.hits);
query_iter_.SetState(last_match.query_iter_state);
text_iter_.SetState(last_match.text_iter_state);
states_.pop_back();
}
TokenizedStringCharIterator query_iter_;
TokenizedStringCharIterator text_iter_;
States states_;
gfx::Range current_match_;
double current_relevance_;
TokenizedStringMatch::Hits current_hits_;
DISALLOW_COPY_AND_ASSIGN(PrefixMatcher);
};
} // namespace
TokenizedStringMatch::TokenizedStringMatch()
: relevance_(kNoMatchScore) {}
TokenizedStringMatch::~TokenizedStringMatch() {}
bool TokenizedStringMatch::Calculate(const TokenizedString& query,
const TokenizedString& text) {
relevance_ = kNoMatchScore;
hits_.clear();
PrefixMatcher matcher(query, text);
if (matcher.Match()) {
relevance_ = matcher.relevance();
hits_.assign(matcher.hits().begin(), matcher.hits().end());
}
// Substring match as a fallback.
if (relevance_ == kNoMatchScore) {
size_t substr_match_start = 0;
size_t substr_match_length = 0;
if (base::i18n::StringSearchIgnoringCaseAndAccents(query.text(),
text.text(),
&substr_match_start,
&substr_match_length)) {
relevance_ = kIsSubstringMultiplier * substr_match_length;
hits_.push_back(gfx::Range(substr_match_start,
substr_match_start + substr_match_length));
}
}
// Temper the relevance score with an exponential curve. Each point of
// relevance (roughly, each keystroke) is worth less than the last. This means
// that typing a few characters of a word is enough to promote matches very
// high, with any subsequent characters being worth comparatively less.
// TODO(mgiuca): This doesn't really play well with Omnibox results, since as
// you type more characters, the app/omnibox results tend to jump over each
// other.
relevance_ = 1.0 - std::pow(0.5, relevance_);
return relevance_ > kNoMatchScore;
}
bool TokenizedStringMatch::Calculate(const base::string16& query,
const base::string16& text) {
const TokenizedString tokenized_query(query);
const TokenizedString tokenized_text(text);
return Calculate(tokenized_query, tokenized_text);
}
} // namespace app_list