blob: cc11ec6f56c4807885e8aa866fca95d80f6fb2da [file] [log] [blame] [edit]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/omnibox/browser/bookmark_provider.h"
#include <algorithm>
#include <functional>
#include <vector>
#include "base/cxx17_backports.h"
#include "base/feature_list.h"
#include "base/macros.h"
#include "base/metrics/field_trial_params.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/trace_event/trace_event.h"
#include "components/bookmarks/browser/bookmark_model.h"
#include "components/omnibox/browser/autocomplete_provider_client.h"
#include "components/omnibox/browser/autocomplete_result.h"
#include "components/omnibox/browser/omnibox_field_trial.h"
#include "components/omnibox/browser/titled_url_match_utils.h"
#include "components/omnibox/common/omnibox_features.h"
#include "components/prefs/pref_service.h"
#include "components/query_parser/snippet.h"
#include "components/search_engines/omnibox_focus_type.h"
#include "third_party/metrics_proto/omnibox_input_type.pb.h"
#include "url/url_constants.h"
using bookmarks::BookmarkNode;
using bookmarks::TitledUrlMatch;
using TitledUrlMatches = std::vector<TitledUrlMatch>;
namespace {
// for_each helper functor that calculates a match factor for each query term
// when calculating the final score.
//
// Calculate a 'factor' from 0 to the bookmark's title length for a match
// based on 1) how many characters match and 2) where the match is positioned.
class ScoringFunctor {
public:
// |title_length| is the length of the bookmark title against which this
// match will be scored.
explicit ScoringFunctor(size_t title_length)
: title_length_(static_cast<double>(title_length)),
scoring_factor_(0.0) {}
void operator()(const query_parser::Snippet::MatchPosition& match) {
double term_length = static_cast<double>(match.second - match.first);
scoring_factor_ +=
term_length * (title_length_ - match.first) / title_length_;
}
double ScoringFactor() { return scoring_factor_; }
private:
double title_length_;
double scoring_factor_;
};
} // namespace
// BookmarkProvider ------------------------------------------------------------
BookmarkProvider::BookmarkProvider(AutocompleteProviderClient* client)
: AutocompleteProvider(AutocompleteProvider::TYPE_BOOKMARK),
client_(client),
bookmark_model_(client ? client_->GetBookmarkModel() : nullptr) {}
void BookmarkProvider::Start(const AutocompleteInput& input,
bool minimal_changes) {
TRACE_EVENT0("omnibox", "BookmarkProvider::Start");
matches_.clear();
if (input.focus_type() != OmniboxFocusType::DEFAULT || input.text().empty())
return;
DoAutocomplete(input);
}
BookmarkProvider::~BookmarkProvider() {}
void BookmarkProvider::DoAutocomplete(const AutocompleteInput& input) {
// We may not have a bookmark model for some unit tests.
if (!bookmark_model_)
return;
// Retrieve enough bookmarks so that we have a reasonable probability of
// suggesting the one that the user desires.
const size_t kMaxBookmarkMatches = 50;
// GetBookmarksMatching returns bookmarks matching the user's
// search terms using the following rules:
// - The search text is broken up into search terms. Each term is searched
// for separately.
// - Term matches are always performed against the start of a word. 'def'
// will match against 'define' but not against 'indefinite'.
// - Terms must be at least three characters in length in order to perform
// partial word matches. Any term of lesser length will only be used as an
// exact match. 'def' will match against 'define' but 'de' will not match.
// (Unless either |IsShortBookmarkSuggestionsEnabled()| or
// |IsShortBookmarkSuggestionsByTotalInputLengthEnabled()| is true.)
// - A search containing multiple terms will return results with those words
// occurring in any order.
// - Terms enclosed in quotes comprises a phrase that must match exactly.
// - Multiple terms enclosed in quotes will require those exact words in that
// exact order to match.
//
// Please refer to the code for TitledUrlIndex::GetResultsMatching for
// complete details of how searches are performed against the user's
// bookmarks.
std::vector<TitledUrlMatch> matches =
GetMatchesWithBookmarkPaths(input, kMaxBookmarkMatches);
if (matches.empty())
return; // There were no matches.
const std::u16string fixed_up_input(FixupUserInput(input).second);
for (auto& bookmark_match : matches) {
if (OmniboxFieldTrial::ShouldDisableCGIParamMatching()) {
RemoveQueryParamKeyMatches(bookmark_match);
if (bookmark_match.title_match_positions.empty() &&
bookmark_match.url_match_positions.empty())
continue;
}
// Score the TitledUrlMatch. If its score is greater than 0 then the
// AutocompleteMatch is created and added to matches_.
int relevance = CalculateBookmarkMatchRelevance(bookmark_match);
if (relevance > 0) {
matches_.push_back(TitledUrlMatchToAutocompleteMatch(
bookmark_match, AutocompleteMatchType::BOOKMARK_TITLE, relevance,
this, client_->GetSchemeClassifier(), input, fixed_up_input));
}
}
// Sort and clip the resulting matches.
size_t num_matches = std::min(matches_.size(), provider_max_matches_);
std::partial_sort(matches_.begin(), matches_.begin() + num_matches,
matches_.end(), AutocompleteMatch::MoreRelevant);
matches_.resize(num_matches);
}
std::vector<TitledUrlMatch> BookmarkProvider::GetMatchesWithBookmarkPaths(
const AutocompleteInput& input,
size_t kMaxBookmarkMatches) {
// Determining whether the |kBookmarkPaths| feature had, or would have had, an
// impact for counterfactual logging is expensive as it requires invoking
// |BookmarkModel::GetBookmarksMatching()| twice. Therefore, the param
// |kBookmarkPathsCounterfactual| determines whether to counterfactual log:
// - When empty, counterfactual logging won't occur.
// - When set to "control" counterfactual logging will occur like usual; i.e.
// path matched bookmarks won't be returned but will be compared to
// determine if the feature triggered.
// - When set to "enabled", counterfactual logging will occur and path matched
// bookmarks will be returned.
std::string counterfactual =
OmniboxFieldTrial::kBookmarkPathsCounterfactual.Get();
bool match_paths = base::FeatureList::IsEnabled(omnibox::kBookmarkPaths) &&
counterfactual != "control";
query_parser::MatchingAlgorithm matching_algorithm =
GetMatchingAlgorithm(input);
if (counterfactual.empty()) {
return bookmark_model_->GetBookmarksMatching(
input.text(), kMaxBookmarkMatches, matching_algorithm, match_paths);
}
std::vector<TitledUrlMatch> matches_without_paths =
bookmark_model_->GetBookmarksMatching(input.text(), kMaxBookmarkMatches,
matching_algorithm);
std::vector<TitledUrlMatch> matches_with_paths =
bookmark_model_->GetBookmarksMatching(input.text(), kMaxBookmarkMatches,
matching_algorithm, true);
DCHECK_LE(matches_without_paths.size(), matches_with_paths.size());
// It's unnecessary to compare the matches themselves because all
// |matches_without_paths| should be contained in |matches_with_paths|.
if (matches_without_paths.size() != matches_with_paths.size()) {
client_->GetOmniboxTriggeredFeatureService()->TriggerFeature(
OmniboxTriggeredFeatureService::Feature::kBookmarkPaths);
}
return match_paths ? matches_with_paths : matches_without_paths;
}
query_parser::MatchingAlgorithm BookmarkProvider::GetMatchingAlgorithm(
AutocompleteInput input) {
if (OmniboxFieldTrial::IsShortBookmarkSuggestionsEnabled())
return query_parser::MatchingAlgorithm::ALWAYS_PREFIX_SEARCH;
if (OmniboxFieldTrial::
IsShortBookmarkSuggestionsByTotalInputLengthEnabled() &&
input.text().length() >=
OmniboxFieldTrial::
ShortBookmarkSuggestionsByTotalInputLengthThreshold()) {
client_->GetOmniboxTriggeredFeatureService()->TriggerFeature(
OmniboxTriggeredFeatureService::Feature::
kShortBookmarkSuggestionsByTotalInputLength);
return OmniboxFieldTrial::
kShortBookmarkSuggestionsByTotalInputLengthCounterfactual
.Get()
? query_parser::MatchingAlgorithm::DEFAULT
: query_parser::MatchingAlgorithm::ALWAYS_PREFIX_SEARCH;
}
return query_parser::MatchingAlgorithm::DEFAULT;
}
int BookmarkProvider::CalculateBookmarkMatchRelevance(
const TitledUrlMatch& bookmark_match) const {
// Summary on how a relevance score is determined for the match:
//
// For each match within the bookmark's title or URL (or both), calculate a
// 'factor', sum up those factors, then use the sum to figure out a value
// between the base score and the maximum score.
//
// The factor for each match is the product of:
//
// 1) how many characters in the bookmark's title/URL are part of this match.
// This is capped at the length of the bookmark's title
// to prevent terms that match in both the title and the URL from
// scoring too strongly.
//
// 2) where the match occurs within the bookmark's title or URL,
// giving more points for matches that appear earlier in the string:
// ((string_length - position of match start) / string_length).
//
// Example: Given a bookmark title of 'abcde fghijklm', with a title length
// of 14, and two different search terms, 'abcde' and 'fghij', with
// start positions of 0 and 6, respectively, 'abcde' will score higher
// (with a a partial factor of (14-0)/14 = 1.000 ) than 'fghij' (with
// a partial factor of (14-6)/14 = 0.571 ). (In this example neither
// term matches in the URL.)
//
// Once all match factors have been calculated they are summed. If there
// are no URL matches, the resulting sum will never be greater than the
// length of the bookmark title because of the way the bookmark model matches
// and removes overlaps. (In particular, the bookmark model only
// matches terms to the beginning of words and it removes all overlapping
// matches, keeping only the longest. Together these mean that each
// character is included in at most one match.) If there are matches in the
// URL, the sum can be greater.
//
// This sum is then normalized by the length of the bookmark title + 10
// and capped at 1.0. The +10 is to expand the scoring range so fewer
// bookmarks will hit the 1.0 cap and hence lose all ability to distinguish
// between these high-quality bookmarks.
//
// The normalized value is multiplied against the scoring range available,
// which is the difference between the minimum possible score and the maximum
// possible score. This product is added to the minimum possible score to
// give the preliminary score.
//
// If the preliminary score is less than the maximum possible score, 1199,
// it can be boosted up to that maximum possible score if the URL referenced
// by the bookmark is also referenced by any of the user's other bookmarks.
// A count of how many times the bookmark's URL is referenced is determined
// and, for each additional reference beyond the one for the bookmark being
// scored up to a maximum of three, the score is boosted by a fixed amount
// given by |kURLCountBoost|, below.
std::u16string title(bookmark_match.node->GetTitledUrlNodeTitle());
const GURL& url(bookmark_match.node->GetTitledUrlNodeUrl());
// Pretend empty titles are identical to the URL.
if (title.empty())
title = base::ASCIIToUTF16(url.spec());
ScoringFunctor title_position_functor =
for_each(bookmark_match.title_match_positions.begin(),
bookmark_match.title_match_positions.end(),
ScoringFunctor(title.size()));
ScoringFunctor url_position_functor =
for_each(bookmark_match.url_match_positions.begin(),
bookmark_match.url_match_positions.end(),
ScoringFunctor(
bookmark_match.node->GetTitledUrlNodeUrl().spec().length()));
const double title_match_strength = title_position_functor.ScoringFactor();
const double summed_factors = title_match_strength +
url_position_functor.ScoringFactor();
const double normalized_sum =
std::min(summed_factors / (title.size() + 10), 1.0);
// Bookmarks with javascript scheme ("bookmarklets") that do not have title
// matches get a lower base and lower maximum score because returning them
// for matches in their (often very long) URL looks stupid and is often not
// intended by the user.
const bool bookmarklet_without_title_match =
url.SchemeIs(url::kJavaScriptScheme) && (title_match_strength == 0.0);
const int kBaseBookmarkScore = bookmarklet_without_title_match ? 400 : 900;
const int kMaxBookmarkScore = bookmarklet_without_title_match ? 799 : 1199;
const double kBookmarkScoreRange =
static_cast<double>(kMaxBookmarkScore - kBaseBookmarkScore);
int relevance = static_cast<int>(normalized_sum * kBookmarkScoreRange) +
kBaseBookmarkScore;
// Don't waste any time searching for additional referenced URLs if we
// already have a perfect title match.
if (relevance >= kMaxBookmarkScore)
return relevance;
// Boost the score if the bookmark's URL is referenced by other bookmarks.
const int kURLCountBoost[4] = { 0, 75, 125, 150 };
std::vector<const BookmarkNode*> nodes;
bookmark_model_->GetNodesByURL(url, &nodes);
DCHECK_GE(std::min(base::size(kURLCountBoost), nodes.size()), 1U);
relevance +=
kURLCountBoost[std::min(base::size(kURLCountBoost), nodes.size()) - 1];
relevance = std::min(kMaxBookmarkScore, relevance);
return relevance;
}
void BookmarkProvider::RemoveQueryParamKeyMatches(TitledUrlMatch& match) {
const GURL& url = match.node->GetTitledUrlNodeUrl();
if (!url.has_query())
return;
// Remove any matches that are for query param keys. Since bookmark provider
// match positions are always at the beginning of words, we can just look at
// the preceding character for a '?' or '&' character.
base::EraseIf(match.url_match_positions,
[url](TitledUrlMatch::MatchPosition& position) {
size_t query_begin =
url.parsed_for_possibly_invalid_spec().query.begin;
return ((query_begin <= position.first) &&
(url.spec().at(position.first - 1) == '?' ||
url.spec().at(position.first - 1) == '&'));
});
}