blob: b6db99b1f20ce5aaa4b364737494529f4fea2157 [file] [log] [blame]
// Copyright 2021 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/history_clusters/core/on_device_clustering_backend.h"
#include <algorithm>
#include <iterator>
#include <set>
#include "base/containers/flat_map.h"
#include "base/i18n/case_conversion.h"
#include "base/metrics/histogram_functions.h"
#include "base/strings/string_util.h"
#include "base/task/task_runner_util.h"
#include "base/task/task_traits.h"
#include "base/task/thread_pool.h"
#include "components/history/core/browser/history_types.h"
#include "components/history_clusters/core/content_annotations_cluster_processor.h"
#include "components/history_clusters/core/content_visibility_cluster_finalizer.h"
#include "components/history_clusters/core/keyword_cluster_finalizer.h"
#include "components/history_clusters/core/noisy_cluster_finalizer.h"
#include "components/history_clusters/core/on_device_clustering_features.h"
#include "components/history_clusters/core/on_device_clustering_util.h"
#include "components/history_clusters/core/ranking_cluster_finalizer.h"
#include "components/history_clusters/core/similar_visit_deduper_cluster_finalizer.h"
#include "components/history_clusters/core/single_visit_cluster_finalizer.h"
#include "components/history_clusters/core/url_deduper_cluster_finalizer.h"
#include "components/optimization_guide/core/batch_entity_metadata_task.h"
#include "components/optimization_guide/core/entity_metadata_provider.h"
#include "components/search_engines/template_url_service.h"
#include "components/site_engagement/core/site_engagement_score_provider.h"
namespace history_clusters {
namespace {
// Returns the normalized URL for the search provider if the URL for the visit
// is a Search URL. Otherwise, returns nullopt.
absl::optional<GURL> GetNormalizedURLForSearchVisit(
const history::AnnotatedVisit& visit,
const TemplateURLService* template_url_service) {
if (!template_url_service)
return absl::nullopt;
const TemplateURL* template_url =
template_url_service->GetTemplateURLForHost(visit.url_row.url().host());
const SearchTermsData& search_terms_data =
template_url_service->search_terms_data();
std::u16string search_terms;
bool is_valid_search_url =
template_url &&
template_url->ExtractSearchTermsFromURL(
visit.url_row.url(), search_terms_data, &search_terms) &&
!search_terms.empty();
if (!is_valid_search_url)
return absl::nullopt;
const std::u16string& normalized_search_query =
base::i18n::ToLower(base::CollapseWhitespace(search_terms, false));
TemplateURLRef::SearchTermsArgs search_terms_args(normalized_search_query);
const TemplateURLRef& search_url_ref = template_url->url_ref();
if (!search_url_ref.SupportsReplacement(search_terms_data))
return absl::nullopt;
return GURL(
search_url_ref.ReplaceSearchTerms(search_terms_args, search_terms_data));
}
} // namespace
OnDeviceClusteringBackend::OnDeviceClusteringBackend(
TemplateURLService* template_url_service,
optimization_guide::EntityMetadataProvider* entity_metadata_provider,
site_engagement::SiteEngagementScoreProvider* engagement_score_provider)
: template_url_service_(template_url_service),
entity_metadata_provider_(entity_metadata_provider),
engagement_score_provider_(engagement_score_provider),
background_task_runner_(base::ThreadPool::CreateSequencedTaskRunner(
{base::MayBlock(), base::TaskPriority::USER_VISIBLE})) {}
OnDeviceClusteringBackend::~OnDeviceClusteringBackend() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
}
void OnDeviceClusteringBackend::GetClusters(
ClustersCallback callback,
const std::vector<history::AnnotatedVisit>& visits) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (visits.empty()) {
std::move(callback).Run({});
return;
}
// Just start clustering without getting entity metadata if we don't have a
// provider to translate the entities.
if (!entity_metadata_provider_) {
OnBatchEntityMetadataRetrieved(/*completed_task=*/nullptr, visits,
/*entity_metadata_start=*/absl::nullopt,
std::move(callback),
/*entity_metadata_map=*/{});
return;
}
// Figure out what entity IDs we need to fetch metadata for.
base::flat_set<std::string> entity_ids;
for (const auto& visit : visits) {
for (const auto& entity :
visit.content_annotations.model_annotations.entities) {
entity_ids.insert(entity.id);
}
}
// Don't bother with getting entity metadata if there's nothing to get
// metadata for.
if (entity_ids.empty()) {
OnBatchEntityMetadataRetrieved(/*completed_task=*/nullptr, visits,
/*entity_metadata_start=*/absl::nullopt,
std::move(callback),
/*entity_metadata_map=*/{});
return;
}
base::UmaHistogramCounts1000("History.Clusters.Backend.BatchEntityLookupSize",
entity_ids.size());
// Fetch the metadata for the entity ID present in |visits|.
auto batch_entity_metadata_task =
std::make_unique<optimization_guide::BatchEntityMetadataTask>(
entity_metadata_provider_, entity_ids);
auto* batch_entity_metadata_task_ptr = batch_entity_metadata_task.get();
in_flight_batch_entity_metadata_tasks_.insert(
std::move(batch_entity_metadata_task));
batch_entity_metadata_task_ptr->Execute(base::BindOnce(
&OnDeviceClusteringBackend::OnBatchEntityMetadataRetrieved,
weak_ptr_factory_.GetWeakPtr(), batch_entity_metadata_task_ptr, visits,
base::TimeTicks::Now(), std::move(callback)));
}
void OnDeviceClusteringBackend::OnBatchEntityMetadataRetrieved(
optimization_guide::BatchEntityMetadataTask* completed_task,
const std::vector<history::AnnotatedVisit>& annotated_visits,
absl::optional<base::TimeTicks> entity_metadata_start,
ClustersCallback callback,
const base::flat_map<std::string, optimization_guide::EntityMetadata>&
entity_metadata_map) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
if (entity_metadata_start) {
base::UmaHistogramTimes("History.Clusters.Backend.BatchEntityLookupLatency",
*entity_metadata_start - base::TimeTicks::Now());
}
// Rewrite the visits based on the mapping and normalize URLs here.
std::vector<history::ClusterVisit> cluster_visits;
cluster_visits.reserve(annotated_visits.size());
for (const auto& visit : annotated_visits) {
history::ClusterVisit cluster_visit;
cluster_visit.annotated_visit = visit;
absl::optional<GURL> maybe_normalized_url =
GetNormalizedURLForSearchVisit(visit, template_url_service_);
cluster_visit.is_search_visit = maybe_normalized_url.has_value();
cluster_visit.normalized_url =
maybe_normalized_url.value_or(visit.url_row.url());
if (engagement_score_provider_) {
cluster_visit.engagement_score =
engagement_score_provider_->GetScore(visit.url_row.url());
}
// Rewrite the entities for the visit, but only if it is possible that we
// had additional metadata for it.
if (entity_metadata_provider_) {
cluster_visit.annotated_visit.content_annotations.model_annotations
.entities.clear();
cluster_visit.annotated_visit.content_annotations.model_annotations
.categories.clear();
base::flat_set<std::string> inserted_categories;
for (const auto& entity :
visit.content_annotations.model_annotations.entities) {
auto entity_metadata_it = entity_metadata_map.find(entity.id);
if (entity_metadata_it == entity_metadata_map.end()) {
continue;
}
history::VisitContentModelAnnotations::Category rewritten_entity =
entity;
rewritten_entity.id = entity_metadata_it->second.human_readable_name;
cluster_visit.annotated_visit.content_annotations.model_annotations
.entities.push_back(rewritten_entity);
for (const auto& category :
entity_metadata_it->second.human_readable_categories) {
if (inserted_categories.find(category.first) ==
inserted_categories.end()) {
cluster_visit.annotated_visit.content_annotations.model_annotations
.categories.push_back(
{category.first, static_cast<int>(100 * category.second)});
inserted_categories.insert(category.first);
}
}
}
if (visit.content_annotations.model_annotations
.page_topics_model_version <
features::GetMinPageTopicsModelVersionToUseContentVisibilityFrom()) {
// Override the visibility score to be as if the model was not evaluated
// if the version of the model being used does not exceed the min
// version that was not a random model.
cluster_visit.annotated_visit.content_annotations.model_annotations
.visibility_score = -1.0;
}
}
cluster_visits.push_back(cluster_visit);
}
// Mark the task as completed, which will destruct |entity_metadata_map|.
if (completed_task) {
auto it = in_flight_batch_entity_metadata_tasks_.find(completed_task);
if (it != in_flight_batch_entity_metadata_tasks_.end())
in_flight_batch_entity_metadata_tasks_.erase(it);
}
// Post the actual clustering work onto the thread pool, then reply on the
// calling sequence. This is to prevent UI jank.
background_task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(
&OnDeviceClusteringBackend::ClusterVisitsOnBackgroundThread,
base::Unretained(this), cluster_visits),
std::move(callback));
}
std::vector<history::Cluster>
OnDeviceClusteringBackend::ClusterVisitsOnBackgroundThread(
const std::vector<history::ClusterVisit>& visits) {
DCHECK(background_task_runner_->RunsTasksInCurrentSequence());
// TODO(crbug.com/1260145): All of these objects are "stateless" between
// requests for clusters. If there needs to be shared state, the entire
// backend needs to be refactored to separate these objects from the UI and
// background thread.
std::unique_ptr<Clusterer> clusterer = std::make_unique<Clusterer>();
// The cluster processors to be run.
std::vector<std::unique_ptr<ClusterProcessor>> cluster_processors;
// The cluster finalizers to be run.
std::vector<std::unique_ptr<ClusterFinalizer>> cluster_finalizers;
if (features::ContentClusteringEnabled()) {
cluster_processors.push_back(
std::make_unique<ContentAnnotationsClusterProcessor>());
}
cluster_finalizers.push_back(
std::make_unique<ContentVisibilityClusterFinalizer>());
if (features::ShouldDedupeSimilarVisits()) {
cluster_finalizers.push_back(
std::make_unique<SimilarVisitDeduperClusterFinalizer>());
} else {
// Otherwise, only dedupe based on normalized URL.
cluster_finalizers.push_back(
std::make_unique<UrlDeduperClusterFinalizer>());
}
cluster_finalizers.push_back(std::make_unique<RankingClusterFinalizer>());
if (features::ShouldHideSingleVisitClustersOnProminentUISurfaces()) {
cluster_finalizers.push_back(
std::make_unique<SingleVisitClusterFinalizer>());
}
// Add feature to turn on/off site engagement score filter.
if (engagement_score_provider_ && features::ShouldFilterNoisyClusters()) {
cluster_finalizers.push_back(std::make_unique<NoisyClusterFinalizer>());
}
cluster_finalizers.push_back(std::make_unique<KeywordClusterFinalizer>());
// Group visits into clusters.
std::vector<history::Cluster> clusters =
clusterer->CreateInitialClustersFromVisits(visits);
// Process clusters.
for (const auto& processor : cluster_processors) {
clusters = processor->ProcessClusters(clusters);
}
// Run finalizers that dedupe and score visits within a cluster and
// log several metrics about the result.
std::vector<int> keyword_sizes;
std::vector<int> visits_in_clusters;
for (auto& cluster : clusters) {
for (const auto& finalizer : cluster_finalizers) {
finalizer->FinalizeCluster(cluster);
}
visits_in_clusters.emplace_back(cluster.visits.size());
keyword_sizes.emplace_back(cluster.keywords.size());
}
// It's a bit strange that this is essentially a `ClusterProcessor` but has
// to operate after the finalizers.
SortClusters(&clusters);
if (!visits_in_clusters.empty()) {
// We check for empty to ensure the below code doesn't crash, but
// realistically this vector should never be empty.
base::UmaHistogramCounts1000("History.Clusters.Backend.ClusterSize.Min",
*std::min_element(visits_in_clusters.begin(),
visits_in_clusters.end()));
base::UmaHistogramCounts1000("History.Clusters.Backend.ClusterSize.Max",
*std::max_element(visits_in_clusters.begin(),
visits_in_clusters.end()));
}
if (!keyword_sizes.empty()) {
// We check for empty to ensure the below code doesn't crash, but
// realistically this vector should never be empty.
base::UmaHistogramCounts100(
"History.Clusters.Backend.NumKeywordsPerCluster.Min",
*std::min_element(keyword_sizes.begin(), keyword_sizes.end()));
base::UmaHistogramCounts100(
"History.Clusters.Backend.NumKeywordsPerCluster.Max",
*std::max_element(keyword_sizes.begin(), keyword_sizes.end()));
}
return clusters;
}
} // namespace history_clusters