blob: b2bb0d96df94adb45b550828d3620bf0e1db6913 [file] [log] [blame]
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/history_clusters/ui/query_clusters_state.h"
#include <set>
#include <string>
#include "base/memory/ref_counted_delete_on_sequence.h"
#include "base/memory/weak_ptr.h"
#include "base/metrics/histogram_functions.h"
#include "base/ranges/algorithm.h"
#include "base/task/sequenced_task_runner.h"
#include "base/task/thread_pool.h"
#include "components/history_clusters/core/config.h"
#include "components/history_clusters/core/history_clusters_service.h"
#include "components/history_clusters/core/history_clusters_service_task_get_most_recent_clusters.h"
#include "components/history_clusters/core/history_clusters_util.h"
#include "components/image_service/image_service.h"
#include "url/gurl.h"
namespace history_clusters {
// Helper class that lives and is destroyed on the `sequenced_task_runner`,
// although it's created on the main thread. It allows us to store state that
// is only accessed on `sequenced_task_runner` that persists between batches.
class QueryClustersState::PostProcessor
: public base::RefCountedDeleteOnSequence<PostProcessor> {
public:
PostProcessor(scoped_refptr<base::SequencedTaskRunner> sequenced_task_runner,
const std::string query)
: base::RefCountedDeleteOnSequence<PostProcessor>(sequenced_task_runner),
query_(query) {}
PostProcessor(const PostProcessor& other) = delete;
PostProcessor& operator=(const PostProcessor& other) = delete;
std::vector<history::Cluster> PostProcess(
std::vector<history::Cluster> clusters) {
ApplySearchQuery(query_, clusters);
CullNonProminentOrDuplicateClusters(query_, clusters,
&seen_single_visit_cluster_urls_);
// Only sort after we figured out what we are showing.
SortClusters(&clusters);
// We have to do this AFTER applying the search query, because applying the
// search query re-scores matching visits to promote them above non-matching
// visits. Show 1-visit clusters only in query mode.
HideAndCullLowScoringVisits(clusters, query_.empty() ? 2 : 1);
// Do this AFTER we cull the low scoring visits, so those visits don't get
// their related searches coalesced onto the cluster level.
CoalesceRelatedSearches(clusters);
return clusters;
}
private:
friend class base::RefCountedDeleteOnSequence<PostProcessor>;
friend class base::DeleteHelper<PostProcessor>;
// Ref-counted object should only be deleted via ref-counting.
~PostProcessor() = default;
const std::string query_;
// URLs of single-visit non-prominent clusters we've already seen.
std::set<GURL> seen_single_visit_cluster_urls_;
};
QueryClustersState::QueryClustersState(
base::WeakPtr<HistoryClustersService> service,
base::WeakPtr<image_service::ImageService> image_service,
const std::string& query,
bool recluster)
: service_(service),
image_service_(image_service),
query_(query),
recluster_(recluster),
post_processing_task_runner_(base::ThreadPool::CreateSequencedTaskRunner(
{base::MayBlock(), base::TaskPriority::USER_VISIBLE})),
post_processing_state_(
base::MakeRefCounted<PostProcessor>(post_processing_task_runner_,
query)) {}
QueryClustersState::~QueryClustersState() = default;
void QueryClustersState::LoadNextBatchOfClusters(ResultCallback callback) {
if (!service_)
return;
base::TimeTicks query_start_time = base::TimeTicks::Now();
query_clusters_task_ = service_->QueryClusters(
ClusteringRequestSource::kJourneysPage,
/*begin_time=*/base::Time(), continuation_params_, recluster_,
base::BindOnce(&QueryClustersState::OnGotRawClusters,
weak_factory_.GetWeakPtr(), query_start_time,
std::move(callback)));
}
void QueryClustersState::OnGotRawClusters(
base::TimeTicks query_start_time,
ResultCallback callback,
std::vector<history::Cluster> clusters,
QueryClustersContinuationParams continuation_params) const {
// Post-process the clusters (expensive task) on an anonymous thread to
// prevent janks.
base::ElapsedTimer post_processing_timer; // Create here to time the task.
size_t clusters_from_backend_count = clusters.size();
post_processing_task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&PostProcessor::PostProcess, post_processing_state_,
std::move(clusters)),
base::BindOnce(
&QueryClustersState::OnGotClusters, weak_factory_.GetMutableWeakPtr(),
std::move(post_processing_timer), clusters_from_backend_count,
query_start_time, std::move(callback), continuation_params));
}
void QueryClustersState::OnGotClusters(
base::ElapsedTimer post_processing_timer,
size_t clusters_from_backend_count,
base::TimeTicks query_start_time,
ResultCallback callback,
QueryClustersContinuationParams continuation_params,
std::vector<history::Cluster> clusters) {
base::UmaHistogramTimes("History.Clusters.ProcessClustersDuration",
post_processing_timer.Elapsed());
if (clusters_from_backend_count > 0) {
// Log the percentage of clusters that get filtered (e.g., 100 - % of
// clusters that remain).
base::UmaHistogramCounts100(
"History.Clusters.PercentClustersFilteredByQuery",
static_cast<int>(100 - (clusters.size() /
(1.0 * clusters_from_backend_count) * 100)));
}
continuation_params_ = continuation_params;
// In case no clusters came back, recursively ask for more here. We do this
// to fulfill the mojom contract where we always return at least one cluster,
// or we exhaust History. We don't do this in the service because of task
// tracker lifetime difficulty. In practice, this only happens when the user
// has a search query that doesn't match any of the clusters in the "page".
// https://crbug.com/1263728
//
// This is distinct from the "tall monitor" case because the page may already
// be full of clusters. In that case, the WebUI would not know to make another
// request for clusters.
if (clusters.empty() && !continuation_params.exhausted_all_visits) {
LoadNextBatchOfClusters(std::move(callback));
return;
}
// This feels like it belongs in `PostProcessor`, but this operates on the
// main thread, because the data needs to live on the main thread. Doing it
// on the task runner requires making heap copies, which probably costs more
// than just doing this simple computation on the main thread.
UpdateUniqueRawLabels(clusters);
if (GetConfig().images && image_service_) {
image_service_->PopulateEntityImagesFor(
std::move(clusters),
base::BindOnce(&QueryClustersState::OnGotImagedClusters,
weak_factory_.GetWeakPtr(), query_start_time,
std::move(callback), std::move(continuation_params)));
} else {
OnGotImagedClusters(query_start_time, std::move(callback),
std::move(continuation_params), std::move(clusters));
}
}
void QueryClustersState::OnGotImagedClusters(
base::TimeTicks query_start_time,
ResultCallback callback,
QueryClustersContinuationParams continuation_params,
std::vector<history::Cluster> clusters) {
size_t clusters_size = clusters.size();
bool is_continuation = number_clusters_sent_to_page_ > 0;
std::move(callback).Run(query_, std::move(clusters),
!continuation_params.exhausted_all_visits,
is_continuation);
number_clusters_sent_to_page_ += clusters_size;
// Log metrics after delivering the results to the page.
base::TimeDelta service_latency = base::TimeTicks::Now() - query_start_time;
base::UmaHistogramTimes("History.Clusters.ServiceLatency", service_latency);
}
void QueryClustersState::UpdateUniqueRawLabels(
const std::vector<history::Cluster>& clusters) {
// Skip this computation when there's a search query.
if (!query_.empty())
return;
for (const auto& cluster : clusters) {
if (!cluster.raw_label)
return;
const auto& raw_label_value = cluster.raw_label.value();
// Warning: N^2 algorithm below. If this ends up scaling poorly, it can be
// optimized by adding a map that tracks which labels have been seen
// already.
auto it = base::ranges::find(raw_label_counts_so_far_, raw_label_value,
&LabelCount::first);
if (it == raw_label_counts_so_far_.end()) {
it = raw_label_counts_so_far_.insert(it,
std::make_pair(raw_label_value, 0));
}
it->second++;
}
}
} // namespace history_clusters