components/history_clusters/ui/query_clusters_state.cc - chromium/src - Git at Google

 // Copyright 2022 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "components/history_clusters/ui/query_clusters_state.h"

 #include <set>
 #include <string>

 #include "base/memory/ref_counted_delete_on_sequence.h"
 #include "base/memory/weak_ptr.h"
 #include "base/metrics/histogram_functions.h"
 #include "base/ranges/algorithm.h"
 #include "base/task/sequenced_task_runner.h"
 #include "base/task/thread_pool.h"
 #include "components/history_clusters/core/config.h"
 #include "components/history_clusters/core/history_clusters_service.h"
 #include "components/history_clusters/core/history_clusters_service_task_get_most_recent_clusters.h"
 #include "components/history_clusters/core/history_clusters_util.h"
 #include "components/image_service/image_service.h"
 #include "url/gurl.h"

 namespace history_clusters {

 // Helper class that lives and is destroyed on the `sequenced_task_runner`,
 // although it's created on the main thread. It allows us to store state that
 // is only accessed on `sequenced_task_runner` that persists between batches.
 class QueryClustersState::PostProcessor
     : public base::RefCountedDeleteOnSequence<PostProcessor> {
  public:
   PostProcessor(scoped_refptr<base::SequencedTaskRunner> sequenced_task_runner,
                 const std::string query)
       : base::RefCountedDeleteOnSequence<PostProcessor>(sequenced_task_runner),
         query_(query) {}
   PostProcessor(const PostProcessor& other) = delete;
   PostProcessor& operator=(const PostProcessor& other) = delete;

   std::vector<history::Cluster> PostProcess(
       std::vector<history::Cluster> clusters) {
     ApplySearchQuery(query_, clusters);
     CullNonProminentOrDuplicateClusters(query_, clusters,
                                         &seen_single_visit_cluster_urls_);

     // Only sort after we figured out what we are showing.
     SortClusters(&clusters);

     // We have to do this AFTER applying the search query, because applying the
     // search query re-scores matching visits to promote them above non-matching
     // visits. Show 1-visit clusters only in query mode.
     HideAndCullLowScoringVisits(clusters, query_.empty() ? 2 : 1);
     // Do this AFTER we cull the low scoring visits, so those visits don't get
     // their related searches coalesced onto the cluster level.
     CoalesceRelatedSearches(clusters);
     return clusters;
   }

  private:
   friend class base::RefCountedDeleteOnSequence<PostProcessor>;
   friend class base::DeleteHelper<PostProcessor>;

   // Ref-counted object should only be deleted via ref-counting.
   ~PostProcessor() = default;

   const std::string query_;

   // URLs of single-visit non-prominent clusters we've already seen.
   std::set<GURL> seen_single_visit_cluster_urls_;
 };

 QueryClustersState::QueryClustersState(
     base::WeakPtr<HistoryClustersService> service,
     base::WeakPtr<image_service::ImageService> image_service,
     const std::string& query,
     bool recluster)
     : service_(service),
       image_service_(image_service),
       query_(query),
       recluster_(recluster),
       post_processing_task_runner_(base::ThreadPool::CreateSequencedTaskRunner(
           {base::MayBlock(), base::TaskPriority::USER_VISIBLE})),
       post_processing_state_(
           base::MakeRefCounted<PostProcessor>(post_processing_task_runner_,
                                               query)) {}

 QueryClustersState::~QueryClustersState() = default;

 void QueryClustersState::LoadNextBatchOfClusters(ResultCallback callback) {
   if (!service_)
     return;

   base::TimeTicks query_start_time = base::TimeTicks::Now();
   query_clusters_task_ = service_->QueryClusters(
       ClusteringRequestSource::kJourneysPage,
       /*begin_time=*/base::Time(), continuation_params_, recluster_,
       base::BindOnce(&QueryClustersState::OnGotRawClusters,
                      weak_factory_.GetWeakPtr(), query_start_time,
                      std::move(callback)));
 }

 void QueryClustersState::OnGotRawClusters(
     base::TimeTicks query_start_time,
     ResultCallback callback,
     std::vector<history::Cluster> clusters,
     QueryClustersContinuationParams continuation_params) const {
   // Post-process the clusters (expensive task) on an anonymous thread to
   // prevent janks.
   base::ElapsedTimer post_processing_timer;  // Create here to time the task.

   size_t clusters_from_backend_count = clusters.size();
   post_processing_task_runner_->PostTaskAndReplyWithResult(
       FROM_HERE,
       base::BindOnce(&PostProcessor::PostProcess, post_processing_state_,
                      std::move(clusters)),
       base::BindOnce(
           &QueryClustersState::OnGotClusters, weak_factory_.GetMutableWeakPtr(),
           std::move(post_processing_timer), clusters_from_backend_count,
           query_start_time, std::move(callback), continuation_params));
 }

 void QueryClustersState::OnGotClusters(
     base::ElapsedTimer post_processing_timer,
     size_t clusters_from_backend_count,
     base::TimeTicks query_start_time,
     ResultCallback callback,
     QueryClustersContinuationParams continuation_params,
     std::vector<history::Cluster> clusters) {
   base::UmaHistogramTimes("History.Clusters.ProcessClustersDuration",
                           post_processing_timer.Elapsed());

   if (clusters_from_backend_count > 0) {
     // Log the percentage of clusters that get filtered (e.g., 100 - % of
     // clusters that remain).
     base::UmaHistogramCounts100(
         "History.Clusters.PercentClustersFilteredByQuery",
         static_cast<int>(100 - (clusters.size() /
                                 (1.0 * clusters_from_backend_count) * 100)));
   }

   continuation_params_ = continuation_params;

   // In case no clusters came back, recursively ask for more here. We do this
   // to fulfill the mojom contract where we always return at least one cluster,
   // or we exhaust History. We don't do this in the service because of task
   // tracker lifetime difficulty. In practice, this only happens when the user
   // has a search query that doesn't match any of the clusters in the "page".
   // https://crbug.com/1263728
   //
   // This is distinct from the "tall monitor" case because the page may already
   // be full of clusters. In that case, the WebUI would not know to make another
   // request for clusters.
   if (clusters.empty() && !continuation_params.exhausted_all_visits) {
     LoadNextBatchOfClusters(std::move(callback));
     return;
   }

   // This feels like it belongs in `PostProcessor`, but this operates on the
   // main thread, because the data needs to live on the main thread. Doing it
   // on the task runner requires making heap copies, which probably costs more
   // than just doing this simple computation on the main thread.
   UpdateUniqueRawLabels(clusters);

   if (GetConfig().images && image_service_) {
     image_service_->PopulateEntityImagesFor(
         std::move(clusters),
         base::BindOnce(&QueryClustersState::OnGotImagedClusters,
                        weak_factory_.GetWeakPtr(), query_start_time,
                        std::move(callback), std::move(continuation_params)));
   } else {
     OnGotImagedClusters(query_start_time, std::move(callback),
                         std::move(continuation_params), std::move(clusters));
   }
 }

 void QueryClustersState::OnGotImagedClusters(
     base::TimeTicks query_start_time,
     ResultCallback callback,
     QueryClustersContinuationParams continuation_params,
     std::vector<history::Cluster> clusters) {
   size_t clusters_size = clusters.size();
   bool is_continuation = number_clusters_sent_to_page_ > 0;
   std::move(callback).Run(query_, std::move(clusters),
                           !continuation_params.exhausted_all_visits,
                           is_continuation);

   number_clusters_sent_to_page_ += clusters_size;

   // Log metrics after delivering the results to the page.
   base::TimeDelta service_latency = base::TimeTicks::Now() - query_start_time;
   base::UmaHistogramTimes("History.Clusters.ServiceLatency", service_latency);
 }

 void QueryClustersState::UpdateUniqueRawLabels(
     const std::vector<history::Cluster>& clusters) {
   // Skip this computation when there's a search query.
   if (!query_.empty())
     return;

   for (const auto& cluster : clusters) {
     if (!cluster.raw_label)
       return;

     const auto& raw_label_value = cluster.raw_label.value();
     // Warning: N^2 algorithm below. If this ends up scaling poorly, it can be
     // optimized by adding a map that tracks which labels have been seen
     // already.
     auto it = base::ranges::find(raw_label_counts_so_far_, raw_label_value,
                                  &LabelCount::first);
     if (it == raw_label_counts_so_far_.end()) {
       it = raw_label_counts_so_far_.insert(it,
                                            std::make_pair(raw_label_value, 0));
     }
     it->second++;
   }
 }

 }  // namespace history_clusters
	// Copyright 2022 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "components/history_clusters/ui/query_clusters_state.h"

	#include <set>
	#include <string>

	#include "base/memory/ref_counted_delete_on_sequence.h"
	#include "base/memory/weak_ptr.h"
	#include "base/metrics/histogram_functions.h"
	#include "base/ranges/algorithm.h"
	#include "base/task/sequenced_task_runner.h"
	#include "base/task/thread_pool.h"
	#include "components/history_clusters/core/config.h"
	#include "components/history_clusters/core/history_clusters_service.h"
	#include "components/history_clusters/core/history_clusters_service_task_get_most_recent_clusters.h"
	#include "components/history_clusters/core/history_clusters_util.h"
	#include "components/image_service/image_service.h"
	#include "url/gurl.h"

	namespace history_clusters {

	// Helper class that lives and is destroyed on the `sequenced_task_runner`,
	// although it's created on the main thread. It allows us to store state that
	// is only accessed on `sequenced_task_runner` that persists between batches.
	class QueryClustersState::PostProcessor
	: public base::RefCountedDeleteOnSequence<PostProcessor> {
	public:
	PostProcessor(scoped_refptr<base::SequencedTaskRunner> sequenced_task_runner,
	const std::string query)
	: base::RefCountedDeleteOnSequence<PostProcessor>(sequenced_task_runner),
	query_(query) {}
	PostProcessor(const PostProcessor& other) = delete;
	PostProcessor& operator=(const PostProcessor& other) = delete;

	std::vector<history::Cluster> PostProcess(
	std::vector<history::Cluster> clusters) {
	ApplySearchQuery(query_, clusters);
	CullNonProminentOrDuplicateClusters(query_, clusters,
	&seen_single_visit_cluster_urls_);

	// Only sort after we figured out what we are showing.
	SortClusters(&clusters);

	// We have to do this AFTER applying the search query, because applying the
	// search query re-scores matching visits to promote them above non-matching
	// visits. Show 1-visit clusters only in query mode.
	HideAndCullLowScoringVisits(clusters, query_.empty() ? 2 : 1);
	// Do this AFTER we cull the low scoring visits, so those visits don't get
	// their related searches coalesced onto the cluster level.
	CoalesceRelatedSearches(clusters);
	return clusters;
	}

	private:
	friend class base::RefCountedDeleteOnSequence<PostProcessor>;
	friend class base::DeleteHelper<PostProcessor>;

	// Ref-counted object should only be deleted via ref-counting.
	~PostProcessor() = default;

	const std::string query_;

	// URLs of single-visit non-prominent clusters we've already seen.
	std::set<GURL> seen_single_visit_cluster_urls_;
	};

	QueryClustersState::QueryClustersState(
	base::WeakPtr<HistoryClustersService> service,
	base::WeakPtr<image_service::ImageService> image_service,
	const std::string& query,
	bool recluster)
	: service_(service),
	image_service_(image_service),
	query_(query),
	recluster_(recluster),
	post_processing_task_runner_(base::ThreadPool::CreateSequencedTaskRunner(
	{base::MayBlock(), base::TaskPriority::USER_VISIBLE})),
	post_processing_state_(
	base::MakeRefCounted<PostProcessor>(post_processing_task_runner_,
	query)) {}

	QueryClustersState::~QueryClustersState() = default;

	void QueryClustersState::LoadNextBatchOfClusters(ResultCallback callback) {
	if (!service_)
	return;

	base::TimeTicks query_start_time = base::TimeTicks::Now();
	query_clusters_task_ = service_->QueryClusters(
	ClusteringRequestSource::kJourneysPage,
	/begin_time=/base::Time(), continuation_params_, recluster_,
	base::BindOnce(&QueryClustersState::OnGotRawClusters,
	weak_factory_.GetWeakPtr(), query_start_time,
	std::move(callback)));
	}

	void QueryClustersState::OnGotRawClusters(
	base::TimeTicks query_start_time,
	ResultCallback callback,
	std::vector<history::Cluster> clusters,
	QueryClustersContinuationParams continuation_params) const {
	// Post-process the clusters (expensive task) on an anonymous thread to
	// prevent janks.
	base::ElapsedTimer post_processing_timer; // Create here to time the task.

	size_t clusters_from_backend_count = clusters.size();
	post_processing_task_runner_->PostTaskAndReplyWithResult(
	FROM_HERE,
	base::BindOnce(&PostProcessor::PostProcess, post_processing_state_,
	std::move(clusters)),
	base::BindOnce(
	&QueryClustersState::OnGotClusters, weak_factory_.GetMutableWeakPtr(),
	std::move(post_processing_timer), clusters_from_backend_count,
	query_start_time, std::move(callback), continuation_params));
	}

	void QueryClustersState::OnGotClusters(
	base::ElapsedTimer post_processing_timer,
	size_t clusters_from_backend_count,
	base::TimeTicks query_start_time,
	ResultCallback callback,
	QueryClustersContinuationParams continuation_params,
	std::vector<history::Cluster> clusters) {
	base::UmaHistogramTimes("History.Clusters.ProcessClustersDuration",
	post_processing_timer.Elapsed());

	if (clusters_from_backend_count > 0) {
	// Log the percentage of clusters that get filtered (e.g., 100 - % of
	// clusters that remain).
	base::UmaHistogramCounts100(
	"History.Clusters.PercentClustersFilteredByQuery",
	static_cast<int>(100 - (clusters.size() /
	(1.0 * clusters_from_backend_count) * 100)));
	}

	continuation_params_ = continuation_params;

	// In case no clusters came back, recursively ask for more here. We do this
	// to fulfill the mojom contract where we always return at least one cluster,
	// or we exhaust History. We don't do this in the service because of task
	// tracker lifetime difficulty. In practice, this only happens when the user
	// has a search query that doesn't match any of the clusters in the "page".
	// https://crbug.com/1263728
	//
	// This is distinct from the "tall monitor" case because the page may already
	// be full of clusters. In that case, the WebUI would not know to make another
	// request for clusters.
	if (clusters.empty() && !continuation_params.exhausted_all_visits) {
	LoadNextBatchOfClusters(std::move(callback));
	return;
	}

	// This feels like it belongs in `PostProcessor`, but this operates on the
	// main thread, because the data needs to live on the main thread. Doing it
	// on the task runner requires making heap copies, which probably costs more
	// than just doing this simple computation on the main thread.
	UpdateUniqueRawLabels(clusters);

	if (GetConfig().images && image_service_) {
	image_service_->PopulateEntityImagesFor(
	std::move(clusters),
	base::BindOnce(&QueryClustersState::OnGotImagedClusters,
	weak_factory_.GetWeakPtr(), query_start_time,
	std::move(callback), std::move(continuation_params)));
	} else {
	OnGotImagedClusters(query_start_time, std::move(callback),
	std::move(continuation_params), std::move(clusters));
	}
	}

	void QueryClustersState::OnGotImagedClusters(
	base::TimeTicks query_start_time,
	ResultCallback callback,
	QueryClustersContinuationParams continuation_params,
	std::vector<history::Cluster> clusters) {
	size_t clusters_size = clusters.size();
	bool is_continuation = number_clusters_sent_to_page_ > 0;
	std::move(callback).Run(query_, std::move(clusters),
	!continuation_params.exhausted_all_visits,
	is_continuation);

	number_clusters_sent_to_page_ += clusters_size;

	// Log metrics after delivering the results to the page.
	base::TimeDelta service_latency = base::TimeTicks::Now() - query_start_time;
	base::UmaHistogramTimes("History.Clusters.ServiceLatency", service_latency);
	}

	void QueryClustersState::UpdateUniqueRawLabels(
	const std::vector<history::Cluster>& clusters) {
	// Skip this computation when there's a search query.
	if (!query_.empty())
	return;

	for (const auto& cluster : clusters) {
	if (!cluster.raw_label)
	return;

	const auto& raw_label_value = cluster.raw_label.value();
	// Warning: N^2 algorithm below. If this ends up scaling poorly, it can be
	// optimized by adding a map that tracks which labels have been seen
	// already.
	auto it = base::ranges::find(raw_label_counts_so_far_, raw_label_value,
	&LabelCount::first);
	if (it == raw_label_counts_so_far_.end()) {
	it = raw_label_counts_so_far_.insert(it,
	std::make_pair(raw_label_value, 0));
	}
	it->second++;
	}
	}

	} // namespace history_clusters