blob: b6a15c1bfa7ef0e2f8f45de1ddd05f412edd2a33 [file] [log] [blame]
// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_OPTIMIZATION_GUIDE_CONTENT_BROWSER_PAGE_CONTENT_ANNOTATIONS_SERVICE_H_
#define COMPONENTS_OPTIMIZATION_GUIDE_CONTENT_BROWSER_PAGE_CONTENT_ANNOTATIONS_SERVICE_H_
#include <memory>
#include <string>
#include <vector>
#include "base/containers/lru_cache.h"
#include "base/files/file_path.h"
#include "base/functional/callback_forward.h"
#include "base/hash/hash.h"
#include "base/memory/raw_ptr.h"
#include "base/memory/weak_ptr.h"
#include "base/scoped_observation.h"
#include "base/task/cancelable_task_tracker.h"
#include "base/task/sequenced_task_runner.h"
#include "base/time/time.h"
#include "components/continuous_search/browser/search_result_extractor_client.h"
#include "components/continuous_search/browser/search_result_extractor_client_status.h"
#include "components/continuous_search/common/public/mojom/continuous_search.mojom.h"
#include "components/history/core/browser/history_service_observer.h"
#include "components/history/core/browser/history_types.h"
#include "components/history/core/browser/url_row.h"
#include "components/keyed_service/core/keyed_service.h"
#include "components/omnibox/browser/autocomplete_provider_client.h"
#include "components/omnibox/browser/zero_suggest_cache_service.h"
#include "components/optimization_guide/content/browser/page_content_annotator.h"
#include "components/optimization_guide/core/entity_metadata_provider.h"
#include "components/optimization_guide/core/model_info.h"
#include "components/optimization_guide/core/page_content_annotations_common.h"
#include "components/optimization_guide/machine_learning_tflite_buildflags.h"
#include "components/optimization_guide/proto/page_entities_metadata.pb.h"
#include "components/optimization_guide/proto/salient_image_metadata.pb.h"
#include "components/search_engines/template_url_service.h"
#include "third_party/abseil-cpp/absl/types/optional.h"
#include "url/gurl.h"
class OptimizationGuideLogger;
namespace history {
class HistoryService;
} // namespace history
namespace leveldb_proto {
class ProtoDatabaseProvider;
} // namespace leveldb_proto
namespace optimization_guide {
class OptimizationGuideModelProvider;
class PageContentAnnotationsModelManager;
class PageContentAnnotationsServiceBrowserTest;
class PageContentAnnotationsValidator;
class PageContentAnnotationsWebContentsObserver;
// The information used by HistoryService to identify a visit to a URL.
struct HistoryVisit {
HistoryVisit();
HistoryVisit(base::Time nav_entry_timestamp, GURL url, int64_t navigation_id);
explicit HistoryVisit(history::VisitID visit_id);
~HistoryVisit();
HistoryVisit(const HistoryVisit&);
base::Time nav_entry_timestamp;
GURL url;
int64_t navigation_id = 0;
absl::optional<history::VisitID> visit_id;
absl::optional<std::string> text_to_annotate;
struct Comp {
bool operator()(const HistoryVisit& lhs, const HistoryVisit& rhs) const {
if (lhs.visit_id && rhs.visit_id) {
return *lhs.visit_id < *rhs.visit_id;
}
if (lhs.visit_id) {
// If we get here, this means that |rhs| does not have a visit ID.
return false;
}
if (lhs.nav_entry_timestamp != rhs.nav_entry_timestamp)
return lhs.nav_entry_timestamp < rhs.nav_entry_timestamp;
return lhs.url < rhs.url;
}
};
};
// The type of page content annotations stored in the history database.
enum class PageContentAnnotationsType {
kUnknown = 0,
// Results from executing the models on page content or annotations received
// from the remote Optimization Guide service.
kModelAnnotations = 1,
// Related searches for the Google Search Results page.
kRelatedSearches = 2,
// Metadata for "search-like" pages.
kSearchMetadata = 3,
// Metadata received from the remote Optimization Guide service.
kRemoteMetdata = 4,
// Salient image metadata.
kSalientImageMetadata = 5,
// New entries should be added to the PageContentAnnotationsStorageType in
// optimization/histograms.xml.
};
// A KeyedService that annotates page content.
class PageContentAnnotationsService : public KeyedService,
public EntityMetadataProvider,
public history::HistoryServiceObserver,
public ZeroSuggestCacheService::Observer {
public:
// Observer interface to listen for PageContentAnnotations for page loads.
// Annotations will be sent for each page load for the registered annotation
// type.
class PageContentAnnotationsObserver : public base::CheckedObserver {
public:
virtual void OnPageContentAnnotated(
const GURL& url,
const PageContentAnnotationsResult& result) = 0;
};
PageContentAnnotationsService(
std::unique_ptr<AutocompleteProviderClient> autocomplete_provider_client,
const std::string& application_locale,
OptimizationGuideModelProvider* optimization_guide_model_provider,
history::HistoryService* history_service,
TemplateURLService* template_url_service,
ZeroSuggestCacheService* zero_suggest_cache_service,
leveldb_proto::ProtoDatabaseProvider* database_provider,
const base::FilePath& database_dir,
OptimizationGuideLogger* optimization_guide_logger,
scoped_refptr<base::SequencedTaskRunner> background_task_runner);
~PageContentAnnotationsService() override;
PageContentAnnotationsService(const PageContentAnnotationsService&) = delete;
PageContentAnnotationsService& operator=(
const PageContentAnnotationsService&) = delete;
// This is the main entry point for page content annotations by external
// callers. Callers must call |RequestAndNotifyWhenModelAvailable| as close to
// session start as possible to allow time for the model file to be
// downloaded.
void BatchAnnotate(BatchAnnotationCallback callback,
const std::vector<std::string>& inputs,
AnnotationType annotation_type);
// Requests that the given model for |type| be loaded in the background and
// then runs |callback| with true when the model is ready to execute. If the
// model is ready now, the callback is run immediately. If the model file will
// never be available, the callback is run with false.
void RequestAndNotifyWhenModelAvailable(
AnnotationType type,
base::OnceCallback<void(bool)> callback);
// Returns the model info for the given annotation type, if the model file is
// available.
absl::optional<ModelInfo> GetModelInfoForType(AnnotationType type) const;
// EntityMetadataProvider:
void GetMetadataForEntityId(
const std::string& entity_id,
EntityMetadataRetrievedCallback callback) override;
// history::HistoryServiceObserver:
void OnURLVisited(history::HistoryService* history_service,
const history::URLRow& url_row,
const history::VisitRow& visit_row) override;
// Overrides the PageContentAnnotator for testing. See
// test_page_content_annotator.h for an implementation designed for testing.
void OverridePageContentAnnotatorForTesting(PageContentAnnotator* annotator);
// Specifies whether PageContentAnnotationsService should extract "related
// searches" data from the ZPS response cache.
bool ShouldExtractRelatedSearchesFromZPSCache();
// ZeroSuggestCacheService::Observer:
void OnZeroSuggestResponseUpdated(
const std::string& page_url,
const ZeroSuggestCacheService::CacheEntry& response) override;
// Callback used to extract "related searches" data from cached ZPS responses.
void ExtractRelatedSearchesFromZeroSuggestResponse(
const ZeroSuggestCacheService::CacheEntry& response,
history::QueryURLResult url_result);
// Adds or removes PageContentAnnotations observers for |annotation_type|.
void AddObserver(AnnotationType annotation_type,
PageContentAnnotationsObserver* observer);
void RemoveObserver(AnnotationType annotation_type,
PageContentAnnotationsObserver* observer);
OptimizationGuideLogger* optimization_guide_logger() const {
return optimization_guide_logger_;
}
private:
#if BUILDFLAG(BUILD_WITH_TFLITE_LIB)
// Callback invoked when a single |visit| has been annotated.
void OnPageContentAnnotated(
const HistoryVisit& visit,
const absl::optional<history::VisitContentModelAnnotations>&
content_annotations);
// Maybe calls |AnnotateVisitBatch| to start a new batch of content
// annotations. Returns true if a new batch is started. Returns false if a
// batch is already running, or if there batch queue is not full.
bool MaybeStartAnnotateVisitBatch();
// Runs the page annotation models available to |model_manager_| on all the
// visits within |current_visit_annotation_batch_|.
void AnnotateVisitBatch();
// Runs when a single annotation job of |type| is completed and |batch_result|
// can be merged into |merge_to_output|. |signal_merge_complete_callback|
// should be run last as it is a |base::BarrierClosure| that may trigger
// |OnBatchVisitsAnnotated| to run.
void OnAnnotationBatchComplete(
AnnotationType type,
std::vector<absl::optional<history::VisitContentModelAnnotations>>*
merge_to_output,
base::OnceClosure signal_merge_complete_callback,
const std::vector<BatchAnnotationResult>& batch_result);
// Callback run after all annotation types in |annotation_types_to_execute_|
// for all of |current_visit_annotation_batch_| has been completed.
void OnBatchVisitsAnnotated(
std::unique_ptr<
std::vector<absl::optional<history::VisitContentModelAnnotations>>>
merged_annotation_outputs);
std::unique_ptr<PageContentAnnotationsModelManager> model_manager_;
#endif
// The annotator to use for requests to |BatchAnnotate| and |Annotate|. In
// prod, this is simply |model_manager_.get()| but is set as a separate
// pointer here in order to be override-able for testing.
raw_ptr<PageContentAnnotator> annotator_ = nullptr;
// Requests to annotate |text|, which is associated with |web_contents|.
//
// When finished annotating, it will store the relevant information in
// History Service.
//
// The WCO friend class is used to keep the `Annotate` API internal to
// OptGuide. Callers should use `BatchAnnotate` instead.
friend class PageContentAnnotationsWebContentsObserver;
friend class PageContentAnnotationsServiceBrowserTest;
// Virtualized for testing.
virtual void Annotate(const HistoryVisit& visit);
// Requests |search_result_extractor_client_| to extract related searches from
// the Google SRP DOM associated with |web_contents|.
//
// Once finished, it will store the related searches in History Service.
//
// Virtualized for testing.
virtual void ExtractRelatedSearches(const HistoryVisit& visit,
content::WebContents* web_contents);
// Callback invoked when related searches have been extracted for |visit|.
void OnRelatedSearchesExtracted(
const HistoryVisit& visit,
continuous_search::SearchResultExtractorClientStatus status,
continuous_search::mojom::CategoryResultsPtr results);
// Persist |page_entities_metadata| for |visit| in |history_service_|.
//
// Virtualized for testing.
virtual void PersistRemotePageMetadata(
const HistoryVisit& visit,
const proto::PageEntitiesMetadata& page_entities_metadata);
// Persist |salient_image_metadata| for |visit| in |history_service_|.
//
// Virtualized for testing.
virtual void PersistSalientImageMetadata(
const HistoryVisit& visit,
const proto::SalientImageMetadata& salient_image_metadata);
// Called when entity metadata for |entity_id| that had weight |weight| on
// page with |url| has been retrieved.
void OnEntityMetadataRetrieved(
const GURL& url,
const std::string& entity_id,
int weight,
const absl::optional<EntityMetadata>& entity_metadata);
using PersistAnnotationsCallback = base::OnceCallback<void(history::VisitID)>;
// Queries |history_service| for all the visits to the visited URL of |visit|.
// |callback| will be invoked to write the bound content annotations to
// |history_service| once the visits to the given URL have returned. The
// |annotation_type| of data to be stored in History Service is passed along
// for metrics purposes.
void QueryURL(const HistoryVisit& visit,
PersistAnnotationsCallback callback,
PageContentAnnotationsType annotation_type);
// Callback invoked when |history_service| has returned results for the visits
// to a URL. In turn invokes |callback| to write the bound content annotations
// to |history_service|.
void OnURLQueried(const HistoryVisit& visit,
PersistAnnotationsCallback callback,
PageContentAnnotationsType annotation_type,
history::QueryURLResult url_result);
// Notifies the PageContentAnnotationsResult to the observers for
// |annotation_type|.
void NotifyPageContentAnnotatedObservers(
AnnotationType annotation_type,
const GURL& url,
const PageContentAnnotationsResult& page_content_annotations_result);
// Provider client instance used when parsing cached ZPS response data.
std::unique_ptr<AutocompleteProviderClient> autocomplete_provider_client_;
// The minimum score that an allowlisted page category must have for it to be
// persisted.
const int min_page_category_score_to_persist_;
// The history service to write content annotations to. Not owned. Guaranteed
// to outlive |this|.
const raw_ptr<history::HistoryService> history_service_;
// Not owned and must outlive |this|. Can be nullptr in tests only.
const raw_ptr<TemplateURLService> template_url_service_;
// The scoped observation to the HistoryService.
base::ScopedObservation<history::HistoryService,
PageContentAnnotationsService>
history_service_observation_{this};
// The task tracker to keep track of tasks to query |history_service|.
base::CancelableTaskTracker history_service_task_tracker_;
// The zero suggest cache service used to fetch cached ZPS response data.
const raw_ptr<ZeroSuggestCacheService> zero_suggest_cache_service_;
// The scoped observation to the ZeroSuggestCacheService.
base::ScopedObservation<ZeroSuggestCacheService,
PageContentAnnotationsService>
zero_suggest_cache_service_observation_{this};
// The client of continuous_search::mojom::SearchResultExtractor interface
// used for extracting data from the main frame of Google SRP |web_contents|.
continuous_search::SearchResultExtractorClient
search_result_extractor_client_;
// A LRU Cache keeping track of the visits that have been requested for
// annotation. If the requested visit is in this cache, the models will not be
// requested for another annotation on the same visit.
base::LRUCache<HistoryVisit, bool, HistoryVisit::Comp>
last_annotated_history_visits_;
// A LRU cache of the annotation results for visits. If the text of the visit
// is in the cache, the cached model annotations will be used.
base::HashingLRUCache<std::string, history::VisitContentModelAnnotations>
annotated_text_cache_;
// The set of visits to be annotated, this is added to by Annotate requests
// from the web content observer. These will be annotated when the set is full
// and annotations can be scheduled with minimal impact to browsing.
std::vector<HistoryVisit> visits_to_annotate_;
// The set of |AnnotationType|'s to run on each of |visits_to_annotate_|.
std::vector<AnnotationType> annotation_types_to_execute_;
// The batch of visits being annotated. If this is empty, it is assumed that
// no visits are actively be annotated and a new batch can be started.
std::vector<HistoryVisit> current_visit_annotation_batch_;
// Set during this' ctor if the corresponding command line or feature flags
// are set.
std::unique_ptr<PageContentAnnotationsValidator> validator_;
raw_ptr<OptimizationGuideLogger> optimization_guide_logger_ = nullptr;
// Observers of PageContentAnnotations that have been registered per
// AnnotationType.
std::map<AnnotationType, base::ObserverList<PageContentAnnotationsObserver>>
page_content_annotations_observers_;
base::WeakPtrFactory<PageContentAnnotationsService> weak_ptr_factory_{this};
};
} // namespace optimization_guide
#endif // COMPONENTS_OPTIMIZATION_GUIDE_CONTENT_BROWSER_PAGE_CONTENT_ANNOTATIONS_SERVICE_H_