blob: ab47b74f09d552a7afcd50d86d718133aaca2658 [file] [log] [blame]
// Copyright 2021 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/optimization_guide/content/browser/page_content_annotations_service.h"
#include "base/callback_helpers.h"
#include "base/metrics/histogram_functions.h"
#include "base/metrics/histogram_macros_local.h"
#include "base/rand_util.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
#include "base/time/default_tick_clock.h"
#include "base/timer/timer.h"
#include "components/history/core/browser/history_service.h"
#include "components/leveldb_proto/public/proto_database_provider.h"
#include "components/optimization_guide/core/local_page_entities_metadata_provider.h"
#include "components/optimization_guide/core/noisy_metrics_recorder.h"
#include "components/optimization_guide/core/optimization_guide_enums.h"
#include "components/optimization_guide/core/optimization_guide_features.h"
#include "components/optimization_guide/core/optimization_guide_model_provider.h"
#include "components/optimization_guide/core/optimization_guide_switches.h"
#include "content/public/browser/navigation_entry.h"
#include "content/public/browser/web_contents.h"
#include "services/metrics/public/cpp/metrics_utils.h"
#include "services/metrics/public/cpp/ukm_builders.h"
#include "services/metrics/public/cpp/ukm_recorder.h"
#include "services/metrics/public/cpp/ukm_source.h"
#include "services/metrics/public/cpp/ukm_source_id.h"
#if BUILDFLAG(BUILD_WITH_TFLITE_LIB)
#include "components/optimization_guide/content/browser/page_content_annotations_model_manager.h"
#endif
namespace optimization_guide {
namespace {
void LogPageContentAnnotationsStorageStatus(
PageContentAnnotationsStorageStatus status) {
DCHECK_NE(status, PageContentAnnotationsStorageStatus::kUnknown);
base::UmaHistogramEnumeration(
"OptimizationGuide.PageContentAnnotationsService."
"ContentAnnotationsStorageStatus",
status);
}
#if BUILDFLAG(BUILD_WITH_TFLITE_LIB)
// Record the visibility score of the provided visit as a RAPPOR-style record to
// UKM.
void MaybeRecordVisibilityUKM(
const HistoryVisit& visit,
const absl::optional<history::VisitContentModelAnnotations>&
content_annotations) {
if (!content_annotations)
return;
if (content_annotations->visibility_score < 0)
return;
int64_t score =
static_cast<int64_t>(100 * content_annotations->visibility_score);
// We want 2^|num_bits| buckets, linearly spaced.
uint32_t num_buckets =
std::pow(2, optimization_guide::features::NumBitsForRAPPORMetrics());
DCHECK_GT(num_buckets, 0u);
float bucket_size = 100.0 / num_buckets;
uint32_t bucketed_score = static_cast<uint32_t>(floor(score / bucket_size));
DCHECK_LE(bucketed_score, num_buckets);
uint32_t noisy_score = NoisyMetricsRecorder().GetNoisyMetric(
optimization_guide::features::NoiseProbabilityForRAPPORMetrics(),
bucketed_score, optimization_guide::features::NumBitsForRAPPORMetrics());
ukm::SourceId ukm_source_id = ukm::ConvertToSourceId(
visit.navigation_id, ukm::SourceIdType::NAVIGATION_ID);
ukm::builders::PageContentAnnotations(ukm_source_id)
.SetVisibilityScore(static_cast<int64_t>(noisy_score))
.Record(ukm::UkmRecorder::Get());
}
#endif /* BUILDFLAG(BUILD_WITH_TFLITE_LIB) */
const char* kRandomWords[] = {
"interesting", "chunky", "maniacal", "tickle", "lettuce",
"obsequious", "stir", "bless", "colossal", "squealing",
"elegant", "ambitious", "eight", "frighten", "descriptive",
"pretty", "curly", "regular", "uneven", "heap",
};
const size_t kCountRandomWords = 20;
} // namespace
PageContentAnnotationsService::PageContentAnnotationsService(
const std::string& application_locale,
OptimizationGuideModelProvider* optimization_guide_model_provider,
history::HistoryService* history_service,
leveldb_proto::ProtoDatabaseProvider* database_provider,
const base::FilePath& database_dir,
scoped_refptr<base::SequencedTaskRunner> background_task_runner)
: last_annotated_history_visits_(
features::MaxContentAnnotationRequestsCached()),
annotated_text_cache_(features::MaxVisitAnnotationCacheSize()) {
DCHECK(optimization_guide_model_provider);
DCHECK(history_service);
history_service_ = history_service;
#if BUILDFLAG(BUILD_WITH_TFLITE_LIB)
model_manager_ = std::make_unique<PageContentAnnotationsModelManager>(
application_locale, optimization_guide_model_provider);
annotator_ = model_manager_.get();
#endif
if (features::UseLocalPageEntitiesMetadataProvider()) {
local_page_entities_metadata_provider_ =
std::make_unique<LocalPageEntitiesMetadataProvider>();
local_page_entities_metadata_provider_->Initialize(
database_provider, database_dir, background_task_runner);
}
if (features::BatchAnnotationsValidationEnabled()) {
// Normally the caller would do this, but we are our own caller.
RequestAndNotifyWhenModelAvailable(
features::BatchAnnotationsValidationUsePageTopics()
? AnnotationType::kPageTopics
: AnnotationType::kContentVisibility,
base::DoNothing());
validation_timer_ = std::make_unique<base::OneShotTimer>(
base::DefaultTickClock::GetInstance());
validation_timer_->Start(
FROM_HERE, features::BatchAnnotationValidationStartupDelay(),
base::BindRepeating(
&PageContentAnnotationsService::RunBatchAnnotationValidation,
weak_ptr_factory_.GetWeakPtr()));
}
}
PageContentAnnotationsService::~PageContentAnnotationsService() = default;
void PageContentAnnotationsService::Annotate(const HistoryVisit& visit) {
if (last_annotated_history_visits_.Peek(visit) !=
last_annotated_history_visits_.end()) {
// We have already been requested to annotate this visit, so don't submit
// for re-annotation.
return;
}
last_annotated_history_visits_.Put(visit, true);
#if BUILDFLAG(BUILD_WITH_TFLITE_LIB)
if (!visit.text_to_annotate)
return;
// Used for testing.
LOCAL_HISTOGRAM_BOOLEAN(
"PageContentAnnotations.AnnotateVisit.AnnotationRequested", true);
auto it = annotated_text_cache_.Peek(*visit.text_to_annotate);
if (it != annotated_text_cache_.end()) {
// We have annotations the text for this visit, so return that immediately
// rather than re-executing the model.
//
// TODO(crbug.com/1291275): If the model was updated, the cached value could
// be stale so we should invalidate the cache on model updates.
OnPageContentAnnotated(visit, it->second);
base::UmaHistogramBoolean(
"OptimizationGuide.PageContentAnnotations.AnnotateVisitResultCached",
true);
return;
}
if (switches::ShouldLogPageContentAnnotationsInput()) {
LOG(ERROR) << "Adding annotation job: \n"
<< "URL: " << visit.url << "\n"
<< "Text: " << visit.text_to_annotate.value_or(std::string());
}
visits_to_annotate_.emplace_back(visit);
base::UmaHistogramBoolean(
"OptimizationGuide.PageContentAnnotations.AnnotateVisitResultCached",
false);
if (visits_to_annotate_.size() >= features::AnnotateVisitBatchSize()) {
if (current_visit_annotation_batch_.empty()) {
// Used for testing.
LOCAL_HISTOGRAM_BOOLEAN(
"PageContentAnnotations.AnnotateVisit.BatchAnnotationStarted", true);
current_visit_annotation_batch_ = std::move(visits_to_annotate_);
AnnotateVisitBatch();
return;
}
// The queue is full and an batch annotation is actively being done so
// we will remove the "oldest" visit.
visits_to_annotate_.erase(visits_to_annotate_.begin());
// Used for testing.
LOCAL_HISTOGRAM_BOOLEAN(
"PageContentAnnotations.AnnotateVisit.QueueFullVisitDropped", true);
}
// Used for testing.
LOCAL_HISTOGRAM_BOOLEAN(
"PageContentAnnotations.AnnotateVisit.AnnotationRequestQueued", true);
#endif
}
#if BUILDFLAG(BUILD_WITH_TFLITE_LIB)
void PageContentAnnotationsService::AnnotateVisitBatch() {
DCHECK(!current_visit_annotation_batch_.empty());
if (switches::StopHistoryVisitBatchAnnotateForTesting()) {
// Code beyond this is tested in multiple places. This just ensures the
// calls up to this point can be more easily configured.
return;
}
if (current_visit_annotation_batch_.empty()) {
return;
}
auto visit = current_visit_annotation_batch_.back();
DCHECK(visit.text_to_annotate);
if (visit.text_to_annotate) {
model_manager_->Annotate(
*(visit.text_to_annotate),
base::BindOnce(&PageContentAnnotationsService::OnBatchVisitAnnotated,
weak_ptr_factory_.GetWeakPtr(), visit));
}
}
void PageContentAnnotationsService::OnBatchVisitAnnotated(
const HistoryVisit& visit,
const absl::optional<history::VisitContentModelAnnotations>&
content_annotations) {
OnPageContentAnnotated(visit, content_annotations);
DCHECK_EQ(visit.navigation_id,
current_visit_annotation_batch_.back().navigation_id);
current_visit_annotation_batch_.pop_back();
if (!current_visit_annotation_batch_.empty()) {
AnnotateVisitBatch();
}
}
#endif
void PageContentAnnotationsService::OverridePageContentAnnotatorForTesting(
PageContentAnnotator* annotator) {
annotator_ = annotator;
}
// static
std::string PageContentAnnotationsService::StringInputForPageTopicsDomain(
const GURL& url) {
std::string domain = base::ToLowerASCII(url.host());
// Strip the 'www.' if it exists.
if (base::StartsWith(domain, "www.")) {
domain = domain.substr(4);
}
for (char c : std::vector<char>{'-', '_', '.', '+'}) {
std::replace(domain.begin(), domain.end(), c, ' ');
}
return domain;
}
void PageContentAnnotationsService::BatchAnnotatePageTopics(
BatchAnnotationCallback callback,
const std::vector<GURL>& inputs) {
std::vector<std::string> domains;
for (const GURL& url : inputs) {
domains.emplace_back(StringInputForPageTopicsDomain(url));
}
if (!annotator_) {
std::move(callback).Run(CreateEmptyBatchAnnotationResults(domains));
return;
}
annotator_->Annotate(std::move(callback), domains,
AnnotationType::kPageTopics);
}
void PageContentAnnotationsService::BatchAnnotate(
BatchAnnotationCallback callback,
const std::vector<std::string>& inputs,
AnnotationType annotation_type) {
DCHECK_NE(annotation_type, AnnotationType::kPageTopics)
<< "Please use |BatchAnnotatePageTopics| instead";
if (!annotator_) {
std::move(callback).Run(CreateEmptyBatchAnnotationResults(inputs));
return;
}
annotator_->Annotate(std::move(callback), inputs, annotation_type);
}
absl::optional<ModelInfo> PageContentAnnotationsService::GetModelInfoForType(
AnnotationType type) const {
#if BUILDFLAG(BUILD_WITH_TFLITE_LIB)
DCHECK(model_manager_);
return model_manager_->GetModelInfoForType(type);
#else
return absl::nullopt;
#endif
}
void PageContentAnnotationsService::RequestAndNotifyWhenModelAvailable(
AnnotationType type,
base::OnceCallback<void(bool)> callback) {
#if BUILDFLAG(BUILD_WITH_TFLITE_LIB)
DCHECK(model_manager_);
model_manager_->RequestAndNotifyWhenModelAvailable(type, std::move(callback));
#else
std::move(callback).Run(false);
#endif
}
void PageContentAnnotationsService::PersistSearchMetadata(
const HistoryVisit& visit,
const SearchMetadata& search_metadata) {
QueryURL(visit,
base::BindOnce(&history::HistoryService::AddSearchMetadataForVisit,
history_service_->AsWeakPtr(),
search_metadata.normalized_url,
search_metadata.search_terms));
}
void PageContentAnnotationsService::ExtractRelatedSearches(
const HistoryVisit& visit,
content::WebContents* web_contents) {
search_result_extractor_client_.RequestData(
web_contents, {continuous_search::mojom::ResultType::kRelatedSearches},
base::BindOnce(&PageContentAnnotationsService::OnRelatedSearchesExtracted,
weak_ptr_factory_.GetWeakPtr(), visit));
}
#if BUILDFLAG(BUILD_WITH_TFLITE_LIB)
void PageContentAnnotationsService::OnPageContentAnnotated(
const HistoryVisit& visit,
const absl::optional<history::VisitContentModelAnnotations>&
content_annotations) {
base::UmaHistogramBoolean(
"OptimizationGuide.PageContentAnnotationsService.ContentAnnotated",
content_annotations.has_value());
if (!content_annotations)
return;
if (annotated_text_cache_.Peek(*visit.text_to_annotate) ==
annotated_text_cache_.end()) {
annotated_text_cache_.Put(*visit.text_to_annotate, *content_annotations);
}
MaybeRecordVisibilityUKM(visit, content_annotations);
if (!features::ShouldWriteContentAnnotationsToHistoryService())
return;
QueryURL(visit,
base::BindOnce(
&history::HistoryService::AddContentModelAnnotationsForVisit,
history_service_->AsWeakPtr(), *content_annotations));
}
#endif
void PageContentAnnotationsService::OnRelatedSearchesExtracted(
const HistoryVisit& visit,
continuous_search::SearchResultExtractorClientStatus status,
continuous_search::mojom::CategoryResultsPtr results) {
const bool success =
status == continuous_search::SearchResultExtractorClientStatus::kSuccess;
base::UmaHistogramBoolean(
"OptimizationGuide.PageContentAnnotationsService."
"RelatedSearchesExtracted",
success);
if (!success) {
return;
}
std::vector<std::string> related_searches;
for (const auto& group : results->groups) {
if (group->type != continuous_search::mojom::ResultType::kRelatedSearches) {
continue;
}
std::transform(std::begin(group->results), std::end(group->results),
std::back_inserter(related_searches),
[](const continuous_search::mojom::SearchResultPtr& result) {
return base::UTF16ToUTF8(
base::CollapseWhitespace(result->title, true));
});
break;
}
if (related_searches.empty()) {
return;
}
if (!features::ShouldWriteContentAnnotationsToHistoryService()) {
return;
}
QueryURL(visit,
base::BindOnce(&history::HistoryService::AddRelatedSearchesForVisit,
history_service_->AsWeakPtr(), related_searches));
}
void PageContentAnnotationsService::QueryURL(
const HistoryVisit& visit,
PersistAnnotationsCallback callback) {
history_service_->QueryURL(
visit.url, /*want_visits=*/true,
base::BindOnce(&PageContentAnnotationsService::OnURLQueried,
weak_ptr_factory_.GetWeakPtr(), visit,
std::move(callback)),
&history_service_task_tracker_);
}
void PageContentAnnotationsService::OnURLQueried(
const HistoryVisit& visit,
PersistAnnotationsCallback callback,
history::QueryURLResult url_result) {
if (!url_result.success) {
LogPageContentAnnotationsStorageStatus(
PageContentAnnotationsStorageStatus::kNoVisitsForUrl);
return;
}
bool did_store_content_annotations = false;
for (const auto& visit_for_url : url_result.visits) {
if (visit.nav_entry_timestamp != visit_for_url.visit_time)
continue;
std::move(callback).Run(visit_for_url.visit_id);
did_store_content_annotations = true;
break;
}
LogPageContentAnnotationsStorageStatus(
did_store_content_annotations ? kSuccess : kSpecificVisitForUrlNotFound);
}
void PageContentAnnotationsService::GetMetadataForEntityId(
const std::string& entity_id,
EntityMetadataRetrievedCallback callback) {
if (features::UseLocalPageEntitiesMetadataProvider()) {
DCHECK(local_page_entities_metadata_provider_);
local_page_entities_metadata_provider_->GetMetadataForEntityId(
entity_id, std::move(callback));
return;
}
#if BUILDFLAG(BUILD_WITH_TFLITE_LIB)
model_manager_->GetMetadataForEntityId(entity_id, std::move(callback));
#else
std::move(callback).Run(absl::nullopt);
#endif
}
void PageContentAnnotationsService::PersistRemotePageEntities(
const HistoryVisit& history_visit,
const std::vector<history::VisitContentModelAnnotations::Category>&
entities) {
history::VisitContentModelAnnotations annotations;
annotations.entities = entities;
QueryURL(history_visit,
base::BindOnce(
&history::HistoryService::AddContentModelAnnotationsForVisit,
history_service_->AsWeakPtr(), annotations));
}
void PageContentAnnotationsService::RunBatchAnnotationValidation() {
DCHECK(features::BatchAnnotationsValidationEnabled());
DCHECK(validation_timer_);
validation_timer_.reset();
std::vector<std::string> dummy_inputs;
dummy_inputs.reserve(features::BatchAnnotationsValidationBatchSize());
for (size_t i = 0; i < features::BatchAnnotationsValidationBatchSize(); i++) {
const char* word1 = kRandomWords[base::RandGenerator(kCountRandomWords)];
const char* word2 = kRandomWords[base::RandGenerator(kCountRandomWords)];
dummy_inputs.emplace_back(base::StringPrintf("%s-%s.com", word1, word2));
}
LOCAL_HISTOGRAM_COUNTS_100(
"OptimizationGuide.PageContentAnnotationsService.ValidationRun",
dummy_inputs.size());
BatchAnnotate(base::DoNothing(), dummy_inputs,
features::BatchAnnotationsValidationUsePageTopics()
? AnnotationType::kPageTopics
: AnnotationType::kContentVisibility);
}
// static
HistoryVisit PageContentAnnotationsService::CreateHistoryVisitFromWebContents(
content::WebContents* web_contents,
int64_t navigation_id) {
HistoryVisit visit(
web_contents->GetController().GetLastCommittedEntry()->GetTimestamp(),
web_contents->GetLastCommittedURL(), navigation_id);
return visit;
}
HistoryVisit::HistoryVisit() = default;
HistoryVisit::HistoryVisit(base::Time nav_entry_timestamp,
GURL url,
int64_t navigation_id) {
this->nav_entry_timestamp = nav_entry_timestamp;
this->url = url;
this->navigation_id = navigation_id;
}
HistoryVisit::~HistoryVisit() = default;
HistoryVisit::HistoryVisit(const HistoryVisit&) = default;
} // namespace optimization_guide