blob: 2443c9fbc3e3744b8dd3d27096191d29fbd96b55 [file]
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/page_content_annotations/content/annotate_page_content_request.h"
#include <utility>
#include "base/check.h"
#include "base/command_line.h"
#include "base/feature_list.h"
#include "base/functional/bind.h"
#include "base/metrics/histogram_functions.h"
#include "base/metrics/histogram_macros.h"
#include "base/task/task_traits.h"
#include "base/task/thread_pool.h"
#include "base/time/time.h"
#include "base/timer/elapsed_timer.h"
#include "base/trace_event/trace_event.h"
#include "build/build_config.h"
#include "components/content_extraction/content/browser/inner_text.h"
#include "components/history/core/browser/features.h"
#include "components/optimization_guide/content/browser/page_content_proto_provider.h"
#include "components/optimization_guide/content/browser/page_context_eligibility.h"
#include "components/page_content_annotations/content/page_content_extraction_service.h"
#include "components/page_content_annotations/core/page_content_annotations_features.h"
#include "components/page_content_annotations/core/page_content_annotations_switches.h"
#include "components/page_content_annotations/core/page_content_extraction_types.h"
#include "components/pdf/common/constants.h"
#include "content/public/browser/browser_thread.h"
#include "content/public/browser/navigation_handle.h"
#include "content/public/browser/page.h"
#include "net/http/http_response_headers.h"
#include "pdf/buildflags.h"
#include "services/metrics/public/cpp/metrics_utils.h"
#include "services/metrics/public/cpp/ukm_builders.h"
#include "services/network/public/mojom/url_response_head.mojom.h"
#include "url/gurl.h"
#if BUILDFLAG(ENABLE_PDF)
#include "components/pdf/browser/pdf_document_helper.h"
#endif // BUILDFLAG(ENABLE_PDF)
namespace page_content_annotations {
namespace {
#if BUILDFLAG(ENABLE_PDF)
void RecordPdfPageCountMetrics(
ukm::SourceId source_id,
pdf::mojom::PdfListener::GetPdfBytesStatus status,
const std::vector<uint8_t>& bytes,
uint32_t page_count) {
if (status == pdf::mojom::PdfListener::GetPdfBytesStatus::kFailed) {
return;
}
ukm::builders::OptimizationGuide_AnnotatedPdfContent(source_id)
.SetPdfPageCount(ukm::GetExponentialBucketMinForCounts1000(page_count))
.Record(ukm::UkmRecorder::Get());
}
#endif // BUILDFLAG(ENABLE_PDF)
std::optional<ExtractedPageContentResult>
RecordAndReturnOnDemandExtractionResult(
base::ElapsedTimer timer,
std::optional<ExtractedPageContentResult> result) {
base::UmaHistogramTimes(
"OptimizationGuide.PageContentExtraction.OnDemand.Latency",
timer.Elapsed());
base::UmaHistogramBoolean(
"OptimizationGuide.PageContentExtraction.OnDemand.Success",
result.has_value());
return result;
}
} // namespace
// static
std::unique_ptr<AnnotatedPageContentRequest>
AnnotatedPageContentRequest::Create(
content::WebContents* web_contents,
PageContentExtractionService& page_content_extraction_service,
FetchPageContextCallback fetch_page_context_callback,
GetTabIdCallback get_tab_id_callback) {
auto request = blink::mojom::AIPageContentOptions::New();
request->mode =
(page_content_annotations::features::AnnotatedPageContentMode() ==
"actionable")
? blink::mojom::AIPageContentMode::kActionableElements
: blink::mojom::AIPageContentMode::kDefault;
request->on_critical_path = page_content_annotations::features::
IsAnnotatedPageContentOnCriticalPath();
if (page_content_annotations::features::
ShouldAnnotatedPageContentExcludeAdRelated()) {
request->non_salient_content_config =
blink::mojom::NonSalientContentConfig::New();
request->non_salient_content_config->exclude_ad_related = true;
}
return std::make_unique<AnnotatedPageContentRequest>(
web_contents, page_content_extraction_service, std::move(request),
std::move(fetch_page_context_callback), std::move(get_tab_id_callback));
}
AnnotatedPageContentRequest::AnnotatedPageContentRequest(
content::WebContents* web_contents,
PageContentExtractionService& page_content_extraction_service,
blink::mojom::AIPageContentOptionsPtr request,
FetchPageContextCallback fetch_page_context_callback,
GetTabIdCallback get_tab_id_callback)
: page_content_extraction_service_(page_content_extraction_service),
web_contents_(web_contents),
request_(std::move(request)),
delay_(features::GetAnnotatedPageContentCaptureDelay()),
include_inner_text_(
features::ShouldAnnotatedPageContentStudyIncludeInnerText()),
fetch_page_context_callback_(std::move(fetch_page_context_callback)),
get_tab_id_callback_(std::move(get_tab_id_callback)) {
// Post to a background thread to avoid blocking the set up of the overlay.
base::ThreadPool::PostTaskAndReplyWithResult(
FROM_HERE, {base::TaskPriority::BEST_EFFORT, base::MayBlock()},
base::BindOnce(&optimization_guide::PageContextEligibility::Get),
base::BindOnce(
&AnnotatedPageContentRequest::OnPageContextEligibilityAPILoaded,
weak_factory_.GetWeakPtr()));
}
AnnotatedPageContentRequest::~AnnotatedPageContentRequest() {
ResolveAllCallbacksWith(std::nullopt);
}
void AnnotatedPageContentRequest::PrimaryPageChanged() {
ResetForNewNavigation();
}
void AnnotatedPageContentRequest::DidFinishNavigation(
content::NavigationHandle* navigation_handle) {
if (!navigation_handle->IsInPrimaryMainFrame()) {
return;
}
// Cross-document navigations are handled in PrimaryPageChanged.
if (!navigation_handle->IsSameDocument() ||
!navigation_handle->HasCommitted()) {
return;
}
// This is a heuristic to tradeoff how frequently the content is updated and
// ensuring we have coverage for single-page-apps in the data. If the
// navigation will appear in the browser history, it's likely a significant
// change in page state.
if (!navigation_handle->ShouldUpdateHistory()) {
return;
}
if (base::FeatureList::IsEnabled(history::kVisitedLinksOn404)) {
// With the flag enabled, navigations with a 404 status code will be
// eligible for History. We want to ignore 404s. At this point, we should
// only be looking at committed same-document navigations. Same-document
// navigations have no network request and therefore no response code, so we
// should look at the response code for the request that brought us to the
// current document instead of the `NavigationHandle`.
const auto* document_response_head =
navigation_handle->GetRenderFrameHost()->GetLastResponseHead();
if (!document_response_head || !document_response_head->headers) {
return;
}
const int status_code = document_response_head->headers->response_code();
if (status_code == 404) {
return;
}
}
ResetForNewNavigation();
// We don't have reliable load and FCP signals for same-document navigations.
// So we assume the content is ready as soon as the navigation commits.
waiting_for_fcp_ = false;
waiting_for_load_ = false;
MaybeScheduleExtraction();
}
void AnnotatedPageContentRequest::DidStopLoading() {
// Ensure that the main frame's Document has finished loading.
if (!web_contents_->IsDocumentOnLoadCompletedInPrimaryMainFrame()) {
return;
}
// Once the main Document has fired the `load` event, wait for all subframes
// currently in the FrameTree to also finish loading.
if (web_contents_->IsLoading()) {
return;
}
if (web_contents_->GetContentsMimeType() == pdf::kPDFMimeType ||
web_contents_->GetVisibility() == content::Visibility::HIDDEN ||
base::CommandLine::ForCurrentProcess()->HasSwitch(
switches::kPageContentAnnotationsSkipFCPWaitForTesting)) {
// Pdfs and hidden tabs don't provide a reliable FirstContentfulPaint
// signal, so skip waiting for it for these Documents.
waiting_for_fcp_ = false;
}
waiting_for_load_ = false;
MaybeScheduleExtraction();
}
void AnnotatedPageContentRequest::OnFirstContentfulPaintInPrimaryMainFrame() {
waiting_for_fcp_ = false;
MaybeScheduleExtraction();
}
void AnnotatedPageContentRequest::ResetForNewNavigation() {
lifecycle_ = Lifecycle::kNavigated;
waiting_for_fcp_ = true;
waiting_for_load_ = true;
cached_content_ = std::nullopt;
ResolveAllCallbacksWith(std::nullopt);
// Drop pending extraction request for the previous page, if any.
weak_factory_.InvalidateWeakPtrs();
page_content_extraction_service_->OnNewNavigation(
get_tab_id_callback_.Run(web_contents_), web_contents_);
}
void AnnotatedPageContentRequest::MaybeScheduleExtraction(bool on_hide) {
if (!ShouldScheduleExtraction(on_hide)) {
return;
}
lifecycle_ = Lifecycle::kScheduled;
content::GetUIThreadTaskRunner()->PostDelayedTask(
FROM_HERE,
base::BindOnce(&AnnotatedPageContentRequest::OnExtractionTimerFired,
weak_factory_.GetWeakPtr()),
delay_);
}
void AnnotatedPageContentRequest::OnExtractionTimerFired() {
// If there was a navigation in between the delay, skip extraction.
if (lifecycle_ != Lifecycle::kScheduled) {
return;
}
StartExtraction();
}
void AnnotatedPageContentRequest::StartExtraction() {
lifecycle_ = Lifecycle::kRunning;
if (web_contents_->GetContentsMimeType() == pdf::kPDFMimeType) {
#if BUILDFLAG(ENABLE_PDF)
RequestPdfPageCount();
#endif // BUILDFLAG(ENABLE_PDF)
} else {
RequestAnnotatedPageContentSync();
}
}
void AnnotatedPageContentRequest::RequestAnnotatedPageContentSync() {
TRACE_EVENT0("browser",
"AnnotatedPageContentRequest::RequestAnnotatedPageContentSync");
// Note: This is not fetching pdfs since we do not want to cache pdfs in disk
// in PageContentCache.
FetchPageContextOptions options;
options.annotated_page_content_options = request_->Clone();
if (features::kPageContentCacheEnableScreenshot.Get()) {
ScreenshotOptions::ScreenshotCollectionOptions
screenshot_collection_options;
screenshot_collection_options.screenshot_image_format =
ScreenshotOptions::ScreenshotImageFormat::kPng;
options.screenshot_options =
ScreenshotOptions::ViewportOnly(std::nullopt, std::nullopt);
}
fetch_page_context_callback_.Run(
*web_contents_, options, /*progress_listener=*/nullptr,
base::BindOnce(&AnnotatedPageContentRequest::OnPageContextFetched,
weak_factory_.GetWeakPtr()));
if (include_inner_text_) {
content_extraction::GetInnerText(
*web_contents_->GetPrimaryMainFrame(), std::nullopt,
base::BindOnce(&AnnotatedPageContentRequest::OnInnerTextReceived,
weak_factory_.GetWeakPtr(), base::TimeTicks::Now()));
}
}
bool AnnotatedPageContentRequest::ShouldScheduleExtraction(bool on_hide) const {
auto triggering_mode = features::GetPageContentExtractionTriggeringMode();
// If the page is not loaded, the extraction would not work.
if (waiting_for_fcp_ || waiting_for_load_) {
return false;
}
if (lifecycle_ == Lifecycle::kScheduled ||
lifecycle_ == Lifecycle::kRunning) {
// Already scheduled or running, no need to duplicate.
return false;
}
bool trigger_on_hide =
triggering_mode ==
features::PageContentExtractionTriggeringMode::kOnHidden ||
triggering_mode ==
features::PageContentExtractionTriggeringMode::kOnLoadAndHidden;
if (trigger_on_hide) {
// We trigger extraction any time the page transitions to hidden, or if the
// page finished loading while already in the background.
bool newly_hidden =
on_hide || (lifecycle_ == Lifecycle::kNavigated && is_hidden_);
if (newly_hidden) {
CHECK(is_hidden_);
return true;
}
}
if (lifecycle_ != Lifecycle::kNavigated) {
return false;
}
bool trigger_on_load =
triggering_mode ==
features::PageContentExtractionTriggeringMode::kOnLoad ||
triggering_mode ==
features::PageContentExtractionTriggeringMode::kOnLoadAndHidden;
if (trigger_on_load || !on_demand_callbacks_.empty()) {
return true;
}
return false;
}
void AnnotatedPageContentRequest::OnPageContextFetched(
FetchPageContextResultCallbackArg result) {
lifecycle_ = Lifecycle::kExtracted;
if (!result.has_value() || !result.value() ||
!result.value()->annotated_page_content_result.has_value()) {
ResolveAllCallbacksWith(std::nullopt);
return;
}
base::Time extraction_time = base::Time::Now();
std::vector<uint8_t> screenshot_data;
if (result.value()->screenshot_result.has_value()) {
screenshot_data =
std::move(result.value()->screenshot_result.value().screenshot_data);
}
auto page_content_result =
std::move(result.value()->annotated_page_content_result);
auto ref_counted_content =
base::MakeRefCounted<RefCountedAnnotatedPageContent>(
std::move(page_content_result->proto));
page_content_extraction_service_->OnPageContentExtracted(
web_contents_->GetPrimaryPage(), ref_counted_content, screenshot_data,
get_tab_id_callback_.Run(web_contents_));
GURL url = web_contents_->GetLastCommittedURL();
bool is_eligible_for_server_upload =
!page_context_eligibility_ ||
optimization_guide::IsPageContextEligible(
url.GetHost(), url.GetPath(),
optimization_guide::GetFrameMetadataFromPageContent(
*page_content_result),
page_context_eligibility_);
cached_content_ = ExtractedPageContentResult(
std::move(ref_counted_content), extraction_time,
is_eligible_for_server_upload, std::move(screenshot_data));
ResolveAllCallbacksWith(cached_content_);
}
void AnnotatedPageContentRequest::OnInnerTextReceived(
base::TimeTicks start_time,
std::unique_ptr<content_extraction::InnerTextResult> result) {
if (!result) {
return;
}
UMA_HISTOGRAM_TIMES("OptimizationGuide.InnerText.TotalLatency",
base::TimeTicks::Now() - start_time);
UMA_HISTOGRAM_CUSTOM_COUNTS("OptimizationGuide.InnerText.TotalSize2",
result->inner_text.length() / 1024, 10, 5000, 50);
}
#if BUILDFLAG(ENABLE_PDF)
void AnnotatedPageContentRequest::RequestPdfPageCount() {
CHECK_EQ(pdf::kPDFMimeType, web_contents_->GetContentsMimeType());
auto* pdf_helper =
pdf::PDFDocumentHelper::MaybeGetForWebContents(web_contents_);
if (pdf_helper) {
pdf_helper->RegisterForDocumentLoadComplete(
base::BindOnce(&AnnotatedPageContentRequest::OnPdfDocumentLoadComplete,
weak_factory_.GetWeakPtr()));
}
}
void AnnotatedPageContentRequest::OnPdfDocumentLoadComplete() {
CHECK_EQ(pdf::kPDFMimeType, web_contents_->GetContentsMimeType());
lifecycle_ = Lifecycle::kExtracted;
auto* pdf_helper =
pdf::PDFDocumentHelper::MaybeGetForWebContents(web_contents_);
if (pdf_helper) {
// Fetch zero PDF bytes to just receive the total page count.
pdf_helper->GetPdfBytes(
/*size_limit=*/0,
base::BindOnce(
&RecordPdfPageCountMetrics,
web_contents_->GetPrimaryMainFrame()->GetPageUkmSourceId()));
}
// Requests for PDFs are synchronously rejected in
// RefreshExtractedPageContentAndEligibilityForPage. Therefore, they never get
// added to the on_demand_callbacks_ queue, so it will always be empty here.
CHECK(on_demand_callbacks_.empty());
}
#endif // BUILDFLAG(ENABLE_PDF)
void AnnotatedPageContentRequest::OnPageContextEligibilityAPILoaded(
optimization_guide::PageContextEligibility* page_context_eligibility) {
page_context_eligibility_ = page_context_eligibility;
}
std::optional<ExtractedPageContentResult>
AnnotatedPageContentRequest::GetCachedContentAndEligibility() {
return cached_content_;
}
std::optional<bool> AnnotatedPageContentRequest::GetServerUploadEligibility() {
return cached_content_ ? std::make_optional(
cached_content_->is_eligible_for_server_upload)
: std::nullopt;
}
void AnnotatedPageContentRequest::
RefreshExtractedPageContentAndEligibilityForPage(
GetExtractedPageContentAndEligibilityCallback callback) {
bool is_pdf = web_contents_->GetContentsMimeType() == pdf::kPDFMimeType;
base::UmaHistogramBoolean(
"OptimizationGuide.PageContentExtraction.OnDemand.IsPDF", is_pdf);
// PDFs have special handling where we only save a metric of their page count
// and do not extract an AnnotatedPageContent.
if (is_pdf) {
CHECK(!cached_content_.has_value());
std::move(callback).Run(std::nullopt);
return;
}
base::UmaHistogramEnumeration(
"OptimizationGuide.PageContentExtraction.OnDemand.StateAtRequest",
lifecycle_);
auto wrapped_callback =
base::BindOnce(&RecordAndReturnOnDemandExtractionResult,
base::ElapsedTimer())
.Then(std::move(callback));
on_demand_callbacks_.push_back(std::move(wrapped_callback));
// This on-demand request must coordinate with the automatic extraction.
if (on_demand_callbacks_.size() == 1) {
switch (lifecycle_) {
case Lifecycle::kNavigated:
// The initial extraction has not been scheduled. Force it to be
// scheduled if the page is ready. This is a no-op unless the
// triggering mode is "on hidden" only.
// TODO(b/490161242): Consider shortening the delay based on how long
// ago the page navigated.
MaybeScheduleExtraction();
break;
case Lifecycle::kScheduled:
case Lifecycle::kRunning:
// Already scheduled or running, wait for it.
break;
case Lifecycle::kExtracted:
// The previous extraction is complete. Start a new one immediately.
StartExtraction();
break;
}
}
}
void AnnotatedPageContentRequest::ResolveAllCallbacksWith(
const std::optional<ExtractedPageContentResult>& result) {
if (on_demand_callbacks_.empty()) {
return;
}
base::UmaHistogramCounts100(
"OptimizationGuide.PageContentExtraction.OnDemand."
"PendingCallbacksBatched",
on_demand_callbacks_.size());
auto callbacks = std::exchange(on_demand_callbacks_, {});
for (auto& callback : callbacks) {
// TODO(b/490161242): Consider wrapping the screenshot data (or the whole
// ExtractedPageContentResult) in a scoped_refptr to avoid copying for each
// of the callbacks.
std::move(callback).Run(result);
}
}
void AnnotatedPageContentRequest::OnVisibilityChanged(
content::Visibility visibility) {
bool was_hidden = is_hidden_;
is_hidden_ = visibility == content::Visibility::HIDDEN;
if (is_hidden_ == was_hidden) {
return;
}
page_content_extraction_service_->OnVisibilityChanged(
get_tab_id_callback_.Run(web_contents_), web_contents_, visibility);
if (is_hidden_) {
MaybeScheduleExtraction(/*on_hide=*/true);
}
}
} // namespace page_content_annotations