blob: 7c660b532bac74f841a573227d3e65affa0e38ca [file] [log] [blame]
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/page_content_annotations/annotate_page_content_request.h"
#include "base/metrics/histogram_macros.h"
#include "base/task/task_traits.h"
#include "base/task/thread_pool.h"
#include "base/trace_event/trace_event.h"
#include "chrome/browser/page_content_annotations/page_content_extraction_service.h"
#include "chrome/browser/page_content_annotations/page_content_extraction_service_factory.h"
#include "chrome/browser/page_content_annotations/page_content_extraction_types.h"
#include "chrome/browser/profiles/profile.h"
#include "components/history/core/browser/features.h"
#include "components/optimization_guide/content/browser/page_content_proto_provider.h"
#include "components/optimization_guide/content/browser/page_context_eligibility.h"
#include "components/page_content_annotations/core/page_content_annotations_features.h"
#include "components/pdf/common/constants.h"
#include "content/public/browser/browser_thread.h"
#include "content/public/browser/navigation_handle.h"
#include "net/http/http_response_headers.h"
#include "pdf/buildflags.h"
#include "services/metrics/public/cpp/metrics_utils.h"
#include "services/metrics/public/cpp/ukm_builders.h"
#include "services/network/public/mojom/url_response_head.mojom.h"
#if BUILDFLAG(ENABLE_PDF)
#include "components/pdf/browser/pdf_document_helper.h"
#endif // BUILDFLAG(ENABLE_PDF)
namespace page_content_annotations {
namespace {
#if BUILDFLAG(ENABLE_PDF)
void RecordPdfPageCountMetrics(
ukm::SourceId source_id,
pdf::mojom::PdfListener::GetPdfBytesStatus status,
const std::vector<uint8_t>& bytes,
uint32_t page_count) {
if (status == pdf::mojom::PdfListener::GetPdfBytesStatus::kFailed) {
return;
}
ukm::builders::OptimizationGuide_AnnotatedPdfContent(source_id)
.SetPdfPageCount(ukm::GetExponentialBucketMinForCounts1000(page_count))
.Record(ukm::UkmRecorder::Get());
}
#endif // BUILDFLAG(ENABLE_PDF)
} // namespace
// static
std::unique_ptr<AnnotatedPageContentRequest>
AnnotatedPageContentRequest::Create(content::WebContents* web_contents) {
auto request = blink::mojom::AIPageContentOptions::New();
request->mode = blink::mojom::AIPageContentMode::kDefault;
request->on_critical_path = page_content_annotations::features::
IsAnnotatedPageContentOnCriticalPath();
return std::make_unique<AnnotatedPageContentRequest>(web_contents,
std::move(request));
}
AnnotatedPageContentRequest::AnnotatedPageContentRequest(
content::WebContents* web_contents,
blink::mojom::AIPageContentOptionsPtr request)
: web_contents_(web_contents),
request_(std::move(request)),
delay_(page_content_annotations::features::
GetAnnotatedPageContentCaptureDelay()),
include_inner_text_(
page_content_annotations::features::
ShouldAnnotatedPageContentStudyIncludeInnerText()) {
// Post to a background thread to avoid blocking the set up of the overlay.
base::ThreadPool::PostTaskAndReplyWithResult(
FROM_HERE, {base::TaskPriority::BEST_EFFORT, base::MayBlock()},
base::BindOnce(&optimization_guide::PageContextEligibility::Get),
base::BindOnce(
&AnnotatedPageContentRequest::OnPageContextEligibilityAPILoaded,
weak_factory_.GetWeakPtr()));
}
AnnotatedPageContentRequest::~AnnotatedPageContentRequest() = default;
void AnnotatedPageContentRequest::PrimaryPageChanged() {
ResetForNewNavigation();
}
void AnnotatedPageContentRequest::DidFinishNavigation(
content::NavigationHandle* navigation_handle) {
if (!navigation_handle->IsInPrimaryMainFrame()) {
return;
}
// Cross-document navigations are handled in PrimaryPageChanged.
if (!navigation_handle->IsSameDocument() ||
!navigation_handle->HasCommitted()) {
return;
}
// This is a heuristic to tradeoff how frequently the content is updated and
// ensuring we have coverage for single-page-apps in the data. If the
// navigation will appear in the browser history, it's likely a significant
// change in page state.
if (!navigation_handle->ShouldUpdateHistory()) {
return;
}
if (base::FeatureList::IsEnabled(history::kVisitedLinksOn404)) {
// With the flag enabled, navigations with a 404 status code will be
// eligible for History. We want to ignore 404s. At this point, we should
// only be looking at committed same-document navigations. Same-document
// navigations have no network request and therefore no response code, so we
// should look at the response code for the request that brought us to the
// current document instead of the `NavigationHandle`.
const auto* document_response_head =
navigation_handle->GetRenderFrameHost()->GetLastResponseHead();
if (!document_response_head || !document_response_head->headers) {
return;
}
const int status_code = document_response_head->headers->response_code();
if (status_code == 404) {
return;
}
}
ResetForNewNavigation();
// We don't have reliable load and FCP signals for same-document
// navigations. So we assume the content is ready as soon as the navigation
// commits.
waiting_for_fcp_ = false;
waiting_for_load_ = false;
MaybeScheduleExtraction();
}
void AnnotatedPageContentRequest::DidStopLoading() {
// Ensure that the main frame's Document has finished loading.
if (!web_contents_->IsDocumentOnLoadCompletedInPrimaryMainFrame()) {
return;
}
// Once the main Document has fired the `load` event, wait for all subframes
// currently in the FrameTree to also finish loading.
if (web_contents_->IsLoading()) {
return;
}
if (web_contents_->GetContentsMimeType() == pdf::kPDFMimeType) {
// Pdfs don't provide a FirstContentfulPaint signal, so skip waiting for
// it for these Documents.
waiting_for_fcp_ = false;
}
waiting_for_load_ = false;
MaybeScheduleExtraction();
}
void AnnotatedPageContentRequest::OnFirstContentfulPaintInPrimaryMainFrame() {
waiting_for_fcp_ = false;
MaybeScheduleExtraction();
}
void AnnotatedPageContentRequest::ResetForNewNavigation() {
lifecycle_ = Lifecycle::kPending;
waiting_for_fcp_ = true;
waiting_for_load_ = true;
cached_content_ = std::nullopt;
// Drop pending extraction request for the previous page, if any.
weak_factory_.InvalidateWeakPtrs();
}
void AnnotatedPageContentRequest::MaybeScheduleExtraction() {
if (!ShouldScheduleExtraction()) {
return;
}
lifecycle_ = Lifecycle::kScheduled;
if (web_contents_->GetContentsMimeType() == pdf::kPDFMimeType) {
#if BUILDFLAG(ENABLE_PDF)
content::GetUIThreadTaskRunner()->PostDelayedTask(
FROM_HERE,
base::BindOnce(&AnnotatedPageContentRequest::RequestPdfPageCount,
weak_factory_.GetWeakPtr()),
page_content_annotations::features::
GetAnnotatedPageContentCaptureDelay());
#endif // BUILDFLAG(ENABLE_PDF)
} else {
content::GetUIThreadTaskRunner()->PostDelayedTask(
FROM_HERE,
base::BindOnce(
&AnnotatedPageContentRequest::RequestAnnotatedPageContentSync,
weak_factory_.GetWeakPtr()),
delay_);
}
}
void AnnotatedPageContentRequest::RequestAnnotatedPageContentSync() {
TRACE_EVENT0("browser",
"AnnotatedPageContentRequest::RequestAnnotatedPageContentSync");
optimization_guide::GetAIPageContent(
web_contents_, request_.Clone(),
base::BindOnce(&AnnotatedPageContentRequest::OnPageContentReceived,
weak_factory_.GetWeakPtr()));
if (include_inner_text_) {
content_extraction::GetInnerText(
*web_contents_->GetPrimaryMainFrame(), std::nullopt,
base::BindOnce(&AnnotatedPageContentRequest::OnInnerTextReceived,
weak_factory_.GetWeakPtr(), base::TimeTicks::Now()));
}
}
bool AnnotatedPageContentRequest::ShouldScheduleExtraction() const {
if (lifecycle_ != Lifecycle::kPending) {
return false;
}
return !waiting_for_fcp_ && !waiting_for_load_;
}
void AnnotatedPageContentRequest::OnPageContentReceived(
std::optional<optimization_guide::AIPageContentResult> page_content) {
lifecycle_ = Lifecycle::kDone;
if (!page_content) {
return;
}
Profile* profile =
Profile::FromBrowserContext(web_contents_->GetBrowserContext());
auto* page_content_extraction_service = page_content_annotations::
PageContentExtractionServiceFactory::GetForProfile(profile);
page_content_extraction_service->OnPageContentExtracted(
web_contents_->GetPrimaryPage(), page_content->proto);
GURL url = web_contents_->GetLastCommittedURL();
bool is_eligible_for_server_upload =
!page_context_eligibility_ ||
optimization_guide::IsPageContextEligible(
url.host(), url.path(),
optimization_guide::GetFrameMetadataFromPageContent(*page_content),
page_context_eligibility_);
cached_content_ = ExtractedPageContentResult{
.page_content = std::move(page_content->proto),
.is_eligible_for_server_upload = is_eligible_for_server_upload};
}
void AnnotatedPageContentRequest::OnInnerTextReceived(
base::TimeTicks start_time,
std::unique_ptr<content_extraction::InnerTextResult> result) {
if (!result) {
return;
}
UMA_HISTOGRAM_TIMES("OptimizationGuide.InnerText.TotalLatency",
base::TimeTicks::Now() - start_time);
UMA_HISTOGRAM_CUSTOM_COUNTS("OptimizationGuide.InnerText.TotalSize2",
result->inner_text.length() / 1024, 10, 5000, 50);
}
#if BUILDFLAG(ENABLE_PDF)
void AnnotatedPageContentRequest::RequestPdfPageCount() {
CHECK_EQ(pdf::kPDFMimeType, web_contents_->GetContentsMimeType());
auto* pdf_helper =
pdf::PDFDocumentHelper::MaybeGetForWebContents(web_contents_);
if (pdf_helper) {
pdf_helper->RegisterForDocumentLoadComplete(
base::BindOnce(&AnnotatedPageContentRequest::OnPdfDocumentLoadComplete,
weak_factory_.GetWeakPtr()));
}
}
void AnnotatedPageContentRequest::OnPdfDocumentLoadComplete() {
CHECK_EQ(pdf::kPDFMimeType, web_contents_->GetContentsMimeType());
auto* pdf_helper =
pdf::PDFDocumentHelper::MaybeGetForWebContents(web_contents_);
if (pdf_helper) {
// Fetch zero PDF bytes to just receive the total page count.
pdf_helper->GetPdfBytes(
/*size_limit=*/0,
base::BindOnce(
&RecordPdfPageCountMetrics,
web_contents_->GetPrimaryMainFrame()->GetPageUkmSourceId()));
}
}
#endif // BUILDFLAG(ENABLE_PDF)
void AnnotatedPageContentRequest::OnPageContextEligibilityAPILoaded(
optimization_guide::PageContextEligibility* page_context_eligibility) {
page_context_eligibility_ = page_context_eligibility;
}
std::optional<ExtractedPageContentResult>
AnnotatedPageContentRequest::GetCachedContentAndEligibility() {
return cached_content_;
}
} // namespace page_content_annotations