blob: 8639c4392cd4a392fafef6134852735215818673 [file] [log] [blame]
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "pdf/pdfium/pdfium_on_demand_searchifier.h"
#include <algorithm>
#include <utility>
#include "base/check.h"
#include "base/containers/contains.h"
#include "base/metrics/histogram_functions.h"
#include "base/task/single_thread_task_runner.h"
#include "pdf/pdfium/pdfium_searchify.h"
namespace {
// A delay to wait between page searchify tasks to give more priority to other
// PDF tasks. The longer delay is used when the next task seems to be not urgent
// and its helpful to reduce CPU load.
constexpr base::TimeDelta kSearchifyPageDelay = base::Milliseconds(100);
constexpr base::TimeDelta kSearchifyPageLongDelay = base::Milliseconds(300);
} // namespace
namespace chrome_pdf {
PDFiumOnDemandSearchifier::OcrResult::OcrResult(
int image_index,
screen_ai::mojom::VisualAnnotationPtr annotation,
const gfx::Size& image_size)
: image_index(image_index),
annotation(std::move(annotation)),
image_size(image_size) {}
PDFiumOnDemandSearchifier::OcrResult::OcrResult(
PDFiumOnDemandSearchifier::OcrResult&& other) noexcept = default;
PDFiumOnDemandSearchifier::OcrResult::~OcrResult() = default;
PDFiumOnDemandSearchifier::PDFiumOnDemandSearchifier(PDFiumEngine* engine)
: engine_(raw_ref<PDFiumEngine>::from_ptr(engine)) {}
PDFiumOnDemandSearchifier::~PDFiumOnDemandSearchifier() = default;
void PDFiumOnDemandSearchifier::Start(
GetOcrMaxImageDimensionCallbackAsync get_max_dimension_callback,
PerformOcrCallbackAsync perform_ocr_callback) {
CHECK(perform_ocr_callback);
CHECK(get_max_dimension_callback);
CHECK_EQ(state_, State::kIdle);
// Expected to be called only once.
CHECK(get_max_dimension_callback_.is_null());
CHECK(perform_ocr_callback_.is_null());
font_ = CreateFont(engine_->doc());
get_max_dimension_callback_ = std::move(get_max_dimension_callback);
perform_ocr_callback_ = std::move(perform_ocr_callback);
if (pages_queue_.size()) {
SearchifyNextPage();
}
}
void PDFiumOnDemandSearchifier::OnGotOcrMaxImageDimension(
uint32_t max_image_dimension) {
// A state changed while waiting for max image dimension indicates that OCR
// got disconnnected and cannot be used.
if (state_ != State::kWaitingForResults) {
return;
}
CHECK(max_image_dimension);
max_image_dimension_ = max_image_dimension;
state_ = State::kIdle;
SearchifyNextPage();
}
void PDFiumOnDemandSearchifier::OnOcrDisconnected() {
switch (state_) {
case State::kIdle:
// No need to change state, if another request comes up, the OCR provider
// will try to connect to the service again.
return;
case State::kWaitingForPageAvailability:
// If waiting for page availability takes long, OCR service may shutdown
// to release resources. Disconnection is expected in this case and the
// service will reconnect on next request.
return;
case State::kWaitingForResults:
// Assume OCR cannot be used anymore if it gets disconnected while
// waiting for results. Therefore cancel all pending requests and move
// to failed state.
ClearCurrentPage();
pages_queue_.clear();
state_ = State::kFailed;
engine_->OnSearchifyStateChange(/*busy=*/false);
return;
case State::kFailed:
// `kFailed` is the end state and searchifier does not accept any requests
// after it. So no need to react to OCR disconnection.
return;
}
NOTREACHED();
}
bool PDFiumOnDemandSearchifier::IsPageScheduled(uint32_t page_index) const {
if (current_page_ && current_page_->index() == page_index) {
return true;
}
return base::Contains(pages_queue_, page_index);
}
void PDFiumOnDemandSearchifier::SchedulePage(uint32_t page_index) {
CHECK_NE(state_, State::kFailed);
if (IsPageScheduled(page_index)) {
return;
}
if (!current_page_ && pages_queue_.empty() && state_ == State::kIdle) {
engine_->OnSearchifyStateChange(/*busy=*/true);
}
pages_queue_.push_back(page_index);
if (state_ == State::kWaitingForResults ||
state_ == State::kWaitingForPageAvailability ||
perform_ocr_callback_.is_null()) {
return;
}
CHECK_EQ(state_, State::kIdle);
base::SingleThreadTaskRunner::GetCurrentDefault()->PostDelayedTask(
FROM_HERE,
base::BindOnce(&PDFiumOnDemandSearchifier::SearchifyNextPage,
weak_factory_.GetWeakPtr()),
kSearchifyPageDelay);
// Avoid posting `SearchifyNextPage` more than once.
state_ = State::kWaitingForResults;
}
void PDFiumOnDemandSearchifier::SearchifyNextPage() {
// Do not proceed if OCR got disconnected.
if (state_ == State::kFailed) {
return;
}
// If max image dimension is not asked yet, ask it before performing OCR.
if (get_max_dimension_callback_) {
std::move(get_max_dimension_callback_)
.Run(base::BindOnce(
&PDFiumOnDemandSearchifier::OnGotOcrMaxImageDimension,
weak_factory_.GetWeakPtr()));
state_ = State::kWaitingForResults;
return;
}
if (pages_queue_.empty()) {
state_ = State::kIdle;
engine_->OnSearchifyStateChange(/*busy=*/false);
return;
}
state_ = State::kWaitingForResults;
current_page_ = engine_->GetPage(pages_queue_.front());
CHECK(current_page_);
current_page_was_loaded_ = !!current_page_->page();
pages_queue_.pop_front();
// Load the page if needed.
current_page_->GetPage();
current_page_image_object_indices_ = current_page_->GetImageObjectIndices();
current_page_ocr_results_.clear();
current_page_ocr_results_.reserve(current_page_image_object_indices_.size());
SearchifyNextImage();
}
void PDFiumOnDemandSearchifier::SearchifyNextImage() {
CHECK(current_page_);
std::optional<BitmapResult> bitmap_result = GetNextBitmap();
if (bitmap_result.has_value()) {
const auto& bitmap = bitmap_result.value().bitmap;
perform_ocr_callback_.Run(
bitmap, base::BindOnce(&PDFiumOnDemandSearchifier::OnGotOcrResult,
weak_factory_.GetWeakPtr(),
bitmap_result.value().image_index,
gfx::Size(bitmap.width(), bitmap.height())));
return;
}
// Report metric only once for each page.
CHECK(!current_page_->IsPageSearchified());
base::UmaHistogramBoolean("PDF.SearchifyAddedText",
!current_page_ocr_results_.empty());
CommitResultsToPage();
}
void PDFiumOnDemandSearchifier::CommitResultsToPage() {
CHECK(state_ == State::kWaitingForResults ||
state_ == State::kWaitingForPageAvailability);
// Ignore the results if the page got unloaded before committing them.
if (!current_page_) {
current_page_ocr_results_.clear();
}
if (!current_page_ocr_results_.empty()) {
// If the page is being painted or cannot be unloaded, wait.
if (!current_page_->PageCanBeUnloaded() ||
engine_->IsPageScheduledForPaint(current_page_->index())) {
if (state_ == State::kWaitingForResults) {
state_ = State::kWaitingForPageAvailability;
}
base::SingleThreadTaskRunner::GetCurrentDefault()->PostDelayedTask(
FROM_HERE,
base::BindOnce(&PDFiumOnDemandSearchifier::CommitResultsToPage,
weak_factory_.GetWeakPtr()),
kSearchifyPageDelay);
return;
}
// Reload page if needed.
FPDF_PAGE page = current_page_->GetPage();
bool added_text = false;
for (auto& result : current_page_ocr_results_) {
FPDF_PAGEOBJECT image = FPDFPage_GetObject(page, result.image_index);
added_text |=
AddTextOnImage(engine_->doc(), page, font_.get(), image,
std::move(result.annotation), result.image_size);
}
current_page_ocr_results_.clear();
current_page_->OnSearchifyGotOcrResult(added_text);
current_page_->ReloadTextPage();
if (!FPDFPage_GenerateContent(page)) {
LOG(ERROR) << "Failed to generate content";
}
}
// `kWaitingForPageAvailability` is only set by this function, hence change
// the state back to `kWaitingForResults` in case it is changed.
state_ = State::kWaitingForResults;
ClearCurrentPage();
// Searchify next page.
// If none of the scheduled pages are visible, post the task with more delay
// to reduce CPU load.
bool long_delay =
std::ranges::none_of(pages_queue_, [this](uint32_t page_index) {
return this->engine_->IsPageVisible(page_index);
});
base::SingleThreadTaskRunner::GetCurrentDefault()->PostDelayedTask(
FROM_HERE,
base::BindOnce(&PDFiumOnDemandSearchifier::SearchifyNextPage,
weak_factory_.GetWeakPtr()),
long_delay ? kSearchifyPageLongDelay : kSearchifyPageDelay);
}
std::optional<PDFiumOnDemandSearchifier::BitmapResult>
PDFiumOnDemandSearchifier::GetNextBitmap() {
while (!current_page_image_object_indices_.empty()) {
int image_index = current_page_image_object_indices_.back();
current_page_image_object_indices_.pop_back();
SkBitmap bitmap =
current_page_->GetImageForOcr(image_index, max_image_dimension_);
if (!bitmap.drawsNothing()) {
return BitmapResult{bitmap, image_index};
}
}
return std::nullopt;
}
void PDFiumOnDemandSearchifier::OnGotOcrResult(
int image_index,
const gfx::Size& image_size,
screen_ai::mojom::VisualAnnotationPtr annotation) {
CHECK_EQ(state_, State::kWaitingForResults);
CHECK(current_page_);
performed_ocr_ = true;
if (annotation) {
current_page_ocr_results_.emplace_back(image_index, std::move(annotation),
image_size);
}
SearchifyNextImage();
}
void PDFiumOnDemandSearchifier::ClearCurrentPage() {
if (current_page_ && !current_page_was_loaded_) {
engine_->MaybeUnloadPage(current_page_->index());
}
current_page_ = nullptr;
}
} // namespace chrome_pdf