blob: ac0ab4a070c8fde55ca389029302f313ce9d3460 [file] [log] [blame]
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "pdf/pdfium/pdfium_text_fragment_finder.h"
#include <optional>
#include <string>
#include <vector>
#include "base/containers/span.h"
#include "base/functional/bind.h"
#include "base/memory/raw_ptr.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/shared_highlighting/core/common/text_fragment.h"
#include "pdf/pdfium/pdfium_engine.h"
#include "pdf/pdfium/pdfium_range.h"
namespace chrome_pdf {
namespace {
// Adds the `prefix_result` to the list of prefixes found in the PDF.
void AddTextFragmentPrefixResult(
std::vector<PDFiumRange>& text_fragment_prefixes,
PDFiumRange prefix_result) {
text_fragment_prefixes.emplace_back(std::move(prefix_result));
}
// Sets the `suffix_result` to be the suffix of the fragment if it comes after
// the `before_suffix_range`.
void AddTextFragmentSuffixResult(
PDFiumEngine* engine,
std::optional<PDFiumRange>& text_fragment_suffix,
const PDFiumRange& before_suffix_range,
PDFiumRange suffix_result) {
// TODO(crbug.com/393166468): Modify TextSearch() to return one result rather
// than all of them.
// If an appropriate suffix was already found, then do nothing.
if (text_fragment_suffix) {
return;
}
const int suffix_boundary_start =
before_suffix_range.char_index() + before_suffix_range.char_count();
const int suffix_boundary_count =
suffix_result.char_index() - suffix_boundary_start;
const auto suffix_boundary =
PDFiumRange(engine->GetPage(before_suffix_range.page_index()),
suffix_boundary_start, suffix_boundary_count);
for (const auto& c : suffix_boundary.GetText()) {
if (!base::IsUnicodeWhitespace(c)) {
return;
}
}
text_fragment_suffix = std::move(suffix_result);
}
// Executes the search of the fragment suffix value. Takes into consideration
// the range that should come before it if it exists. Returns a null optional or
// the range representing the suffix if it exists.
std::optional<PDFiumRange> FindTextFragmentSuffix(
PDFiumEngine* engine,
const shared_highlighting::TextFragment& fragment,
const PDFiumRange& end_range) {
std::optional<PDFiumRange> text_fragment_suffix = std::nullopt;
engine->SearchForFragment(
base::UTF8ToUTF16(fragment.suffix()),
/*character_to_start_searching_from=*/end_range.char_index() +
end_range.char_count(),
/*last_character_index_to_search=*/-1,
/*page_to_search=*/end_range.page_index(),
base::BindRepeating(&AddTextFragmentSuffixResult, engine,
std::ref(text_fragment_suffix), std::ref(end_range)));
return text_fragment_suffix;
}
// Adds the `start_result` to the list of text starts found in the PDF. This
// search utilizes the provided prefix and suffix, if present, to locate the
// fragment within the text. If only one is provided, the search proceeds
// accordingly, ignoring the missing component. The suffix is only checked
// when provided if there does not exist a `text_end` value in the fragment.
void AddTextFragmentStartResult(
PDFiumEngine* engine,
std::vector<PDFiumRange>& text_fragment_starts,
std::optional<PDFiumRange>& text_fragment_suffix,
const shared_highlighting::TextFragment& fragment,
std::optional<const PDFiumRange> prefix_range,
PDFiumRange start_result) {
// If there is a prefix range, only add the result to `text_fragment_starts_`
// if the result comes immediately after a word boundary.
if (prefix_range) {
const int prefix_end =
prefix_range->char_index() + prefix_range->char_count();
const int boundary_start = prefix_end;
const int boundary_count = start_result.char_index() - prefix_end;
const auto boundary =
PDFiumRange(engine->GetPage(start_result.page_index()), boundary_start,
boundary_count);
for (const auto& c : boundary.GetText()) {
if (!base::IsUnicodeWhitespace(c)) {
return;
}
}
}
if (fragment.text_end().empty() && !fragment.suffix().empty()) {
text_fragment_suffix =
FindTextFragmentSuffix(engine, fragment, start_result);
if (!text_fragment_suffix) {
return;
}
}
text_fragment_starts.emplace_back(std::move(start_result));
}
// Sets the `end_result` to be the text end of the fragment. If a suffix is
// provided, it is also checked to come after the text end.
void AddTextFragmentEndResult(PDFiumEngine* engine,
std::optional<PDFiumRange>& text_fragment_end,
std::optional<PDFiumRange>& text_fragment_suffix,
const shared_highlighting::TextFragment& fragment,
PDFiumRange end_result) {
// If an appropriate text fragment end was already found, do nothing.
if (text_fragment_end) {
return;
}
if (!fragment.suffix().empty()) {
text_fragment_suffix = FindTextFragmentSuffix(engine, fragment, end_result);
if (!text_fragment_suffix) {
return;
}
}
text_fragment_end = std::move(end_result);
}
} // namespace
PDFiumTextFragmentFinder::PDFiumTextFragmentFinder(PDFiumEngine* engine)
: engine_(engine) {}
PDFiumTextFragmentFinder::~PDFiumTextFragmentFinder() = default;
std::vector<PDFiumRange> PDFiumTextFragmentFinder::FindTextFragments(
base::span<const std::string> text_fragments) {
text_fragment_highlights_.clear();
for (const std::string& fragment : text_fragments) {
const auto text_fragment =
shared_highlighting::TextFragment::FromEscapedString(fragment);
CHECK(text_fragment.has_value());
StartTextFragmentSearch(text_fragment.value());
}
return std::move(text_fragment_highlights_);
}
void PDFiumTextFragmentFinder::StartTextFragmentSearch(
const shared_highlighting::TextFragment& fragment) {
// Clear any state from previous searches.
last_unsearched_page_ = 0;
text_fragment_prefixes_.clear();
text_fragment_starts_.clear();
text_fragment_end_ = std::nullopt;
text_fragment_suffix_ = std::nullopt;
// If StartTextFragmentSearch() gets called before `engine_` has any page
// information (i.e. before the first call to LoadDocument has happened).
// Handle this case.
if (engine_->GetNumberOfPages() == 0) {
return;
}
// If the fragment contains a prefix, start the search there.
if (!fragment.prefix().empty()) {
FindTextFragmentPrefix(fragment, /*page_to_start_search_from=*/0);
return;
}
// Otherwise, start the search from the the text fragment start value as it is
// a required value of the fragment.
FindTextFragmentStart(fragment);
}
void PDFiumTextFragmentFinder::FindTextFragmentPrefix(
const shared_highlighting::TextFragment& fragment,
int page_to_start_search_from) {
text_fragment_prefixes_.clear();
const auto prefix_unicode = base::UTF8ToUTF16(fragment.prefix());
for (int current_page = page_to_start_search_from;
current_page < engine_->GetNumberOfPages(); current_page++) {
last_unsearched_page_ = current_page + 1;
engine_->SearchForFragment(
prefix_unicode,
/*character_to_start_searching_from=*/0,
/*last_character_index_to_search=*/-1, current_page,
base::BindRepeating(&AddTextFragmentPrefixResult,
std::ref(text_fragment_prefixes_)));
if (!text_fragment_prefixes_.empty()) {
FindTextFragmentStart(fragment);
return;
}
}
}
void PDFiumTextFragmentFinder::FindTextFragmentStart(
const shared_highlighting::TextFragment& fragment) {
text_fragment_starts_.clear();
const auto start_unicode = base::UTF8ToUTF16(fragment.text_start());
// If there are no text fragment prefixes then none were expected as part of
// the text fragment. In this case, searching for the start term itself should
// be adequate.
if (text_fragment_prefixes_.empty()) {
for (int current_page = 0; current_page < engine_->GetNumberOfPages();
current_page++) {
engine_->SearchForFragment(
start_unicode,
/*character_to_start_searching_from=*/0,
/*last_character_index_to_search=*/-1, current_page,
base::BindRepeating(&AddTextFragmentStartResult, engine_,
std::ref(text_fragment_starts_),
std::ref(text_fragment_suffix_), fragment,
std::nullopt));
}
// If no text fragments were found, then return early as the text fragment
// does exist in the text of this PDF.
if (text_fragment_starts_.empty()) {
return;
}
// If text fragment starts were found, continue the search. If there is no
// text end value in the fragment, then the FindTextFragmentEnd() function
// will conclude the search.
FindTextFragmentEnd(fragment);
return;
}
// If there are text fragment prefixes, then search through them to determine
// if the `text_start` value comes after as expected.
for (const auto& prefix_range : text_fragment_prefixes_) {
engine_->SearchForFragment(
start_unicode,
/*character_to_start_searching_from=*/prefix_range.char_index(),
/*last_character_index_to_search=*/-1, prefix_range.page_index(),
base::BindRepeating(&AddTextFragmentStartResult, engine_,
std::ref(text_fragment_starts_),
std::ref(text_fragment_suffix_), fragment,
prefix_range));
// If no text fragments were found, then continue on to the next prefix
// found.
if (text_fragment_starts_.empty()) {
continue;
}
// If text fragment starts were found, continue the search. If there is no
// text end value in the fragment, then the FindTextFragmentEnd() function
// will conclude the search.
FindTextFragmentEnd(fragment);
return;
}
// If the `text_start` value could not be found and the fragment contains a
// prefix, search again for the text fragment prefix in case the fragment is
// actually on an unsearched page.
if (text_fragment_starts_.empty() && !fragment.prefix().empty() &&
last_unsearched_page_ < engine_->GetNumberOfPages()) {
FindTextFragmentPrefix(fragment, last_unsearched_page_);
}
}
void PDFiumTextFragmentFinder::FindTextFragmentEnd(
const shared_highlighting::TextFragment& fragment) {
if (fragment.text_end().empty()) {
FinishTextFragmentSearch();
return;
}
text_fragment_end_ = std::nullopt;
const auto end_unicode = base::UTF8ToUTF16(fragment.text_end());
for (const auto& start_range : text_fragment_starts_) {
engine_->SearchForFragment(
end_unicode,
/*character_to_start_searching_from=*/start_range.char_index() +
start_range.char_count(),
/*last_character_index_to_search=*/-1,
/*page_to_search=*/start_range.page_index(),
base::BindRepeating(&AddTextFragmentEndResult, engine_,
std::ref(text_fragment_end_),
std::ref(text_fragment_suffix_), fragment));
if (text_fragment_end_) {
// If a text fragment end was found, then the text fragment start list
// should be cleared except for the start range that was used in the
// search.
text_fragment_starts_ = {start_range};
FinishTextFragmentSearch();
return;
}
}
// If no text end was found and the fragment contains a prefix, search again
// for the text fragment prefix in case the fragment is on an unsearched page.
if (!text_fragment_end_ && !fragment.prefix().empty() &&
last_unsearched_page_ < engine_->GetNumberOfPages()) {
FindTextFragmentPrefix(fragment, last_unsearched_page_);
}
}
void PDFiumTextFragmentFinder::FinishTextFragmentSearch() {
if (text_fragment_starts_.empty()) {
return;
}
PDFiumRange highlight = text_fragment_starts_[0];
if (text_fragment_end_) {
// The search for `text_fragment_end_` always starts at the character
// index that would be represented by `highlight.char_index() +
// highlight.char_count()`. Because of this
// `text_fragment_end_->char_index()` should always be greater than
// `highlight.char_index()`.
CHECK_GT(text_fragment_end_->char_index(), highlight.char_index());
base::CheckedNumeric<int> new_char_count = text_fragment_end_->char_index();
new_char_count -= highlight.char_index();
new_char_count += text_fragment_end_->char_count();
highlight.SetCharCount(new_char_count.ValueOrDie());
}
text_fragment_highlights_.emplace_back(std::move(highlight));
}
} // namespace chrome_pdf