blob: c4e3e6160d8ab646f933c4d9cdfdb1ee4ca67e95 [file] [log] [blame]
// Copyright 2010 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "pdf/pdfium/pdfium_range.h"
#include <string>
#include "base/check_op.h"
#include "base/containers/span.h"
#include "base/feature_list.h"
#include "base/strings/string_util.h"
#include "pdf/accessibility_structs.h"
#include "pdf/pdfium/pdfium_api_string_buffer_adapter.h"
#include "third_party/pdfium/public/fpdf_searchex.h"
#include "ui/gfx/geometry/point.h"
#include "ui/gfx/geometry/rect.h"
#include "ui/gfx/geometry/rect_f.h"
namespace chrome_pdf {
namespace {
// Enables AccessibilityTextRunInfo-based screen rects.
// TODO(crbug.com/40448046): Remove this kill switch after a safe rollout.
BASE_FEATURE(kPdfAccessibilityTextRunInfoScreenRects,
"PdfAccessibilityTextRunInfoScreenRects",
base::FEATURE_ENABLED_BY_DEFAULT);
void AdjustForBackwardsRange(int& index, int& count) {
if (count < 0) {
count *= -1;
index -= count - 1;
}
}
// Struct with only the text run info needed for PDFiumRange::GetScreenRects().
struct ScreenRectTextRunInfo {
gfx::Rect screen_rect;
size_t char_count;
};
// Returns a ratio between [0, 1].
float GetVerticalOverlap(const gfx::Rect& rect1, const gfx::Rect& rect2) {
CHECK(!rect1.IsEmpty());
CHECK(!rect2.IsEmpty());
gfx::Rect union_rect = rect1;
union_rect.Union(rect2);
if (union_rect.height() == rect1.height() ||
union_rect.height() == rect2.height()) {
return 1.0f;
}
gfx::Rect intersect_rect = rect1;
intersect_rect.Intersect(rect2);
return static_cast<float>(intersect_rect.height()) / union_rect.height();
}
// Returns true if there is sufficient horizontal and vertical overlap.
bool ShouldMergeHorizontalRects(const ScreenRectTextRunInfo& text_run1,
const ScreenRectTextRunInfo& text_run2) {
static constexpr float kVerticalOverlapThreshold = 0.8f;
const gfx::Rect& rect1 = text_run1.screen_rect;
const gfx::Rect& rect2 = text_run2.screen_rect;
if (GetVerticalOverlap(rect1, rect2) < kVerticalOverlapThreshold) {
return false;
}
static constexpr float kHorizontalWidthFactor = 1.0f;
const float average_width1 =
kHorizontalWidthFactor * rect1.width() / text_run1.char_count;
const float average_width2 =
kHorizontalWidthFactor * rect2.width() / text_run2.char_count;
const float rect1_left = rect1.x() - average_width1;
const float rect1_right = rect1.right() + average_width1;
const float rect2_left = rect2.x() - average_width2;
const float rect2_right = rect2.right() + average_width2;
return rect1_left < rect2_right && rect1_right > rect2_left;
}
// Since PDFiumPage::GetTextRunInfo() can end a text run for a variety of
// reasons, post-process the collected text run data and merge rectangles.
std::vector<gfx::Rect> MergeAdjacentRects(
base::span<ScreenRectTextRunInfo> text_runs) {
std::vector<gfx::Rect> results;
const ScreenRectTextRunInfo* previous_text_run = nullptr;
gfx::Rect current_screen_rect;
for (const auto& text_run : text_runs) {
if (previous_text_run) {
// TODO(crbug.com/40448046): Improve vertical text handling.
// For now, treat all text as horizontal, as that is the majority of the
// text. Also, PDFiumPage::GetTextPage() has bugs in its heuristics where
// it mistakenly reports horizontal text as vertical.
if (ShouldMergeHorizontalRects(*previous_text_run, text_run)) {
current_screen_rect.Union(text_run.screen_rect);
} else {
results.push_back(current_screen_rect);
current_screen_rect = text_run.screen_rect;
}
} else {
current_screen_rect = text_run.screen_rect;
}
previous_text_run = &text_run;
}
if (!current_screen_rect.IsEmpty()) {
results.push_back(current_screen_rect);
}
return results;
}
} // namespace
bool IsIgnorableCharacter(char16_t c) {
return c == kZeroWidthSpace || c == kPDFSoftHyphenMarker;
}
// static
PDFiumRange PDFiumRange::AllTextOnPage(PDFiumPage* page) {
return PDFiumRange(page, 0, page->GetCharCount());
}
PDFiumRange::PDFiumRange(PDFiumPage* page, int char_index, int char_count)
: page_unload_preventer_(page),
page_(page),
char_index_(char_index),
char_count_(char_count) {
DCHECK(page_);
// Ensure page load, while `page_unload_preventer_` prevents page unload.
// This prevents GetScreenRects() from triggering page loads, which can have
// surprising side effects, considering GetScreenRects() is const.
[[maybe_unused]] FPDF_TEXTPAGE text_page = page_->GetTextPage();
#if DCHECK_IS_ON()
AdjustForBackwardsRange(char_index, char_count);
DCHECK_LE(char_count, FPDFText_CountChars(text_page));
#endif
}
PDFiumRange::PDFiumRange(const PDFiumRange&) = default;
PDFiumRange& PDFiumRange::operator=(const PDFiumRange&) = default;
PDFiumRange::PDFiumRange(PDFiumRange&&) noexcept = default;
PDFiumRange& PDFiumRange::operator=(PDFiumRange&&) noexcept = default;
PDFiumRange::~PDFiumRange() = default;
void PDFiumRange::SetCharCount(int char_count) {
if (char_count == char_count_) {
return;
}
char_count_ = char_count;
#if DCHECK_IS_ON()
int dummy_index = 0;
AdjustForBackwardsRange(dummy_index, char_count);
DCHECK_LE(char_count, FPDFText_CountChars(page_->GetTextPage()));
#endif
cached_screen_rects_point_ = gfx::Point();
cached_screen_rects_zoom_ = 0;
}
const std::vector<gfx::Rect>& PDFiumRange::GetScreenRects(
const gfx::Point& point,
double zoom,
PageOrientation orientation) const {
if (point == cached_screen_rects_point_ &&
zoom == cached_screen_rects_zoom_) {
return cached_screen_rects_;
}
cached_screen_rects_.clear();
cached_screen_rects_point_ = point;
cached_screen_rects_zoom_ = zoom;
if (char_count_ == 0) {
return cached_screen_rects_;
}
FPDF_TEXTPAGE text_page = page_->GetTextPage();
if (!text_page) {
return cached_screen_rects_;
}
int char_index = char_index_;
int char_count = char_count_;
AdjustForBackwardsRange(char_index, char_count);
DCHECK_GE(char_index, 0) << " start: " << char_index_
<< " count: " << char_count_;
DCHECK_LT(char_index, FPDFText_CountChars(text_page))
<< " start: " << char_index_ << " count: " << char_count_;
if (!base::FeatureList::IsEnabled(kPdfAccessibilityTextRunInfoScreenRects)) {
int count = FPDFText_CountRects(text_page, char_index, char_count);
for (int i = 0; i < count; ++i) {
double left;
double top;
double right;
double bottom;
FPDFText_GetRect(text_page, i, &left, &top, &right, &bottom);
gfx::Rect rect = page_->PageToScreen(point, zoom, left, top, right,
bottom, orientation);
if (rect.IsEmpty()) {
continue;
}
cached_screen_rects_.push_back(rect);
}
return cached_screen_rects_;
}
std::vector<ScreenRectTextRunInfo> text_runs;
const int end_char_index = char_index + char_count;
bool reached_end = false;
while (!reached_end) {
// Should not fail because `text_page` is non-null and `char_index` is
// always in range.
std::optional<AccessibilityTextRunInfo> text_run_info =
page_->GetTextRunInfo(char_index);
CHECK(text_run_info.has_value());
// Figure out how many characters to process in the for-loop below, and
// determine if this while-loop iteration reached the end of the range.
int next_char_index = char_index + text_run_info.value().len;
reached_end = next_char_index >= end_char_index;
if (reached_end) {
next_char_index = end_char_index;
}
// Do not use the bounds from `text_run_info`, as those are in the wrong
// coordinate system. Calculate it here instead.
gfx::Rect text_run_rect;
for (int i = char_index; i < next_char_index; ++i) {
// Use the loose rectangle, which gives the selection a bit more padding.
// In comparison, the rectangle from FPDFText_GetCharBox() surrounds the
// glyph too tightly.
//
// Should not fail because `text_page` is non-null, `i` is always in
// range, and the out-parameter is non-null.
FS_RECTF rect;
bool got_rect = FPDFText_GetLooseCharBox(text_page, i, &rect);
CHECK(got_rect);
// Check for empty `rect` and skip. PDFiumPage::PageToScreen() may round
// an empty `rect` to a 1x1 `screen_rect`, which is hard to distinguish
// from an actual 1x1 `rect`.
if (rect.left == rect.right || rect.top == rect.bottom) {
continue;
}
gfx::Rect screen_rect =
page_->PageToScreen(point, zoom, rect.left, rect.top, rect.right,
rect.bottom, orientation);
text_run_rect.Union(screen_rect);
}
if (!text_run_rect.IsEmpty()) {
text_runs.emplace_back(text_run_rect, next_char_index - char_index);
}
char_index = next_char_index;
}
cached_screen_rects_ = MergeAdjacentRects(text_runs);
return cached_screen_rects_;
}
std::u16string PDFiumRange::GetText() const {
int index = char_index_;
int count = char_count_;
std::u16string result;
if (count == 0)
return result;
AdjustForBackwardsRange(index, count);
if (count > 0) {
// Note that the `expected_size` value includes the NUL terminator.
//
// Cannot set `check_expected_size` to true here because the fix to
// https://crbug.com/pdfium/1139 made it such that FPDFText_GetText() is
// not always consistent with FPDFText_CountChars() and may trim characters.
//
// Instead, treat `count` as the requested count, but use the size of
// `result` as the source of truth for how many characters
// FPDFText_GetText() actually wrote out.
PDFiumAPIStringBufferAdapter<std::u16string> api_string_adapter(
&result, /*expected_size=*/count + 1, /*check_expected_size=*/false);
unsigned short* data =
reinterpret_cast<unsigned short*>(api_string_adapter.GetData());
int written = FPDFText_GetText(page_->GetTextPage(), index, count, data);
// FPDFText_GetText() returns 0 on failure. Never negative value.
DCHECK_GE(written, 0);
api_string_adapter.Close(written);
const gfx::RectF page_bounds = page_->GetCroppedRect();
std::u16string in_bound_text;
in_bound_text.reserve(result.size());
// If FPDFText_GetText() trimmed off characters, figure out how many were
// trimmed from the front. Store the result in `index_offset`, so the
// IsCharInPageBounds() calls below can have the correct index.
CHECK_GE(static_cast<size_t>(count), result.size());
size_t trimmed_count = static_cast<size_t>(count) - result.size();
int index_offset = 0;
while (trimmed_count) {
if (FPDFText_GetTextIndexFromCharIndex(page_->GetTextPage(),
index + index_offset) >= 0) {
break;
}
--trimmed_count;
++index_offset;
}
for (size_t i = 0; i < result.size(); ++i) {
// Filter out characters outside the page bounds, which are semantically
// not part of the page.
if (page_->IsCharInPageBounds(index + index_offset + i, page_bounds))
in_bound_text += result[i];
}
result = in_bound_text;
std::erase_if(result, IsIgnorableCharacter);
}
return result;
}
} // namespace chrome_pdf