Tagged PDFs: Retrieves the entire structure tree from the PDF (7/7)
This continues the abandoned CL previously reviewed at
https://crrev.com/c/6339009. Together with the preparation series, this
CL rebases the original work, addresses the reviewer’s comments, and
fixes failures in pdf_unittests and browser_tests. Moreover, it adds
testing for GetStructureTree and the AccessibilityStructureElement
class.
Original message:
The entire structure tree containing information about the organization
and semantics of the PDF is retrieved and sent to the accessibility
layer. This will be used to build the correct accessibility nodes in the
AX tree.
Design doc at go/chrome-tagged-pdfs
AX-Relnotes: n/a.
Bug: 40707542
Change-Id: Ibcb15dfb330a69dffb3b0fdaa27a4db719f0f94e
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6939209
Commit-Queue: Tiago Vignatti <tvignatti@igalia.com>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Auto-Submit: Tiago Vignatti <tvignatti@igalia.com>
Cr-Commit-Position: refs/heads/main@{#1528096}
diff --git a/pdf/accessibility_structs.cc b/pdf/accessibility_structs.cc
index 0aaca54..ad24ee58 100644
--- a/pdf/accessibility_structs.cc
+++ b/pdf/accessibility_structs.cc
@@ -4,8 +4,67 @@
#include "pdf/accessibility_structs.h"
+#include "base/containers/fixed_flat_map.h"
+
namespace chrome_pdf {
+// Please keep the entries in the same order as the `PdfTagType` enum.
+// TODO(crbug.com/40707542): Consider moving this map to a shared location for
+// use also by PDF printing.
+constexpr auto kStringToPdfTagTypeMap =
+ base::MakeFixedFlatMap<std::string_view, PdfTagType>(
+ {{"", PdfTagType::kNone},
+ {"Document", PdfTagType::kDocument},
+ {"Part", PdfTagType::kPart},
+ {"Art", PdfTagType::kArt},
+ {"Sect", PdfTagType::kSect},
+ {"Div", PdfTagType::kDiv},
+ {"BlockQuote", PdfTagType::kBlockQuote},
+ {"Caption", PdfTagType::kCaption},
+ {"TOC", PdfTagType::kTOC},
+ {"TOCI", PdfTagType::kTOCI},
+ {"Index", PdfTagType::kIndex},
+ {"P", PdfTagType::kP},
+ {"H", PdfTagType::kH},
+ {"H1", PdfTagType::kH1},
+ {"H2", PdfTagType::kH2},
+ {"H3", PdfTagType::kH3},
+ {"H4", PdfTagType::kH4},
+ {"H5", PdfTagType::kH5},
+ {"H6", PdfTagType::kH6},
+ {"L", PdfTagType::kL},
+ {"LI", PdfTagType::kLI},
+ {"Lbl", PdfTagType::kLbl},
+ {"LBody", PdfTagType::kLBody},
+ {"Table", PdfTagType::kTable},
+ {"TR", PdfTagType::kTR},
+ {"TH", PdfTagType::kTH},
+ {"THead", PdfTagType::kTHead},
+ {"TBody", PdfTagType::kTBody},
+ {"TFoot", PdfTagType::kTFoot},
+ {"TD", PdfTagType::kTD},
+ {"Span", PdfTagType::kSpan},
+ {"Link", PdfTagType::kLink},
+ {"Figure", PdfTagType::kFigure},
+ {"Formula", PdfTagType::kFormula},
+ {"Form", PdfTagType::kForm}});
+
+static_assert(kStringToPdfTagTypeMap.size() ==
+ static_cast<size_t>(PdfTagType::kUnknown));
+
+PdfTagType PdfTagTypeFromString(const std::string& tag_type) {
+ if (auto iter = kStringToPdfTagTypeMap.find(tag_type);
+ iter != kStringToPdfTagTypeMap.end()) {
+ return iter->second;
+ }
+ return PdfTagType::kUnknown;
+}
+
+const base::fixed_flat_map<std::string_view, PdfTagType, 35>&
+GetPdfTagTypeMap() {
+ return kStringToPdfTagTypeMap;
+}
+
AccessibilityTextStyleInfo::AccessibilityTextStyleInfo() = default;
AccessibilityTextStyleInfo::AccessibilityTextStyleInfo(
@@ -81,6 +140,10 @@
AccessibilityImageInfo::~AccessibilityImageInfo() = default;
+AccessibilityStructureElement::AccessibilityStructureElement() = default;
+
+AccessibilityStructureElement::~AccessibilityStructureElement() = default;
+
AccessibilityDocInfo::AccessibilityDocInfo() = default;
AccessibilityDocInfo::~AccessibilityDocInfo() = default;
diff --git a/pdf/accessibility_structs.h b/pdf/accessibility_structs.h
index 812dc702..c4296ed 100644
--- a/pdf/accessibility_structs.h
+++ b/pdf/accessibility_structs.h
@@ -10,6 +10,8 @@
#include <string>
#include <vector>
+#include "base/containers/fixed_flat_map.h"
+#include "base/memory/raw_ptr.h"
#include "pdf/page_character_index.h"
#include "third_party/skia/include/core/SkBitmap.h"
#include "ui/gfx/geometry/point.h"
@@ -90,6 +92,7 @@
uint32_t len = 0;
// One of various types defined in a PDF tag, such as "Span", "P", "H1", "LI",
// etc.
+ // TODO(crbug.com/40707542): Remove in favor of AccessibilityStructureElement.
std::string tag_type;
gfx::RectF bounds;
AccessibilityTextDirection direction = AccessibilityTextDirection::kNone;
@@ -107,6 +110,7 @@
~AccessibilityImageInfo();
// Alternate text for the image provided by PDF.
+ // TODO(crbug.com/40707542): Remove in favor of AccessibilityStructureElement.
std::string alt_text;
// We anchor the image to a char index, this denotes the text run before
@@ -121,17 +125,101 @@
int32_t page_object_index;
};
+// Indicates the semantic meaning of each `AccessibilityStructureElement`. Such
+// elements can be associated with either an `AccessibilityTextRun`, e.g. when a
+// piece of text is a heading, or stand on their own, e.g. if an element
+// representing a table row.
+//
+// Please keep the below enum as close as possible to the list defined in the
+// PDF Specification, ISO 32000-1:2008, table 333.
+//
+// TODO(crbug.com/40707542): Consider moving this definition to a shared
+// location for use also by PDF printing.
+enum class PdfTagType {
+ kNone, // Not present.
+ kDocument,
+ kPart,
+ kArt,
+ kSect,
+ kDiv,
+ kBlockQuote,
+ kCaption,
+ kTOC, // Table of contents.
+ kTOCI, // Table of contents entry.
+ kIndex,
+ kP, // Paragraph.
+ kH, // Heading.
+ kH1,
+ kH2,
+ kH3,
+ kH4,
+ kH5,
+ kH6,
+ kL, // List.
+ kLI, // List item.
+ kLbl, // List marker.
+ kLBody,
+ kTable,
+ kTR,
+ kTH,
+ kTHead, // Table row group header.
+ kTBody,
+ kTFoot,
+ kTD,
+ kSpan,
+ kLink,
+ kFigure,
+ kFormula,
+ kForm,
+ kUnknown, // Unrecognized.
+};
+
+// Given a string containing a PDF tag type, such as "H1", returns the
+// corresponding enum value, such as `PdfTagType::kH1`.
+PdfTagType PdfTagTypeFromString(const std::string& tag_type);
+
+// Returns the PDF tag type string-to-enum map.
+const base::fixed_flat_map<std::string_view, PdfTagType, 35>&
+GetPdfTagTypeMap();
+
+// Represents a node in the PDF's structure tree. This tree represents the
+// logical organization of the text inside the PDF, e.g. when data is placed in
+// a table, or points are placed inside a bulleted list. This should result in
+// additional “structural nodes” to be added to the accessibility tree or
+// existing nodes to get new accessibility roles / attributes.
+struct AccessibilityStructureElement {
+ AccessibilityStructureElement();
+ AccessibilityStructureElement(const AccessibilityStructureElement&) = delete;
+ AccessibilityStructureElement& operator=(
+ const AccessibilityStructureElement&) = delete;
+ ~AccessibilityStructureElement();
+
+ // Trailing comments indicate corresponding PDF spec dictionary keys.
+ PdfTagType type = PdfTagType::kNone; // /S
+ std::string language; // /Lang
+ std::string alt_text; // /Alt
+ std::string abbreviation_expansion; // /E
+ std::string actual_text; // /ActualText
+
+ std::vector<raw_ptr<AccessibilityTextRunInfo, VectorExperimental>>
+ associated_text_runs_if_available;
+
+ std::vector<std::unique_ptr<AccessibilityStructureElement>> children;
+ raw_ptr<AccessibilityStructureElement> parent = nullptr;
+};
+
struct AccessibilityDocInfo {
AccessibilityDocInfo();
AccessibilityDocInfo(const AccessibilityDocInfo&) = delete;
AccessibilityDocInfo& operator=(const AccessibilityDocInfo&) = delete;
~AccessibilityDocInfo();
- friend constexpr bool operator==(const AccessibilityDocInfo&,
- const AccessibilityDocInfo&) = default;
+ friend bool operator==(const AccessibilityDocInfo&,
+ const AccessibilityDocInfo&) = default;
uint32_t page_count = 0;
bool is_tagged = false;
+ std::unique_ptr<AccessibilityStructureElement> structure_tree_root;
bool text_accessible = false;
bool text_copyable = false;
};
diff --git a/pdf/pdf_view_web_plugin.cc b/pdf/pdf_view_web_plugin.cc
index 6264a8d..955744e 100644
--- a/pdf/pdf_view_web_plugin.cc
+++ b/pdf/pdf_view_web_plugin.cc
@@ -3103,6 +3103,9 @@
doc_info->page_count = engine_->GetNumberOfPages();
if (base::FeatureList::IsEnabled(chrome_pdf::features::kPdfTags)) {
doc_info->is_tagged = engine_->IsPDFDocTagged();
+ if (doc_info->is_tagged) {
+ doc_info->structure_tree_root = engine_->GetStructureTree();
+ }
}
doc_info->text_accessible =
engine_->HasPermission(DocumentPermission::kCopyAccessible);
diff --git a/pdf/pdfium/accessibility_unittest.cc b/pdf/pdfium/accessibility_unittest.cc
index f5c527c..99613c4 100644
--- a/pdf/pdfium/accessibility_unittest.cc
+++ b/pdf/pdfium/accessibility_unittest.cc
@@ -8,6 +8,11 @@
#include <string>
#include "base/compiler_specific.h"
+#include "base/containers/fixed_flat_map.h"
+#include "base/containers/flat_map.h"
+#include "base/strings/strcat.h"
+#include "base/strings/string_number_conversions.h"
+#include "base/strings/string_util.h"
#include "base/test/scoped_feature_list.h"
#include "base/types/zip.h"
#include "pdf/accessibility_structs.h"
@@ -26,6 +31,61 @@
using AccessibilityTest = PDFiumTestBase;
+std::string_view PdfTagTypeToString(const PdfTagType& tag_type) {
+ static const auto kPdfTagTypeToStringMap = []() {
+ base::flat_map<PdfTagType, std::string_view> reverse_map;
+ for (const auto& [str, type] : GetPdfTagTypeMap()) {
+ reverse_map[type] = str;
+ }
+ return reverse_map;
+ }();
+
+ if (auto iter = kPdfTagTypeToStringMap.find(tag_type);
+ iter != kPdfTagTypeToStringMap.end()) {
+ return iter->second;
+ }
+ return "Unknown";
+}
+
+std::string AccessibilityStructureElementToString(
+ const AccessibilityStructureElement& element) {
+ static constexpr std::string_view kLevelPrefix = "\n++";
+ std::string element_str =
+ base::StrCat({"/S /", PdfTagTypeToString(element.type)});
+ if (!element.language.empty()) {
+ base::StrAppend(&element_str, {" /Lang (", element.language, ")"});
+ }
+ if (!element.alt_text.empty()) {
+ base::StrAppend(&element_str, {" /Alt (", element.alt_text, ")"});
+ }
+ if (!element.abbreviation_expansion.empty()) {
+ base::StrAppend(&element_str,
+ {" /E (", element.abbreviation_expansion, ")"});
+ }
+ if (!element.actual_text.empty()) {
+ base::StrAppend(&element_str, {" /ActualText (", element.actual_text, ")"});
+ }
+ if (!element.associated_text_runs_if_available.empty()) {
+ base::StrAppend(&element_str, {" AssociatedTextRunLens={"});
+ for (const AccessibilityTextRunInfo* text_run :
+ element.associated_text_runs_if_available) {
+ base::StrAppend(&element_str, {" ", base::NumberToString(text_run->len)});
+ }
+ base::StrAppend(&element_str, {" }"});
+ }
+ for (const auto& child : element.children) {
+ if (!child) {
+ // Null children can occur for pages without structure trees.
+ continue;
+ }
+ std::string child_str = AccessibilityStructureElementToString(*child);
+
+ base::ReplaceChars(child_str, "\n", kLevelPrefix, &child_str);
+ base::StrAppend(&element_str, {kLevelPrefix, child_str});
+ }
+ return element_str;
+}
+
float GetExpectedBoundsWidth(bool using_test_fonts, size_t i, float expected) {
return (using_test_fonts && i == 0) ? 85.333336f : expected;
}
@@ -127,6 +187,34 @@
});
}
+TEST_P(AccessibilityTest, AccessibilityStructureTree) {
+ base::test::ScopedFeatureList pdf_tags;
+ pdf_tags.InitAndEnableFeature(features::kPdfTags);
+
+ TestClient client;
+ std::unique_ptr<PDFiumEngine> engine =
+ InitializeEngine(&client, FILE_PATH_LITERAL("tags.pdf"));
+ ASSERT_TRUE(engine);
+ ASSERT_EQ(1, engine->GetNumberOfPages());
+
+ std::unique_ptr<AccessibilityStructureElement> doc_structure =
+ engine->GetStructureTree();
+ ASSERT_TRUE(doc_structure);
+
+ static constexpr char kExpectedStructureTree[] =
+ "/S /Document\n"
+ "++/S /Part\n"
+ "++++/S /Document /Lang (en-US)\n"
+ "++++++/S /Art\n"
+ "++++++/S /BlockQuote\n"
+ "++++++/S /P\n"
+ "++++++/S /H1\n"
+ "++++++/S /H2";
+
+ EXPECT_EQ(kExpectedStructureTree,
+ AccessibilityStructureElementToString(*doc_structure));
+}
+
TEST_P(AccessibilityTest, GetAccessibilityPageWithTags) {
base::test::ScopedFeatureList pdf_tags;
pdf_tags.InitAndEnableFeature(features::kPdfTags);
diff --git a/pdf/pdfium/pdfium_engine.cc b/pdf/pdfium/pdfium_engine.cc
index de2ec3d..5ad64cd2 100644
--- a/pdf/pdfium/pdfium_engine.cc
+++ b/pdf/pdfium/pdfium_engine.cc
@@ -1432,6 +1432,23 @@
return FPDFCatalog_IsTagged(doc());
}
+std::unique_ptr<AccessibilityStructureElement> PDFiumEngine::GetStructureTree()
+ const {
+ auto structure_tree_root = std::make_unique<AccessibilityStructureElement>();
+ structure_tree_root->type = PdfTagType::kDocument;
+ structure_tree_root->children.reserve(pages_.size());
+ // TODO(crbug.com/40707542): Get the /Lang string from
+ // AccessibilityStructureElement.
+ for (const std::unique_ptr<PDFiumPage>& page : pages_) {
+ auto page_structure = page->GetStructureTree();
+ if (page_structure) {
+ page_structure->parent = structure_tree_root.get();
+ }
+ structure_tree_root->children.push_back(std::move(page_structure));
+ }
+ return structure_tree_root;
+}
+
uint32_t PDFiumEngine::GetLoadedByteSize() {
return doc_loader_->GetDocumentSize();
}
diff --git a/pdf/pdfium/pdfium_engine.h b/pdf/pdfium/pdfium_engine.h
index 7d3b0506..e3300f9 100644
--- a/pdf/pdfium/pdfium_engine.h
+++ b/pdf/pdfium/pdfium_engine.h
@@ -379,6 +379,10 @@
// elements, e.g. headings and table cells.
virtual bool IsPDFDocTagged() const;
+ // Returns a copy of the structure tree which describes the logical
+ // organization of the PDF, if present.
+ std::unique_ptr<AccessibilityStructureElement> GetStructureTree() const;
+
virtual uint32_t GetLoadedByteSize();
// Copies data from `doc_loader_` into `buffer` starting from `offset`.
diff --git a/pdf/pdfium/pdfium_page.cc b/pdf/pdfium/pdfium_page.cc
index 2a23d24..fe05918 100644
--- a/pdf/pdfium/pdfium_page.cc
+++ b/pdf/pdfium/pdfium_page.cc
@@ -570,6 +570,96 @@
return chars;
}
+std::unique_ptr<AccessibilityStructureElement> PDFiumPage::GetStructureTree() {
+ if (!available_) {
+ return nullptr;
+ }
+
+ ScopedFPDFStructTree structure_tree(FPDF_StructTree_GetForPage(GetPage()));
+ if (!structure_tree) {
+ return nullptr;
+ }
+
+ auto tree_root = std::make_unique<AccessibilityStructureElement>();
+ tree_root->type = PdfTagType::kPart;
+ std::set<FPDF_STRUCTELEMENT> visited_elements;
+ int tree_children_count = FPDF_StructTree_CountChildren(structure_tree.get());
+ CHECK_GE(tree_children_count, 0);
+ tree_root->children.resize(tree_children_count);
+ for (int i = 0; i < tree_children_count; ++i) {
+ FPDF_STRUCTELEMENT tree_child =
+ FPDF_StructTree_GetChildAtIndex(structure_tree.get(), i);
+ tree_root->children[i] = GetStructureSubtree(tree_child, visited_elements);
+ if (tree_root->children[i]) {
+ tree_root->children[i]->parent = tree_root.get();
+ }
+ }
+ return tree_root;
+}
+
+std::unique_ptr<AccessibilityStructureElement> PDFiumPage::GetStructureSubtree(
+ FPDF_STRUCTELEMENT element,
+ std::set<FPDF_STRUCTELEMENT>& visited_elements) {
+ CHECK(element);
+ bool inserted = visited_elements.insert(element).second;
+ if (!inserted) {
+ return nullptr;
+ }
+
+ auto tree_node = std::make_unique<AccessibilityStructureElement>();
+ tree_node->actual_text = base::UTF16ToUTF8(CallPDFiumWideStringBufferApi(
+ base::BindRepeating(&FPDF_StructElement_GetActualText, element),
+ /*check_expected_size=*/true));
+ std::string alt_text = base::UTF16ToUTF8(CallPDFiumWideStringBufferApi(
+ base::BindRepeating(&FPDF_StructElement_GetAltText, element),
+ /*check_expected_size=*/true));
+ tree_node->alt_text = alt_text;
+ std::string tag_type = base::UTF16ToUTF8(CallPDFiumWideStringBufferApi(
+ base::BindRepeating(&FPDF_StructElement_GetType, element),
+ /*check_expected_size=*/true));
+ tree_node->type = PdfTagTypeFromString(tag_type);
+ tree_node->language = base::UTF16ToUTF8(CallPDFiumWideStringBufferApi(
+ base::BindRepeating(&FPDF_StructElement_GetLang, element),
+ /*check_expected_size=*/true));
+
+ if (base::FeatureList::IsEnabled(chrome_pdf::features::kPdfTags)) {
+ int marked_content_id = -1;
+ if (FPDF_StructElement_GetMarkedContentIdCount(element) > 0) {
+ marked_content_id =
+ FPDF_StructElement_GetMarkedContentIdAtIndex(element, 0);
+ }
+ if (marked_content_id >= 0) {
+ auto text_runs_iter =
+ marked_content_id_to_text_runs_map_.find(marked_content_id);
+ if (text_runs_iter != marked_content_id_to_text_runs_map_.end()) {
+ const std::vector<size_t>& text_run_indices = text_runs_iter->second;
+ for (size_t text_run_index : text_run_indices) {
+ tree_node->associated_text_runs_if_available.push_back(
+ &text_runs_[text_run_index]);
+ }
+ }
+
+ // TODO(crbug.com/40707542): Add `associated_image_if_available` field to
+ // `AccessibilityStructureElement` and populate it here by looking up
+ // `marked_content_id` in `marked_content_id_to_images_map_`.
+ }
+ }
+
+ int children_count = FPDF_StructElement_CountChildren(element);
+ CHECK_GE(children_count, 0);
+ tree_node->children.resize(children_count);
+ for (int i = 0; i < children_count; ++i) {
+ FPDF_STRUCTELEMENT child = FPDF_StructElement_GetChildAtIndex(element, i);
+ if (child) {
+ tree_node->children[i] = GetStructureSubtree(child, visited_elements);
+ if (tree_node->children[i]) {
+ tree_node->children[i]->parent = tree_node.get();
+ }
+ }
+ }
+ return tree_node;
+}
+
std::optional<AccessibilityTextRunInfo> PDFiumPage::GetTextRunInfoAt(
int start_char_index) {
FPDF_PAGE page = GetPage();
diff --git a/pdf/pdfium/pdfium_page.h b/pdf/pdfium/pdfium_page.h
index af15801..04e4a11 100644
--- a/pdf/pdfium/pdfium_page.h
+++ b/pdf/pdfium/pdfium_page.h
@@ -49,6 +49,7 @@
struct AccessibilityHighlightInfo;
struct AccessibilityImageInfo;
struct AccessibilityLinkInfo;
+struct AccessibilityStructureElement;
struct AccessibilityTextFieldInfo;
struct AccessibilityTextRunInfo;
@@ -90,6 +91,10 @@
// Get all the chars from the page.
std::vector<AccessibilityCharInfo> GetCharInfo();
+ // Returns a copy of the structure tree which describes the logical
+ // organization of the current PDF page, if present.
+ std::unique_ptr<AccessibilityStructureElement> GetStructureTree();
+
// Gets all the text runs from the page.
std::vector<AccessibilityTextRunInfo> GetTextRunInfo();
@@ -482,6 +487,17 @@
FPDF_STRUCTELEMENT current_element,
std::set<FPDF_STRUCTELEMENT>& visited_elements);
+ // Traverses a structure element and its subtree recursively and extracts all
+ // information, storing it in a corresponding hierarchy of
+ // `AccessibilityStructureElement`s. Also, extracts the text run type or the
+ // alt text from structure elements corresponding to the marked content IDs
+ // present in `marked_content_id_to_text_runs_map_` or
+ // `marked_content_id_to_images_map_` respectively. Uses `visited_elements` to
+ // guard against malformed structure trees.
+ std::unique_ptr<AccessibilityStructureElement> GetStructureSubtree(
+ FPDF_STRUCTELEMENT element,
+ std::set<FPDF_STRUCTELEMENT>& visited_elements);
+
bool PopulateFormFieldProperties(FPDF_ANNOTATION annot,
FormField* form_field);