| // Copyright 2025 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/pdf/renderer/pdf_accessibility_tree_builder_structure.h" |
| |
| #include <algorithm> |
| #include <optional> |
| #include <string> |
| #include <vector> |
| |
| #include "base/strings/strcat.h" |
| #include "components/pdf/renderer/pdf_accessibility_tree_builder.h" |
| #include "pdf/accessibility_structs.h" |
| #include "pdf/pdf_accessibility_constants_helper.h" |
| #include "ui/accessibility/ax_enums.mojom-shared.h" |
| #include "ui/accessibility/ax_node_data.h" |
| |
| namespace pdf { |
| |
| namespace { |
| |
| // Adds text runs as inline text box children of static_text_node. Returns the |
| // accumulated text string. |
| std::string AddTextRunsToStaticText( |
| PdfAccessibilityTreeBuilder& builder, |
| ui::AXNodeData* static_text_node, |
| const chrome_pdf::UnassociatedTextRunRange& range) { |
| const auto& text_runs = builder.text_runs(); |
| std::string accumulated_text; |
| |
| for (size_t run_idx = range.start; run_idx <= range.end; ++run_idx) { |
| const chrome_pdf::AccessibilityTextRunInfo& text_run = text_runs[run_idx]; |
| chrome_pdf::PageCharacterIndex page_char_index = { |
| builder.page_index(), builder.text_run_start_indices()[run_idx]}; |
| |
| ui::AXNodeData* inline_text_box = |
| builder.CreateInlineTextBoxNode(text_run, page_char_index); |
| static_text_node->child_ids.push_back(inline_text_box->id); |
| |
| static_text_node->relative_bounds.bounds.Union( |
| inline_text_box->relative_bounds.bounds); |
| base::StrAppend(&accumulated_text, {inline_text_box->GetStringAttribute( |
| ax::mojom::StringAttribute::kName)}); |
| } |
| |
| return accumulated_text; |
| } |
| |
| // Creates a paragraph node containing the text runs in the range. |
| ui::AXNodeData* CreateParagraphFromTextRunRange( |
| PdfAccessibilityTreeBuilder& builder, |
| const chrome_pdf::UnassociatedTextRunRange& range) { |
| ui::AXNodeData* container_node = builder.CreateAndAppendNode( |
| ax::mojom::Role::kParagraph, ax::mojom::Restriction::kReadOnly); |
| |
| chrome_pdf::PageCharacterIndex page_char_index = { |
| builder.page_index(), builder.text_run_start_indices()[range.start]}; |
| ui::AXNodeData* static_text_node = |
| builder.CreateStaticTextNode(page_char_index); |
| container_node->child_ids.push_back(static_text_node->id); |
| |
| std::string accumulated_text = |
| AddTextRunsToStaticText(builder, static_text_node, range); |
| |
| static_text_node->AddStringAttribute(ax::mojom::StringAttribute::kName, |
| accumulated_text); |
| container_node->relative_bounds.bounds = |
| static_text_node->relative_bounds.bounds; |
| |
| return container_node; |
| } |
| |
| } // namespace |
| |
| PdfAccessibilityTreeBuilderStructure::PdfAccessibilityTreeBuilderStructure( |
| PdfAccessibilityTreeBuilder& builder, |
| const chrome_pdf::AccessibilityStructureElement* structure_tree_root) |
| : builder_(builder), structure_tree_root_(structure_tree_root) {} |
| |
| PdfAccessibilityTreeBuilderStructure::~PdfAccessibilityTreeBuilderStructure() = |
| default; |
| |
| void PdfAccessibilityTreeBuilderStructure::BuildPageTree() { |
| InsertUnassociatedTextRunsAtStart(); |
| |
| WalkStructureTree(structure_tree_root_, builder_->page_node()); |
| } |
| |
| // static |
| bool PdfAccessibilityTreeBuilderStructure::StructureTreeHasContent( |
| const chrome_pdf::AccessibilityStructureElement* pdf_struct_element) { |
| if (!pdf_struct_element) { |
| return false; |
| } |
| |
| // Check if this element has direct content. |
| if (!pdf_struct_element->associated_text_runs_if_available.empty() || |
| pdf_struct_element->associated_image_if_available) { |
| return true; |
| } |
| |
| for (const auto& child : pdf_struct_element->children) { |
| if (StructureTreeHasContent(child.get())) { |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| ui::AXNodeData* PdfAccessibilityTreeBuilderStructure::CreateNodeWithTextContent( |
| ui::AXNodeData* parent_node, |
| ax::mojom::Role role, |
| base::span<const raw_ptr<chrome_pdf::AccessibilityTextRunInfo, |
| VectorExperimental>> text_runs) { |
| // Create container node with the specified role (e.g., kParagraph, kHeading). |
| ui::AXNodeData* container_node = |
| builder_->CreateAndAppendNode(role, ax::mojom::Restriction::kReadOnly); |
| parent_node->child_ids.push_back(container_node->id); |
| |
| // Find indices of the associated text runs in the page's text run list. |
| std::vector<size_t> text_run_indices; |
| for (const auto& text_run_ptr : text_runs) { |
| auto it = std::ranges::find_if( |
| builder_->text_runs(), [&text_run_ptr](const auto& tr) { |
| return tr.start_index == text_run_ptr->start_index; |
| }); |
| if (it != builder_->text_runs().end()) { |
| size_t idx = std::distance(builder_->text_runs().begin(), it); |
| text_run_indices.push_back(idx); |
| } |
| } |
| |
| if (text_run_indices.empty()) { |
| return container_node; |
| } |
| |
| // Create static text node as child of container. |
| chrome_pdf::PageCharacterIndex page_char_index = { |
| builder_->page_index(), |
| builder_->text_run_start_indices()[text_run_indices[0]]}; |
| ui::AXNodeData* static_text_node = |
| builder_->CreateStaticTextNode(page_char_index); |
| container_node->child_ids.push_back(static_text_node->id); |
| |
| // Create inline text box nodes for each associated text run. |
| std::string accumulated_text; |
| for (size_t text_run_index : text_run_indices) { |
| const chrome_pdf::AccessibilityTextRunInfo& text_run = |
| (builder_->text_runs())[text_run_index]; |
| page_char_index.char_index = |
| builder_->text_run_start_indices()[text_run_index]; |
| |
| ui::AXNodeData* inline_text_box = |
| builder_->CreateInlineTextBoxNode(text_run, page_char_index); |
| static_text_node->child_ids.push_back(inline_text_box->id); |
| |
| static_text_node->relative_bounds.bounds.Union( |
| inline_text_box->relative_bounds.bounds); |
| accumulated_text += |
| inline_text_box->GetStringAttribute(ax::mojom::StringAttribute::kName); |
| } |
| |
| // If there are any unassociated text runs immediately after the associated |
| // text runs, add them each as sibling text box nodes. |
| // TODO(crbug.com/40707542): Consider using heuristics to determine whether |
| // unassociated text should be siblings or in a separate container. |
| auto range = FindUnassociatedTextRunRangeAtIndex(text_run_indices.back() + 1); |
| if (range) { |
| std::string text = |
| AddTextRunsToStaticText(*builder_, static_text_node, *range); |
| base::StrAppend(&accumulated_text, {text}); |
| } |
| |
| static_text_node->AddStringAttribute(ax::mojom::StringAttribute::kName, |
| accumulated_text); |
| container_node->relative_bounds.bounds = |
| static_text_node->relative_bounds.bounds; |
| |
| return container_node; |
| } |
| |
| ui::AXNodeData* |
| PdfAccessibilityTreeBuilderStructure::CreateNodeWithImageContent( |
| ui::AXNodeData* parent_node, |
| const chrome_pdf::AccessibilityImageInfo& image_info) { |
| ui::AXNodeData* image_node = builder_->CreateImageNode(image_info); |
| parent_node->child_ids.push_back(image_node->id); |
| return image_node; |
| } |
| |
| void PdfAccessibilityTreeBuilderStructure::WalkStructureTree( |
| const chrome_pdf::AccessibilityStructureElement* pdf_struct_element, |
| ui::AXNodeData* parent_node) { |
| if (!pdf_struct_element) { |
| return; |
| } |
| |
| // For purely structural container types (Part, Div), skip content check |
| // and recurse directly to children. This avoids O(n²) redundant checks |
| // for deeply nested Part/Div structures. |
| if (pdf_struct_element->type == chrome_pdf::PdfTagType::kPart || |
| pdf_struct_element->type == chrome_pdf::PdfTagType::kDiv) { |
| for (const auto& child : pdf_struct_element->children) { |
| WalkStructureTree(child.get(), parent_node); |
| } |
| return; |
| } |
| |
| // Skip empty elements entirely. |
| if (!StructureTreeHasContent(pdf_struct_element)) { |
| return; |
| } |
| |
| // Check if this element has direct content. |
| bool has_text = |
| !pdf_struct_element->associated_text_runs_if_available.empty(); |
| bool has_image = !!pdf_struct_element->associated_image_if_available; |
| |
| // Map PDF tag to accessibility role, except kDocument which is mapped to |
| // kGenericContainer to avoid introducing a redundant Document node in the |
| // accessibility tree. |
| const ax::mojom::Role role = |
| pdf_struct_element->type == chrome_pdf::PdfTagType::kDocument |
| ? ax::mojom::Role::kGenericContainer |
| : chrome_pdf::AXRoleFromPdfTagType(pdf_struct_element->type); |
| |
| // Handle elements with both text and image content (from different MCIDs). |
| if (has_text && has_image) { |
| // Create container node with semantic role (e.g., Paragraph) |
| ui::AXNodeData* container_node = CreateNodeWithTextContent( |
| parent_node, role, |
| pdf_struct_element->associated_text_runs_if_available); |
| |
| if (!pdf_struct_element->language.empty()) { |
| container_node->AddStringAttribute(ax::mojom::StringAttribute::kLanguage, |
| pdf_struct_element->language); |
| } |
| if (!pdf_struct_element->abbreviation_expansion.empty()) { |
| container_node->AddStringAttribute( |
| ax::mojom::StringAttribute::kDescription, |
| pdf_struct_element->abbreviation_expansion); |
| } |
| |
| // Add image as additional child of the container. |
| chrome_pdf::AccessibilityImageInfo modified_image = |
| *pdf_struct_element->associated_image_if_available; |
| if (!pdf_struct_element->alt_text.empty()) { |
| modified_image.alt_text = pdf_struct_element->alt_text; |
| } |
| CreateNodeWithImageContent(container_node, modified_image); |
| |
| // TODO(crbug.com/40707542): Handle pdf_struct_element->actual_text as text |
| // override. |
| |
| for (const auto& child : pdf_struct_element->children) { |
| WalkStructureTree(child.get(), container_node); |
| } |
| return; |
| } |
| |
| // Handle elements with text content only. |
| if (has_text) { |
| ui::AXNodeData* node_data = CreateNodeWithTextContent( |
| parent_node, role, |
| pdf_struct_element->associated_text_runs_if_available); |
| |
| if (!pdf_struct_element->alt_text.empty()) { |
| node_data->AddStringAttribute(ax::mojom::StringAttribute::kDescription, |
| pdf_struct_element->alt_text); |
| } else if (!pdf_struct_element->abbreviation_expansion.empty()) { |
| node_data->AddStringAttribute(ax::mojom::StringAttribute::kDescription, |
| pdf_struct_element->abbreviation_expansion); |
| } |
| if (!pdf_struct_element->language.empty()) { |
| node_data->AddStringAttribute(ax::mojom::StringAttribute::kLanguage, |
| pdf_struct_element->language); |
| } |
| |
| // TODO(crbug.com/40707542): Handle pdf_struct_element->actual_text as text |
| // override. |
| |
| for (const auto& child : pdf_struct_element->children) { |
| WalkStructureTree(child.get(), node_data); |
| } |
| return; |
| } |
| |
| // Handle elements with image content. |
| if (has_image) { |
| // Use alt_text from structure element (authoritative per PDF spec). |
| chrome_pdf::AccessibilityImageInfo modified_image = |
| *pdf_struct_element->associated_image_if_available; |
| modified_image.alt_text = pdf_struct_element->alt_text; |
| |
| // PDF images can appear in two contexts: |
| // (i) Figure elements: These are standalone images that should have a |
| // semantic Figure container (role="figure") with an Image child |
| // (role="img"). E.g. photos, diagrams. This matches HTML |
| // <figure><img></figure> semantics. The Figure container holds the alt text |
| // and can have additional children like captions. |
| // |
| // (ii) Non-Figure images: Images that are part of other structural elements |
| // and should be created as direct Image nodes (role="img") within their |
| // parent container. E.g. link elements (clickable images/buttons with |
| // role="link" with img child), formula elements (equations with role="math" |
| // with img child). |
| |
| if (pdf_struct_element->type == chrome_pdf::PdfTagType::kFigure) { |
| // Create a Figure container node with role="figure". |
| ui::AXNodeData* figure_node = builder_->CreateAndAppendNode( |
| ax::mojom::Role::kFigure, ax::mojom::Restriction::kReadOnly); |
| parent_node->child_ids.push_back(figure_node->id); |
| |
| // Add alt text and language to the Figure container (not the image |
| // child). The Figure is the semantic element, like HTML <figure>. |
| if (!pdf_struct_element->alt_text.empty()) { |
| figure_node->AddStringAttribute(ax::mojom::StringAttribute::kName, |
| pdf_struct_element->alt_text); |
| } |
| if (!pdf_struct_element->abbreviation_expansion.empty()) { |
| figure_node->AddStringAttribute( |
| ax::mojom::StringAttribute::kDescription, |
| pdf_struct_element->abbreviation_expansion); |
| } |
| if (!pdf_struct_element->language.empty()) { |
| figure_node->AddStringAttribute(ax::mojom::StringAttribute::kLanguage, |
| pdf_struct_element->language); |
| } |
| |
| ui::AXNodeData* image_node = |
| CreateNodeWithImageContent(figure_node, modified_image); |
| |
| figure_node->relative_bounds.bounds = image_node->relative_bounds.bounds; |
| |
| // Process any children of the Figure (e.g., captions). |
| for (const auto& child : pdf_struct_element->children) { |
| WalkStructureTree(child.get(), figure_node); |
| } |
| } else { |
| // Non-Figure image: Create the image directly within the parent element. |
| // The parent's role (Link, Formula, etc.) provides semantic context. |
| ui::AXNodeData* image_node = |
| CreateNodeWithImageContent(parent_node, modified_image); |
| |
| if (!pdf_struct_element->abbreviation_expansion.empty()) { |
| image_node->AddStringAttribute( |
| ax::mojom::StringAttribute::kDescription, |
| pdf_struct_element->abbreviation_expansion); |
| } |
| if (!pdf_struct_element->language.empty()) { |
| image_node->AddStringAttribute(ax::mojom::StringAttribute::kLanguage, |
| pdf_struct_element->language); |
| } |
| |
| for (const auto& child : pdf_struct_element->children) { |
| WalkStructureTree(child.get(), image_node); |
| } |
| } |
| return; |
| } |
| |
| // Handle empty semantic containers (create container and recurse). |
| ui::AXNodeData* container = |
| builder_->CreateAndAppendNode(role, ax::mojom::Restriction::kReadOnly); |
| parent_node->child_ids.push_back(container->id); |
| |
| if (!pdf_struct_element->alt_text.empty()) { |
| container->AddStringAttribute(ax::mojom::StringAttribute::kDescription, |
| pdf_struct_element->alt_text); |
| } else if (!pdf_struct_element->abbreviation_expansion.empty()) { |
| container->AddStringAttribute(ax::mojom::StringAttribute::kDescription, |
| pdf_struct_element->abbreviation_expansion); |
| } |
| if (!pdf_struct_element->language.empty()) { |
| container->AddStringAttribute(ax::mojom::StringAttribute::kLanguage, |
| pdf_struct_element->language); |
| } |
| |
| for (const auto& child : pdf_struct_element->children) { |
| WalkStructureTree(child.get(), container); |
| } |
| } |
| |
| std::optional<chrome_pdf::UnassociatedTextRunRange> |
| PdfAccessibilityTreeBuilderStructure::FindUnassociatedTextRunRangeAtIndex( |
| size_t range_start) { |
| auto ranges = structure_tree_root_->unassociated_text_run_ranges_for_page; |
| if (ranges.empty()) { |
| return std::nullopt; |
| } |
| |
| auto range = std::ranges::lower_bound( |
| ranges, range_start, {}, &chrome_pdf::UnassociatedTextRunRange::start); |
| |
| if (range != ranges.end() && range->start == range_start) { |
| return *range; |
| } |
| |
| return std::nullopt; |
| } |
| |
| void PdfAccessibilityTreeBuilderStructure::InsertUnassociatedTextRunsAtStart() { |
| auto range = FindUnassociatedTextRunRangeAtIndex(0); |
| if (!range) { |
| return; |
| } |
| |
| ui::AXNodeData* container = |
| CreateParagraphFromTextRunRange(*builder_, *range); |
| |
| builder_->page_node()->child_ids.insert( |
| builder_->page_node()->child_ids.begin(), container->id); |
| } |
| |
| } // namespace pdf |