blob: 825c0dfbc25b95f001575ec19321b5fb11b9f1e9 [file] [log] [blame] [edit]
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_
#define THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_
#include "third_party/blink/renderer/core/dom/node.h"
#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
#include "third_party/blink/renderer/platform/wtf/vector.h"
namespace blink {
class HTMLIFrameElement;
// Returns true if the content of `iframe_element` should be included for
// inner text or document passages.
bool ShouldContentExtractionIncludeIFrame(const HTMLIFrameElement& iframe_element);
// Chunks documents into text passages. Each passage contains either a single
// node of text, or the text of the node and its siblings and descendants if the
// total number of words is less than max_words_per_aggregate_passage. This is
// done by recursively walking the document tree, gathering the content of
// individual text nodes ("segments") and then aggregating these into longer
// strings ("passages"), each containing whitespace-joined segments from zero or
// more siblings and descendants.
class DocumentChunker {
public:
// Parameters:
// max_words_per_aggregate_passage: Maximum number of words in a passage
// comprised of multiple nodes. A passage with text from only a single
// node may exceed this max.
// greedily_aggregate_sibling_nodes: If true, sibling nodes are greedily
// aggregated into passages under max_words_per_aggregate_passage words. If
// false, each sibling node is output as a separate passage if they cannot
// all be combined into a single passage under
// max_words_per_aggregate_passage words.
DocumentChunker(size_t max_words_per_aggregate_passage,
bool greedily_aggregate_sibling_nodes,
uint32_t max_passages,
uint32_t min_words_per_passage);
// Chunks the node and its descendants into text passages.
// Returns a vector of text passages.
Vector<String> Chunk(const Node& tree);
private:
struct AggregateNode;
// A list of finished aggregations of text segments, built from the leaves up.
struct PassageList {
// Creates and adds a text passage for the input node, if it is non-empty,
// and contains more words than the given minimum.
void AddPassageForNode(const AggregateNode& node,
size_t min_words_per_passage);
// Extends this PassageList from another given |passage_list|.
void Extend(const PassageList& passage_list);
// Passages are completed aggregations of text segments. It is possible
// for a single passage to exceed max_words_per_aggregate_passage but the
// aggregation process tries to avoid it. This has an inline capacity of
// 32 to avoid excessive per-node reallocations during the recursive walk.
Vector<String, 32> passages;
};
// Contains aggregate information about a node and its descendants.
struct AggregateNode {
// Returns true if |node| can be added without exceeding |max_words|.
bool Fits(const AggregateNode& node, size_t max_words);
// Adds the input node to this AggregateNode.
void AddNode(const AggregateNode& node);
// Returns a text passage built from joined |segments|.
String CreatePassage() const;
// Segments of text that are part of this AggregateNode.
// These are accumulated as work in progress toward creating full passages.
Vector<String, 32> segments;
// Total number of words in |segments|.
size_t num_words = 0;
// Completed passages for this node and its descendants.
PassageList passage_list;
};
// Recursively processes a node and its descendants, returning early if
// a maximum |depth| is reached.
AggregateNode ProcessNode(const Node& node,
int depth,
uint32_t passage_count);
size_t max_words_per_aggregate_passage_;
bool greedily_aggregate_sibling_nodes_;
uint32_t max_passages_;
uint32_t min_words_per_passage_;
};
} // namespace blink
#endif // THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_