ios/web/annotations/resources/text_extractor.ts - codesearch/chromium/src - Git at Google

 // Copyright 2023 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 /**
  * @fileoverview Interface used to extract visible text on the page, add extra
  * at the ends, and pass it on to the a consumer.
  */

 import type {TextWithSymbolIndex} from '//ios/web/annotations/resources/text_dom_utils.js';
 import {nextLeaf, previousLeaf} from '//ios/web/annotations/resources/text_dom_utils.js';
 import type {TextNodeVisitor} from '//ios/web/annotations/resources/text_intersection_observer.js';

 // TODO(crbug.com/40936184): investigate concatening of nodes and RTL languages.

 // Character added to the extracted text that intent detection should not cross.
 const SECTION_BREAK = ' ‡ ';

 // Minimum number of characters to add at ends of sections.
 const EXTRA_CHARACTERS_AT_END = 128;

 const KNOWN_INLINE_ELEMENTS: Set<string> = new Set([
   'A',
   'ABBR',
   'B',
   'CITE',
   'CODE',
   'I',
   'DFN',
   'EM',
   'MARK',
   'SMALL',
   'SPAN',
   'STRONG',
   'SUB',
   'SUP',
   'VAR',
 ]);

 // A section is a `textNode` and an index. The index is the position when this
 // node's text is in the full extracted text. Note that some text, like breaks
 // and spaces, are not in `TextSection`s. Neither are text nodes with no text or
 // with only spaces and newlines.
 export class TextSection {
   private sourceTextNode: WeakRef<TextWithSymbolIndex>;

   constructor(textNode: TextWithSymbolIndex, public index: number) {
     this.sourceTextNode = new WeakRef<TextWithSymbolIndex>(textNode);
   }

   get textNode(): TextWithSymbolIndex|null {
     return this.sourceTextNode.deref() || null;
   }
 }

 // Consumer of `TextChunk` callback.
 export interface TextChunkConsumer {
   (chunk: TextChunk): void;
 }

 // A piece of extracted text and the sections needed to locate back the nodes
 // from which the text, at a given index, comes from.
 export class TextChunk {
   text: string = '';
   sections: TextSection[] = [];

   // `firstNodeOffset` is the offset to the first character in the first
   // `TextSection`. The text before the offset is not included in `text`.
   // The offset will be subtracted to index of the first node when calling
   // the section enumerator. `visibleStart` and `visibleEnd` define the range
   // in which any annotation having at least one character in will be decorated.
   constructor(
       public firstNodeOffset: number, public visibleStart: number,
       public visibleEnd: number) {}

   // Adds a list of `sections` at the end of the current list. Adds the given
   // `text` at the end of the current text.
   add(sections: TextSection[], text: string): void {
     // The new section needs to be offsetted based on how much text is here
     // already. Note that `firstNodeOffset` is indepedant of this.
     const offset = this.text.length;
     for (const section of sections) {
       section.index += offset;
     }
     this.text += text;
     this.sections.push(...sections);
   }
 }

 // A `TextNodeVisitor` that assembles the text. It adds breaks where needed and
 // concatenates prefix and suffix text (of at most `extraCharactersAtEnd`) at
 // each end.
 export class TextExtractor implements TextNodeVisitor {
   constructor(
       private consumer: TextChunkConsumer,
       private extraCharactersAtEnd = EXTRA_CHARACTERS_AT_END,
       private sectionBreak = SECTION_BREAK) {}

   private parts: string[] = [];
   private sections: TextSection[] = [];

   // `true` when a text break has been added. A text break is meant to replace
   // non visible or invalid nodes to avoid creating false context by combining
   // text before and after the break.
   private broken = true;
   // Current index, it is equivalent to `''.concat(...this.parts).length`.
   private index = 0;

   // `true` when a space has already been added between text.
   spaced = true;

   // Mark: TextNodeVisitor

   begin(): void {
     this.parts = [];
     this.sections = [];
     this.broken = true;
     this.spaced = true;
     this.index = 0;
   }

   visibleTextNode(textNode: Text): void {
     if (textNode.textContent!.trim()) {
       this.parts.push(textNode.textContent!);
       this.sections.push(new TextSection(textNode, this.index));
       this.index += textNode.textContent!.length;
       this.broken = false;
       this.spaced = false;
     } else {
       this.addSpaceIfNeeded();
     }
   }

   enterVisibleNode(node: Node): void {
     if (node instanceof Element && !KNOWN_INLINE_ELEMENTS.has(node.nodeName)) {
       this.addSpaceIfNeeded();
     }
   }

   leaveVisibleNode(node: Node): void {
     if (node instanceof Element && !KNOWN_INLINE_ELEMENTS.has(node.nodeName)) {
       this.addSpaceIfNeeded();
     }
   }

   invisibleNode(node: Node): void {
     if (node.nodeType === Node.COMMENT_NODE) {
       // Completely ignore comments.
     } else if (
         node.nodeType === Node.TEXT_NODE &&
         (!node.textContent || !node.textContent.trim())) {
       // Skip empty text nodes. They are not real breaks.
       this.addSpaceIfNeeded();
     } else if (!this.broken) {
       // Text section break, no section registered.
       this.parts.push(this.sectionBreak);
       this.index += this.sectionBreak.length;
       this.broken = true;
       this.spaced = false;
     }
   }

   end(): void {
     // If there's no new text, cancel extraction. It doesn't make sense
     // to send prefix and suffix characters and send those two ends.
     if (this.sections.length === 0) {
       return;
     }

     // To catch an address on multiple line scrolling in, the extra 'window'
     // (rootMargin) isn't enough, it just pushes the problem below or above the
     // viewport. This solves the issue by always adding extra text before
     // and after, regardless of that text's visibility and not removing it
     // from the DOM observer or intersection observer.
     const firstNode: Node = this.sections[0]!.textNode!;
     const [firstNodeOffset, prefixText, prefixSections] =
         this.extractPrefix(firstNode);
     const lastNode: Node = this.sections[this.sections.length - 1]!.textNode!;
     const [postfixText, postfixSections] =
         this.extractPostfix(lastNode, this.spaced);
     const text = ''.concat(...this.parts);
     const chunk = new TextChunk(
         firstNodeOffset, prefixText.length, prefixText.length + text.length);
     chunk.add(prefixSections, prefixText);
     chunk.add(this.sections, text);
     chunk.add(postfixSections, postfixText);
     this.consumer(chunk);
   }

   // Mark: Private API

   // Adds a single space between parts if there was none.
   private addSpaceIfNeeded() {
     if (!this.spaced) {
       // Spacer, no section registered.
       this.parts.push(' ');
       this.index++;
       this.spaced = true;
     }
   }

   // Extracts up to `extraCharactersAtEnd` before `beforeNode`. Returns
   // an array of `TextSection`, its combined text and the offset to
   // the first character in the first section. In case nothing can be found,
   // empty text and sections are returned.
   private extractPrefix(beforeNode: Node): [number, string, TextSection[]] {
     let sections: TextSection[] = [];
     let parts: string[] = [' '];
     // Leave space for a space and start `index` at 1 from the end.
     let index = this.extraCharactersAtEnd - 1;
     // Keep track of latest offset and since the traversal if backward, the
     // last value will be for the first node.
     let offset = 0;
     let spaced = true;
     let node: Node|null = previousLeaf(beforeNode, /* breakAtInvalid= */ true);
     while (node && index > 0) {
       if (node.nodeType === Node.TEXT_NODE && node.textContent &&
           node.textContent.trim()) {
         const textLength = node.textContent.length;
         const minLength = Math.min(index, textLength);
         offset = textLength - minLength;
         parts.push(node.textContent.substring(offset));
         sections.push(new TextSection(node as Text, index - minLength));
         index -= minLength;
         spaced = false;
       } else if (!spaced) {
         parts.push(' ');
         index--;
         spaced = true;
       }
       node = previousLeaf(node, /* breakAtInvalid= */ true);
     }
     if (sections.length > 0) {
       sections = sections.reverse();
       parts = parts.reverse();
       const text = ''.concat(...parts);
       // index will be > 0 if there wasn't enough text, so adjust the
       // sections to match the `text`.
       if (index > 0) {
         for (const section of sections) {
           section.index -= index;
         }
       }
       return [offset, text, sections];
     }
     return [0, '', []];
   }

   // Extracts up to `extraCharactersAtEnd` after `afterNode`. Returns
   // an array of `TextSection` and its combined text. In case nothng can
   // be found, empty text and sections are returned.
   private extractPostfix(afterNode: Node, alreadySpaced: boolean):
       [string, TextSection[]] {
     const sections: TextSection[] = [];
     const parts: string[] = [];
     let index = 0;
     let spaced = alreadySpaced;
     if (!alreadySpaced) {
       parts.push(' ');
       index++;
       spaced = true;
     }
     const maxChars = alreadySpaced ? this.extraCharactersAtEnd :
                                      this.extraCharactersAtEnd - 1;
     let node: Node|null = nextLeaf(afterNode, /* breakAtInvalid= */ true);
     while (node && index < maxChars) {
       if (node.nodeType === Node.TEXT_NODE && node.textContent &&
           node.textContent.trim()) {
         const textLength = node.textContent.length;
         const minLength = Math.min(maxChars - index, textLength);
         parts.push(node.textContent.substring(0, minLength));
         sections.push(new TextSection(node as Text, index));
         index += minLength;
         spaced = false;
       } else if (!spaced) {
         parts.push(' ');
         index++;
         spaced = true;
       }
       node = nextLeaf(node, /* breakAtInvalid= */ true);
     }
     if (sections.length > 0) {
       const text = ''.concat(...parts);
       return [text, sections];
     }
     return ['', []];
   }
 }
	// Copyright 2023 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	/**
	* @fileoverview Interface used to extract visible text on the page, add extra
	* at the ends, and pass it on to the a consumer.
	*/

	import type {TextWithSymbolIndex} from '//ios/web/annotations/resources/text_dom_utils.js';
	import {nextLeaf, previousLeaf} from '//ios/web/annotations/resources/text_dom_utils.js';
	import type {TextNodeVisitor} from '//ios/web/annotations/resources/text_intersection_observer.js';

	// TODO(crbug.com/40936184): investigate concatening of nodes and RTL languages.

	// Character added to the extracted text that intent detection should not cross.
	const SECTION_BREAK = ' ‡ ';

	// Minimum number of characters to add at ends of sections.
	const EXTRA_CHARACTERS_AT_END = 128;

	const KNOWN_INLINE_ELEMENTS: Set<string> = new Set([
	'A',
	'ABBR',
	'B',
	'CITE',
	'CODE',
	'I',
	'DFN',
	'EM',
	'MARK',
	'SMALL',
	'SPAN',
	'STRONG',
	'SUB',
	'SUP',
	'VAR',
	]);

	// A section is a `textNode` and an index. The index is the position when this
	// node's text is in the full extracted text. Note that some text, like breaks
	// and spaces, are not in `TextSection`s. Neither are text nodes with no text or
	// with only spaces and newlines.
	export class TextSection {
	private sourceTextNode: WeakRef<TextWithSymbolIndex>;

	constructor(textNode: TextWithSymbolIndex, public index: number) {
	this.sourceTextNode = new WeakRef<TextWithSymbolIndex>(textNode);
	}

	get textNode(): TextWithSymbolIndex\|null {
	return this.sourceTextNode.deref() \|\| null;
	}
	}

	// Consumer of `TextChunk` callback.
	export interface TextChunkConsumer {
	(chunk: TextChunk): void;
	}

	// A piece of extracted text and the sections needed to locate back the nodes
	// from which the text, at a given index, comes from.
	export class TextChunk {
	text: string = '';
	sections: TextSection[] = [];

	// `firstNodeOffset` is the offset to the first character in the first
	// `TextSection`. The text before the offset is not included in `text`.
	// The offset will be subtracted to index of the first node when calling
	// the section enumerator. `visibleStart` and `visibleEnd` define the range
	// in which any annotation having at least one character in will be decorated.
	constructor(
	public firstNodeOffset: number, public visibleStart: number,
	public visibleEnd: number) {}

	// Adds a list of `sections` at the end of the current list. Adds the given
	// `text` at the end of the current text.
	add(sections: TextSection[], text: string): void {
	// The new section needs to be offsetted based on how much text is here
	// already. Note that `firstNodeOffset` is indepedant of this.
	const offset = this.text.length;
	for (const section of sections) {
	section.index += offset;
	}
	this.text += text;
	this.sections.push(...sections);
	}
	}

	// A `TextNodeVisitor` that assembles the text. It adds breaks where needed and
	// concatenates prefix and suffix text (of at most `extraCharactersAtEnd`) at
	// each end.
	export class TextExtractor implements TextNodeVisitor {
	constructor(
	private consumer: TextChunkConsumer,
	private extraCharactersAtEnd = EXTRA_CHARACTERS_AT_END,
	private sectionBreak = SECTION_BREAK) {}

	private parts: string[] = [];
	private sections: TextSection[] = [];

	// `true` when a text break has been added. A text break is meant to replace
	// non visible or invalid nodes to avoid creating false context by combining
	// text before and after the break.
	private broken = true;
	// Current index, it is equivalent to `''.concat(...this.parts).length`.
	private index = 0;

	// `true` when a space has already been added between text.
	spaced = true;

	// Mark: TextNodeVisitor

	begin(): void {
	this.parts = [];
	this.sections = [];
	this.broken = true;
	this.spaced = true;
	this.index = 0;
	}

	visibleTextNode(textNode: Text): void {
	if (textNode.textContent!.trim()) {
	this.parts.push(textNode.textContent!);
	this.sections.push(new TextSection(textNode, this.index));
	this.index += textNode.textContent!.length;
	this.broken = false;
	this.spaced = false;
	} else {
	this.addSpaceIfNeeded();
	}
	}

	enterVisibleNode(node: Node): void {
	if (node instanceof Element && !KNOWN_INLINE_ELEMENTS.has(node.nodeName)) {
	this.addSpaceIfNeeded();
	}
	}

	leaveVisibleNode(node: Node): void {
	if (node instanceof Element && !KNOWN_INLINE_ELEMENTS.has(node.nodeName)) {
	this.addSpaceIfNeeded();
	}
	}

	invisibleNode(node: Node): void {
	if (node.nodeType === Node.COMMENT_NODE) {
	// Completely ignore comments.
	} else if (
	node.nodeType === Node.TEXT_NODE &&
	(!node.textContent \|\| !node.textContent.trim())) {
	// Skip empty text nodes. They are not real breaks.
	this.addSpaceIfNeeded();
	} else if (!this.broken) {
	// Text section break, no section registered.
	this.parts.push(this.sectionBreak);
	this.index += this.sectionBreak.length;
	this.broken = true;
	this.spaced = false;
	}
	}

	end(): void {
	// If there's no new text, cancel extraction. It doesn't make sense
	// to send prefix and suffix characters and send those two ends.
	if (this.sections.length === 0) {
	return;
	}

	// To catch an address on multiple line scrolling in, the extra 'window'
	// (rootMargin) isn't enough, it just pushes the problem below or above the
	// viewport. This solves the issue by always adding extra text before
	// and after, regardless of that text's visibility and not removing it
	// from the DOM observer or intersection observer.
	const firstNode: Node = this.sections[0]!.textNode!;
	const [firstNodeOffset, prefixText, prefixSections] =
	this.extractPrefix(firstNode);
	const lastNode: Node = this.sections[this.sections.length - 1]!.textNode!;
	const [postfixText, postfixSections] =
	this.extractPostfix(lastNode, this.spaced);
	const text = ''.concat(...this.parts);
	const chunk = new TextChunk(
	firstNodeOffset, prefixText.length, prefixText.length + text.length);
	chunk.add(prefixSections, prefixText);
	chunk.add(this.sections, text);
	chunk.add(postfixSections, postfixText);
	this.consumer(chunk);
	}

	// Mark: Private API

	// Adds a single space between parts if there was none.
	private addSpaceIfNeeded() {
	if (!this.spaced) {
	// Spacer, no section registered.
	this.parts.push(' ');
	this.index++;
	this.spaced = true;
	}
	}

	// Extracts up to `extraCharactersAtEnd` before `beforeNode`. Returns
	// an array of `TextSection`, its combined text and the offset to
	// the first character in the first section. In case nothing can be found,
	// empty text and sections are returned.
	private extractPrefix(beforeNode: Node): [number, string, TextSection[]] {
	let sections: TextSection[] = [];
	let parts: string[] = [' '];
	// Leave space for a space and start `index` at 1 from the end.
	let index = this.extraCharactersAtEnd - 1;
	// Keep track of latest offset and since the traversal if backward, the
	// last value will be for the first node.
	let offset = 0;
	let spaced = true;
	let node: Node\|null = previousLeaf(beforeNode, /* breakAtInvalid= */ true);
	while (node && index > 0) {
	if (node.nodeType === Node.TEXT_NODE && node.textContent &&
	node.textContent.trim()) {
	const textLength = node.textContent.length;
	const minLength = Math.min(index, textLength);
	offset = textLength - minLength;
	parts.push(node.textContent.substring(offset));
	sections.push(new TextSection(node as Text, index - minLength));
	index -= minLength;
	spaced = false;
	} else if (!spaced) {
	parts.push(' ');
	index--;
	spaced = true;
	}
	node = previousLeaf(node, /* breakAtInvalid= */ true);
	}
	if (sections.length > 0) {
	sections = sections.reverse();
	parts = parts.reverse();
	const text = ''.concat(...parts);
	// index will be > 0 if there wasn't enough text, so adjust the
	// sections to match the `text`.
	if (index > 0) {
	for (const section of sections) {
	section.index -= index;
	}
	}
	return [offset, text, sections];
	}
	return [0, '', []];
	}

	// Extracts up to `extraCharactersAtEnd` after `afterNode`. Returns
	// an array of `TextSection` and its combined text. In case nothng can
	// be found, empty text and sections are returned.
	private extractPostfix(afterNode: Node, alreadySpaced: boolean):
	[string, TextSection[]] {
	const sections: TextSection[] = [];
	const parts: string[] = [];
	let index = 0;
	let spaced = alreadySpaced;
	if (!alreadySpaced) {
	parts.push(' ');
	index++;
	spaced = true;
	}
	const maxChars = alreadySpaced ? this.extraCharactersAtEnd :
	this.extraCharactersAtEnd - 1;
	let node: Node\|null = nextLeaf(afterNode, /* breakAtInvalid= */ true);
	while (node && index < maxChars) {
	if (node.nodeType === Node.TEXT_NODE && node.textContent &&
	node.textContent.trim()) {
	const textLength = node.textContent.length;
	const minLength = Math.min(maxChars - index, textLength);
	parts.push(node.textContent.substring(0, minLength));
	sections.push(new TextSection(node as Text, index));
	index += minLength;
	spaced = false;
	} else if (!spaced) {
	parts.push(' ');
	index++;
	spaced = true;
	}
	node = nextLeaf(node, /* breakAtInvalid= */ true);
	}
	if (sections.length > 0) {
	const text = ''.concat(...parts);
	return [text, sections];
	}
	return ['', []];
	}
	}