blob: d28c080b600b6ce949c953e841602907aed686d2 [file] [log] [blame]
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
import {ParagraphUtils} from './paragraph_utils.js';
// Utilities for processing words within strings and nodes.
export class WordUtils {
/**
* Searches through text starting at an index to find the next word's
* start boundary.
* @param {string|undefined} text The string to search through
* @param {number} indexAfter The index into text at which to start
* searching.
* @param {ParagraphUtils.NodeGroupItem} nodeGroupItem The node whose name we
* are searching through.
* @param {boolean?} ignoreStartChar When set to true, the search will only
* consider the index within the input node and ignore
* nodeGroupItem.startChar offsets. This is useful when we only search
* within the input nodeGroupItem, instead of the parent nodeGroup.
* @return {number} The index of the next word's start
*/
static getNextWordStart(
text, indexAfter, nodeGroupItem, ignoreStartChar = false) {
if (nodeGroupItem.hasInlineText && nodeGroupItem.node.children.length > 0) {
const startChar = ignoreStartChar ? 0 : nodeGroupItem.startChar;
const node = ParagraphUtils.findInlineTextNodeByCharacterIndex(
nodeGroupItem.node, indexAfter - startChar);
const startCharInParent = ParagraphUtils.getStartCharIndexInParent(node);
for (var i = 0; i < node.wordStarts.length; i++) {
if (node.wordStarts[i] + startChar + startCharInParent < indexAfter) {
continue;
}
return node.wordStarts[i] + startChar + startCharInParent;
}
// Default: We are just off the edge of this node.
return node.name.length + startChar + startCharInParent;
} else {
// Try to parse using a regex, which is imperfect.
// Fall back to the given index if we can't find a match.
return WordUtils.nextWordHelper(
text, indexAfter, WordUtils.WORD_START_REGEXP, indexAfter);
}
}
/**
* Searches through text starting at an index to find the next word's
* end boundary.
* @param {string|undefined} text The string to search through
* @param {number} indexAfter The index into text at which to start
* searching.
* @param {ParagraphUtils.NodeGroupItem} nodeGroupItem The node whose name we
* are searching through.
* @param {boolean?} ignoreStartChar When set to true, the search will only
* consider the index within the input node and ignore
* nodeGroupItem.startChar offsets. This is useful when we only search
* within the input nodeGroupItem, instead of the parent nodeGroup.
* @return {number} The index of the next word's end
*/
static getNextWordEnd(
text, indexAfter, nodeGroupItem, ignoreStartChar = false) {
if (nodeGroupItem.hasInlineText && nodeGroupItem.node.children.length > 0) {
const startChar = ignoreStartChar ? 0 : nodeGroupItem.startChar;
const node = ParagraphUtils.findInlineTextNodeByCharacterIndex(
nodeGroupItem.node, indexAfter - startChar + 1);
const startCharInParent = ParagraphUtils.getStartCharIndexInParent(node);
for (var i = 0; i < node.wordEnds.length; i++) {
if (node.wordEnds[i] + startChar + startCharInParent - 1 < indexAfter) {
continue;
}
const result = node.wordEnds[i] + startChar + startCharInParent;
return text.length > result ? result : text.length;
}
// Default.
return text.length;
} else {
// Try to parse using a regex, which is imperfect.
// Fall back to the full length of the text if we can't find a match.
return WordUtils.nextWordHelper(
text, indexAfter, WordUtils.WORD_END_REGEXP, text.length - 1) +
1;
}
}
/**
* Searches through text to find the first index of a regular expression
* after a given starting index. Returns a default value if no match is
* found.
* @param {string|undefined} text The string to search through
* @param {number} indexAfter The index at which to start searching
* @param {RegExp} re A regular expression to search for
* @param {number} defaultValue The default value to return if no
match is found.
* @return {number} The index found by the regular expression, or -1
* if none found.
*/
static nextWordHelper(text, indexAfter, re, defaultValue) {
if (text === undefined) {
return defaultValue;
}
const result = re.exec(text.substr(indexAfter));
if (result != null && result.length > 0) {
return indexAfter + result.index;
}
return defaultValue;
}
}
/**
* Regular expression to find the start of the next word after a word boundary.
* We cannot use \b\W to find the next word because it does not match many
* unicode characters.
* @type {RegExp}
*/
WordUtils.WORD_START_REGEXP = /\b\S/;
/**
* Regular expression to find the end of the next word, which is followed by
* whitespace. We cannot use \w\b to find the end of the previous word because
* \w does not know about many unicode characters.
* @type {RegExp}
*/
WordUtils.WORD_END_REGEXP = /\S\s/;