blob: 41b50f83ca1ac2ccc30079a006b64cb06a264911 [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "base/macros.h"
#include "third_party/cld_3/src/src/nnet_language_identifier.h"
#include "ui/accessibility/ax_enums.mojom.h"
#include "ui/accessibility/ax_export.h"
namespace ui {
class AXNode;
class AXTree;
// This module implements language detection enabling Chrome to automatically
// detect the language for runs of text within the page.
// Node-level language detection runs once per page after the load complete
// event. This involves two passes:
// *Detect* walks the tree from the given root using cld3 to detect up to 3
// potential languages per node. A ranked list is created enumerating
// all potential languages on a page.
// *Label* re-walks the tree, assigning a language to each node considering
// the potential languages from the detect phase, page level
// statistics, and the assigned languages of ancestor nodes.
// Optionally an embedder may run *sub-node* language detection which attempts
// to assign languages for runs of text within a node, potentially down to the
// individual character level. This is useful in cases where a single paragraph
// involves switching between multiple languages, and where the speech engine
// doesn't automatically switch voices to handle different character sets.
// Due to the potentially small lengths of text runs involved this tends to be
// lower in accuracy, and works best when a node is composed of multiple
// languages with easily distinguishable scripts.
// AXLanguageInfo represents the local language detection data for all text
// within an AXNode. Stored on AXNode.
struct AX_EXPORT AXLanguageInfo {
// This is the final language we have assigned for this node during the
// 'label' step, it is the result of merging:
// a) The detected language for this node
// b) The declared lang attribute on this node
// c) the (recursive) language of the parent (detected or declared).
// This will be the empty string if no language was assigned during label
// phase.
// IETF BCP 47 Language code (rfc5646).
// examples:
// 'de'
// 'de-DE'
// 'en'
// 'en-US'
// 'es-ES'
// This should not be read directly by clients of AXNode, instead clients
// should call AXNode::GetLanguage().
std::string language;
// Detected languages for this node sorted as returned by
// FindTopNMostFreqLangs, which sorts in decreasing order of probability,
// filtered to remove any unreliable results.
std::vector<std::string> detected_languages;
// Each AXLanguageSpan contains a language, a probability, and start and end
// indices. The indices are used to specify the substring that contains the
// associated language. The string which the indices are relative to is not
// included in this structure.
// Also, the indices are relative to a Utf8 string.
// See documentation on GetLanguageAnnotationForStringAttribute for details
// on how to associate this object with a string.
struct AX_EXPORT AXLanguageSpan {
int start_index;
int end_index;
std::string language;
float probability;
// A single AXLanguageInfoStats instance is stored on each AXTree and contains
// statistics on detected languages for all the AXNodes in that tree.
// We rely on these tree-level statistics when labelling individual nodes, to
// provide extra signals to increase our confidence in assigning a detected
// language.
// The Label step will only assign a detected language to a node if that
// language is one of the most frequent languages on the page.
// For example, if a single node has detected_languages (in order of probability
// assigned by cld_3): da-DK, en-AU, fr-FR, but the page statistics overall
// indicate that the page is generally in en-AU and ja-JP, it is more likely to
// be a mis-recognition of Danish than an accurate assignment, so we assign
// en-AU instead of da-DK.
class AX_EXPORT AXLanguageInfoStats {
// Adjust our statistics to add provided detected languages.
void Add(const std::vector<std::string>& languages);
// Fetch the score for a given language.
int GetScore(const std::string& lang) const;
// Check if a given language is within the top results.
bool CheckLanguageWithinTop(const std::string& lang);
// Store a count of the occurrences of a given language.
std::unordered_map<std::string, unsigned int> lang_counts_;
// Cache of last calculated top language results.
// A vector of pairs of (score, language) sorted by descending score.
std::vector<std::pair<unsigned int, std::string>> top_results_;
// Boolean recording that we have not mutated the statistics since last
// calculating top results, setting this to false will cause recalculation
// when the results are next fetched.
bool top_results_valid_;
void InvalidateTopResults();
void GenerateTopResults();
// AXLanguageDetectionManager manages all of the context needed for language
// detection within an AXTree.
class AX_EXPORT AXLanguageDetectionManager {
// Detect language for each node in the subtree rooted at the given node.
// This is the first pass in detection and labelling.
// This only detects the language, it does not label it, for that see
// LabelLanguageForSubtree.
void DetectLanguageForSubtree(AXNode* subtree_root);
// Label language for each node in the subtree rooted at the given node.
// This is the second pass in detection and labelling.
// This will label the language, but relies on the earlier detection phase
// having already completed.
void LabelLanguageForSubtree(AXNode* subtree_root);
// Sub-node language detection for a given string attribute.
// For example, if a node has name: "My name is Fred", then calling
// GetLanguageAnnotationForStringAttribute(*node, ax::mojom::StringAttribute::
// kName) would return language detection information about "My name is Fred".
std::vector<AXLanguageSpan> GetLanguageAnnotationForStringAttribute(
const AXNode& node,
ax::mojom::StringAttribute attr);
// TODO(chrishall): should this be stored by pointer or value?
AXLanguageInfoStats lang_info_stats;
void DetectLanguageForSubtreeInternal(AXNode* subtree_root);
void LabelLanguageForSubtreeInternal(AXNode* subtree_root);
// This language identifier is constructed with a default minimum byte length
// of chrome_lang_id::NNetLanguageIdentifier::kMinNumBytesToConsider and is
// used for detecting page-level languages.
chrome_lang_id::NNetLanguageIdentifier language_identifier_;
// This language identifier is constructed with a minimum byte length of
// kShortTextIdentifierMinByteLength so it can be used for detecting languages
// of shorter text (e.g. one character).
chrome_lang_id::NNetLanguageIdentifier short_text_language_identifier_;
} // namespace ui