blob: fa500612cde1fd0dce67c31a07704eb62c7ee626 [file] [log] [blame]
// Copyright 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "ui/accessibility/ax_language_info.h"
#include <algorithm>
#include <functional>
#include "base/command_line.h"
#include "ui/accessibility/accessibility_switches.h"
#include "ui/accessibility/ax_enums.mojom.h"
#include "ui/accessibility/ax_tree.h"
namespace ui {
namespace {
// This is the maximum number of languages we assign per page, so only the top
// 3 languages on the top will be assigned to any node.
const auto kMaxDetectedLanguagesPerPage = 3;
// This is the maximum number of languages that cld3 will detect for each
// input we give it, 3 was recommended to us by the ML team as a good
// starting point.
const auto kMaxDetectedLanguagesPerSpan = 3;
} // namespace
AXLanguageInfo::AXLanguageInfo() {}
AXLanguageInfo::~AXLanguageInfo() {}
AXLanguageInfoStats::AXLanguageInfoStats() : top_results_valid_(false) {}
AXLanguageInfoStats::~AXLanguageInfoStats() = default;
AXLanguageInfoStats::GetLanguageIdentifier() {
return language_identifier_;
void AXLanguageInfoStats::Add(const std::vector<std::string>& languages) {
// Assign languages with higher probability a higher score.
// TODO(chrishall): consider more complex scoring
size_t score = kMaxDetectedLanguagesPerSpan;
for (const auto& lang : languages) {
lang_counts_[lang] += score;
int AXLanguageInfoStats::GetScore(const std::string& lang) const {
const auto& lang_count_it = lang_counts_.find(lang);
if (lang_count_it == lang_counts_.end()) {
return 0;
return lang_count_it->second;
void AXLanguageInfoStats::InvalidateTopResults() {
top_results_valid_ = false;
// Check if a given language is within the top results.
bool AXLanguageInfoStats::CheckLanguageWithinTop(const std::string& lang) {
if (!top_results_valid_) {
for (const auto& item : top_results_) {
if (lang == item.second) {
return true;
return false;
void AXLanguageInfoStats::GenerateTopResults() {
for (const auto& item : lang_counts_) {
top_results_.emplace_back(item.second, item.first);
// Since we store the pair as (score, language) the default operator> on pairs
// does our sort appropriately.
// Sort in descending order.
std::sort(top_results_.begin(), top_results_.end(),
std::greater<std::pair<unsigned int, std::string>>());
// Resize down to remove all values greater than the N we are considering.
top_results_valid_ = true;
static void DetectLanguageForSubtreeInternal(AXNode* node, class AXTree* tree);
// Detect language for a subtree rooted at the given node.
void DetectLanguageForSubtree(AXNode* subtree_root, class AXTree* tree) {
TRACE_EVENT0("accessibility", "AXLanguageInfo::DetectLanguageForSubtree");
if (!::switches::IsExperimentalAccessibilityLanguageDetectionEnabled()) {
// If feature is not enabled we still return success as we were as
// successful as we could have been.
if (!tree->language_info_stats) {
tree->language_info_stats.reset(new AXLanguageInfoStats());
DetectLanguageForSubtreeInternal(subtree_root, tree);
// Detect language for a subtree rooted at the given node
// will not check feature flag.
static void DetectLanguageForSubtreeInternal(AXNode* node, class AXTree* tree) {
if (node->IsText()) {
AXLanguageInfoStats* lang_info_stats = tree->language_info_stats.get();
AXLanguageInfo* lang_info = node->GetLanguageInfo();
if (!lang_info) {
// TODO(chrishall): consider space optimisations.
// Currently we keep these language info instances around until
// destruction of the containing node, this is due to us treating AXNode
// as otherwise read-only and so we store any detected language
// information on lang info.
lang_info = new AXLanguageInfo();
} else {
chrome_lang_id::NNetLanguageIdentifier& language_identifier =
// TODO(chrishall): implement strategy for nodes which are too small to get
// reliable language detection results. Consider combination of
// concatenation and bubbling up results.
auto text = node->GetStringAttribute(ax::mojom::StringAttribute::kName);
const auto results = language_identifier.FindTopNMostFreqLangs(
text, kMaxDetectedLanguagesPerSpan);
for (const auto res : results) {
// The output of FindTopNMostFreqLangs is already sorted by byte count,
// this seems good enough for now.
// Only consider results which are 'reliable', this will also remove
// 'unknown'.
if (res.is_reliable) {
// TODO(chrishall): refactor this as textnodes only ever have inline text
// boxen as children. This means we don't need to recurse except for
// inheritance which can be handled elsewhere.
for (AXNode* child : node->children()) {
DetectLanguageForSubtreeInternal(child, tree);
static void LabelLanguageForSubtreeInternal(AXNode* node, class AXTree* tree);
// Label language for each node in the subtree rooted at the given node.
// This relies on DetectLanguageForSubtree having already been run.
bool LabelLanguageForSubtree(AXNode* subtree_root, class AXTree* tree) {
TRACE_EVENT0("accessibility", "AXLanguageInfo::LabelLanguageForSubtree");
if (!::switches::IsExperimentalAccessibilityLanguageDetectionEnabled()) {
// If feature is not enabled we still return success as we were as
// successful as we could have been.
return true;
if (!tree->language_info_stats) {
// Detection has not been performed, error, the user is holding this wrong.
// DetectLanguageForSubtree must always be called on a given subtree before
// LabelLanguageForSubtree is called.
LOG(FATAL) << "LabelLanguageForSubtree run before DetectLanguageForSubtree";
return false;
LabelLanguageForSubtreeInternal(subtree_root, tree);
return true;
static void LabelLanguageForSubtreeInternal(AXNode* node, class AXTree* tree) {
AXLanguageInfo* lang_info = node->GetLanguageInfo();
// lang_info is only attached by Detect when it thinks a node is interesting,
// the presence of lang_info means that Detect expects the node to end up with
// a language specified.
// If the lang_info->language is already set then we have no more work to do
// for this node.
if (lang_info && lang_info->language.empty()) {
AXLanguageInfoStats* lang_info_stats = tree->language_info_stats.get();
for (const auto& lang : lang_info->detected_languages) {
if (lang_info_stats->CheckLanguageWithinTop(lang)) {
lang_info->language = lang;
// TODO(chrishall): consider obeying the author declared lang tag in some
// cases, either based on proximity or based on common language detection
// error cases.
// If language is still empty then we failed to detect a language from
// this node, we will instead try construct a language from other sources
// including any lang attribute and any language from the parent tree.
if (lang_info->language.empty()) {
const auto& lang_attr =
if (!lang_attr.empty()) {
lang_info->language = lang_attr;
} else {
// We call GetLanguage() on our parent which will return a detected
// language if it has one, otherwise it will search up the tree for a
// kLanguage attribute.
// This means that lang attributes are inherited indefinitely but
// detected language is only inherited one level.
// Currently we only attach detected language to text nodes, once we
// start attaching detected language on other nodes we need to rethink
// this. We may want to attach detected language information once we
// consider combining multiple smaller text nodes into one larger one.
// TODO(chrishall): reconsider detected language inheritance.
AXNode* parent = node->parent();
if (parent) {
const auto& parent_lang = parent->GetLanguage();
if (!parent_lang.empty()) {
lang_info->language = parent_lang;
for (AXNode* child : node->children()) {
LabelLanguageForSubtreeInternal(child, tree);
} // namespace ui