blob: cf0915626218a5474bce9cb2ee99b8e5821fa8db [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "base/macros.h"
#include "ui/accessibility/ax_export.h"
#include "third_party/cld_3/src/src/nnet_language_identifier.h"
namespace ui {
class AXNode;
class AXTree;
// This module implements language detection enabling Chrome to automatically
// detect the language for spans of text within the page without relying on any
// declared attributes.
// Language detection relies on two key data structures:
// AXLanguageInfo represents the local language detection data for an AXNode.
// AXLanguageInfoStats represents the 'global' (tree-level) language detection
// data for all nodes within an AXTree.
// Language detection is implemented as a two-pass process to reduce the
// assignment of spurious languages.
// After the first pass no languages have been assigned to AXNode(s), this is
// left to the second pass so that we can take use tree-level statistics to
// better inform the local language assigned.
// The first pass 'Detect' (entry point DetectLanguageForSubtree) walks the
// subtree from a given AXNode and attempts to detect the language of any text
// found. It records results in an instance of AXLanguageInfo which it stores on
// the AXNode, it also records statistics on the languages found in the
// AXLanguageInfoStats instance associated with each AXTree.
// The second pass 'Label' (entry point LabelLanguageForSubtree) walks the
// subtree from a given AXNode and attempts to find an appropriate language to
// associate with each AXNode based on a combination of the local detection
// results (AXLanguageInfo) and the global stats (AXLanguageInfoStats).
// An instance of AXLanguageInfo is used to record the detected and assigned
// languages for a single AXNode, this data is entirely local to the AXNode.
struct AX_EXPORT AXLanguageInfo {
// This is the final language we have assigned for this node during the
// 'label' step, it is the result of merging:
// a) The detected language for this node
// b) The declared lang attribute on this node
// c) the (recursive) language of the parent (detected or declared).
// This will be the empty string if no language was assigned during label
// phase.
// IETF BCP 47 Language code (rfc5646).
// examples:
// 'de'
// 'de-DE'
// 'en'
// 'en-US'
// 'es-ES'
// This should not be read directly by clients of AXNode, instead clients
// should call AXNode::GetLanguage().
std::string language;
// Detected languages for this node sorted as returned by
// FindTopNMostFreqLangs, which sorts in decreasing order of probability,
// filtered to remove any unreliable results.
std::vector<std::string> detected_languages;
// A single AXLanguageInfoStats instance is stored on each AXTree and represents
// the language detection statistics for every AXNode within that AXTree.
// We rely on these tree-level statistics to avoid spurious language detection
// assignments.
// The Label step will only assign a detected language to a node if that
// language is one of the dominant languages on the page.
class AX_EXPORT AXLanguageInfoStats {
// Adjust our statistics to add provided detected languages.
void Add(const std::vector<std::string>& languages);
// Fetch the score for a given language.
int GetScore(const std::string& lang) const;
// Check if a given language is within the top results.
bool CheckLanguageWithinTop(const std::string& lang);
chrome_lang_id::NNetLanguageIdentifier& GetLanguageIdentifier();
// Store a count of the occurrences of a given language.
std::unordered_map<std::string, unsigned int> lang_counts_;
// Cache of last calculated top language results.
// A vector of pairs of (score, language) sorted by descending score.
std::vector<std::pair<unsigned int, std::string>> top_results_;
// Boolean recording that we have not mutated the statistics since last
// calculating top results, setting this to false will cause recalculation
// when the results are next fetched.
bool top_results_valid_;
void InvalidateTopResults();
// Populate top_results_.
void GenerateTopResults();
chrome_lang_id::NNetLanguageIdentifier language_identifier_;
// Detect language for each node in the subtree rooted at the given node.
// This is the first pass in detection and labelling.
// This only detects the language, it does not label it, for that see
// LabelLanguageForSubtree.
AX_EXPORT void DetectLanguageForSubtree(AXNode* subtree_root,
class AXTree* tree);
// Label language for each node in the subtree rooted at the given node.
// This is the second pass in detection and labelling.
// This will label the language, but relies on the earlier detection phase
// having already completed.
// returns boolean indicating success.
AX_EXPORT bool LabelLanguageForSubtree(AXNode* subtree_root,
class AXTree* tree);
} // namespace ui