blob: 857deedc94dfa077580e773b1f22d80c9e4617ef [file] [log] [blame]
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/ash/accessibility/dictation.h"
#include "ash/components/audio/sounds.h"
#include "ash/constants/ash_pref_names.h"
#include "base/containers/fixed_flat_map.h"
#include "base/containers/flat_map.h"
#include "base/metrics/histogram_functions.h"
#include "base/metrics/metrics_hashes.h"
#include "base/strings/string_piece.h"
#include "base/strings/utf_string_conversions.h"
#include "base/timer/timer.h"
#include "chrome/browser/ash/accessibility/accessibility_manager.h"
#include "chrome/browser/browser_process.h"
#include "chrome/browser/profiles/profile.h"
#include "chrome/browser/speech/network_speech_recognizer.h"
#include "chrome/browser/speech/on_device_speech_recognizer.h"
#include "chrome/common/pref_names.h"
#include "components/language/core/browser/pref_names.h"
#include "components/language/core/common/locale_util.h"
#include "components/prefs/pref_service.h"
#include "components/soda/soda_installer.h"
#include "content/public/browser/browser_task_traits.h"
#include "content/public/browser/browser_thread.h"
#include "content/public/browser/storage_partition.h"
#include "services/audio/public/cpp/sounds/sounds_manager.h"
#include "services/network/public/cpp/shared_url_loader_factory.h"
#include "ui/accessibility/accessibility_features.h"
#include "ui/base/ime/chromeos/extension_ime_util.h"
#include "ui/base/ime/chromeos/ime_bridge.h"
#include "ui/base/ime/chromeos/ime_input_context_handler_interface.h"
#include "ui/base/ime/chromeos/input_method_util.h"
#include "ui/base/ime/composition_text.h"
namespace ash {
namespace {
// Length of timeout to cancel recognition if there's no speech heard.
static const base::TimeDelta kNetworkNoSpeechTimeout =
base::TimeDelta::FromSeconds(5);
static const base::TimeDelta kDeviceNoSpeechTimeout =
base::TimeDelta::FromSeconds(10);
// Length of timeout to cancel recognition if no different results are received.
static const base::TimeDelta kNetworkNoNewSpeechTimeout =
base::TimeDelta::FromSeconds(2);
static const base::TimeDelta kDeviceNoNewSpeechTimeout =
base::TimeDelta::FromSeconds(5);
const char kDefaultProfileLocale[] = "en-US";
// Determines the user's language or locale from the system, first trying
// the current IME language and falling back to the application locale.
std::string GetUserLangOrLocaleFromSystem(Profile* profile) {
// Convert from the ID used in the pref to a language identifier.
std::vector<std::string> input_method_ids;
input_method_ids.push_back(
profile->GetPrefs()->GetString(::prefs::kLanguageCurrentInputMethod));
std::vector<std::string> languages;
chromeos::input_method::InputMethodManager::Get()
->GetInputMethodUtil()
->GetLanguageCodesFromInputMethodIds(input_method_ids, &languages);
std::string user_language;
if (!languages.empty())
user_language = languages[0];
// If we don't find an IME language, fall back to using the application
// locale.
if (user_language.empty())
user_language = g_browser_process->GetApplicationLocale();
return user_language.empty() ? kDefaultProfileLocale : user_language;
}
std::string GetUserLocale(Profile* profile) {
std::string locale;
if (features::IsExperimentalAccessibilityDictationOfflineEnabled()) {
// Get the user's chosen dictation locale from their preference in settings.
// This is guaranteed to be a supported locale and won't be empty, since
// the pref is set using DetermineDefaultSupportedLocale() as soon as
// Dictation is enabled, assuming that supported languages are never removed
// from this list.
locale =
profile->GetPrefs()->GetString(prefs::kAccessibilityDictationLocale);
} else {
locale = GetUserLangOrLocaleFromSystem(profile);
}
DCHECK(!locale.empty());
return locale;
}
std::string GetSupportedLocale(const std::string& lang_or_locale) {
if (lang_or_locale.empty())
return std::string();
// Map of language code to supported locale for the open web API.
// Chrome OS does not support Chinese languages with "cmn", so this
// map also includes a map from Open Speech API "cmn" languages to
// their equivalent default locale.
static constexpr auto kLangsToDefaultLocales =
base::MakeFixedFlatMap<base::StringPiece, base::StringPiece>(
{{"af", "af-ZA"}, {"am", "am-ET"},
{"ar", "ar-001"}, {"az", "az-AZ"},
{"bg", "bg-BG"}, {"bn", "bn-IN"},
{"bs", "bs-BA"}, {"ca", "ca-ES"},
{"cs", "cs-CZ"}, {"da", "da-DK"},
{"de", "de-DE"}, {"el", "el-GR"},
{"en", "en-US"}, {"es", "es-ES"},
{"et", "et-EE"}, {"eu", "eu-ES"},
{"fa", "fa-IR"}, {"fi", "fi-FI"},
{"fil", "fil-PH"}, {"fr", "fr-FR"},
{"gl", "gl-ES"}, {"gu", "gu-IN"},
{"he", "iw-IL"}, {"hi", "hi-IN"},
{"hr", "hr-HR"}, {"hu", "hu-HU"},
{"hy", "hy-AM"}, {"id", "id-ID"},
{"is", "is-IS"}, {"it", "it-IT"},
{"iw", "iw-IL"}, {"ja", "ja-JP"},
{"jv", "jv-ID"}, {"ka", "ka-GE"},
{"kk", "kk-KZ"}, {"km", "km-KH"},
{"kn", "kn-IN"}, {"ko", "ko-KR"},
{"lo", "lo-LA"}, {"lt", "lt-LT"},
{"lv", "lv-LV"}, {"mk", "mk-MK"},
{"ml", "ml-IN"}, {"mn", "mn-MN"},
{"mo", "ro-RO"}, {"mr", "mr-IN"},
{"ms", "ms-MY"}, {"my", "my-MM"},
{"ne", "ne-NP"}, {"nl", "nl-NL"},
{"no", "no-NO"}, {"pa", "pa-Guru-IN"},
{"pl", "pl-PL"}, {"pt", "pt-BR"},
{"ro", "ro-RO"}, {"ru", "ru-RU"},
{"si", "si-LK"}, {"sk", "sk-SK"},
{"sl", "sl-SI"}, {"sq", "sq-AL"},
{"sr", "sr-RS"}, {"su", "su-ID"},
{"sv", "sv-SE"}, {"sw", "sw-TZ"},
{"ta", "ta-IN"}, {"te", "te-IN"},
{"tl", "fil-PH"}, {"th", "th-TH"},
{"tr", "tr-TR"}, {"uk", "uk-UA"},
{"ur", "ur-PK"}, {"uz", "uz-UZ"},
{"vi", "vi-VN"}, {"yue", "yue-Hant-HK"},
{"zh", "zh-CN"}, {"zu", "zu-ZA"},
{"zh-cmn-CN", "zh-CN"}, {"zh-cmn", "zh-CN"},
{"zh-cmn-Hans", "zh-CN"}, {"zh-cmn-Hans-CN", "zh-CN"},
{"cmn-CN", "zh-CN"}, {"cmn-Hans", "zh-CN"},
{"cmn-Hans-CN", "zh-CN"}, {"cmn-Hant-TW", "zh-TW"},
{"zh-cmn-TW", "zh-TW"}, {"zh-cmn-Hant-TW", "zh-TW"},
{"cmn-TW", "zh-TW"}});
// First check if this is a language code supported in the map above.
auto* iter = kLangsToDefaultLocales.find(lang_or_locale);
if (iter != kLangsToDefaultLocales.end())
return std::string(iter->second);
// If it's only a language code, we can return early, because no other
// language-only codes are supported.
std::pair<base::StringPiece, base::StringPiece> lang_and_locale_pair =
language::SplitIntoMainAndTail(lang_or_locale);
if (lang_and_locale_pair.second.size() == 0)
return std::string();
// The code is a supported locale. Return itself.
// Note that it doesn't matter if the supported locale is online or offline.
if (base::Contains(Dictation::GetAllSupportedLocales(), lang_or_locale))
return lang_or_locale;
// Finally, get the language code from the locale and try to use it to map
// to a default locale. For example, "en-XX" should map to "en-US" if "en-XX"
// does not exist.
iter = kLangsToDefaultLocales.find(lang_and_locale_pair.first);
if (iter != kLangsToDefaultLocales.end())
return std::string(iter->second);
return std::string();
}
// Returns the current input context. This may change during the session, even
// if the IME engine does not change, because remote mojo applications have
// their own instance of InputMethodChromeOS. See comment on InputMethodBridge.
ui::IMEInputContextHandlerInterface* GetInputContext() {
return ui::IMEBridge::Get()->GetInputContextHandler();
}
} // namespace
// static
const base::flat_map<std::string, bool> Dictation::GetAllSupportedLocales() {
base::flat_map<std::string, bool> supported_locales;
static const char* kWebSpeechSupportedLocales[] = {
"af-ZA", "am-ET", "ar-AE", "ar-BH", "ar-DZ", "ar-EG", "ar-IL",
"ar-IQ", "ar-JO", "ar-KW", "ar-LB", "ar-MA", "ar-OM", "ar-PS",
"ar-QA", "ar-SA", "ar-TN", "ar-YE", "az-AZ", "bg-BG", "bn-BD",
"bn-IN", "bs-BA", "ca-ES", "cs-CZ", "da-DK", "de-AT", "de-CH",
"de-DE", "el-GR", "en-AU", "en-CA", "en-GB", "en-GH", "en-HK",
"en-IE", "en-IN", "en-KE", "en-NG", "en-NZ", "en-PH", "en-PK",
"en-SG", "en-TZ", "en-US", "en-ZA", "es-AR", "es-BO", "es-CL",
"es-CO", "es-CR", "es-DO", "es-EC", "es-ES", "es-GT", "es-HN",
"es-MX", "es-NI", "es-PA", "es-PE", "es-PR", "es-PY", "es-SV",
"es-US", "es-UY", "es-VE", "et-EE", "eu-ES", "fa-IR", "fi-FI",
"fil-PH", "fr-BE", "fr-CA", "fr-CH", "fr-FR", "gl-ES", "gu-IN",
"hi-IN", "hr-HR", "hu-HU", "hy-AM", "id-ID", "is-IS", "it-CH",
"it-IT", "iw-IL", "ja-JP", "jv-ID", "ka-GE", "kk-KZ", "km-KH",
"kn-IN", "ko-KR", "lo-LA", "lt-LT", "lv-LV", "mk-MK", "ml-IN",
"mn-MN", "mr-IN", "ms-MY", "my-MM", "ne-NP", "nl-BE", "nl-NL",
"no-NO", "pa-Guru-IN", "pl-PL", "pt-BR", "pt-PT", "ro-RO", "ru-RU",
"si-LK", "sk-SK", "sl-SI", "sq-AL", "sr-RS", "su-ID", "sv-SE",
"sw-KE", "sw-TZ", "ta-IN", "ta-LK", "ta-MY", "ta-SG", "te-IN",
"th-TH", "tr-TR", "uk-UA", "ur-IN", "ur-PK", "uz-UZ", "vi-VN",
"yue-Hant-HK", "zh-CN", "zh-TW", "zu-ZA", "ar-001"};
for (const char* locale : kWebSpeechSupportedLocales) {
// By default these languages are not supported offline.
supported_locales[locale] = false;
}
if (features::IsExperimentalAccessibilityDictationOfflineEnabled()) {
std::vector<std::string> offline_languages =
speech::SodaInstaller::GetInstance()->GetAvailableLanguages();
for (auto language : offline_languages) {
// These are supported offline.
supported_locales[language] = true;
}
}
return supported_locales;
}
// static
std::string Dictation::DetermineDefaultSupportedLocale(Profile* profile,
bool new_user) {
std::string lang_or_locale;
if (new_user) {
// This is the first time this user has enabled Dictation. Pick the default
// language preference based on their application locale.
lang_or_locale = g_browser_process->GetApplicationLocale();
} else {
// This user has already had Dictation enabled, but now we need to map
// from the language they've previously used to a supported locale.
lang_or_locale = GetUserLangOrLocaleFromSystem(profile);
}
std::string supported_locale = GetSupportedLocale(lang_or_locale);
return supported_locale.empty() ? kDefaultProfileLocale : supported_locale;
}
Dictation::Dictation(Profile* profile)
: current_state_(SPEECH_RECOGNIZER_OFF),
composition_(std::make_unique<ui::CompositionText>()),
profile_(profile),
no_speech_timeout_(kNetworkNoSpeechTimeout),
no_new_speech_timeout_(kNetworkNoNewSpeechTimeout) {
if (GetInputContext() && GetInputContext()->GetInputMethod())
GetInputContext()->GetInputMethod()->AddObserver(this);
}
Dictation::~Dictation() {
if (GetInputContext() && GetInputContext()->GetInputMethod())
GetInputContext()->GetInputMethod()->RemoveObserver(this);
}
bool Dictation::OnToggleDictation() {
if (speech_recognizer_) {
DictationOff();
return false;
}
has_committed_text_ = false;
const std::string locale = GetUserLocale(profile_);
// Log the locale used with LocaleCodeISO639 values.
base::UmaHistogramSparse("Accessibility.CrosDictation.Language",
base::HashMetricName(locale));
speech::SodaInstaller* soda_installer = speech::SodaInstaller::GetInstance();
if (features::IsExperimentalAccessibilityDictationOfflineEnabled() &&
(soda_installer->IsSodaDownloading(speech::GetLanguageCode(locale)))) {
// Don't allow Dictation to be used while SODA is downloading.
audio::SoundsManager::Get()->Play(
static_cast<int>(Sound::kDictationCancel));
return false;
}
if (features::IsExperimentalAccessibilityDictationOfflineEnabled() &&
OnDeviceSpeechRecognizer::IsOnDeviceSpeechRecognizerAvailable(locale)) {
// On-device recognition is behind a flag and then only available if
// SODA is installed on-device.
speech_recognizer_ = std::make_unique<OnDeviceSpeechRecognizer>(
weak_ptr_factory_.GetWeakPtr(), profile_, locale,
/*recognition_mode_ime=*/true, /*enable_formatting=*/false);
base::UmaHistogramBoolean("Accessibility.CrosDictation.UsedOnDeviceSpeech",
true);
no_speech_timeout_ = kDeviceNoSpeechTimeout;
no_new_speech_timeout_ = kDeviceNoNewSpeechTimeout;
used_on_device_speech_ = true;
} else {
speech_recognizer_ = std::make_unique<NetworkSpeechRecognizer>(
weak_ptr_factory_.GetWeakPtr(),
profile_->GetDefaultStoragePartition()
->GetURLLoaderFactoryForBrowserProcessIOThread(),
profile_->GetPrefs()->GetString(language::prefs::kAcceptLanguages),
locale);
base::UmaHistogramBoolean("Accessibility.CrosDictation.UsedOnDeviceSpeech",
false);
no_speech_timeout_ =
features::IsExperimentalAccessibilityDictationListeningEnabled()
? kDeviceNoSpeechTimeout
: kNetworkNoSpeechTimeout;
no_new_speech_timeout_ = kNetworkNoNewSpeechTimeout;
used_on_device_speech_ = false;
}
listening_duration_timer_ = base::ElapsedTimer();
return true;
}
void Dictation::OnSpeechResult(
const std::u16string& transcription,
bool is_final,
const absl::optional<media::SpeechRecognitionResult>& word_offsets) {
// If the first character of text isn't a space, add a space before it.
// NetworkSpeechRecognizer adds the preceding space but
// OnDeviceSpeechRecognizer does not. This is also done in
// CaptionBubbleModel::CommitPartialText.
// TODO(crbug.com/1055150): This feature is launching for English first.
// Make sure spacing is correct for all languages.
if (has_committed_text_ && transcription.size() > 0 &&
transcription.compare(0, 1, u" ") != 0) {
composition_->text = u" " + transcription;
} else {
composition_->text = transcription;
}
// Restart the timer when we have a final result. If we receive any new or
// changed text, restart the timer to give the user more time to speak. (The
// timer is recording the amount of time since the most recent utterance.)
if (is_final) {
StartSpeechTimeout(no_speech_timeout_);
} else {
StartSpeechTimeout(
features::IsExperimentalAccessibilityDictationListeningEnabled()
? no_speech_timeout_
: no_new_speech_timeout_);
// If ChromeVox is enabled, we don't want to show intermediate results
if (AccessibilityManager::Get()->IsSpokenFeedbackEnabled())
return;
ui::IMEInputContextHandlerInterface* input_context = GetInputContext();
if (input_context)
input_context->UpdateCompositionText(*composition_, 0, true);
return;
}
if (features::IsExperimentalAccessibilityDictationListeningEnabled()) {
CommitCurrentText();
} else {
// Turn off after finalized speech.
DictationOff();
}
}
void Dictation::OnSpeechSoundLevelChanged(int16_t level) {}
void Dictation::OnSpeechRecognitionStateChanged(
SpeechRecognizerStatus new_state) {
SpeechRecognizerStatus next_state = new_state;
if (new_state == SPEECH_RECOGNIZER_RECOGNIZING) {
// If we are starting to listen to audio, play a tone for the user.
audio::SoundsManager::Get()->Play(static_cast<int>(Sound::kDictationStart));
// Start a timeout to ensure if no speech happens we will eventually turn
// ourselves off.
StartSpeechTimeout(no_speech_timeout_);
} else if (new_state == SPEECH_RECOGNIZER_ERROR) {
DictationOff();
next_state = SPEECH_RECOGNIZER_OFF;
} else if (new_state == SPEECH_RECOGNIZER_READY) {
if (current_state_ == SPEECH_RECOGNIZER_OFF && speech_recognizer_) {
// The SpeechRecognizer was initialized after being created, and
// is ready to start recognizing speech.
speech_recognizer_->Start();
} else {
// This state is only reached when nothing has been said for a fixed time.
// In this case, the expected behavior is for dictation to terminate.
DictationOff();
next_state = SPEECH_RECOGNIZER_OFF;
}
}
current_state_ = next_state;
}
void Dictation::OnTextInputStateChanged(const ui::TextInputClient* client) {
if (!client)
return;
if (client->GetFocusReason() ==
ui::TextInputClient::FocusReason::FOCUS_REASON_NONE)
return;
DictationOff();
}
void Dictation::DictationOff() {
current_state_ = SPEECH_RECOGNIZER_OFF;
StopSpeechTimeout();
if (!speech_recognizer_)
return;
// Post commit text delayed to avoid a dcheck.
content::GetUIThreadTaskRunner({})->PostTask(
FROM_HERE, base::BindOnce(&Dictation::CommitCurrentText,
weak_ptr_factory_.GetWeakPtr()));
if (!composition_->text.empty()) {
audio::SoundsManager::Get()->Play(static_cast<int>(Sound::kDictationEnd));
} else {
audio::SoundsManager::Get()->Play(
static_cast<int>(Sound::kDictationCancel));
}
AccessibilityStatusEventDetails details(
AccessibilityNotificationType::kToggleDictation, false /* enabled */);
AccessibilityManager::Get()->NotifyAccessibilityStatusChanged(details);
speech_recognizer_.reset();
// Duration matches the lifetime of the speech recognizer.
if (used_on_device_speech_) {
base::UmaHistogramLongTimes(
"Accessibility.CrosDictation.ListeningDuration.OnDeviceRecognition",
listening_duration_timer_.Elapsed());
} else {
base::UmaHistogramLongTimes(
"Accessibility.CrosDictation.ListeningDuration.NetworkRecognition",
listening_duration_timer_.Elapsed());
}
}
void Dictation::CommitCurrentText() {
if (composition_->text.empty()) {
return;
}
has_committed_text_ = true;
ui::IMEInputContextHandlerInterface* input_context = GetInputContext();
if (input_context) {
input_context->CommitText(
composition_->text,
ui::TextInputClient::InsertTextCursorBehavior::kMoveCursorAfterText);
}
composition_->text = std::u16string();
}
void Dictation::StartSpeechTimeout(base::TimeDelta timeout_duration) {
speech_timeout_.Start(FROM_HERE, timeout_duration,
base::BindOnce(&Dictation::OnSpeechTimeout,
weak_ptr_factory_.GetWeakPtr()));
}
void Dictation::StopSpeechTimeout() {
speech_timeout_.Stop();
}
void Dictation::OnSpeechTimeout() {
DictationOff();
}
} // namespace ash