blob: be18e8097c0a0ba7108bafbf84918cb120a9ad10 [file] [log] [blame]
// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/speech/extension_api/tts_extension_api.h"
#include <stddef.h>
#include <memory>
#include <string>
#include <utility>
#include "base/lazy_instance.h"
#include "base/metrics/histogram_macros.h"
#include "base/metrics/user_metrics.h"
#include "base/values.h"
#include "build/build_config.h"
#include "chrome/browser/profiles/profile.h"
#include "chrome/browser/speech/extension_api/tts_engine_extension_api.h"
#include "chrome/browser/speech/extension_api/tts_extension_api_constants.h"
#include "content/public/browser/tts_controller.h"
#include "content/public/browser/tts_platform.h"
#include "extensions/browser/event_router.h"
#include "extensions/browser/extension_function_registry.h"
#include "extensions/browser/extension_host.h"
#include "extensions/browser/process_manager.h"
#include "third_party/blink/public/mojom/speech/speech_synthesis.mojom.h"
#include "ui/base/l10n/l10n_util.h"
#if BUILDFLAG(IS_CHROMEOS)
#include "chrome/browser/speech/extension_api/tts_engine_extension_observer_chromeos_factory.h"
#include "chrome/common/extensions/extension_constants.h"
#endif // BUILDFLAG(IS_CHROMEOS)
namespace constants = tts_extension_api_constants;
#if BUILDFLAG(IS_CHROMEOS)
namespace {
// ChromeOS source that triggered text-to-speech utterance.
//
// These values are logged to UMA. Entries should not be renumbered and
// numeric values should never be reused. Please keep in sync with
// "TextToSpeechSource" in src/tools/metrics/histograms/enums.xml.
// LINT.IfChange(UMATextToSpeechSource)
enum class UMATextToSpeechSource {
kOther = 0,
kChromeVox = 1,
kSelectToSpeak = 2,
kMaxValue = kSelectToSpeak,
};
// LINT.ThenChange(/tools/metrics/histograms/metadata/accessibility/enums.xml:TextToSpeechSource)
} // namespace
#endif // BUILDFLAG(IS_CHROMEOS)
namespace events {
const char kOnEvent[] = "tts.onEvent";
const char kOnVoicesChanged[] = "tts.onVoicesChanged";
} // namespace events
[[nodiscard]] std::string_view TtsEventTypeToString(
content::TtsEventType event_type) {
switch (event_type) {
case content::TTS_EVENT_START:
return constants::kEventTypeStart;
case content::TTS_EVENT_END:
return constants::kEventTypeEnd;
case content::TTS_EVENT_WORD:
return constants::kEventTypeWord;
case content::TTS_EVENT_SENTENCE:
return constants::kEventTypeSentence;
case content::TTS_EVENT_MARKER:
return constants::kEventTypeMarker;
case content::TTS_EVENT_INTERRUPTED:
return constants::kEventTypeInterrupted;
case content::TTS_EVENT_CANCELLED:
return constants::kEventTypeCancelled;
case content::TTS_EVENT_ERROR:
return constants::kEventTypeError;
case content::TTS_EVENT_PAUSE:
return constants::kEventTypePause;
case content::TTS_EVENT_RESUME:
return constants::kEventTypeResume;
default:
NOTREACHED();
}
}
content::TtsEventType TtsEventTypeFromString(std::string_view str) {
if (str == constants::kEventTypeStart)
return content::TTS_EVENT_START;
if (str == constants::kEventTypeEnd)
return content::TTS_EVENT_END;
if (str == constants::kEventTypeWord)
return content::TTS_EVENT_WORD;
if (str == constants::kEventTypeSentence)
return content::TTS_EVENT_SENTENCE;
if (str == constants::kEventTypeMarker)
return content::TTS_EVENT_MARKER;
if (str == constants::kEventTypeInterrupted)
return content::TTS_EVENT_INTERRUPTED;
if (str == constants::kEventTypeCancelled)
return content::TTS_EVENT_CANCELLED;
if (str == constants::kEventTypeError)
return content::TTS_EVENT_ERROR;
if (str == constants::kEventTypePause)
return content::TTS_EVENT_PAUSE;
if (str == constants::kEventTypeResume)
return content::TTS_EVENT_RESUME;
NOTREACHED();
}
namespace extensions {
namespace {
// One of these is constructed for each utterance, and deleted when the
// utterance gets any final event.
class TtsExtensionEventHandler : public content::UtteranceEventDelegate {
public:
explicit TtsExtensionEventHandler(const std::string& extension_id)
: extension_id_(extension_id) {}
void OnTtsEvent(content::TtsUtterance* utterance,
content::TtsEventType event_type,
int char_index,
int length,
const std::string& error_message) override {
if (utterance->GetSrcId() < 0) {
return;
}
const base::flat_set<content::TtsEventType>& desired_event_types =
utterance->GetDesiredEventTypes();
if (!desired_event_types.empty() &&
desired_event_types.find(event_type) == desired_event_types.end()) {
return;
}
base::Value::Dict details;
if (char_index >= 0) {
details.Set(constants::kCharIndexKey, char_index);
}
if (length >= 0) {
details.Set(constants::kLengthKey, length);
}
details.Set(constants::kEventTypeKey, TtsEventTypeToString(event_type));
if (event_type == content::TTS_EVENT_ERROR) {
details.Set(constants::kErrorMessageKey, error_message);
}
details.Set(constants::kSrcIdKey, utterance->GetSrcId());
details.Set(constants::kIsFinalEventKey, utterance->IsFinished());
base::Value::List arguments;
arguments.Append(std::move(details));
auto event = std::make_unique<extensions::Event>(
::extensions::events::TTS_ON_EVENT, ::events::kOnEvent,
std::move(arguments), utterance->GetBrowserContext());
event->event_url = utterance->GetSrcUrl();
extensions::EventRouter::Get(utterance->GetBrowserContext())
->DispatchEventToExtension(extension_id_, std::move(event));
}
private:
// The extension ID of the extension that called speak() and should
// receive events.
const std::string extension_id_;
};
} // namespace
ExtensionFunction::ResponseAction TtsSpeakFunction::Run() {
EXTENSION_FUNCTION_VALIDATE(args().size() >= 1);
EXTENSION_FUNCTION_VALIDATE(args()[0].is_string());
const std::string& text = args()[0].GetString();
if (text.size() > 32768) {
return RespondNow(Error(constants::kErrorUtteranceTooLong));
}
base::Value::Dict options;
if (args().size() >= 2 && args()[1].is_dict())
options = args()[1].GetDict().Clone();
std::string voice_name;
if (base::Value* voice_name_value = options.Find(constants::kVoiceNameKey)) {
EXTENSION_FUNCTION_VALIDATE(voice_name_value->is_string());
voice_name = voice_name_value->GetString();
}
std::string lang;
if (base::Value* lang_value = options.Find(constants::kLangKey)) {
EXTENSION_FUNCTION_VALIDATE(lang_value->is_string());
lang = lang_value->GetString();
}
if (!lang.empty() && !l10n_util::IsValidLocaleSyntax(lang)) {
return RespondNow(Error(constants::kErrorInvalidLang));
}
double rate = blink::mojom::kSpeechSynthesisDoublePrefNotSet;
if (base::Value* rate_value = options.Find(constants::kRateKey)) {
EXTENSION_FUNCTION_VALIDATE(rate_value->GetIfDouble());
rate = rate_value->GetIfDouble().value_or(rate);
if (rate < 0.1 || rate > 10.0) {
return RespondNow(Error(constants::kErrorInvalidRate));
}
}
double pitch = blink::mojom::kSpeechSynthesisDoublePrefNotSet;
if (base::Value* pitch_value = options.Find(constants::kPitchKey)) {
EXTENSION_FUNCTION_VALIDATE(pitch_value->GetIfDouble());
pitch = pitch_value->GetIfDouble().value_or(pitch);
if (pitch < 0.0 || pitch > 2.0) {
return RespondNow(Error(constants::kErrorInvalidPitch));
}
}
double volume = blink::mojom::kSpeechSynthesisDoublePrefNotSet;
if (base::Value* volume_value = options.Find(constants::kVolumeKey)) {
EXTENSION_FUNCTION_VALIDATE(volume_value->GetIfDouble());
volume = volume_value->GetIfDouble().value_or(volume);
if (volume < 0.0 || volume > 1.0) {
return RespondNow(Error(constants::kErrorInvalidVolume));
}
}
bool can_enqueue = options.FindBool(constants::kEnqueueKey).value_or(false);
if (base::Value* value = options.Find(constants::kEnqueueKey)) {
EXTENSION_FUNCTION_VALIDATE(value->is_bool());
}
base::flat_set<content::TtsEventType> required_event_types;
if (options.contains(constants::kRequiredEventTypesKey)) {
base::Value::List* list =
options.FindList(constants::kRequiredEventTypesKey);
EXTENSION_FUNCTION_VALIDATE(list);
for (const base::Value& i : *list) {
const std::string* event_type = i.GetIfString();
if (event_type) {
required_event_types.insert(TtsEventTypeFromString(*event_type));
}
}
}
base::flat_set<content::TtsEventType> desired_event_types;
if (options.contains(constants::kDesiredEventTypesKey)) {
base::Value::List* list =
options.FindList(constants::kDesiredEventTypesKey);
EXTENSION_FUNCTION_VALIDATE(list);
for (const base::Value& i : *list) {
const std::string* event_type = i.GetIfString();
if (event_type)
desired_event_types.insert(TtsEventTypeFromString(*event_type));
}
}
std::string voice_extension_id;
if (base::Value* voice_extension_id_value =
options.Find(constants::kExtensionIdKey)) {
EXTENSION_FUNCTION_VALIDATE(voice_extension_id_value);
voice_extension_id = voice_extension_id_value->GetString();
}
int src_id = -1;
base::Value* src_id_value = options.Find(constants::kSrcIdKey);
if (src_id_value) {
EXTENSION_FUNCTION_VALIDATE(src_id_value->is_int());
src_id = src_id_value->GetInt();
}
#if BUILDFLAG(IS_CHROMEOS)
UMATextToSpeechSource source = UMATextToSpeechSource::kOther;
const std::string host = source_url().host();
if (host == extension_misc::kSelectToSpeakExtensionId) {
source = UMATextToSpeechSource::kSelectToSpeak;
} else if (host == extension_misc::kChromeVoxExtensionId) {
source = UMATextToSpeechSource::kChromeVox;
}
UMA_HISTOGRAM_ENUMERATION("TextToSpeech.Utterance.Source", source);
#endif // BUILDFLAG(IS_CHROMEOS)
// If we got this far, the arguments were all in the valid format, so
// send the success response to the callback now - this ensures that
// the callback response always arrives before events, which makes
// the behavior more predictable and easier to write unit tests for too.
Respond(NoArguments());
std::unique_ptr<content::TtsUtterance> utterance;
if (extension()) {
extensions::ExtensionHost* extension_host =
extensions::ProcessManager::Get(browser_context())
->GetBackgroundHostForExtension(extension()->id());
if (extension_host && extension_host->host_contents()) {
utterance =
content::TtsUtterance::Create(extension_host->host_contents());
}
}
if (!utterance)
utterance = content::TtsUtterance::Create(browser_context());
utterance->SetText(text);
utterance->SetVoiceName(voice_name);
utterance->SetSrcId(src_id);
utterance->SetSrcUrl(source_url());
utterance->SetLang(lang);
utterance->SetContinuousParameters(rate, pitch, volume);
utterance->SetShouldClearQueue(!can_enqueue);
utterance->SetRequiredEventTypes(required_event_types);
utterance->SetDesiredEventTypes(desired_event_types);
utterance->SetEngineId(voice_extension_id);
utterance->SetOptions(std::move(options));
if (extension()) {
utterance->SetEventDelegate(
std::make_unique<TtsExtensionEventHandler>(extension_id()));
}
content::TtsController* controller = content::TtsController::GetInstance();
controller->SpeakOrEnqueue(std::move(utterance));
return AlreadyResponded();
}
ExtensionFunction::ResponseAction TtsStopSpeakingFunction::Run() {
content::TtsController::GetInstance()->Stop(source_url());
return RespondNow(NoArguments());
}
ExtensionFunction::ResponseAction TtsPauseFunction::Run() {
content::TtsController::GetInstance()->Pause();
return RespondNow(NoArguments());
}
ExtensionFunction::ResponseAction TtsResumeFunction::Run() {
content::TtsController::GetInstance()->Resume();
return RespondNow(NoArguments());
}
void TtsIsSpeakingFunction::OnIsSpeakingComplete(bool speaking) {
Respond(WithArguments(speaking));
}
ExtensionFunction::ResponseAction TtsIsSpeakingFunction::Run() {
return RespondNow(
WithArguments(content::TtsController::GetInstance()->IsSpeaking()));
}
ExtensionFunction::ResponseAction TtsGetVoicesFunction::Run() {
std::vector<content::VoiceData> voices;
content::TtsController::GetInstance()->GetVoices(browser_context(),
source_url(), &voices);
base::Value::List result_voices;
for (size_t i = 0; i < voices.size(); ++i) {
const content::VoiceData& voice = voices[i];
base::Value::Dict result_voice;
result_voice.Set(constants::kVoiceNameKey, voice.name);
result_voice.Set(constants::kRemoteKey, voice.remote);
if (!voice.lang.empty())
result_voice.Set(constants::kLangKey, voice.lang);
if (!voice.engine_id.empty())
result_voice.Set(constants::kExtensionIdKey, voice.engine_id);
base::Value::List event_types;
for (auto& event : voice.events) {
event_types.Append(TtsEventTypeToString(event));
}
result_voice.Set(constants::kEventTypesKey, std::move(event_types));
result_voices.Append(std::move(result_voice));
}
return RespondNow(WithArguments(std::move(result_voices)));
}
TtsAPI::TtsAPI(content::BrowserContext* context) {
ExtensionFunctionRegistry& registry =
ExtensionFunctionRegistry::GetInstance();
registry.RegisterFunction<ExtensionTtsEngineUpdateVoicesFunction>();
registry.RegisterFunction<ExtensionTtsEngineSendTtsEventFunction>();
registry.RegisterFunction<ExtensionTtsEngineSendTtsAudioFunction>();
registry.RegisterFunction<ExtensionTtsEngineUpdateLanguageFunction>();
registry.RegisterFunction<TtsGetVoicesFunction>();
registry.RegisterFunction<TtsIsSpeakingFunction>();
registry.RegisterFunction<TtsSpeakFunction>();
registry.RegisterFunction<TtsStopSpeakingFunction>();
registry.RegisterFunction<TtsPauseFunction>();
registry.RegisterFunction<TtsResumeFunction>();
#if BUILDFLAG(IS_CHROMEOS)
// Ensure we're observing newly added engines for the given context.
TtsEngineExtensionObserverChromeOSFactory::GetForProfile(
Profile::FromBrowserContext(context));
#endif // BUILDFLAG(IS_CHROMEOS)
content::TtsController::GetInstance()->AddVoicesChangedDelegate(this);
event_router_ = EventRouter::Get(context);
event_router_->RegisterObserver(this, ::events::kOnVoicesChanged);
}
TtsAPI::~TtsAPI() {
content::TtsController::GetInstance()->RemoveVoicesChangedDelegate(this);
event_router_->UnregisterObserver(this);
}
static base::LazyInstance<
BrowserContextKeyedAPIFactory<TtsAPI>>::DestructorAtExit g_factory =
LAZY_INSTANCE_INITIALIZER;
BrowserContextKeyedAPIFactory<TtsAPI>* TtsAPI::GetFactoryInstance() {
return g_factory.Pointer();
}
void TtsAPI::OnVoicesChanged() {
if (!broadcast_events_) {
return;
}
auto event = std::make_unique<extensions::Event>(
events::TTS_ON_VOICES_CHANGED, ::events::kOnVoicesChanged,
base::Value::List());
event_router_->BroadcastEvent(std::move(event));
}
void TtsAPI::OnListenerAdded(const EventListenerInfo& details) {
StartOrStopListeningForVoicesChanged();
}
void TtsAPI::OnListenerRemoved(const EventListenerInfo& details) {
StartOrStopListeningForVoicesChanged();
}
void TtsAPI::StartOrStopListeningForVoicesChanged() {
broadcast_events_ =
event_router_->HasEventListener(::events::kOnVoicesChanged);
}
} // namespace extensions