blob: bf5d302bd3c7f26e35ffff3d74ff14c7cc167a32 [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/browser/speech/tts_controller_impl.h"
#include <stddef.h>
#include <string>
#include <vector>
#include "base/bind.h"
#include "base/containers/queue.h"
#include "base/json/json_reader.h"
#include "base/metrics/histogram_macros.h"
#include "base/metrics/user_metrics.h"
#include "base/values.h"
#include "build/build_config.h"
#include "content/public/browser/content_browser_client.h"
#include "content/public/common/service_manager_connection.h"
#include "services/data_decoder/public/cpp/safe_xml_parser.h"
#include "services/data_decoder/public/mojom/constants.mojom.h"
#include "services/data_decoder/public/mojom/xml_parser.mojom.h"
#include "services/service_manager/public/cpp/connector.h"
#include "third_party/blink/public/platform/web_speech_synthesis_constants.h"
namespace content {
// A value to be used to indicate that there is no char index available.
const int kInvalidCharIndex = -1;
// A value to be used to indicate that there is no length available.
const int kInvalidLength = -1;
//
// VoiceData
//
VoiceData::VoiceData() : remote(false), native(false) {}
VoiceData::VoiceData(const VoiceData& other) = default;
VoiceData::~VoiceData() {}
//
// TtsController
//
TtsController* TtsController::GetInstance() {
return TtsControllerImpl::GetInstance();
}
// IMPORTANT!
// These values are written to logs. Do not renumber or delete
// existing items; add new entries to the end of the list.
enum class UMATextToSpeechEvent {
START = 0,
END = 1,
WORD = 2,
SENTENCE = 3,
MARKER = 4,
INTERRUPTED = 5,
CANCELLED = 6,
SPEECH_ERROR = 7,
PAUSE = 8,
RESUME = 9,
// This must always be the last enum. It's okay for its value to
// increase, but none of the other enum values may change.
COUNT
};
//
// TtsControllerImpl
//
// static
TtsControllerImpl* TtsControllerImpl::GetInstance() {
return base::Singleton<TtsControllerImpl>::get();
}
TtsControllerImpl::TtsControllerImpl()
: delegate_(nullptr),
current_utterance_(nullptr),
paused_(false),
tts_platform_(nullptr) {}
TtsControllerImpl::~TtsControllerImpl() {
if (current_utterance_) {
current_utterance_->Finish();
delete current_utterance_;
}
// Clear any queued utterances too.
ClearUtteranceQueue(false); // Don't sent events.
}
void TtsControllerImpl::SpeakOrEnqueue(TtsUtterance* utterance) {
// If we're paused and we get an utterance that can't be queued,
// flush the queue but stay in the paused state.
if (paused_ && !utterance->GetCanEnqueue()) {
utterance_queue_.push(utterance);
Stop();
paused_ = true;
return;
}
if (paused_ || (IsSpeaking() && utterance->GetCanEnqueue())) {
utterance_queue_.push(utterance);
} else {
Stop();
SpeakNow(utterance);
}
}
void TtsControllerImpl::Stop() {
Stop(GURL());
}
void TtsControllerImpl::Stop(const GURL& source_url) {
base::RecordAction(base::UserMetricsAction("TextToSpeech.Stop"));
paused_ = false;
if (!source_url.is_empty() && current_utterance_ &&
current_utterance_->GetSrcUrl().GetOrigin() != source_url.GetOrigin())
return;
if (current_utterance_ && !current_utterance_->GetEngineId().empty()) {
if (GetTtsControllerDelegate()->GetTtsEngineDelegate())
GetTtsControllerDelegate()->GetTtsEngineDelegate()->Stop(
current_utterance_);
} else {
GetTtsPlatform()->ClearError();
GetTtsPlatform()->StopSpeaking();
}
if (current_utterance_)
current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex,
kInvalidLength, std::string());
FinishCurrentUtterance();
ClearUtteranceQueue(true); // Send events.
}
void TtsControllerImpl::Pause() {
base::RecordAction(base::UserMetricsAction("TextToSpeech.Pause"));
paused_ = true;
if (current_utterance_ && !current_utterance_->GetEngineId().empty()) {
if (GetTtsControllerDelegate()->GetTtsEngineDelegate())
GetTtsControllerDelegate()->GetTtsEngineDelegate()->Pause(
current_utterance_);
} else if (current_utterance_) {
GetTtsPlatform()->ClearError();
GetTtsPlatform()->Pause();
}
}
void TtsControllerImpl::Resume() {
base::RecordAction(base::UserMetricsAction("TextToSpeech.Resume"));
paused_ = false;
if (current_utterance_ && !current_utterance_->GetEngineId().empty()) {
if (GetTtsControllerDelegate()->GetTtsEngineDelegate())
GetTtsControllerDelegate()->GetTtsEngineDelegate()->Resume(
current_utterance_);
} else if (current_utterance_) {
GetTtsPlatform()->ClearError();
GetTtsPlatform()->Resume();
} else {
SpeakNextUtterance();
}
}
void TtsControllerImpl::OnTtsEvent(int utterance_id,
TtsEventType event_type,
int char_index,
int length,
const std::string& error_message) {
// We may sometimes receive completion callbacks "late", after we've
// already finished the utterance (for example because another utterance
// interrupted or we got a call to Stop). This is normal and we can
// safely just ignore these events.
if (!current_utterance_ || utterance_id != current_utterance_->GetId()) {
return;
}
UMATextToSpeechEvent metric;
switch (event_type) {
case TTS_EVENT_START:
metric = UMATextToSpeechEvent::START;
break;
case TTS_EVENT_END:
metric = UMATextToSpeechEvent::END;
break;
case TTS_EVENT_WORD:
metric = UMATextToSpeechEvent::WORD;
break;
case TTS_EVENT_SENTENCE:
metric = UMATextToSpeechEvent::SENTENCE;
break;
case TTS_EVENT_MARKER:
metric = UMATextToSpeechEvent::MARKER;
break;
case TTS_EVENT_INTERRUPTED:
metric = UMATextToSpeechEvent::INTERRUPTED;
break;
case TTS_EVENT_CANCELLED:
metric = UMATextToSpeechEvent::CANCELLED;
break;
case TTS_EVENT_ERROR:
metric = UMATextToSpeechEvent::SPEECH_ERROR;
break;
case TTS_EVENT_PAUSE:
metric = UMATextToSpeechEvent::PAUSE;
break;
case TTS_EVENT_RESUME:
metric = UMATextToSpeechEvent::RESUME;
break;
default:
NOTREACHED();
return;
}
UMA_HISTOGRAM_ENUMERATION("TextToSpeech.Event", metric,
UMATextToSpeechEvent::COUNT);
current_utterance_->OnTtsEvent(event_type, char_index, length, error_message);
if (current_utterance_->IsFinished()) {
FinishCurrentUtterance();
SpeakNextUtterance();
}
}
void TtsControllerImpl::GetVoices(BrowserContext* browser_context,
std::vector<VoiceData>* out_voices) {
TtsPlatform* tts_platform = GetTtsPlatform();
if (tts_platform) {
// Ensure we have all built-in voices loaded. This is a no-op if already
// loaded.
tts_platform->LoadBuiltInTtsEngine(browser_context);
if (tts_platform->PlatformImplAvailable())
tts_platform->GetVoices(out_voices);
}
if (browser_context && GetTtsControllerDelegate()->GetTtsEngineDelegate())
GetTtsControllerDelegate()->GetTtsEngineDelegate()->GetVoices(
browser_context, out_voices);
}
bool TtsControllerImpl::IsSpeaking() {
return current_utterance_ != nullptr || GetTtsPlatform()->IsSpeaking();
}
void TtsControllerImpl::VoicesChanged() {
// Existence of platform tts indicates explicit requests to tts. Since
// |VoicesChanged| can occur implicitly, only send if needed.
for (auto& delegate : voices_changed_delegates_)
delegate.OnVoicesChanged();
}
void TtsControllerImpl::AddVoicesChangedDelegate(
VoicesChangedDelegate* delegate) {
voices_changed_delegates_.AddObserver(delegate);
}
void TtsControllerImpl::RemoveVoicesChangedDelegate(
VoicesChangedDelegate* delegate) {
voices_changed_delegates_.RemoveObserver(delegate);
}
void TtsControllerImpl::RemoveUtteranceEventDelegate(
UtteranceEventDelegate* delegate) {
// First clear any pending utterances with this delegate.
base::queue<TtsUtterance*> old_queue = utterance_queue_;
utterance_queue_ = base::queue<TtsUtterance*>();
while (!old_queue.empty()) {
TtsUtterance* utterance = old_queue.front();
old_queue.pop();
if (utterance->GetEventDelegate() != delegate)
utterance_queue_.push(utterance);
else
delete utterance;
}
if (current_utterance_ &&
current_utterance_->GetEventDelegate() == delegate) {
current_utterance_->SetEventDelegate(nullptr);
if (!current_utterance_->GetEngineId().empty()) {
if (GetTtsControllerDelegate()->GetTtsEngineDelegate())
GetTtsControllerDelegate()->GetTtsEngineDelegate()->Stop(
current_utterance_);
} else {
GetTtsPlatform()->ClearError();
GetTtsPlatform()->StopSpeaking();
}
FinishCurrentUtterance();
if (!paused_)
SpeakNextUtterance();
}
}
void TtsControllerImpl::SetTtsEngineDelegate(TtsEngineDelegate* delegate) {
if (!GetTtsControllerDelegate())
return;
GetTtsControllerDelegate()->SetTtsEngineDelegate(delegate);
}
TtsEngineDelegate* TtsControllerImpl::GetTtsEngineDelegate() {
if (!GetTtsControllerDelegate())
return nullptr;
return GetTtsControllerDelegate()->GetTtsEngineDelegate();
}
void TtsControllerImpl::SetTtsPlatform(TtsPlatform* tts_platform) {
tts_platform_ = tts_platform;
}
int TtsControllerImpl::QueueSize() {
return static_cast<int>(utterance_queue_.size());
}
TtsPlatform* TtsControllerImpl::GetTtsPlatform() {
if (!tts_platform_)
tts_platform_ = TtsPlatform::GetInstance();
return tts_platform_;
}
void TtsControllerImpl::SpeakNow(TtsUtterance* utterance) {
if (!GetTtsControllerDelegate())
return;
// Get all available voices and try to find a matching voice.
std::vector<VoiceData> voices;
GetVoices(utterance->GetBrowserContext(), &voices);
// Get the best matching voice. If nothing matches, just set "native"
// to true because that might trigger deferred loading of native voices.
// TODO(katie): Move most of the GetMatchingVoice logic into content/ and
// use the TTS controller delegate to get chrome-specific info as needed.
int index = GetTtsControllerDelegate()->GetMatchingVoice(utterance, voices);
VoiceData voice;
if (index >= 0)
voice = voices[index];
else
voice.native = true;
UpdateUtteranceDefaults(utterance);
GetTtsPlatform()->WillSpeakUtteranceWithVoice(utterance, voice);
base::RecordAction(base::UserMetricsAction("TextToSpeech.Speak"));
UMA_HISTOGRAM_COUNTS_100000("TextToSpeech.Utterance.TextLength",
utterance->GetText().size());
UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.FromExtensionAPI",
!utterance->GetSrcUrl().is_empty());
UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasVoiceName",
!utterance->GetVoiceName().empty());
UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasLang",
!utterance->GetLang().empty());
UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasRate",
utterance->GetContinuousParameters().rate != 1.0);
UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasPitch",
utterance->GetContinuousParameters().pitch != 1.0);
UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.HasVolume",
utterance->GetContinuousParameters().volume != 1.0);
UMA_HISTOGRAM_BOOLEAN("TextToSpeech.Utterance.Native", voice.native);
if (!voice.native) {
#if !defined(OS_ANDROID)
DCHECK(!voice.engine_id.empty());
current_utterance_ = utterance;
utterance->SetEngineId(voice.engine_id);
if (GetTtsControllerDelegate()->GetTtsEngineDelegate())
GetTtsControllerDelegate()->GetTtsEngineDelegate()->Speak(utterance,
voice);
bool sends_end_event =
voice.events.find(TTS_EVENT_END) != voice.events.end();
if (!sends_end_event) {
utterance->Finish();
delete utterance;
current_utterance_ = nullptr;
SpeakNextUtterance();
}
#endif
} else {
// It's possible for certain platforms to send start events immediately
// during |speak|.
current_utterance_ = utterance;
GetTtsPlatform()->ClearError();
GetTtsPlatform()->Speak(utterance->GetId(), utterance->GetText(),
utterance->GetLang(), voice,
utterance->GetContinuousParameters(),
base::BindOnce(&TtsControllerImpl::OnSpeakFinished,
base::Unretained(this), utterance));
}
}
void TtsControllerImpl::OnSpeakFinished(TtsUtterance* utterance, bool success) {
if (!success)
current_utterance_ = nullptr;
// If the native voice wasn't able to process this speech, see if
// the browser has built-in TTS that isn't loaded yet.
if (!success &&
GetTtsPlatform()->LoadBuiltInTtsEngine(utterance->GetBrowserContext())) {
utterance_queue_.push(utterance);
return;
}
if (!success) {
utterance->OnTtsEvent(TTS_EVENT_ERROR, kInvalidCharIndex, kInvalidLength,
GetTtsPlatform()->GetError());
delete utterance;
return;
}
}
void TtsControllerImpl::ClearUtteranceQueue(bool send_events) {
while (!utterance_queue_.empty()) {
TtsUtterance* utterance = utterance_queue_.front();
utterance_queue_.pop();
if (send_events)
utterance->OnTtsEvent(TTS_EVENT_CANCELLED, kInvalidCharIndex,
kInvalidLength, std::string());
else
utterance->Finish();
delete utterance;
}
}
void TtsControllerImpl::FinishCurrentUtterance() {
if (current_utterance_) {
if (!current_utterance_->IsFinished())
current_utterance_->OnTtsEvent(TTS_EVENT_INTERRUPTED, kInvalidCharIndex,
kInvalidLength, std::string());
delete current_utterance_;
current_utterance_ = nullptr;
}
}
void TtsControllerImpl::SpeakNextUtterance() {
if (paused_)
return;
// Start speaking the next utterance in the queue. Keep trying in case
// one fails but there are still more in the queue to try.
while (!utterance_queue_.empty() && !current_utterance_) {
TtsUtterance* utterance = utterance_queue_.front();
utterance_queue_.pop();
SpeakNow(utterance);
}
}
void TtsControllerImpl::UpdateUtteranceDefaults(TtsUtterance* utterance) {
double rate = utterance->GetContinuousParameters().rate;
double pitch = utterance->GetContinuousParameters().pitch;
double volume = utterance->GetContinuousParameters().volume;
#if defined(OS_CHROMEOS)
GetTtsControllerDelegate()->UpdateUtteranceDefaultsFromPrefs(utterance, &rate,
&pitch, &volume);
#else
// Update pitch, rate and volume to defaults if not explicity set on
// this utterance.
if (rate == blink::kWebSpeechSynthesisDoublePrefNotSet)
rate = blink::kWebSpeechSynthesisDefaultTextToSpeechRate;
if (pitch == blink::kWebSpeechSynthesisDoublePrefNotSet)
pitch = blink::kWebSpeechSynthesisDefaultTextToSpeechPitch;
if (volume == blink::kWebSpeechSynthesisDoublePrefNotSet)
volume = blink::kWebSpeechSynthesisDefaultTextToSpeechVolume;
#endif // defined(OS_CHROMEOS)
utterance->SetContinuousParameters(rate, pitch, volume);
}
TtsControllerDelegate* TtsControllerImpl::GetTtsControllerDelegate() {
if (delegate_)
return delegate_;
if (GetContentClient() && GetContentClient()->browser()) {
delegate_ = GetContentClient()->browser()->GetTtsControllerDelegate();
return delegate_;
}
return nullptr;
}
void TtsControllerImpl::StripSSML(
const std::string& utterance,
base::OnceCallback<void(const std::string&)> on_ssml_parsed) {
// Skip parsing and return if not xml.
if (utterance.find("<?xml") == std::string::npos) {
std::move(on_ssml_parsed).Run(utterance);
return;
}
// Get ServiceManagerConnection and Connector.
ServiceManagerConnection* service_manager_connection =
ServiceManagerConnection::GetForProcess();
CHECK(service_manager_connection);
service_manager::Connector* connector =
service_manager_connection->GetConnector();
CHECK(connector);
// Parse using safe, out-of-process Xml Parser.
data_decoder::ParseXml(connector, utterance,
base::BindOnce(&TtsControllerImpl::StripSSMLHelper,
utterance, std::move(on_ssml_parsed)));
}
// Called when ParseXml finishes.
// Uses parsed xml to build parsed utterance text.
void TtsControllerImpl::StripSSMLHelper(
const std::string& utterance,
base::OnceCallback<void(const std::string&)> on_ssml_parsed,
std::unique_ptr<base::Value> value,
const base::Optional<std::string>& error_message) {
// Error checks.
// If invalid xml, return original utterance text.
if (!value || error_message) {
std::move(on_ssml_parsed).Run(utterance);
return;
}
std::string root_tag_name;
data_decoder::GetXmlElementTagName(*value, &root_tag_name);
// Root element must be <speak>.
if (root_tag_name.compare("speak") != 0) {
std::move(on_ssml_parsed).Run(utterance);
return;
}
std::string parsed_text = "";
// Change from unique_ptr to base::Value* so recursion will work.
PopulateParsedText(&parsed_text, &(*value));
// Run with parsed_text.
std::move(on_ssml_parsed).Run(parsed_text);
}
void TtsControllerImpl::PopulateParsedText(std::string* parsed_text,
const base::Value* element) {
DCHECK(parsed_text);
if (!element)
return;
// Add element's text if present.
// Note: We don't use data_decoder::GetXmlElementText because it gets the text
// of element's first child, not text of current element.
const base::Value* text_value = element->FindKeyOfType(
data_decoder::mojom::XmlParser::kTextKey, base::Value::Type::STRING);
if (text_value)
*parsed_text += text_value->GetString();
const base::Value* children = data_decoder::GetXmlElementChildren(*element);
if (!children || !children->is_list())
return;
for (size_t i = 0; i < children->GetList().size(); ++i) {
// We need to iterate over all children because some text elements are
// nested within other types of elements, such as <emphasis> tags.
PopulateParsedText(parsed_text, &children->GetList()[i]);
}
}
} // namespace content