blob: 17ad63286ed6e5e10acbf5ae83faf69e84d7c386 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <math.h>
#include <objbase.h>
#include <sapi.h>
#include <stdint.h>
#include <wrl/client.h>
#include "base/bind.h"
#include "base/macros.h"
#include "base/memory/singleton.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_piece.h"
#include "base/strings/utf_string_conversions.h"
#include "base/values.h"
#include "base/win/scoped_co_mem.h"
#include "base/win/sphelper.h"
#include "content/browser/speech/tts_platform_impl.h"
#include "content/public/browser/tts_controller.h"
namespace content {
namespace {
// ISpObjectToken key and value names.
const wchar_t kAttributesKey[] = L"Attributes";
const wchar_t kLanguageValue[] = L"Language";
} // anonymous namespace.
class TtsPlatformImplWin : public TtsPlatformImpl {
public:
bool PlatformImplAvailable() override { return true; }
void Speak(int utterance_id,
const std::string& utterance,
const std::string& lang,
const VoiceData& voice,
const UtteranceContinuousParameters& params,
base::OnceCallback<void(bool)> on_speak_finished) override;
bool StopSpeaking() override;
void Pause() override;
void Resume() override;
bool IsSpeaking() override;
void GetVoices(std::vector<VoiceData>* out_voices) override;
// Get the single instance of this class.
static TtsPlatformImplWin* GetInstance();
static void __stdcall SpeechEventCallback(WPARAM w_param, LPARAM l_param);
private:
TtsPlatformImplWin();
~TtsPlatformImplWin() override {}
void OnSpeechEvent();
void SetVoiceFromName(const std::string& name);
void ProcessSpeech(int utterance_id,
const std::string& lang,
const VoiceData& voice,
const UtteranceContinuousParameters& params,
base::OnceCallback<void(bool)> on_speak_finished,
const std::string& parsed_utterance);
Microsoft::WRL::ComPtr<ISpVoice> speech_synthesizer_;
// These apply to the current utterance only.
std::wstring utterance_;
int utterance_id_;
int prefix_len_;
ULONG stream_number_;
int char_position_;
int char_length_;
bool paused_;
std::string last_voice_name_;
friend struct base::DefaultSingletonTraits<TtsPlatformImplWin>;
base::WeakPtrFactory<TtsPlatformImplWin> weak_factory_;
DISALLOW_COPY_AND_ASSIGN(TtsPlatformImplWin);
};
// static
TtsPlatformImpl* TtsPlatformImpl::GetInstance() {
return TtsPlatformImplWin::GetInstance();
}
void TtsPlatformImplWin::Speak(
int utterance_id,
const std::string& utterance,
const std::string& lang,
const VoiceData& voice,
const UtteranceContinuousParameters& params,
base::OnceCallback<void(bool)> on_speak_finished) {
// Parse SSML and process speech.
TtsController::GetInstance()->StripSSML(
utterance, base::BindOnce(&TtsPlatformImplWin::ProcessSpeech,
weak_factory_.GetWeakPtr(), utterance_id, lang,
voice, params, std::move(on_speak_finished)));
}
void TtsPlatformImplWin::ProcessSpeech(
int utterance_id,
const std::string& lang,
const VoiceData& voice,
const UtteranceContinuousParameters& params,
base::OnceCallback<void(bool)> on_speak_finished,
const std::string& parsed_utterance) {
std::wstring prefix;
std::wstring suffix;
if (!speech_synthesizer_.Get()) {
std::move(on_speak_finished).Run(false);
return;
}
SetVoiceFromName(voice.name);
if (params.rate >= 0.0) {
// Map our multiplicative range of 0.1x to 10.0x onto Microsoft's
// linear range of -10 to 10:
// 0.1 -> -10
// 1.0 -> 0
// 10.0 -> 10
speech_synthesizer_->SetRate(static_cast<int32_t>(10 * log10(params.rate)));
}
if (params.pitch >= 0.0) {
// The TTS api allows a range of -10 to 10 for speech pitch.
// TODO(dtseng): cleanup if we ever use any other properties that
// require xml.
std::wstring pitch_value = base::NumberToString16(params.pitch * 10 - 10);
prefix = L"<pitch absmiddle=\"" + pitch_value + L"\">";
suffix = L"</pitch>";
}
if (params.volume >= 0.0) {
// The TTS api allows a range of 0 to 100 for speech volume.
speech_synthesizer_->SetVolume(static_cast<uint16_t>(params.volume * 100));
}
// TODO(dmazzoni): convert SSML to SAPI xml. http://crbug.com/88072
utterance_ = base::UTF8ToWide(parsed_utterance);
utterance_id_ = utterance_id;
char_position_ = 0;
char_length_ = 0;
std::wstring merged_utterance = prefix + utterance_ + suffix;
prefix_len_ = prefix.size();
HRESULT result = speech_synthesizer_->Speak(merged_utterance.c_str(),
SPF_ASYNC, &stream_number_);
std::move(on_speak_finished).Run((result == S_OK));
}
bool TtsPlatformImplWin::StopSpeaking() {
if (speech_synthesizer_.Get()) {
// Clear the stream number so that any further events relating to this
// utterance are ignored.
stream_number_ = 0;
if (IsSpeaking()) {
// Stop speech by speaking nullptr with the purge flag.
speech_synthesizer_->Speak(nullptr, SPF_ASYNC | SPF_PURGEBEFORESPEAK,
NULL);
}
if (paused_) {
speech_synthesizer_->Resume();
paused_ = false;
}
}
return true;
}
void TtsPlatformImplWin::Pause() {
if (speech_synthesizer_.Get() && utterance_id_ && !paused_) {
speech_synthesizer_->Pause();
paused_ = true;
TtsController::GetInstance()->OnTtsEvent(utterance_id_, TTS_EVENT_PAUSE,
char_position_, -1, "");
}
}
void TtsPlatformImplWin::Resume() {
if (speech_synthesizer_.Get() && utterance_id_ && paused_) {
speech_synthesizer_->Resume();
paused_ = false;
TtsController::GetInstance()->OnTtsEvent(utterance_id_, TTS_EVENT_RESUME,
char_position_, -1, "");
}
}
bool TtsPlatformImplWin::IsSpeaking() {
if (speech_synthesizer_.Get()) {
SPVOICESTATUS status;
HRESULT result = speech_synthesizer_->GetStatus(&status, NULL);
if (result == S_OK) {
if (status.dwRunningState == 0 || // 0 == waiting to speak
status.dwRunningState == SPRS_IS_SPEAKING) {
return true;
}
}
}
return false;
}
void TtsPlatformImplWin::GetVoices(std::vector<VoiceData>* out_voices) {
Microsoft::WRL::ComPtr<IEnumSpObjectTokens> voice_tokens;
unsigned long voice_count;
if (S_OK !=
SpEnumTokens(SPCAT_VOICES, NULL, NULL, voice_tokens.GetAddressOf()))
return;
if (S_OK != voice_tokens->GetCount(&voice_count))
return;
for (unsigned i = 0; i < voice_count; i++) {
VoiceData voice;
Microsoft::WRL::ComPtr<ISpObjectToken> voice_token;
if (S_OK != voice_tokens->Next(1, voice_token.GetAddressOf(), NULL))
return;
base::win::ScopedCoMem<WCHAR> description;
if (S_OK != SpGetDescription(voice_token.Get(), &description))
continue;
voice.name = base::WideToUTF8(description.get());
Microsoft::WRL::ComPtr<ISpDataKey> attributes;
if (S_OK != voice_token->OpenKey(kAttributesKey, attributes.GetAddressOf()))
continue;
base::win::ScopedCoMem<WCHAR> language;
if (S_OK == attributes->GetStringValue(kLanguageValue, &language)) {
int lcid_value;
base::HexStringToInt(base::WideToUTF8(language.get()), &lcid_value);
LCID lcid = MAKELCID(lcid_value, SORT_DEFAULT);
WCHAR locale_name[LOCALE_NAME_MAX_LENGTH] = {0};
LCIDToLocaleName(lcid, locale_name, LOCALE_NAME_MAX_LENGTH, 0);
voice.lang = base::WideToUTF8(locale_name);
}
voice.native = true;
voice.events.insert(TTS_EVENT_START);
voice.events.insert(TTS_EVENT_END);
voice.events.insert(TTS_EVENT_MARKER);
voice.events.insert(TTS_EVENT_WORD);
voice.events.insert(TTS_EVENT_SENTENCE);
voice.events.insert(TTS_EVENT_PAUSE);
voice.events.insert(TTS_EVENT_RESUME);
out_voices->push_back(voice);
}
}
void TtsPlatformImplWin::OnSpeechEvent() {
TtsController* controller = TtsController::GetInstance();
SPEVENT event;
while (S_OK == speech_synthesizer_->GetEvents(1, &event, NULL)) {
if (event.ulStreamNum != stream_number_)
continue;
switch (event.eEventId) {
case SPEI_START_INPUT_STREAM:
controller->OnTtsEvent(utterance_id_, TTS_EVENT_START, 0, -1,
std::string());
break;
case SPEI_END_INPUT_STREAM:
char_position_ = utterance_.size();
controller->OnTtsEvent(utterance_id_, TTS_EVENT_END, char_position_, 0,
std::string());
break;
case SPEI_TTS_BOOKMARK:
controller->OnTtsEvent(utterance_id_, TTS_EVENT_MARKER, char_position_,
-1, std::string());
break;
case SPEI_WORD_BOUNDARY:
char_position_ = static_cast<ULONG>(event.lParam) - prefix_len_;
char_length_ = static_cast<ULONG>(event.wParam);
controller->OnTtsEvent(utterance_id_, TTS_EVENT_WORD, char_position_,
char_length_, std::string());
break;
case SPEI_SENTENCE_BOUNDARY:
char_position_ = static_cast<ULONG>(event.lParam) - prefix_len_;
controller->OnTtsEvent(utterance_id_, TTS_EVENT_SENTENCE,
char_position_, -1, std::string());
break;
default:
break;
}
}
}
void TtsPlatformImplWin::SetVoiceFromName(const std::string& name) {
if (name.empty() || name == last_voice_name_)
return;
last_voice_name_ = name;
Microsoft::WRL::ComPtr<IEnumSpObjectTokens> voice_tokens;
unsigned long voice_count;
if (S_OK !=
SpEnumTokens(SPCAT_VOICES, NULL, NULL, voice_tokens.GetAddressOf()))
return;
if (S_OK != voice_tokens->GetCount(&voice_count))
return;
for (unsigned i = 0; i < voice_count; i++) {
Microsoft::WRL::ComPtr<ISpObjectToken> voice_token;
if (S_OK != voice_tokens->Next(1, voice_token.GetAddressOf(), NULL))
return;
base::win::ScopedCoMem<WCHAR> description;
if (S_OK != SpGetDescription(voice_token.Get(), &description))
continue;
if (name == base::WideToUTF8(description.get())) {
speech_synthesizer_->SetVoice(voice_token.Get());
break;
}
}
}
TtsPlatformImplWin::TtsPlatformImplWin()
: utterance_id_(0),
prefix_len_(0),
stream_number_(0),
char_position_(0),
paused_(false),
weak_factory_(this) {
::CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_ALL,
IID_PPV_ARGS(&speech_synthesizer_));
if (speech_synthesizer_.Get()) {
ULONGLONG event_mask =
SPFEI(SPEI_START_INPUT_STREAM) | SPFEI(SPEI_TTS_BOOKMARK) |
SPFEI(SPEI_WORD_BOUNDARY) | SPFEI(SPEI_SENTENCE_BOUNDARY) |
SPFEI(SPEI_END_INPUT_STREAM);
speech_synthesizer_->SetInterest(event_mask, event_mask);
speech_synthesizer_->SetNotifyCallbackFunction(
TtsPlatformImplWin::SpeechEventCallback, 0, 0);
}
}
// static
TtsPlatformImplWin* TtsPlatformImplWin::GetInstance() {
return base::Singleton<TtsPlatformImplWin,
base::LeakySingletonTraits<TtsPlatformImplWin>>::get();
}
// static
void TtsPlatformImplWin::SpeechEventCallback(WPARAM w_param, LPARAM l_param) {
GetInstance()->OnSpeechEvent();
}
} // namespace content