blob: 8dd2917024805257632ce69b0c9520ae3d206b98 [file] [log] [blame]
// Copyright 2018 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a the GPL license that can be
// found in the LICENSE file.
/**
* @fileoverview eSpeak text-to-speech extension.
*
* There are three components to the extension:
*
* The open-source eSpeak-NG engine is compiled using Emscripten into a
* WebAssembly module that's run in a web worker. The interface to it is
* stored in this.tts_. This class sends it asynchronous queries for
* a list of voices and commands to start generating speech. Callbacks
* from the engine queue up events like word callbacks, and generate
* audio data.
*
* The implementation of the chrome.ttsEngine API is how we get
* commands to speak and stop. We also use this API to publish the
* current set of enabled voices.
*
* Finally, the audio data generated by the engine is output using
* a private mojo interface. This is an intermediate solution as will
* serve as motivation for a new chrome.TtsEngine api.
*/
class EspeakTtsEngine {
constructor() {
/**
* The callback provided by a call to chrome.ttsEngine.onSpeak, which is
* our channel to send status updates about the progress of speech.
*/
this.ttsEngineApiCallback_ = null;
/**
* The string containing the text we're currently speaking.
*/
this.utterance_ = null;
/**
* An id to track the current utterance to, for example, ensure audio data
* for previous utterances is ignored.
*/
this.utteranceId_ = 0;
/**
* The handle to the WASM Espeak-NG module.
*/
this.tts_ = null;
/** Sample rate requested for Espeak to generate audio. */
this.sampleRate_;
/**
* The size, in samples, of each audio frame. When using an
* AudioWorkletProcessor this is always 128 samples.
*/
this.bufferLen_;
/**
* The current buffer of audio samples from the engine. When each
* buffer fills up it's sent to the audio worklet processor thread.
*/
this.currentBuffer_ = null;
/**
* The current zero-based index into |this.currentBuffer_| where the
* next audio sample should be written.
*/
this.currentBufferIndex_ = 0;
/**
* Whether or not we're currently playing audio.
* This is not set to true until after we've got at least one
* buffer worth of samples from the engine and sent it to be played.
*/
this.playing_ = false;
/**
* The voice data for the languages that are supported by eSpeak
* and enabled by the user. The elements of this array are in the
* format needed by the chrome.ttsEngine API.
*/
this.ttsEngineApiVoices_ = [];
/**
* The data about each enabled language in eSpeak's format.
* Used when we get a speech request that specifies a language
* code but not a specific complete voice name.
*/
this.internalVoiceInfos_ = [];
/**
* The data about every supported eSpeak language, in eSpeak's
* format. Used by the options page to present all languages to
* the user and let them choose which ones to enable.
*/
this.langInfoArray_ = [];
/**
* Function to call when langInfoArray_ is populated.
*/
this.pendingGetLangInfoArrayCallback_ = null;
/**
* The default voice to use when no metadata is provided at all.
*/
this.defaultEspeakVoiceName_ = null;
/**
* The timestamp when speech started. Used to fire events like
* word callbacks.
*/
this.startTime_ = null;
/**
* Callback to run after initialization.
*/
this.pendingCallback_ = null;
/**
* True if initialization is done.
*/
this.initialized_ = false;
// Initialize the WASM module and call updateVoices when it's ready.
this.tts_ = new eSpeakNG('js/espeakng.worker.js', (function() {
// This will call
// this.finishInitialization_()
// when done, which will
// initialize the audio
// worklet.
this.updateVoices();
}).bind(this));
// Start listening to chrome.ttsEngine requests.
chrome.ttsEngine.onSpeak.addListener(this.onSpeak.bind(this));
chrome.ttsEngine.onStop.addListener(this.onStop.bind(this));
}
/**
* Return the array of all languages supported by eSpeak. Called by
* the options page.
*/
getLangInfoArray(callback) {
if (this.langInfoArray_.length > 0) {
callback(this.langInfoArray_);
return;
}
this.pendingGetLangInfoArrayCallback_ = callback;
};
/**
* Function called on startup and when the set of enabled voices
* may have changed.
*/
updateVoices() {
console.log('updateVoices');
this.tts_.list_voices(
(function(langInfoArray) {
this.langInfoArray_ = langInfoArray;
// Remove Japanese because the voice cannot pronounce many characters.
for (var i = 0; i < this.langInfoArray_.length; ++i) {
if (this.langInfoArray_[i].identifier === 'jpx/ja') {
this.langInfoArray_.splice(i, 1);
break;
}
}
this.ttsEngineApiVoices_ = [];
this.internalVoiceInfos_ = [];
this.langsRemaining_ = this.langInfoArray_.length;
this.langInfoArray_.forEach(
(function(langInfo) {
isEspeakLanguageEnabled(
langInfo,
(function(enabled) {
if (enabled) {
var voiceName = 'eSpeak ' + langInfo.name;
var ttsEngineApiVoice = {
voiceName: voiceName,
lang: langInfo.languages[0].name,
remote: false,
eventTypes:
['start', 'end', 'word', 'sentence', 'error']
};
this.ttsEngineApiVoices_.push(ttsEngineApiVoice);
var internalVoiceInfo = {
voiceName: voiceName,
espeakVoiceName: langInfo.name,
languages: langInfo.languages
};
this.internalVoiceInfos_.push(internalVoiceInfo);
}
this.langsRemaining_--;
if (this.langsRemaining_ == 0) {
this.finishInitialization_();
}
}).bind(this));
}).bind(this));
if (this.pendingGetLangInfoArrayCallback_) {
this.pendingGetLangInfoArrayCallback_(this.langInfoArray_);
this.pendingGetLangInfoArrayCallback_ = null;
}
}).bind(this));
};
/**
* Called after asynchronously getting the list of languages
* supported by eSpeak and filtering it to only the ones
* currently enabled and converting that to the format expected
* by the chrome.ttsEngine API.
*
* Calls chrome.ttsEngine.updateVoices to advertise the set of
* voices currently enabled, and updates the default voice to
* be used if someone requests speech without specifying any
* other metadata.
*/
finishUpdatingVoices_() {
console.log('finishUpdatingVoices_');
chrome.ttsEngine.updateVoices(this.ttsEngineApiVoices_);
console.log('Loaded ' + this.ttsEngineApiVoices_.length + ' voices');
this.defaultEspeakVoiceName_ =
this.getBestEspeakVoice('', navigator.language);
};
/**
* Finish initialization by initializing the playback system.
* This is the final step, then we can start speaking.
*/
finishInitialization_() {
chrome.mojoPrivate.requireAsync('chromeos.tts.stream_factory')
.then(factory => {
factory.createPlaybackTtsStream().then(result => {
this.ttsStream_ = result.stream;
this.sampleRate_ = result.sampleRate;
this.bufferLen_ = result.bufferSize;
this.finishUpdatingVoices_();
// Initialization is now complete.
this.initialized_ = true;
if (this.pendingCallback_) {
this.pendingCallback_();
this.pendingCallback_ = null;
}
});
});
}
/**
* Called by the client to stop speech.
*/
onStop() {
this.ttsStream_.stop();
this.onStopInternal();
}
onStopInternal() {
this.pendingCallback_ = null;
this.playing_ = false;
this.ttsEngineApiCallback_ = null;
};
/**
* Called by the client to start speech synthesis.
*
* @param {string} utterance The utterance to say.
* @param {object} options The options affecting the speech, like language,
* pitch, rate, etc.
* @param {function(object)} callback The function to receive messages from
* the
* engine.
*/
onSpeak(utterance, options, callback) {
if (this.ttsStream_ && !this.ttsStream_.ptr.isBound()) {
window.close();
return;
}
console.log('Will speak: "' + utterance + '" lang="' + options.lang + '"');
this.utteranceId_++;
this.pendingCallback_ = null;
if (!this.initialized_) {
// We got a call to speak before we're initialized. Enqueue this until
// we're ready.
this.pendingCallback_ =
this.onSpeak.bind(this, utterance, options, callback);
return;
}
this.ttsEngineApiCallback_ = callback;
this.utterance_ = utterance;
var espeakVoiceName =
this.getBestEspeakVoice(options.voiceName, options.lang);
this.tts_.set_voice(espeakVoiceName);
this.tts_.set_systemSampleRate(this.sampleRate_);
// Chrome TTS rates range from 0.1 to 10.0, with 1.0 as the default.
// eSpeak rates range from 80 to 450, with 175 as the default.
var rate = Math.min(Math.max(Math.floor(options.rate * 175), 80), 450);
this.tts_.set_rate(rate);
// Chrome TTS pitches range from 0.0 to 2.0, with 1.0 as the default.
// eSpeak pitches range from 0 to 99, with 50 as the default.
var pitch = Math.min(Math.max(Math.floor(options.pitch * 50), 0), 99);
this.tts_.set_pitch(pitch);
var volume = Math.min(Math.max(options.volume, 0.0), 1.0);
this.ttsStream_.setVolume(volume);
const owner = this;
this.ttsStream_.play().then(ttsEventObserver => {
new mojo.Binding(chromeos.tts.mojom.TtsEventObserver, new class {
onStart() {
owner.scheduleTimepointEvent(0, 'start');
}
onTimepoint(charIndex) {
owner.scheduleTimepointEvent(charIndex, 'word');
}
onEnd() {
owner.scheduleTimepointEvent(owner.utterance_.length, 'end');
}
onError() {}
}(), ttsEventObserver.eventObserver);
});
var handleEvents =
function(utteranceId, samples, events) {
if (this.utteranceId_ != utteranceId) {
return;
}
// 'end' type events can still be followed by audio data. This is a more
// reliable way the native code signals end.
let isEnd = events === undefined;
let charIndex = -1;
if (events) {
for (const event of events) {
charIndex = event.text_position;
switch (event.type) {
case 'sentence':
case 'word':
case 'end':
break;
}
}
}
var sampleArray = new Float32Array(samples);
this.processSamples(sampleArray, charIndex, isEnd);
}
this.tts_.synthesize(
utterance, (handleEvents).bind(this, this.utteranceId_));
};
/**
* Given a voice name and language, determine the best eSpeak voice
* to use. Sometimes a partial match needs to be used, for example
* because the language requested is just 'en', but eSpeak has
* 'en-US', 'en-GB', etc.
*/
getBestEspeakVoice(desiredVoiceName, desiredLang) {
var exactMatchEspeakVoiceName = null;
var langMatchEspeakVoiceName = null;
var langMatch = '';
this.internalVoiceInfos_.forEach(
(function(voice) {
if (desiredVoiceName == voice.voiceName)
exactMatchEspeakVoiceName = voice.espeakVoiceName;
voice.languages.forEach(
(function(lang) {
if (desiredLang.toLowerCase() == lang.name.toLowerCase() &&
lang.name.length > langMatch.length) {
langMatch = lang.name;
langMatchEspeakVoiceName = voice.voiceName;
}
}).bind(this));
}).bind(this));
if (exactMatchEspeakVoiceName) return exactMatchEspeakVoiceName;
if (langMatchEspeakVoiceName) return langMatchEspeakVoiceName;
return this.defaultEspeakVoiceName_;
};
/**
* Called when we get samples back from the WebAssembly TTS engine.
* Fills up buffers with samples, and as each buffer fills, sends it to be
* played.
*
* If these are the first samples for a new utterance, starts speech
* by connceting the audio node.
*/
processSamples(samples, charIndex, isEnd) {
if (!this.ttsEngineApiCallback_) {
return;
}
var i = 0;
var len = samples.length;
var didSendBuffers = false;
while (i < len) {
var chunkLen =
Math.min(this.bufferLen_ - this.currentBufferIndex_, len - i);
if (!this.currentBuffer_) {
this.currentBuffer_ = new Float32Array(this.bufferLen_);
}
this.currentBuffer_.set(
samples.subarray(i, i + chunkLen), this.currentBufferIndex_);
i += chunkLen;
this.currentBufferIndex_ += chunkLen;
if (this.currentBufferIndex_ == this.bufferLen_) {
this.ttsStream_.sendAudioBuffer(this.currentBuffer_, charIndex, isEnd);
didSendBuffers = true;
charIndex = -1;
this.currentBufferIndex_ = 0;
this.currentBuffer_ = null;
}
}
// Push final buffer, not complete.
if (isEnd && this.currentBufferIndex_ > 0) {
this.ttsStream_.sendAudioBuffer(this.currentBuffer_, charIndex, isEnd);
didSendBuffers = true;
this.currentBufferIndex_ = 0;
this.currentBuffer_ = null;
}
// Connect and start playback if this is the
// first buffer.
if (didSendBuffers && !this.playing_) {
this.playing_ = true;
this.startTime_ = new Date();
}
};
/**
* Schedules an event to be fired indicating progress of speech synthesis.
*/
scheduleTimepointEvent(textPosition, eventType) {
if (!this.ttsEngineApiCallback_) {
return;
}
if (textPosition < 0) {
return;
}
this.ttsEngineApiCallback_({'type': eventType, 'charIndex': textPosition});
}
}
// In 'split' manifest mode, the extension system runs two copies of the
// extension. One in an incognito context; the other not. In guest mode, the
// extension system runs only the extension in an incognito context. To prevent
// doubling of this extension, only continue for one context.
const manifest =
/** @type {{incognito: (string|undefined)}} */ (
chrome.runtime.getManifest());
if (manifest.incognito == 'split' && !chrome.extension.inIncognitoContext) {
window.close();
} else {
window.engine = new EspeakTtsEngine();
}