blob: bd3d12dd408377a2d6c79029a3daa1454bc91464 [file] [log] [blame]
// Copyright 2018 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a the GPL license that can be
// found in the LICENSE file.
/**
* @fileoverview eSpeak text-to-speech extension.
*
* There are three components to the extension:
*
* The open-source eSpeak-NG engine is compiled using Emscripten into a
* WebAssembly module that's run in a web worker. The interface to it is
* stored in this.tts_. This class sends it asynchronous queries for
* a list of voices and commands to start generating speech. Callbacks
* from the engine queue up events like word callbacks, and generate
* audio data.
*
* The implementation of the chrome.ttsEngine API is how we get
* commands to speak and stop. We also use this API to publish the
* current set of enabled voices.
*
* Finally, the audio data generated by the engine is output using
* the Web Audio API by creating an AudioworkletNode with a companion
* AudioWorkletProcess that runs in the audio thread, providing
* low-latency real-time audio output that's unaffected by any contention
* on the main thread.
*/
class EspeakTtsEngine {
constructor() {
/**
* The callback provided by a call to chrome.ttsEngine.onSpeak, which is
* our channel to send status updates about the progress of speech.
*/
this.ttsEngineApiCallback_ = null;
/**
* The string containing the text we're currently speaking.
*/
this.utterance_ = null;
/**
* The handle to the WASM Espeak-NG module.
*/
this.tts_ = null;
/**
* The size, in samples, of each audio frame. When using an
* AudioWorkletProcessor this is always 128 samples.
*/
this.bufferLen_ = 128;
/**
* The current buffer of audio samples from the engine. When each
* buffer fills up it's sent to the audio worklet processor thread.
*/
this.currentBuffer_ = null;
/**
* The current zero-based index into |this.currentBuffer_| where the
* next audio sample should be written.
*/
this.currentBufferIndex_ = 0;
/**
* The audio context, needed by any app using the Web Audio API.
*/
this.context_ = new AudioContext();
/**
* A node that allows us to adjust the output volume. The
* AudioWorkletNode connects to the gain node, and the gain node
* connects to the audio context destination.
*/
this.gainNode_ = this.context_.createGain();
/**
* Whether or not we're currently playing audio via the Web Audio
* API. This is not set to true until after we've got at least one
* buffer worth of samples from the engine and sent it to the
* AudioWorkletProcessor.
*/
this.playing_ = false;
/**
* The voice data for the languages that are supported by eSpeak
* and enabled by the user. The elements of this array are in the
* format needed by the chrome.ttsEngine API.
*/
this.ttsEngineApiVoices_ = [];
/**
* The data about each enabled language in eSpeak's format.
* Used when we get a speech request that specifies a language
* code but not a specific complete voice name.
*/
this.internalVoiceInfos_ = [];
/**
* The data about every supported eSpeak language, in eSpeak's
* format. Used by the options page to present all languages to
* the user and let them choose which ones to enable.
*/
this.langInfoArray_ = [];
/**
* Function to call when langInfoArray_ is populated.
*/
this.pendingGetLangInfoArrayCallback_ = null;
/**
* The default voice to use when no metadata is provided at all.
*/
this.defaultEspeakVoiceName_ = null;
/**
* The timestamp when speech started. Used to fire events like
* word callbacks.
*/
this.startTime_ = null;
/**
* The queue of events that need to be scheduled to fire when
* audio playback of the utterance begins.
*/
this.eventQueue_ = [];
/**
* The timeout IDs (from calling window.setTimeout) of event
* callbacks that have already been scheduled. These are cleared
* when we get a call to stop speech.
*/
this.timeoutIds_ = [];
/**
* Callback to run after initialization.
*/
this.pendingCallback_ = null;
/**
* True if initialization is done.
*/
this.initialized_ = false;
// Initialize the WASM module and call updateVoices when it's ready.
this.tts_ = new eSpeakNG('js/espeakng.worker.js', (function() {
// This will call this.finishInitialization_() when done,
// which will initialize the audio worklet.
this.updateVoices();
}).bind(this));
// Start listening to chrome.ttsEngine requests.
chrome.ttsEngine.onSpeak.addListener(this.onSpeak.bind(this));
chrome.ttsEngine.onStop.addListener(this.onStop.bind(this));
}
/**
* Return the array of all languages supported by eSpeak. Called by
* the options page.
*/
getLangInfoArray(callback) {
if (this.langInfoArray_.length > 0) {
callback(this.langInfoArray_);
return;
}
this.pendingGetLangInfoArrayCallback_ = callback;
};
/**
* Function called on startup and when the set of enabled voices
* may have changed.
*/
updateVoices() {
console.log('updateVoices');
this.tts_.list_voices((function (langInfoArray) {
this.langInfoArray_ = langInfoArray;
// Remove Japanese because the voice cannot pronounce many characters.
for (var i = 0; i < this.langInfoArray_.length; ++i) {
if (this.langInfoArray_[i].identifier === 'jpx/ja') {
this.langInfoArray_.splice(i,1);
break;
}
}
this.ttsEngineApiVoices_ = [];
this.internalVoiceInfos_ = [];
this.langsRemaining_ = this.langInfoArray_.length;
this.langInfoArray_.forEach((function(langInfo) {
isEspeakLanguageEnabled(langInfo, (function(enabled) {
if (enabled) {
var voiceName = 'eSpeak ' + langInfo.name;
var ttsEngineApiVoice = {
voiceName: voiceName,
lang: langInfo.languages[0].name,
remote: false,
eventTypes: [
'start', 'end', 'word', 'sentence', 'error'
]};
this.ttsEngineApiVoices_.push(ttsEngineApiVoice);
var internalVoiceInfo = {
voiceName: voiceName,
espeakVoiceName: langInfo.name,
languages: langInfo.languages
};
this.internalVoiceInfos_.push(internalVoiceInfo);
}
this.langsRemaining_--;
if (this.langsRemaining_ == 0) {
this.finishUpdatingVoices_();
}
}).bind(this));
}).bind(this));
if (this.pendingGetLangInfoArrayCallback_) {
this.pendingGetLangInfoArrayCallback_(this.langInfoArray_);
this.pendingGetLangInfoArrayCallback_ = null;
}
}).bind(this));
};
/**
* Called after asynchronously getting the list of languages
* supported by eSpeak and filtering it to only the ones
* currently enabled and converting that to the format expected
* by the chrome.ttsEngine API.
*
* Calls chrome.ttsEngine.updateVoices to advertise the set of
* voices currently enabled, and updates the default voice to
* be used if someone requests speech without specifying any
* other metadata.
*/
finishUpdatingVoices_() {
console.log('finishUpdatingVoices_');
chrome.ttsEngine.updateVoices(this.ttsEngineApiVoices_);
console.log('Loaded ' + this.ttsEngineApiVoices_.length +
' voices');
this.defaultEspeakVoiceName_ = this.getBestEspeakVoice(
'', navigator.language);
this.finishInitialization_();
};
/**
* Finish initialization by initializing the audio worklet.
* This is the final step, then we can start speaking.
*/
finishInitialization_() {
// Initialize the audio worklet and hook up the gain node.
this.context_.audioWorklet.addModule('streaming_worklet_processor.js').then((function() {
this.streamingNode_ = new AudioWorkletNode(this.context_, 'streaming-worklet-processor');
this.streamingNode_.port.onmessage = this.onWorkletProcessorMessage.bind(this);
this.gainNode_.connect(this.context_.destination);
// Initialization is now complete.
this.initialized_ = true;
if (this.pendingCallback_) {
this.pendingCallback_();
this.pendingCallback_ = null;
}
}).bind(this));
};
/**
* Called by the client to stop speech.
*/
onStop() {
this.pendingCallback_ = null;
this.playing_ = false;
this.streamingNode_.disconnect();
this.streamingNode_.port.postMessage({
'command': 'clearBuffers'
});
this.ttsEngineApiCallback_ = null;
this.eventQueue_ = [];
this.timeoutIds_.forEach(function(timeoutId) {
window.clearTimeout(timeoutId);
});
};
/**
* Called by the client to start speech synthesis.
*
* @param {string} utterance The utterance to say.
* @param {object} options The options affecting the speech, like language,
* pitch, rate, etc.
* @param {function(object)} callback The function to receive messages from the
* engine.
*/
onSpeak(utterance, options, callback) {
console.log('Will speak: "' + utterance + '" lang="' + options.lang + '"');
this.pendingCallback_ = null;
if (!this.initialized_) {
// We got a call to speak before we're initialized. Enqueue this until
// we're ready.
this.pendingCallback_ = this.onSpeak.bind(this, utterance, options, callback);
return;
}
this.onStop();
this.ttsEngineApiCallback_ = callback;
this.utterance_ = utterance;
var espeakVoiceName = this.getBestEspeakVoice(
options.voiceName, options.lang);
this.tts_.set_voice(espeakVoiceName);
this.tts_.set_systemSampleRate(this.context_.sampleRate);
// Chrome TTS rates range from 0.1 to 10.0, with 1.0 as the default.
// eSpeak rates range from 80 to 450, with 175 as the default.
var rate = Math.min(Math.max(Math.floor(options.rate * 175), 80), 450);
this.tts_.set_rate(rate);
// Chrome TTS pitches range from 0.0 to 2.0, with 1.0 as the default.
// eSpeak pitches range from 0 to 99, with 50 as the default.
var pitch = Math.min(Math.max(Math.floor(options.pitch * 50), 0), 99);
this.tts_.set_pitch(pitch);
var volume = Math.min(Math.max(options.volume, 0.0), 1.0);
this.gainNode_.gain.value = volume;
this.tts_.synthesize(
utterance,
(function(samples, events) {
var isEnd = false;
if (events) {
events.forEach((function(event) {
switch (event.type) {
case 'sentence':
case 'word':
this.scheduleTimepointEvent(
event.audio_position,
event.text_position - 1,
event.type);
break;
case 'end':
isEnd = true;
break;
}
}).bind(this));
}
var sampleArray = new Float32Array(samples);
this.processSamples(sampleArray, isEnd);
}).bind(this));
};
/**
* Given a voice name and language, determine the best eSpeak voice
* to use. Sometimes a partial match needs to be used, for example
* because the language requested is just 'en', but eSpeak has
* 'en-US', 'en-GB', etc.
*/
getBestEspeakVoice(desiredVoiceName, desiredLang) {
var exactMatchEspeakVoiceName = null;
var langMatchEspeakVoiceName = null;
var langMatch = '';
this.internalVoiceInfos_.forEach((function(voice) {
if (desiredVoiceName == voice.voiceName)
exactMatchEspeakVoiceName = voice.espeakVoiceName;
voice.languages.forEach((function(lang) {
if (desiredLang.toLowerCase() == lang.name.toLowerCase() &&
lang.name.length > langMatch.length) {
langMatch = lang.name;
langMatchEspeakVoiceName = voice.voiceName;
}
}).bind(this));
}).bind(this));
if (exactMatchEspeakVoiceName)
return exactMatchEspeakVoiceName;
if (langMatchEspeakVoiceName)
return langMatchEspeakVoiceName;
return this.defaultEspeakVoiceName_;
};
/**
* Called when we get samples back from the WebAssembly TTS engine.
* Fills up buffers with samples, and as each buffer fills, sends it
* to the AudioWorkletProcessor running in another thread.
*
* If these are the first samples for a new utterance, starts speech
* by connceting the audio node.
*/
processSamples(samples, isEnd) {
if (!this.ttsEngineApiCallback_) {
return;
}
var i = 0;
var len = samples.length;
var didSendBuffers = false;
while (i < len) {
var chunkLen = Math.min(this.bufferLen_ - this.currentBufferIndex_,
len - i);
if (!this.currentBuffer_) {
this.currentBuffer_ = new Float32Array(this.bufferLen_);
}
this.currentBuffer_.set(samples.subarray(i, i + chunkLen),
this.currentBufferIndex_);
i += chunkLen;
this.currentBufferIndex_ += chunkLen;
if (this.currentBufferIndex_ == this.bufferLen_) {
this.streamingNode_.port.postMessage({
'command': 'addBuffer',
'buffer': this.currentBuffer_
});
didSendBuffers = true;
this.currentBufferIndex_ = 0;
this.currentBuffer_ = null;
}
}
// Push final buffer, not complete.
if (isEnd && this.currentBufferIndex_ > 0) {
this.streamingNode_.port.postMessage({
'command': 'addBuffer',
'buffer': this.currentBuffer_
});
didSendBuffers = true;
this.currentBufferIndex_ = 0;
this.currentBuffer_ = null;
}
// Connect and start playback if this is the
// first buffer.
if (didSendBuffers && !this.playing_) {
this.streamingNode_.connect(this.gainNode_);
this.playing_ = true;
this.ttsEngineApiCallback_({
'type': 'start',
'charIndex': 0
});
this.startTime_ = new Date();
this.scheduleQueuedTimepointEvents();
}
};
/**
* Called when we get a message from the worklet processor in
* the audio thread. Currently the only message is that it has
* run out of audio buffers. When that happens we fire the 'end'
* event and stop speaking.
*/
onWorkletProcessorMessage(event) {
if (!this.ttsEngineApiCallback_) {
// Do nothing if we've already stopped.
return;
}
if (event.data.type == 'empty') {
this.ttsEngineApiCallback_({
'type': 'end',
'charIndex': this.utterance_.length
});
this.onStop();
}
};
/**
* Schedules an event to be fired indicating progress of speech synthesis.
* If audio playback has already started, schedule it now based on the
* desired time and actual elapsed time so far. If audio playback has not
* started yet, put it in a queue.
*/
scheduleTimepointEvent(audioPositionMillis,
textPosition,
eventType) {
if (textPosition < 0 || audioPositionMillis <= 0) {
return;
}
if (!this.playing_) {
this.eventQueue_.push({'audioPositionMillis': audioPositionMillis,
'textPosition': textPosition,
'eventType': eventType});
return;
}
var currentTimeMillis = (new Date()) - this.startTime_;
var deltaTimeMillis = audioPositionMillis - currentTimeMillis;
if (deltaTimeMillis < -100) {
console.log('Skipping event ' + eventType + ' in the past');
// Skip it if it's too much in the past.
return;
} else if (deltaTimeMillis < 2) {
// It's basically now, fire it.
this.ttsEngineApiCallback_({
'type': eventType,
'charIndex': textPosition
});
return;
}
var timeoutId = window.setTimeout((function() {
this.ttsEngineApiCallback_({
'type': eventType,
'charIndex': textPosition
});
}).bind(this), deltaTimeMillis);
this.timeoutIds_.push(timeoutId);
}
/**
* If any audio events where queued up because they were generated
* before audio playback had started, schedule them now.
*/
scheduleQueuedTimepointEvents() {
console.assert(this.playing_);
var events = this.eventQueue_;
this.eventQueue_ = [];
events.forEach((function(event) {
this.scheduleTimepointEvent(
event.audioPositionMillis,
event.textPosition,
event.eventType);
}).bind(this));
}
}
window.engine = new EspeakTtsEngine();