| // Copyright 2018 The Chromium OS Authors. All rights reserved. |
| // Use of this source code is governed by a the GPL license that can be |
| // found in the LICENSE file. |
| |
| /** |
| * @fileoverview eSpeak text-to-speech extension. |
| * |
| * There are three components to the extension: |
| * |
| * The open-source eSpeak-NG engine is compiled using Emscripten into a |
| * WebAssembly module that's run in a web worker. The interface to it is |
| * stored in this.tts_. This class sends it asynchronous queries for |
| * a list of voices and commands to start generating speech. Callbacks |
| * from the engine queue up events like word callbacks, and generate |
| * audio data. |
| * |
| * The implementation of the chrome.ttsEngine API is how we get |
| * commands to speak and stop. We also use this API to publish the |
| * current set of enabled voices. |
| * |
| * Finally, the audio data generated by the engine is output using |
| * the Web Audio API by creating an AudioworkletNode with a companion |
| * AudioWorkletProcess that runs in the audio thread, providing |
| * low-latency real-time audio output that's unaffected by any contention |
| * on the main thread. |
| */ |
| class EspeakTtsEngine { |
| constructor() { |
| /** |
| * The callback provided by a call to chrome.ttsEngine.onSpeak, which is |
| * our channel to send status updates about the progress of speech. |
| */ |
| this.ttsEngineApiCallback_ = null; |
| |
| /** |
| * The string containing the text we're currently speaking. |
| */ |
| this.utterance_ = null; |
| |
| /** |
| * The handle to the WASM Espeak-NG module. |
| */ |
| this.tts_ = null; |
| |
| /** |
| * The size, in samples, of each audio frame. When using an |
| * AudioWorkletProcessor this is always 128 samples. |
| */ |
| this.bufferLen_ = 128; |
| |
| /** |
| * The current buffer of audio samples from the engine. When each |
| * buffer fills up it's sent to the audio worklet processor thread. |
| */ |
| this.currentBuffer_ = null; |
| |
| /** |
| * The current zero-based index into |this.currentBuffer_| where the |
| * next audio sample should be written. |
| */ |
| this.currentBufferIndex_ = 0; |
| |
| /** |
| * The audio context, needed by any app using the Web Audio API. |
| */ |
| this.context_ = new AudioContext(); |
| |
| /** |
| * A node that allows us to adjust the output volume. The |
| * AudioWorkletNode connects to the gain node, and the gain node |
| * connects to the audio context destination. |
| */ |
| this.gainNode_ = this.context_.createGain(); |
| |
| /** |
| * Whether or not we're currently playing audio via the Web Audio |
| * API. This is not set to true until after we've got at least one |
| * buffer worth of samples from the engine and sent it to the |
| * AudioWorkletProcessor. |
| */ |
| this.playing_ = false; |
| |
| /** |
| * The voice data for the languages that are supported by eSpeak |
| * and enabled by the user. The elements of this array are in the |
| * format needed by the chrome.ttsEngine API. |
| */ |
| this.ttsEngineApiVoices_ = []; |
| |
| /** |
| * The data about each enabled language in eSpeak's format. |
| * Used when we get a speech request that specifies a language |
| * code but not a specific complete voice name. |
| */ |
| this.internalVoiceInfos_ = []; |
| |
| /** |
| * The data about every supported eSpeak language, in eSpeak's |
| * format. Used by the options page to present all languages to |
| * the user and let them choose which ones to enable. |
| */ |
| this.langInfoArray_ = []; |
| |
| /** |
| * Function to call when langInfoArray_ is populated. |
| */ |
| this.pendingGetLangInfoArrayCallback_ = null; |
| |
| /** |
| * The default voice to use when no metadata is provided at all. |
| */ |
| this.defaultEspeakVoiceName_ = null; |
| |
| /** |
| * The timestamp when speech started. Used to fire events like |
| * word callbacks. |
| */ |
| this.startTime_ = null; |
| |
| /** |
| * The queue of events that need to be scheduled to fire when |
| * audio playback of the utterance begins. |
| */ |
| this.eventQueue_ = []; |
| |
| /** |
| * The timeout IDs (from calling window.setTimeout) of event |
| * callbacks that have already been scheduled. These are cleared |
| * when we get a call to stop speech. |
| */ |
| this.timeoutIds_ = []; |
| |
| /** |
| * Callback to run after initialization. |
| */ |
| this.pendingCallback_ = null; |
| |
| /** |
| * True if initialization is done. |
| */ |
| this.initialized_ = false; |
| |
| // Initialize the WASM module and call updateVoices when it's ready. |
| this.tts_ = new eSpeakNG('js/espeakng.worker.js', (function() { |
| // This will call this.finishInitialization_() when done, |
| // which will initialize the audio worklet. |
| this.updateVoices(); |
| }).bind(this)); |
| |
| // Start listening to chrome.ttsEngine requests. |
| chrome.ttsEngine.onSpeak.addListener(this.onSpeak.bind(this)); |
| chrome.ttsEngine.onStop.addListener(this.onStop.bind(this)); |
| } |
| |
| /** |
| * Return the array of all languages supported by eSpeak. Called by |
| * the options page. |
| */ |
| getLangInfoArray(callback) { |
| if (this.langInfoArray_.length > 0) { |
| callback(this.langInfoArray_); |
| return; |
| } |
| |
| this.pendingGetLangInfoArrayCallback_ = callback; |
| }; |
| |
| /** |
| * Function called on startup and when the set of enabled voices |
| * may have changed. |
| */ |
| updateVoices() { |
| console.log('updateVoices'); |
| this.tts_.list_voices((function (langInfoArray) { |
| this.langInfoArray_ = langInfoArray; |
| // Remove Japanese because the voice cannot pronounce many characters. |
| for (var i = 0; i < this.langInfoArray_.length; ++i) { |
| if (this.langInfoArray_[i].identifier === 'jpx/ja') { |
| this.langInfoArray_.splice(i,1); |
| break; |
| } |
| } |
| this.ttsEngineApiVoices_ = []; |
| this.internalVoiceInfos_ = []; |
| this.langsRemaining_ = this.langInfoArray_.length; |
| this.langInfoArray_.forEach((function(langInfo) { |
| isEspeakLanguageEnabled(langInfo, (function(enabled) { |
| if (enabled) { |
| var voiceName = 'eSpeak ' + langInfo.name; |
| var ttsEngineApiVoice = { |
| voiceName: voiceName, |
| lang: langInfo.languages[0].name, |
| remote: false, |
| eventTypes: [ |
| 'start', 'end', 'word', 'sentence', 'error' |
| ]}; |
| this.ttsEngineApiVoices_.push(ttsEngineApiVoice); |
| |
| var internalVoiceInfo = { |
| voiceName: voiceName, |
| espeakVoiceName: langInfo.name, |
| languages: langInfo.languages |
| }; |
| this.internalVoiceInfos_.push(internalVoiceInfo); |
| } |
| |
| this.langsRemaining_--; |
| if (this.langsRemaining_ == 0) { |
| this.finishUpdatingVoices_(); |
| } |
| }).bind(this)); |
| }).bind(this)); |
| if (this.pendingGetLangInfoArrayCallback_) { |
| this.pendingGetLangInfoArrayCallback_(this.langInfoArray_); |
| this.pendingGetLangInfoArrayCallback_ = null; |
| } |
| }).bind(this)); |
| }; |
| |
| /** |
| * Called after asynchronously getting the list of languages |
| * supported by eSpeak and filtering it to only the ones |
| * currently enabled and converting that to the format expected |
| * by the chrome.ttsEngine API. |
| * |
| * Calls chrome.ttsEngine.updateVoices to advertise the set of |
| * voices currently enabled, and updates the default voice to |
| * be used if someone requests speech without specifying any |
| * other metadata. |
| */ |
| finishUpdatingVoices_() { |
| console.log('finishUpdatingVoices_'); |
| chrome.ttsEngine.updateVoices(this.ttsEngineApiVoices_); |
| console.log('Loaded ' + this.ttsEngineApiVoices_.length + |
| ' voices'); |
| |
| this.defaultEspeakVoiceName_ = this.getBestEspeakVoice( |
| '', navigator.language); |
| |
| this.finishInitialization_(); |
| }; |
| |
| /** |
| * Finish initialization by initializing the audio worklet. |
| * This is the final step, then we can start speaking. |
| */ |
| finishInitialization_() { |
| // Initialize the audio worklet and hook up the gain node. |
| this.context_.audioWorklet.addModule('streaming_worklet_processor.js').then((function() { |
| this.streamingNode_ = new AudioWorkletNode(this.context_, 'streaming-worklet-processor'); |
| this.streamingNode_.port.onmessage = this.onWorkletProcessorMessage.bind(this); |
| this.gainNode_.connect(this.context_.destination); |
| |
| // Initialization is now complete. |
| this.initialized_ = true; |
| if (this.pendingCallback_) { |
| this.pendingCallback_(); |
| this.pendingCallback_ = null; |
| } |
| }).bind(this)); |
| }; |
| |
| /** |
| * Called by the client to stop speech. |
| */ |
| onStop() { |
| this.pendingCallback_ = null; |
| |
| this.playing_ = false; |
| this.streamingNode_.disconnect(); |
| this.streamingNode_.port.postMessage({ |
| 'command': 'clearBuffers' |
| }); |
| this.ttsEngineApiCallback_ = null; |
| this.eventQueue_ = []; |
| this.timeoutIds_.forEach(function(timeoutId) { |
| window.clearTimeout(timeoutId); |
| }); |
| }; |
| |
| /** |
| * Called by the client to start speech synthesis. |
| * |
| * @param {string} utterance The utterance to say. |
| * @param {object} options The options affecting the speech, like language, |
| * pitch, rate, etc. |
| * @param {function(object)} callback The function to receive messages from the |
| * engine. |
| */ |
| onSpeak(utterance, options, callback) { |
| console.log('Will speak: "' + utterance + '" lang="' + options.lang + '"'); |
| |
| this.pendingCallback_ = null; |
| |
| if (!this.initialized_) { |
| // We got a call to speak before we're initialized. Enqueue this until |
| // we're ready. |
| this.pendingCallback_ = this.onSpeak.bind(this, utterance, options, callback); |
| return; |
| } |
| |
| this.onStop(); |
| |
| this.ttsEngineApiCallback_ = callback; |
| this.utterance_ = utterance; |
| |
| var espeakVoiceName = this.getBestEspeakVoice( |
| options.voiceName, options.lang); |
| this.tts_.set_voice(espeakVoiceName); |
| |
| this.tts_.set_systemSampleRate(this.context_.sampleRate); |
| |
| // Chrome TTS rates range from 0.1 to 10.0, with 1.0 as the default. |
| // eSpeak rates range from 80 to 450, with 175 as the default. |
| var rate = Math.min(Math.max(Math.floor(options.rate * 175), 80), 450); |
| this.tts_.set_rate(rate); |
| |
| // Chrome TTS pitches range from 0.0 to 2.0, with 1.0 as the default. |
| // eSpeak pitches range from 0 to 99, with 50 as the default. |
| var pitch = Math.min(Math.max(Math.floor(options.pitch * 50), 0), 99); |
| this.tts_.set_pitch(pitch); |
| |
| var volume = Math.min(Math.max(options.volume, 0.0), 1.0); |
| this.gainNode_.gain.value = volume; |
| |
| this.tts_.synthesize( |
| utterance, |
| (function(samples, events) { |
| var isEnd = false; |
| if (events) { |
| events.forEach((function(event) { |
| switch (event.type) { |
| case 'sentence': |
| case 'word': |
| this.scheduleTimepointEvent( |
| event.audio_position, |
| event.text_position - 1, |
| event.type); |
| break; |
| case 'end': |
| isEnd = true; |
| break; |
| } |
| }).bind(this)); |
| } |
| var sampleArray = new Float32Array(samples); |
| this.processSamples(sampleArray, isEnd); |
| }).bind(this)); |
| }; |
| |
| /** |
| * Given a voice name and language, determine the best eSpeak voice |
| * to use. Sometimes a partial match needs to be used, for example |
| * because the language requested is just 'en', but eSpeak has |
| * 'en-US', 'en-GB', etc. |
| */ |
| getBestEspeakVoice(desiredVoiceName, desiredLang) { |
| var exactMatchEspeakVoiceName = null; |
| var langMatchEspeakVoiceName = null; |
| var langMatch = ''; |
| this.internalVoiceInfos_.forEach((function(voice) { |
| if (desiredVoiceName == voice.voiceName) |
| exactMatchEspeakVoiceName = voice.espeakVoiceName; |
| voice.languages.forEach((function(lang) { |
| if (desiredLang.toLowerCase() == lang.name.toLowerCase() && |
| lang.name.length > langMatch.length) { |
| langMatch = lang.name; |
| langMatchEspeakVoiceName = voice.voiceName; |
| } |
| }).bind(this)); |
| }).bind(this)); |
| |
| if (exactMatchEspeakVoiceName) |
| return exactMatchEspeakVoiceName; |
| |
| if (langMatchEspeakVoiceName) |
| return langMatchEspeakVoiceName; |
| |
| return this.defaultEspeakVoiceName_; |
| }; |
| |
| /** |
| * Called when we get samples back from the WebAssembly TTS engine. |
| * Fills up buffers with samples, and as each buffer fills, sends it |
| * to the AudioWorkletProcessor running in another thread. |
| * |
| * If these are the first samples for a new utterance, starts speech |
| * by connceting the audio node. |
| */ |
| processSamples(samples, isEnd) { |
| if (!this.ttsEngineApiCallback_) { |
| return; |
| } |
| |
| var i = 0; |
| var len = samples.length; |
| |
| var didSendBuffers = false; |
| while (i < len) { |
| var chunkLen = Math.min(this.bufferLen_ - this.currentBufferIndex_, |
| len - i); |
| if (!this.currentBuffer_) { |
| this.currentBuffer_ = new Float32Array(this.bufferLen_); |
| } |
| this.currentBuffer_.set(samples.subarray(i, i + chunkLen), |
| this.currentBufferIndex_); |
| i += chunkLen; |
| this.currentBufferIndex_ += chunkLen; |
| if (this.currentBufferIndex_ == this.bufferLen_) { |
| this.streamingNode_.port.postMessage({ |
| 'command': 'addBuffer', |
| 'buffer': this.currentBuffer_ |
| }); |
| didSendBuffers = true; |
| |
| this.currentBufferIndex_ = 0; |
| this.currentBuffer_ = null; |
| } |
| } |
| |
| // Push final buffer, not complete. |
| if (isEnd && this.currentBufferIndex_ > 0) { |
| this.streamingNode_.port.postMessage({ |
| 'command': 'addBuffer', |
| 'buffer': this.currentBuffer_ |
| }); |
| didSendBuffers = true; |
| this.currentBufferIndex_ = 0; |
| this.currentBuffer_ = null; |
| } |
| |
| // Connect and start playback if this is the |
| // first buffer. |
| if (didSendBuffers && !this.playing_) { |
| this.streamingNode_.connect(this.gainNode_); |
| this.playing_ = true; |
| this.ttsEngineApiCallback_({ |
| 'type': 'start', |
| 'charIndex': 0 |
| }); |
| this.startTime_ = new Date(); |
| this.scheduleQueuedTimepointEvents(); |
| } |
| }; |
| |
| /** |
| * Called when we get a message from the worklet processor in |
| * the audio thread. Currently the only message is that it has |
| * run out of audio buffers. When that happens we fire the 'end' |
| * event and stop speaking. |
| */ |
| onWorkletProcessorMessage(event) { |
| if (!this.ttsEngineApiCallback_) { |
| // Do nothing if we've already stopped. |
| return; |
| } |
| |
| if (event.data.type == 'empty') { |
| this.ttsEngineApiCallback_({ |
| 'type': 'end', |
| 'charIndex': this.utterance_.length |
| }); |
| this.onStop(); |
| } |
| }; |
| |
| /** |
| * Schedules an event to be fired indicating progress of speech synthesis. |
| * If audio playback has already started, schedule it now based on the |
| * desired time and actual elapsed time so far. If audio playback has not |
| * started yet, put it in a queue. |
| */ |
| scheduleTimepointEvent(audioPositionMillis, |
| textPosition, |
| eventType) { |
| |
| if (textPosition < 0 || audioPositionMillis <= 0) { |
| return; |
| } |
| |
| if (!this.playing_) { |
| this.eventQueue_.push({'audioPositionMillis': audioPositionMillis, |
| 'textPosition': textPosition, |
| 'eventType': eventType}); |
| return; |
| } |
| |
| var currentTimeMillis = (new Date()) - this.startTime_; |
| var deltaTimeMillis = audioPositionMillis - currentTimeMillis; |
| if (deltaTimeMillis < -100) { |
| console.log('Skipping event ' + eventType + ' in the past'); |
| // Skip it if it's too much in the past. |
| return; |
| } else if (deltaTimeMillis < 2) { |
| // It's basically now, fire it. |
| this.ttsEngineApiCallback_({ |
| 'type': eventType, |
| 'charIndex': textPosition |
| }); |
| return; |
| } |
| |
| var timeoutId = window.setTimeout((function() { |
| this.ttsEngineApiCallback_({ |
| 'type': eventType, |
| 'charIndex': textPosition |
| }); |
| }).bind(this), deltaTimeMillis); |
| this.timeoutIds_.push(timeoutId); |
| } |
| |
| /** |
| * If any audio events where queued up because they were generated |
| * before audio playback had started, schedule them now. |
| */ |
| scheduleQueuedTimepointEvents() { |
| console.assert(this.playing_); |
| var events = this.eventQueue_; |
| this.eventQueue_ = []; |
| events.forEach((function(event) { |
| this.scheduleTimepointEvent( |
| event.audioPositionMillis, |
| event.textPosition, |
| event.eventType); |
| }).bind(this)); |
| } |
| } |
| |
| window.engine = new EspeakTtsEngine(); |