| // Copyright 2018 The Chromium OS Authors. All rights reserved. |
| // Use of this source code is governed by a the GPL license that can be |
| // found in the LICENSE file. |
| |
| /** |
| * @fileoverview eSpeak text-to-speech extension. |
| * |
| * There are three components to the extension: |
| * |
| * The open-source eSpeak-NG engine is compiled using Emscripten into a |
| * WebAssembly module that's run in a web worker. The interface to it is |
| * stored in this.tts_. This class sends it asynchronous queries for |
| * a list of voices and commands to start generating speech. Callbacks |
| * from the engine queue up events like word callbacks, and generate |
| * audio data. |
| * |
| * The implementation of the chrome.ttsEngine API is how we get |
| * commands to speak and stop. We also use this API to publish the |
| * current set of enabled voices. |
| * |
| * Finally, the audio data generated by the engine is output using |
| * a private mojo interface. This is an intermediate solution as will |
| * serve as motivation for a new chrome.TtsEngine api. |
| */ |
| class EspeakTtsEngine { |
| constructor() { |
| /** |
| * The callback provided by a call to chrome.ttsEngine.onSpeak, which is |
| * our channel to send status updates about the progress of speech. |
| */ |
| this.ttsEngineApiCallback_ = null; |
| |
| /** |
| * The string containing the text we're currently speaking. |
| */ |
| this.utterance_ = null; |
| |
| /** |
| * An id to track the current utterance to, for example, ensure audio data |
| * for previous utterances is ignored. |
| */ |
| this.utteranceId_ = 0; |
| |
| /** |
| * The handle to the WASM Espeak-NG module. |
| */ |
| this.tts_ = null; |
| |
| /** Sample rate requested for Espeak to generate audio. */ |
| this.sampleRate_; |
| |
| /** |
| * The size, in samples, of each audio frame. When using an |
| * AudioWorkletProcessor this is always 128 samples. |
| */ |
| this.bufferLen_; |
| |
| /** |
| * The current buffer of audio samples from the engine. When each |
| * buffer fills up it's sent to the audio worklet processor thread. |
| */ |
| this.currentBuffer_ = null; |
| |
| /** |
| * The current zero-based index into |this.currentBuffer_| where the |
| * next audio sample should be written. |
| */ |
| this.currentBufferIndex_ = 0; |
| |
| /** |
| * Whether or not we're currently playing audio. |
| * This is not set to true until after we've got at least one |
| * buffer worth of samples from the engine and sent it to be played. |
| */ |
| this.playing_ = false; |
| |
| /** |
| * The voice data for the languages that are supported by eSpeak |
| * and enabled by the user. The elements of this array are in the |
| * format needed by the chrome.ttsEngine API. |
| */ |
| this.ttsEngineApiVoices_ = []; |
| |
| /** |
| * The data about each enabled language in eSpeak's format. |
| * Used when we get a speech request that specifies a language |
| * code but not a specific complete voice name. |
| */ |
| this.internalVoiceInfos_ = []; |
| |
| /** |
| * The data about every supported eSpeak language, in eSpeak's |
| * format. Used by the options page to present all languages to |
| * the user and let them choose which ones to enable. |
| */ |
| this.langInfoArray_ = []; |
| |
| /** |
| * Function to call when langInfoArray_ is populated. |
| */ |
| this.pendingGetLangInfoArrayCallback_ = null; |
| |
| /** |
| * The default voice to use when no metadata is provided at all. |
| */ |
| this.defaultEspeakVoiceName_ = null; |
| |
| /** |
| * The timestamp when speech started. Used to fire events like |
| * word callbacks. |
| */ |
| this.startTime_ = null; |
| |
| /** |
| * Callback to run after initialization. |
| */ |
| this.pendingCallback_ = null; |
| |
| /** |
| * True if initialization is done. |
| */ |
| this.initialized_ = false; |
| |
| // Initialize the WASM module and call updateVoices when it's ready. |
| this.tts_ = new eSpeakNG('js/espeakng.worker.js', (function() { |
| // This will call |
| // this.finishInitialization_() |
| // when done, which will |
| // initialize the audio |
| // worklet. |
| this.updateVoices(); |
| }).bind(this)); |
| |
| // Start listening to chrome.ttsEngine requests. |
| chrome.ttsEngine.onSpeak.addListener(this.onSpeak.bind(this)); |
| chrome.ttsEngine.onStop.addListener(this.onStop.bind(this)); |
| } |
| |
| /** |
| * Return the array of all languages supported by eSpeak. Called by |
| * the options page. |
| */ |
| getLangInfoArray(callback) { |
| if (this.langInfoArray_.length > 0) { |
| callback(this.langInfoArray_); |
| return; |
| } |
| |
| this.pendingGetLangInfoArrayCallback_ = callback; |
| }; |
| |
| /** |
| * Function called on startup and when the set of enabled voices |
| * may have changed. |
| */ |
| updateVoices() { |
| console.log('updateVoices'); |
| this.tts_.list_voices( |
| (function(langInfoArray) { |
| this.langInfoArray_ = langInfoArray; |
| // Remove Japanese because the voice cannot pronounce many characters. |
| for (var i = 0; i < this.langInfoArray_.length; ++i) { |
| if (this.langInfoArray_[i].identifier === 'jpx/ja') { |
| this.langInfoArray_.splice(i, 1); |
| break; |
| } |
| } |
| this.ttsEngineApiVoices_ = []; |
| this.internalVoiceInfos_ = []; |
| this.langsRemaining_ = this.langInfoArray_.length; |
| this.langInfoArray_.forEach( |
| (function(langInfo) { |
| isEspeakLanguageEnabled( |
| langInfo, |
| (function(enabled) { |
| if (enabled) { |
| var voiceName = 'eSpeak ' + langInfo.name; |
| var ttsEngineApiVoice = { |
| voiceName: voiceName, |
| lang: langInfo.languages[0].name, |
| remote: false, |
| eventTypes: |
| ['start', 'end', 'word', 'sentence', 'error'] |
| }; |
| this.ttsEngineApiVoices_.push(ttsEngineApiVoice); |
| |
| var internalVoiceInfo = { |
| voiceName: voiceName, |
| espeakVoiceName: langInfo.name, |
| languages: langInfo.languages |
| }; |
| this.internalVoiceInfos_.push(internalVoiceInfo); |
| } |
| |
| this.langsRemaining_--; |
| if (this.langsRemaining_ == 0) { |
| this.finishInitialization_(); |
| } |
| }).bind(this)); |
| }).bind(this)); |
| if (this.pendingGetLangInfoArrayCallback_) { |
| this.pendingGetLangInfoArrayCallback_(this.langInfoArray_); |
| this.pendingGetLangInfoArrayCallback_ = null; |
| } |
| }).bind(this)); |
| }; |
| |
| /** |
| * Called after asynchronously getting the list of languages |
| * supported by eSpeak and filtering it to only the ones |
| * currently enabled and converting that to the format expected |
| * by the chrome.ttsEngine API. |
| * |
| * Calls chrome.ttsEngine.updateVoices to advertise the set of |
| * voices currently enabled, and updates the default voice to |
| * be used if someone requests speech without specifying any |
| * other metadata. |
| */ |
| finishUpdatingVoices_() { |
| console.log('finishUpdatingVoices_'); |
| chrome.ttsEngine.updateVoices(this.ttsEngineApiVoices_); |
| console.log('Loaded ' + this.ttsEngineApiVoices_.length + ' voices'); |
| |
| this.defaultEspeakVoiceName_ = |
| this.getBestEspeakVoice('', navigator.language); |
| }; |
| |
| /** |
| * Finish initialization by initializing the playback system. |
| * This is the final step, then we can start speaking. |
| */ |
| finishInitialization_() { |
| chrome.mojoPrivate.requireAsync('chromeos.tts.stream_factory') |
| .then(factory => { |
| factory.createPlaybackTtsStream().then(result => { |
| this.ttsStream_ = result.stream; |
| this.sampleRate_ = result.sampleRate; |
| this.bufferLen_ = result.bufferSize; |
| |
| this.finishUpdatingVoices_(); |
| |
| // Initialization is now complete. |
| this.initialized_ = true; |
| if (this.pendingCallback_) { |
| this.pendingCallback_(); |
| this.pendingCallback_ = null; |
| } |
| }); |
| }); |
| } |
| |
| /** |
| * Called by the client to stop speech. |
| */ |
| onStop() { |
| this.ttsStream_.stop(); |
| this.onStopInternal(); |
| } |
| |
| onStopInternal() { |
| this.pendingCallback_ = null; |
| |
| this.playing_ = false; |
| this.ttsEngineApiCallback_ = null; |
| }; |
| |
| /** |
| * Called by the client to start speech synthesis. |
| * |
| * @param {string} utterance The utterance to say. |
| * @param {object} options The options affecting the speech, like language, |
| * pitch, rate, etc. |
| * @param {function(object)} callback The function to receive messages from |
| * the |
| * engine. |
| */ |
| onSpeak(utterance, options, callback) { |
| if (this.ttsStream_ && !this.ttsStream_.ptr.isBound()) { |
| window.close(); |
| return; |
| } |
| |
| console.log('Will speak: "' + utterance + '" lang="' + options.lang + '"'); |
| this.utteranceId_++; |
| this.pendingCallback_ = null; |
| |
| if (!this.initialized_) { |
| // We got a call to speak before we're initialized. Enqueue this until |
| // we're ready. |
| this.pendingCallback_ = |
| this.onSpeak.bind(this, utterance, options, callback); |
| return; |
| } |
| |
| this.ttsEngineApiCallback_ = callback; |
| this.utterance_ = utterance; |
| |
| var espeakVoiceName = |
| this.getBestEspeakVoice(options.voiceName, options.lang); |
| this.tts_.set_voice(espeakVoiceName); |
| |
| this.tts_.set_systemSampleRate(this.sampleRate_); |
| |
| // Chrome TTS rates range from 0.1 to 10.0, with 1.0 as the default. |
| // eSpeak rates range from 80 to 450, with 175 as the default. |
| var rate = Math.min(Math.max(Math.floor(options.rate * 175), 80), 450); |
| this.tts_.set_rate(rate); |
| |
| // Chrome TTS pitches range from 0.0 to 2.0, with 1.0 as the default. |
| // eSpeak pitches range from 0 to 99, with 50 as the default. |
| var pitch = Math.min(Math.max(Math.floor(options.pitch * 50), 0), 99); |
| this.tts_.set_pitch(pitch); |
| |
| var volume = Math.min(Math.max(options.volume, 0.0), 1.0); |
| this.ttsStream_.setVolume(volume); |
| const owner = this; |
| this.ttsStream_.play().then(ttsEventObserver => { |
| new mojo.Binding(chromeos.tts.mojom.TtsEventObserver, new class { |
| onStart() { |
| owner.scheduleTimepointEvent(0, 'start'); |
| } |
| |
| onTimepoint(charIndex) { |
| owner.scheduleTimepointEvent(charIndex, 'word'); |
| } |
| |
| onEnd() { |
| owner.scheduleTimepointEvent(owner.utterance_.length, 'end'); |
| } |
| |
| onError() {} |
| }(), ttsEventObserver.eventObserver); |
| }); |
| |
| var handleEvents = |
| function(utteranceId, samples, events) { |
| if (this.utteranceId_ != utteranceId) { |
| return; |
| } |
| |
| // 'end' type events can still be followed by audio data. This is a more |
| // reliable way the native code signals end. |
| let isEnd = events === undefined; |
| let charIndex = -1; |
| |
| if (events) { |
| for (const event of events) { |
| charIndex = event.text_position; |
| |
| switch (event.type) { |
| case 'sentence': |
| case 'word': |
| case 'end': |
| break; |
| } |
| } |
| } |
| var sampleArray = new Float32Array(samples); |
| this.processSamples(sampleArray, charIndex, isEnd); |
| } |
| |
| this.tts_.synthesize( |
| utterance, (handleEvents).bind(this, this.utteranceId_)); |
| }; |
| |
| /** |
| * Given a voice name and language, determine the best eSpeak voice |
| * to use. Sometimes a partial match needs to be used, for example |
| * because the language requested is just 'en', but eSpeak has |
| * 'en-US', 'en-GB', etc. |
| */ |
| getBestEspeakVoice(desiredVoiceName, desiredLang) { |
| var exactMatchEspeakVoiceName = null; |
| var langMatchEspeakVoiceName = null; |
| var langMatch = ''; |
| this.internalVoiceInfos_.forEach( |
| (function(voice) { |
| if (desiredVoiceName == voice.voiceName) |
| exactMatchEspeakVoiceName = voice.espeakVoiceName; |
| voice.languages.forEach( |
| (function(lang) { |
| if (desiredLang.toLowerCase() == lang.name.toLowerCase() && |
| lang.name.length > langMatch.length) { |
| langMatch = lang.name; |
| langMatchEspeakVoiceName = voice.voiceName; |
| } |
| }).bind(this)); |
| }).bind(this)); |
| |
| if (exactMatchEspeakVoiceName) return exactMatchEspeakVoiceName; |
| |
| if (langMatchEspeakVoiceName) return langMatchEspeakVoiceName; |
| |
| return this.defaultEspeakVoiceName_; |
| }; |
| |
| /** |
| * Called when we get samples back from the WebAssembly TTS engine. |
| * Fills up buffers with samples, and as each buffer fills, sends it to be |
| * played. |
| * |
| * If these are the first samples for a new utterance, starts speech |
| * by connceting the audio node. |
| */ |
| processSamples(samples, charIndex, isEnd) { |
| if (!this.ttsEngineApiCallback_) { |
| return; |
| } |
| |
| var i = 0; |
| var len = samples.length; |
| |
| var didSendBuffers = false; |
| while (i < len) { |
| var chunkLen = |
| Math.min(this.bufferLen_ - this.currentBufferIndex_, len - i); |
| if (!this.currentBuffer_) { |
| this.currentBuffer_ = new Float32Array(this.bufferLen_); |
| } |
| this.currentBuffer_.set( |
| samples.subarray(i, i + chunkLen), this.currentBufferIndex_); |
| i += chunkLen; |
| this.currentBufferIndex_ += chunkLen; |
| if (this.currentBufferIndex_ == this.bufferLen_) { |
| this.ttsStream_.sendAudioBuffer(this.currentBuffer_, charIndex, isEnd); |
| didSendBuffers = true; |
| charIndex = -1; |
| |
| this.currentBufferIndex_ = 0; |
| this.currentBuffer_ = null; |
| } |
| } |
| |
| // Push final buffer, not complete. |
| if (isEnd && this.currentBufferIndex_ > 0) { |
| this.ttsStream_.sendAudioBuffer(this.currentBuffer_, charIndex, isEnd); |
| didSendBuffers = true; |
| this.currentBufferIndex_ = 0; |
| this.currentBuffer_ = null; |
| } |
| |
| // Connect and start playback if this is the |
| // first buffer. |
| if (didSendBuffers && !this.playing_) { |
| this.playing_ = true; |
| this.startTime_ = new Date(); |
| } |
| }; |
| |
| /** |
| * Schedules an event to be fired indicating progress of speech synthesis. |
| */ |
| scheduleTimepointEvent(textPosition, eventType) { |
| if (!this.ttsEngineApiCallback_) { |
| return; |
| } |
| |
| if (textPosition < 0) { |
| return; |
| } |
| this.ttsEngineApiCallback_({'type': eventType, 'charIndex': textPosition}); |
| } |
| } |
| |
| // In 'split' manifest mode, the extension system runs two copies of the |
| // extension. One in an incognito context; the other not. In guest mode, the |
| // extension system runs only the extension in an incognito context. To prevent |
| // doubling of this extension, only continue for one context. |
| const manifest = |
| /** @type {{incognito: (string|undefined)}} */ ( |
| chrome.runtime.getManifest()); |
| if (manifest.incognito == 'split' && !chrome.extension.inIncognitoContext) { |
| window.close(); |
| } else { |
| window.engine = new EspeakTtsEngine(); |
| } |