chrome-extension/espeak_tts_engine.js - chromiumos/third_party/espeak-ng - Git at Google

 // Copyright 2018 The Chromium OS Authors. All rights reserved.
 // Use of this source code is governed by a the GPL license that can be
 // found in the LICENSE file.

 /**
  * @fileoverview eSpeak text-to-speech extension.
  *
  * There are three components to the extension:
  *
  * The open-source eSpeak-NG engine is compiled using Emscripten into a
  * WebAssembly module that's run in a web worker. The interface to it is
  * stored in this.tts_. This class sends it asynchronous queries for
  * a list of voices and commands to start generating speech. Callbacks
  * from the engine queue up events like word callbacks, and generate
  * audio data.
  *
  * The implementation of the chrome.ttsEngine API is how we get
  * commands to speak and stop. We also use this API to publish the
  * current set of enabled voices.
  *
  * Finally, the audio data generated by the engine is output using
  * the Web Audio API by creating an AudioworkletNode with a companion
  * AudioWorkletProcess that runs in the audio thread, providing
  * low-latency real-time audio output that's unaffected by any contention
  * on the main thread.
  */
 class EspeakTtsEngine {
   constructor() {
     /**
      * The callback provided by a call to chrome.ttsEngine.onSpeak, which is
      * our channel to send status updates about the progress of speech.
      */
     this.ttsEngineApiCallback_ = null;

     /**
      * The string containing the text we're currently speaking.
      */
     this.utterance_ = null;

     /**
      * The handle to the WASM Espeak-NG module.
      */
     this.tts_ = null;

     /**
      * The size, in samples, of each audio frame. When using an
      * AudioWorkletProcessor this is always 128 samples.
      */
     this.bufferLen_ = 128;

     /**
      * The current buffer of audio samples from the engine. When each
      * buffer fills up it's sent to the audio worklet processor thread.
      */
     this.currentBuffer_ = null;

     /**
      * The current zero-based index into |this.currentBuffer_| where the
      * next audio sample should be written.
      */
     this.currentBufferIndex_ = 0;

     /**
      * The audio context, needed by any app using the Web Audio API.
      */
     this.context_ = new AudioContext();

     /**
      * A node that allows us to adjust the output volume. The
      * AudioWorkletNode connects to the gain node, and the gain node
      * connects to the audio context destination.
      */
     this.gainNode_ = this.context_.createGain();

     /**
      * Whether or not we're currently playing audio via the Web Audio
      * API. This is not set to true until after we've got at least one
      * buffer worth of samples from the engine and sent it to the
      * AudioWorkletProcessor.
      */
     this.playing_ = false;

     /**
      * The voice data for the languages that are supported by eSpeak
      * and enabled by the user. The elements of this array are in the
      * format needed by the chrome.ttsEngine API.
      */
     this.ttsEngineApiVoices_ = [];

     /**
      * The data about each enabled language in eSpeak's format.
      * Used when we get a speech request that specifies a language
      * code but not a specific complete voice name.
      */
     this.internalVoiceInfos_ = [];

     /**
      * The data about every supported eSpeak language, in eSpeak's
      * format. Used by the options page to present all languages to
      * the user and let them choose which ones to enable.
      */
     this.langInfoArray_ = [];

     /**
      * Function to call when langInfoArray_ is populated.
      */
     this.pendingGetLangInfoArrayCallback_ = null;

     /**
      * The default voice to use when no metadata is provided at all.
      */
     this.defaultEspeakVoiceName_ = null;

     /**
      * The timestamp when speech started. Used to fire events like
      * word callbacks.
      */
     this.startTime_ = null;

     /**
      * The queue of events that need to be scheduled to fire when
      * audio playback of the utterance begins.
      */
     this.eventQueue_ = [];

     /**
      * The timeout IDs (from calling window.setTimeout) of event
      * callbacks that have already been scheduled. These are cleared
      * when we get a call to stop speech.
      */
     this.timeoutIds_ = [];

     /**
      * Callback to run after initialization.
      */
     this.pendingCallback_ = null;

     /**
      * True if initialization is done.
      */
     this.initialized_ = false;

     // Initialize the WASM module and call updateVoices when it's ready.
     this.tts_ = new eSpeakNG('js/espeakng.worker.js', (function() {
       // This will call this.finishInitialization_() when done,
       // which will initialize the audio worklet.
       this.updateVoices();
     }).bind(this));

     // Start listening to chrome.ttsEngine requests.
     chrome.ttsEngine.onSpeak.addListener(this.onSpeak.bind(this));
     chrome.ttsEngine.onStop.addListener(this.onStop.bind(this));
   }

   /**
    * Return the array of all languages supported by eSpeak. Called by
    * the options page.
    */
   getLangInfoArray(callback) {
     if (this.langInfoArray_.length > 0) {
       callback(this.langInfoArray_);
       return;
     }

     this.pendingGetLangInfoArrayCallback_ = callback;
   };

   /**
    * Function called on startup and when the set of enabled voices
    * may have changed.
    */
   updateVoices() {
     console.log('updateVoices');
     this.tts_.list_voices((function (langInfoArray) {
       this.langInfoArray_ = langInfoArray;
       // Remove Japanese because the voice cannot pronounce many characters.
       for (var i = 0; i < this.langInfoArray_.length; ++i) {
         if (this.langInfoArray_[i].identifier === 'jpx/ja') {
           this.langInfoArray_.splice(i,1);
           break;
         }
       }
       this.ttsEngineApiVoices_ = [];
       this.internalVoiceInfos_ = [];
       this.langsRemaining_ = this.langInfoArray_.length;
       this.langInfoArray_.forEach((function(langInfo) {
         isEspeakLanguageEnabled(langInfo, (function(enabled) {
           if (enabled) {
             var voiceName = 'eSpeak ' + langInfo.name;
             var ttsEngineApiVoice = {
               voiceName: voiceName,
               lang: langInfo.languages[0].name,
               remote: false,
               eventTypes: [
                 'start', 'end', 'word', 'sentence', 'error'
               ]};
             this.ttsEngineApiVoices_.push(ttsEngineApiVoice);

             var internalVoiceInfo = {
               voiceName: voiceName,
               espeakVoiceName: langInfo.name,
               languages: langInfo.languages
             };
             this.internalVoiceInfos_.push(internalVoiceInfo);
           }

           this.langsRemaining_--;
           if (this.langsRemaining_ == 0) {
             this.finishUpdatingVoices_();
           }
         }).bind(this));
       }).bind(this));
       if (this.pendingGetLangInfoArrayCallback_) {
 	this.pendingGetLangInfoArrayCallback_(this.langInfoArray_);
 	this.pendingGetLangInfoArrayCallback_ = null;
       }
     }).bind(this));
   };

   /**
    * Called after asynchronously getting the list of languages
    * supported by eSpeak and filtering it to only the ones
    * currently enabled and converting that to the format expected
    * by the chrome.ttsEngine API.
    *
    * Calls chrome.ttsEngine.updateVoices to advertise the set of
    * voices currently enabled, and updates the default voice to
    * be used if someone requests speech without specifying any
    * other metadata.
    */
   finishUpdatingVoices_() {
     console.log('finishUpdatingVoices_');
     chrome.ttsEngine.updateVoices(this.ttsEngineApiVoices_);
     console.log('Loaded ' + this.ttsEngineApiVoices_.length +
           ' voices');

     this.defaultEspeakVoiceName_ = this.getBestEspeakVoice(
         '', navigator.language);

     this.finishInitialization_();
   };

   /**
    * Finish initialization by initializing the audio worklet.
    * This is the final step, then we can start speaking.
    */
   finishInitialization_() {
     // Initialize the audio worklet and hook up the gain node.
     this.context_.audioWorklet.addModule('streaming_worklet_processor.js').then((function() {
       this.streamingNode_ = new AudioWorkletNode(this.context_, 'streaming-worklet-processor');
       this.streamingNode_.port.onmessage = this.onWorkletProcessorMessage.bind(this);
       this.gainNode_.connect(this.context_.destination);

       // Initialization is now complete.
       this.initialized_ = true;
       if (this.pendingCallback_) {
 	this.pendingCallback_();
 	this.pendingCallback_ = null;
       }
     }).bind(this));
   };

   /**
    * Called by the client to stop speech.
    */
   onStop() {
     this.pendingCallback_ = null;

     this.playing_ = false;
     this.streamingNode_.disconnect();
     this.streamingNode_.port.postMessage({
       'command': 'clearBuffers'
     });
     this.ttsEngineApiCallback_ = null;
     this.eventQueue_ = [];
     this.timeoutIds_.forEach(function(timeoutId) {
       window.clearTimeout(timeoutId);
     });
   };

   /**
    * Called by the client to start speech synthesis.
    *
    * @param {string} utterance The utterance to say.
    * @param {object} options The options affecting the speech, like language,
    *   pitch, rate, etc.
    * @param {function(object)} callback The function to receive messages from the
    * engine.
    */
   onSpeak(utterance, options, callback) {
     console.log('Will speak: "' + utterance + '" lang="' + options.lang + '"');

     this.pendingCallback_ = null;

     if (!this.initialized_) {
       // We got a call to speak before we're initialized. Enqueue this until
       // we're ready.
       this.pendingCallback_ = this.onSpeak.bind(this, utterance, options, callback);
       return;
     }

     this.onStop();

     this.ttsEngineApiCallback_ = callback;
     this.utterance_ = utterance;

     var espeakVoiceName = this.getBestEspeakVoice(
         options.voiceName, options.lang);
     this.tts_.set_voice(espeakVoiceName);

     this.tts_.set_systemSampleRate(this.context_.sampleRate);

     // Chrome TTS rates range from 0.1 to 10.0, with 1.0 as the default.
     // eSpeak rates range from 80 to 450, with 175 as the default.
     var rate = Math.min(Math.max(Math.floor(options.rate * 175), 80), 450);
     this.tts_.set_rate(rate);

     // Chrome TTS pitches range from 0.0 to 2.0, with 1.0 as the default.
     // eSpeak pitches range from 0 to 99, with 50 as the default.
     var pitch = Math.min(Math.max(Math.floor(options.pitch * 50), 0), 99);
     this.tts_.set_pitch(pitch);

     var volume = Math.min(Math.max(options.volume, 0.0), 1.0);
     this.gainNode_.gain.value = volume;

     this.tts_.synthesize(
       utterance,
       (function(samples, events) {
         var isEnd = false;
         if (events) {
           events.forEach((function(event) {
             switch (event.type) {
             case 'sentence':
             case 'word':
               this.scheduleTimepointEvent(
                 event.audio_position,
                 event.text_position - 1,
                 event.type);
               break;
             case 'end':
               isEnd = true;
               break;
             }
           }).bind(this));
         }
         var sampleArray = new Float32Array(samples);
         this.processSamples(sampleArray, isEnd);
       }).bind(this));
   };

   /**
    * Given a voice name and language, determine the best eSpeak voice
    * to use. Sometimes a partial match needs to be used, for example
    * because the language requested is just 'en', but eSpeak has
    * 'en-US', 'en-GB', etc.
    */
   getBestEspeakVoice(desiredVoiceName, desiredLang) {
     var exactMatchEspeakVoiceName = null;
     var langMatchEspeakVoiceName = null;
     var langMatch = '';
     this.internalVoiceInfos_.forEach((function(voice) {
       if (desiredVoiceName == voice.voiceName)
         exactMatchEspeakVoiceName = voice.espeakVoiceName;
       voice.languages.forEach((function(lang) {
         if (desiredLang.toLowerCase() == lang.name.toLowerCase() &&
           lang.name.length > langMatch.length) {
           langMatch = lang.name;
           langMatchEspeakVoiceName = voice.voiceName;
         }
       }).bind(this));
     }).bind(this));

     if (exactMatchEspeakVoiceName)
       return exactMatchEspeakVoiceName;

     if (langMatchEspeakVoiceName)
       return langMatchEspeakVoiceName;

     return this.defaultEspeakVoiceName_;
   };

   /**
    * Called when we get samples back from the WebAssembly TTS engine.
    * Fills up buffers with samples, and as each buffer fills, sends it
    * to the AudioWorkletProcessor running in another thread.
    *
    * If these are the first samples for a new utterance, starts speech
    * by connceting the audio node.
    */
   processSamples(samples, isEnd) {
     if (!this.ttsEngineApiCallback_) {
       return;
     }

     var i = 0;
     var len = samples.length;

     var didSendBuffers = false;
     while (i < len) {
       var chunkLen = Math.min(this.bufferLen_ - this.currentBufferIndex_,
                   len - i);
       if (!this.currentBuffer_) {
         this.currentBuffer_ = new Float32Array(this.bufferLen_);
       }
       this.currentBuffer_.set(samples.subarray(i, i + chunkLen),
                               this.currentBufferIndex_);
       i += chunkLen;
       this.currentBufferIndex_ += chunkLen;
       if (this.currentBufferIndex_ == this.bufferLen_) {
         this.streamingNode_.port.postMessage({
           'command': 'addBuffer',
           'buffer': this.currentBuffer_
         });
         didSendBuffers = true;

         this.currentBufferIndex_ = 0;
         this.currentBuffer_ = null;
       }
     }

     // Push final buffer, not complete.
     if (isEnd && this.currentBufferIndex_ > 0) {
       this.streamingNode_.port.postMessage({
         'command': 'addBuffer',
         'buffer': this.currentBuffer_
       });
       didSendBuffers = true;
       this.currentBufferIndex_ = 0;
       this.currentBuffer_ = null;
     }

     // Connect and start playback if this is the
     // first buffer.
     if (didSendBuffers && !this.playing_) {
       this.streamingNode_.connect(this.gainNode_);
       this.playing_ = true;
       this.ttsEngineApiCallback_({
         'type': 'start',
         'charIndex': 0
       });
       this.startTime_ = new Date();
       this.scheduleQueuedTimepointEvents();
     }
   };

   /**
    * Called when we get a message from the worklet processor in
    * the audio thread. Currently the only message is that it has
    * run out of audio buffers. When that happens we fire the 'end'
    * event and stop speaking.
    */
   onWorkletProcessorMessage(event) {
     if (!this.ttsEngineApiCallback_) {
       // Do nothing if we've already stopped.
       return;
     }

     if (event.data.type == 'empty') {
       this.ttsEngineApiCallback_({
         'type': 'end',
         'charIndex': this.utterance_.length
       });
       this.onStop();
     }
   };

   /**
    * Schedules an event to be fired indicating progress of speech synthesis.
    * If audio playback has already started, schedule it now based on the
    * desired time and actual elapsed time so far. If audio playback has not
    * started yet, put it in a queue.
    */
   scheduleTimepointEvent(audioPositionMillis,
                          textPosition,
                          eventType) {

     if (textPosition < 0 || audioPositionMillis <= 0) {
       return;
     }

     if (!this.playing_) {
       this.eventQueue_.push({'audioPositionMillis': audioPositionMillis,
                    'textPosition': textPosition,
                    'eventType': eventType});
       return;
     }

     var currentTimeMillis = (new Date()) - this.startTime_;
     var deltaTimeMillis = audioPositionMillis - currentTimeMillis;
     if (deltaTimeMillis < -100) {
       console.log('Skipping event ' + eventType + ' in the past');
       // Skip it if it's too much in the past.
       return;
     } else if (deltaTimeMillis < 2) {
       // It's basically now, fire it.
       this.ttsEngineApiCallback_({
         'type': eventType,
         'charIndex': textPosition
       });
       return;
     }

     var timeoutId = window.setTimeout((function() {
       this.ttsEngineApiCallback_({
         'type': eventType,
         'charIndex': textPosition
       });
     }).bind(this), deltaTimeMillis);
     this.timeoutIds_.push(timeoutId);
   }

   /**
    * If any audio events where queued up because they were generated
    * before audio playback had started, schedule them now.
    */
   scheduleQueuedTimepointEvents() {
     console.assert(this.playing_);
     var events = this.eventQueue_;
     this.eventQueue_ = [];
     events.forEach((function(event) {
       this.scheduleTimepointEvent(
         event.audioPositionMillis,
         event.textPosition,
         event.eventType);
     }).bind(this));
   }
 }

 window.engine = new EspeakTtsEngine();
	// Copyright 2018 The Chromium OS Authors. All rights reserved.
	// Use of this source code is governed by a the GPL license that can be
	// found in the LICENSE file.

	/**
	* @fileoverview eSpeak text-to-speech extension.
	*
	* There are three components to the extension:
	*
	* The open-source eSpeak-NG engine is compiled using Emscripten into a
	* WebAssembly module that's run in a web worker. The interface to it is
	* stored in this.tts_. This class sends it asynchronous queries for
	* a list of voices and commands to start generating speech. Callbacks
	* from the engine queue up events like word callbacks, and generate
	* audio data.
	*
	* The implementation of the chrome.ttsEngine API is how we get
	* commands to speak and stop. We also use this API to publish the
	* current set of enabled voices.
	*
	* Finally, the audio data generated by the engine is output using
	* the Web Audio API by creating an AudioworkletNode with a companion
	* AudioWorkletProcess that runs in the audio thread, providing
	* low-latency real-time audio output that's unaffected by any contention
	* on the main thread.
	*/
	class EspeakTtsEngine {
	constructor() {
	/**
	* The callback provided by a call to chrome.ttsEngine.onSpeak, which is
	* our channel to send status updates about the progress of speech.
	*/
	this.ttsEngineApiCallback_ = null;

	/**
	* The string containing the text we're currently speaking.
	*/
	this.utterance_ = null;

	/**
	* The handle to the WASM Espeak-NG module.
	*/
	this.tts_ = null;

	/**
	* The size, in samples, of each audio frame. When using an
	* AudioWorkletProcessor this is always 128 samples.
	*/
	this.bufferLen_ = 128;

	/**
	* The current buffer of audio samples from the engine. When each
	* buffer fills up it's sent to the audio worklet processor thread.
	*/
	this.currentBuffer_ = null;

	/**
	* The current zero-based index into \|this.currentBuffer_\| where the
	* next audio sample should be written.
	*/
	this.currentBufferIndex_ = 0;

	/**
	* The audio context, needed by any app using the Web Audio API.
	*/
	this.context_ = new AudioContext();

	/**
	* A node that allows us to adjust the output volume. The
	* AudioWorkletNode connects to the gain node, and the gain node
	* connects to the audio context destination.
	*/
	this.gainNode_ = this.context_.createGain();

	/**
	* Whether or not we're currently playing audio via the Web Audio
	* API. This is not set to true until after we've got at least one
	* buffer worth of samples from the engine and sent it to the
	* AudioWorkletProcessor.
	*/
	this.playing_ = false;

	/**
	* The voice data for the languages that are supported by eSpeak
	* and enabled by the user. The elements of this array are in the
	* format needed by the chrome.ttsEngine API.
	*/
	this.ttsEngineApiVoices_ = [];

	/**
	* The data about each enabled language in eSpeak's format.
	* Used when we get a speech request that specifies a language
	* code but not a specific complete voice name.
	*/
	this.internalVoiceInfos_ = [];

	/**
	* The data about every supported eSpeak language, in eSpeak's
	* format. Used by the options page to present all languages to
	* the user and let them choose which ones to enable.
	*/
	this.langInfoArray_ = [];

	/**
	* Function to call when langInfoArray_ is populated.
	*/
	this.pendingGetLangInfoArrayCallback_ = null;

	/**
	* The default voice to use when no metadata is provided at all.
	*/
	this.defaultEspeakVoiceName_ = null;

	/**
	* The timestamp when speech started. Used to fire events like
	* word callbacks.
	*/
	this.startTime_ = null;

	/**
	* The queue of events that need to be scheduled to fire when
	* audio playback of the utterance begins.
	*/
	this.eventQueue_ = [];

	/**
	* The timeout IDs (from calling window.setTimeout) of event
	* callbacks that have already been scheduled. These are cleared
	* when we get a call to stop speech.
	*/
	this.timeoutIds_ = [];

	/**
	* Callback to run after initialization.
	*/
	this.pendingCallback_ = null;

	/**
	* True if initialization is done.
	*/
	this.initialized_ = false;

	// Initialize the WASM module and call updateVoices when it's ready.
	this.tts_ = new eSpeakNG('js/espeakng.worker.js', (function() {
	// This will call this.finishInitialization_() when done,
	// which will initialize the audio worklet.
	this.updateVoices();
	}).bind(this));

	// Start listening to chrome.ttsEngine requests.
	chrome.ttsEngine.onSpeak.addListener(this.onSpeak.bind(this));
	chrome.ttsEngine.onStop.addListener(this.onStop.bind(this));
	}

	/**
	* Return the array of all languages supported by eSpeak. Called by
	* the options page.
	*/
	getLangInfoArray(callback) {
	if (this.langInfoArray_.length > 0) {
	callback(this.langInfoArray_);
	return;
	}

	this.pendingGetLangInfoArrayCallback_ = callback;
	};

	/**
	* Function called on startup and when the set of enabled voices
	* may have changed.
	*/
	updateVoices() {
	console.log('updateVoices');
	this.tts_.list_voices((function (langInfoArray) {
	this.langInfoArray_ = langInfoArray;
	// Remove Japanese because the voice cannot pronounce many characters.
	for (var i = 0; i < this.langInfoArray_.length; ++i) {
	if (this.langInfoArray_[i].identifier === 'jpx/ja') {
	this.langInfoArray_.splice(i,1);
	break;
	}
	}
	this.ttsEngineApiVoices_ = [];
	this.internalVoiceInfos_ = [];
	this.langsRemaining_ = this.langInfoArray_.length;
	this.langInfoArray_.forEach((function(langInfo) {
	isEspeakLanguageEnabled(langInfo, (function(enabled) {
	if (enabled) {
	var voiceName = 'eSpeak ' + langInfo.name;
	var ttsEngineApiVoice = {
	voiceName: voiceName,
	lang: langInfo.languages[0].name,
	remote: false,
	eventTypes: [
	'start', 'end', 'word', 'sentence', 'error'
	]};
	this.ttsEngineApiVoices_.push(ttsEngineApiVoice);

	var internalVoiceInfo = {
	voiceName: voiceName,
	espeakVoiceName: langInfo.name,
	languages: langInfo.languages
	};
	this.internalVoiceInfos_.push(internalVoiceInfo);
	}

	this.langsRemaining_--;
	if (this.langsRemaining_ == 0) {
	this.finishUpdatingVoices_();
	}
	}).bind(this));
	}).bind(this));
	if (this.pendingGetLangInfoArrayCallback_) {
	this.pendingGetLangInfoArrayCallback_(this.langInfoArray_);
	this.pendingGetLangInfoArrayCallback_ = null;
	}
	}).bind(this));
	};

	/**
	* Called after asynchronously getting the list of languages
	* supported by eSpeak and filtering it to only the ones
	* currently enabled and converting that to the format expected
	* by the chrome.ttsEngine API.
	*
	* Calls chrome.ttsEngine.updateVoices to advertise the set of
	* voices currently enabled, and updates the default voice to
	* be used if someone requests speech without specifying any
	* other metadata.
	*/
	finishUpdatingVoices_() {
	console.log('finishUpdatingVoices_');
	chrome.ttsEngine.updateVoices(this.ttsEngineApiVoices_);
	console.log('Loaded ' + this.ttsEngineApiVoices_.length +
	' voices');

	this.defaultEspeakVoiceName_ = this.getBestEspeakVoice(
	'', navigator.language);

	this.finishInitialization_();
	};

	/**
	* Finish initialization by initializing the audio worklet.
	* This is the final step, then we can start speaking.
	*/
	finishInitialization_() {
	// Initialize the audio worklet and hook up the gain node.
	this.context_.audioWorklet.addModule('streaming_worklet_processor.js').then((function() {
	this.streamingNode_ = new AudioWorkletNode(this.context_, 'streaming-worklet-processor');
	this.streamingNode_.port.onmessage = this.onWorkletProcessorMessage.bind(this);
	this.gainNode_.connect(this.context_.destination);

	// Initialization is now complete.
	this.initialized_ = true;
	if (this.pendingCallback_) {
	this.pendingCallback_();
	this.pendingCallback_ = null;
	}
	}).bind(this));
	};

	/**
	* Called by the client to stop speech.
	*/
	onStop() {
	this.pendingCallback_ = null;

	this.playing_ = false;
	this.streamingNode_.disconnect();
	this.streamingNode_.port.postMessage({
	'command': 'clearBuffers'
	});
	this.ttsEngineApiCallback_ = null;
	this.eventQueue_ = [];
	this.timeoutIds_.forEach(function(timeoutId) {
	window.clearTimeout(timeoutId);
	});
	};

	/**
	* Called by the client to start speech synthesis.
	*
	* @param {string} utterance The utterance to say.
	* @param {object} options The options affecting the speech, like language,
	* pitch, rate, etc.
	* @param {function(object)} callback The function to receive messages from the
	* engine.
	*/
	onSpeak(utterance, options, callback) {
	console.log('Will speak: "' + utterance + '" lang="' + options.lang + '"');

	this.pendingCallback_ = null;

	if (!this.initialized_) {
	// We got a call to speak before we're initialized. Enqueue this until
	// we're ready.
	this.pendingCallback_ = this.onSpeak.bind(this, utterance, options, callback);
	return;
	}

	this.onStop();

	this.ttsEngineApiCallback_ = callback;
	this.utterance_ = utterance;

	var espeakVoiceName = this.getBestEspeakVoice(
	options.voiceName, options.lang);
	this.tts_.set_voice(espeakVoiceName);

	this.tts_.set_systemSampleRate(this.context_.sampleRate);

	// Chrome TTS rates range from 0.1 to 10.0, with 1.0 as the default.
	// eSpeak rates range from 80 to 450, with 175 as the default.
	var rate = Math.min(Math.max(Math.floor(options.rate * 175), 80), 450);
	this.tts_.set_rate(rate);

	// Chrome TTS pitches range from 0.0 to 2.0, with 1.0 as the default.
	// eSpeak pitches range from 0 to 99, with 50 as the default.
	var pitch = Math.min(Math.max(Math.floor(options.pitch * 50), 0), 99);
	this.tts_.set_pitch(pitch);

	var volume = Math.min(Math.max(options.volume, 0.0), 1.0);
	this.gainNode_.gain.value = volume;

	this.tts_.synthesize(
	utterance,
	(function(samples, events) {
	var isEnd = false;
	if (events) {
	events.forEach((function(event) {
	switch (event.type) {
	case 'sentence':
	case 'word':
	this.scheduleTimepointEvent(
	event.audio_position,
	event.text_position - 1,
	event.type);
	break;
	case 'end':
	isEnd = true;
	break;
	}
	}).bind(this));
	}
	var sampleArray = new Float32Array(samples);
	this.processSamples(sampleArray, isEnd);
	}).bind(this));
	};

	/**
	* Given a voice name and language, determine the best eSpeak voice
	* to use. Sometimes a partial match needs to be used, for example
	* because the language requested is just 'en', but eSpeak has
	* 'en-US', 'en-GB', etc.
	*/
	getBestEspeakVoice(desiredVoiceName, desiredLang) {
	var exactMatchEspeakVoiceName = null;
	var langMatchEspeakVoiceName = null;
	var langMatch = '';
	this.internalVoiceInfos_.forEach((function(voice) {
	if (desiredVoiceName == voice.voiceName)
	exactMatchEspeakVoiceName = voice.espeakVoiceName;
	voice.languages.forEach((function(lang) {
	if (desiredLang.toLowerCase() == lang.name.toLowerCase() &&
	lang.name.length > langMatch.length) {
	langMatch = lang.name;
	langMatchEspeakVoiceName = voice.voiceName;
	}
	}).bind(this));
	}).bind(this));

	if (exactMatchEspeakVoiceName)
	return exactMatchEspeakVoiceName;

	if (langMatchEspeakVoiceName)
	return langMatchEspeakVoiceName;

	return this.defaultEspeakVoiceName_;
	};

	/**
	* Called when we get samples back from the WebAssembly TTS engine.
	* Fills up buffers with samples, and as each buffer fills, sends it
	* to the AudioWorkletProcessor running in another thread.
	*
	* If these are the first samples for a new utterance, starts speech
	* by connceting the audio node.
	*/
	processSamples(samples, isEnd) {
	if (!this.ttsEngineApiCallback_) {
	return;
	}

	var i = 0;
	var len = samples.length;

	var didSendBuffers = false;
	while (i < len) {
	var chunkLen = Math.min(this.bufferLen_ - this.currentBufferIndex_,
	len - i);
	if (!this.currentBuffer_) {
	this.currentBuffer_ = new Float32Array(this.bufferLen_);
	}
	this.currentBuffer_.set(samples.subarray(i, i + chunkLen),
	this.currentBufferIndex_);
	i += chunkLen;
	this.currentBufferIndex_ += chunkLen;
	if (this.currentBufferIndex_ == this.bufferLen_) {
	this.streamingNode_.port.postMessage({
	'command': 'addBuffer',
	'buffer': this.currentBuffer_
	});
	didSendBuffers = true;

	this.currentBufferIndex_ = 0;
	this.currentBuffer_ = null;
	}
	}

	// Push final buffer, not complete.
	if (isEnd && this.currentBufferIndex_ > 0) {
	this.streamingNode_.port.postMessage({
	'command': 'addBuffer',
	'buffer': this.currentBuffer_
	});
	didSendBuffers = true;
	this.currentBufferIndex_ = 0;
	this.currentBuffer_ = null;
	}

	// Connect and start playback if this is the
	// first buffer.
	if (didSendBuffers && !this.playing_) {
	this.streamingNode_.connect(this.gainNode_);
	this.playing_ = true;
	this.ttsEngineApiCallback_({
	'type': 'start',
	'charIndex': 0
	});
	this.startTime_ = new Date();
	this.scheduleQueuedTimepointEvents();
	}
	};

	/**
	* Called when we get a message from the worklet processor in
	* the audio thread. Currently the only message is that it has
	* run out of audio buffers. When that happens we fire the 'end'
	* event and stop speaking.
	*/
	onWorkletProcessorMessage(event) {
	if (!this.ttsEngineApiCallback_) {
	// Do nothing if we've already stopped.
	return;
	}

	if (event.data.type == 'empty') {
	this.ttsEngineApiCallback_({
	'type': 'end',
	'charIndex': this.utterance_.length
	});
	this.onStop();
	}
	};

	/**
	* Schedules an event to be fired indicating progress of speech synthesis.
	* If audio playback has already started, schedule it now based on the
	* desired time and actual elapsed time so far. If audio playback has not
	* started yet, put it in a queue.
	*/
	scheduleTimepointEvent(audioPositionMillis,
	textPosition,
	eventType) {

	if (textPosition < 0 \|\| audioPositionMillis <= 0) {
	return;
	}

	if (!this.playing_) {
	this.eventQueue_.push({'audioPositionMillis': audioPositionMillis,
	'textPosition': textPosition,
	'eventType': eventType});
	return;
	}

	var currentTimeMillis = (new Date()) - this.startTime_;
	var deltaTimeMillis = audioPositionMillis - currentTimeMillis;
	if (deltaTimeMillis < -100) {
	console.log('Skipping event ' + eventType + ' in the past');
	// Skip it if it's too much in the past.
	return;
	} else if (deltaTimeMillis < 2) {
	// It's basically now, fire it.
	this.ttsEngineApiCallback_({
	'type': eventType,
	'charIndex': textPosition
	});
	return;
	}

	var timeoutId = window.setTimeout((function() {
	this.ttsEngineApiCallback_({
	'type': eventType,
	'charIndex': textPosition
	});
	}).bind(this), deltaTimeMillis);
	this.timeoutIds_.push(timeoutId);
	}

	/**
	* If any audio events where queued up because they were generated
	* before audio playback had started, schedule them now.
	*/
	scheduleQueuedTimepointEvents() {
	console.assert(this.playing_);
	var events = this.eventQueue_;
	this.eventQueue_ = [];
	events.forEach((function(event) {
	this.scheduleTimepointEvent(
	event.audioPositionMillis,
	event.textPosition,
	event.eventType);
	}).bind(this));
	}
	}

	window.engine = new EspeakTtsEngine();