| // Copyright (c) 2012 The Chromium OS Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| // |
| // Pico specific implementation of the TtsEngine interface defined in |
| // tts_engine.h. |
| |
| #include "pico/pico_tts_engine.h" |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <math.h> |
| |
| #include "base/string_number_conversions.h" |
| |
| #include "log.h" |
| |
| #define FAILERR(X) \ |
| if (PICO_OK != (X)) { \ |
| LOG(ERROR) << "Fail line " << __LINE__; \ |
| return TTS_FAILURE; \ |
| } \ |
| else |
| |
| using std::string; |
| |
| namespace speech_synthesis { |
| |
| const char* PROP_RATE = "rate"; |
| const char* PROP_PITCH = "pitch"; |
| const char* PROP_VOLUME = "volume"; |
| |
| const int PICO_MEM_SIZE = 2500000; |
| const pico_Char * PICO_VOICE_NAME = |
| reinterpret_cast<const pico_Char *>("PicoVoice"); |
| |
| PicoTtsEngine::PicoTtsEngine(const std::string& base_path) |
| : base_path_(base_path), |
| mem_area_(NULL), |
| system_(NULL), |
| engine_(NULL), |
| ta_resource_(NULL), |
| sg_resource_(NULL), |
| receiver_(NULL) { |
| } |
| |
| PicoTtsEngine::~PicoTtsEngine() { |
| Shutdown(); |
| } |
| |
| // Unloads the Pico engine and any loaded Pico resources, but does not |
| // shut down. |
| void PicoTtsEngine::CleanResources(void) { |
| if (engine_) { |
| pico_disposeEngine(system_, &engine_); |
| pico_releaseVoiceDefinition(system_, PICO_VOICE_NAME); |
| engine_ = NULL; |
| } |
| if (ta_resource_) { |
| pico_unloadResource(system_, &ta_resource_); |
| ta_resource_ = NULL; |
| } |
| if (sg_resource_) { |
| pico_unloadResource(system_, &sg_resource_); |
| sg_resource_ = NULL; |
| } |
| |
| current_voice_index_ = -1; |
| } |
| |
| // Initializes the engine for the specified voice. |
| tts_result PicoTtsEngine::InitVoice(int voice_index) { |
| if (voice_index < 0 || voice_index >= GetVoiceCount()) { |
| LOG(INFO) << "Voice index out of range: " << voice_index; |
| return TTS_FAILURE; |
| } |
| const PicoTtsVoice * voice = &voices_[voice_index]; |
| |
| pico_Char ta_resource_name[PICO_MAX_RESOURCE_NAME_SIZE]; |
| pico_Char sg_resource_name[PICO_MAX_RESOURCE_NAME_SIZE]; |
| |
| string tafile = base_path_ + voice->ta_lingware; |
| string sgfile = base_path_ + voice->sg_lingware; |
| const pico_Char *ta_filename = |
| reinterpret_cast<const pico_Char *>(tafile.c_str()); |
| const pico_Char *sg_filename = |
| reinterpret_cast<const pico_Char *>(sgfile.c_str()); |
| |
| FAILERR(pico_loadResource(system_, ta_filename, &ta_resource_)); |
| FAILERR(pico_loadResource(system_, sg_filename, &sg_resource_)); |
| FAILERR(pico_getResourceName(system_, ta_resource_, |
| reinterpret_cast<char *>(ta_resource_name))); |
| FAILERR(pico_getResourceName(system_, sg_resource_, |
| reinterpret_cast<char *>(sg_resource_name))); |
| FAILERR(pico_createVoiceDefinition(system_, PICO_VOICE_NAME)); |
| FAILERR(pico_addResourceToVoiceDefinition( |
| system_, PICO_VOICE_NAME, ta_resource_name)); |
| FAILERR(pico_addResourceToVoiceDefinition( |
| system_, PICO_VOICE_NAME, sg_resource_name)); |
| pico_newEngine(system_, PICO_VOICE_NAME, &engine_); |
| current_voice_index_ = voice_index; |
| |
| return TTS_SUCCESS; |
| } |
| |
| // Initialize TTS engine. |
| tts_result PicoTtsEngine::Init() { |
| LOG(INFO) << "Start."; |
| LoadVoices(base_path_ + "tts_support.xml"); |
| mem_area_ = malloc(PICO_MEM_SIZE); |
| if (!mem_area_) { |
| LOG(ERROR) << "Failed to allocate memory for Pico system"; |
| return TTS_FAILURE; |
| } |
| |
| FAILERR(pico_initialize(mem_area_, PICO_MEM_SIZE, &system_)); |
| // Set the first language in the data file as the default. |
| FAILERR(InitVoice(0)); |
| LOG(INFO) << "Init done."; |
| return TTS_SUCCESS; |
| } |
| |
| // Shuts down the TTS engine, cleans up resources. |
| tts_result PicoTtsEngine::Shutdown() { |
| CleanResources(); |
| if (system_) { |
| pico_terminate(&system_); |
| system_ = NULL; |
| } |
| if (mem_area_) { |
| free(mem_area_); |
| mem_area_ = NULL; |
| } |
| return TTS_SUCCESS; |
| } |
| |
| tts_result PicoTtsEngine::Stop() { |
| // TODO(fergus): use PICO_RESET_SOFT here instead? |
| pico_resetEngine(engine_, PICO_RESET_FULL); |
| return TTS_SUCCESS; |
| } |
| |
| int PicoTtsEngine::GetVoiceCount() { |
| return static_cast<int>(voices_.size()); |
| } |
| |
| const TtsVoice* PicoTtsEngine::GetVoiceInfo(int voice_index) { |
| if (voice_index >= 0 && voice_index < GetVoiceCount()) { |
| return &voices_[voice_index]; |
| } else { |
| return NULL; |
| } |
| } |
| |
| tts_result PicoTtsEngine::SetVoice(int voice_index) { |
| if (current_voice_index_ != voice_index) { |
| CleanResources(); |
| return InitVoice(voice_index); |
| } else { |
| return TTS_SUCCESS; |
| } |
| } |
| |
| void PicoTtsEngine::SetReceiver(TtsDataReceiver* receiver) { |
| receiver_ = receiver; |
| } |
| |
| // Sets the property for the engine. |
| tts_result PicoTtsEngine::SetProperty(const char *property, const char *value) { |
| if (properties_.count(property) > 0) { |
| properties_[property] = value; |
| return TTS_SUCCESS; |
| } else { |
| return TTS_PROPERTY_UNSUPPORTED; |
| } |
| } |
| |
| tts_result PicoTtsEngine::SetProperty(const char *property, float value) { |
| SetProperty(property, base::IntToString(value).c_str()); |
| return TTS_SUCCESS; |
| } |
| |
| tts_result PicoTtsEngine::SetRate(float rate) { |
| if (rate < 0 || rate > 5) { |
| return TTS_VALUE_INVALID; |
| } |
| rate = rate < 0.2 ? 0.2 : rate; |
| return SetProperty(PROP_RATE, rate * 100); |
| } |
| |
| tts_result PicoTtsEngine::SetPitch(float pitch) { |
| if (pitch < 0 || pitch > 2) { |
| return TTS_VALUE_INVALID; |
| } |
| pitch = pitch < 0.5 ? 0.5 : pitch; |
| return SetProperty(PROP_PITCH, pitch * 100); |
| } |
| |
| tts_result PicoTtsEngine::SetVolume(float volume) { |
| if (volume < 0 || volume > 5) { |
| return TTS_VALUE_INVALID; |
| } |
| return SetProperty(PROP_VOLUME, volume * 100); |
| } |
| |
| tts_result PicoTtsEngine::GetProperty(const char *property, |
| const char **value) { |
| std::map<string, string>::const_iterator iter = |
| properties_.find(property); |
| if (iter != properties_.end()) { |
| if (value != NULL) { |
| (*value) = iter->second.c_str(); |
| } |
| return TTS_SUCCESS; |
| } |
| return TTS_PROPERTY_UNSUPPORTED; |
| } |
| |
| int PicoTtsEngine::GetSampleRate() { |
| return voices_[current_voice_index_].sample_rate; |
| } |
| |
| tts_result PicoTtsEngine::SynthesizeText(const char* text, |
| int16_t* audio_buffer, |
| int audio_buffer_size, |
| int* out_total_samples) { |
| if (out_total_samples != NULL) { |
| *out_total_samples = 0; |
| } |
| |
| string synth_text = ""; |
| AppendProperties(text, &synth_text); |
| |
| int text_pos = 0; |
| const pico_Char* text_ptr = |
| reinterpret_cast<const pico_Char*>(synth_text.c_str()); |
| int text_buffer_len = synth_text.size() + 1; |
| while (text_pos < text_buffer_len) { |
| pico_Int16 text_bytes_consumed = 0; |
| if (PICO_OK != pico_putTextUtf8( |
| engine_, text_ptr, text_buffer_len - text_pos, |
| &text_bytes_consumed)) { |
| return TTS_FAILURE; |
| } |
| |
| int out_samples; |
| tts_result result = GetAudioFromTts( |
| audio_buffer, audio_buffer_size, &out_samples); |
| if (out_total_samples != NULL) { |
| *out_total_samples += out_samples; |
| } |
| |
| if (result != TTS_SUCCESS) { |
| return result; |
| } |
| |
| text_pos += text_bytes_consumed; |
| text_ptr += text_bytes_consumed; |
| } |
| return TTS_SUCCESS; |
| } |
| |
| // This method appends the SSML tags for the supported properties if their |
| // values are different from the default values. |
| void PicoTtsEngine::AppendProperties(const char *text, string *synth_text) { |
| int rate_level_ = floor(atof(properties_[PROP_RATE].c_str())); |
| int pitch_level_ = floor(atof(properties_[PROP_PITCH].c_str())); |
| int volume_level_ = floor(atof(properties_[PROP_VOLUME].c_str())); |
| |
| if (rate_level_ < PICO_MIN_RATE || rate_level_ > PICO_MAX_RATE) { |
| LOG(WARNING) << "Rate is outside the allowed range."; |
| } |
| if (pitch_level_ < PICO_MIN_PITCH || pitch_level_ > PICO_MAX_PITCH) { |
| LOG(WARNING) << "Pitch is outside the allowed range."; |
| } |
| if (volume_level_ < PICO_MIN_VOL || volume_level_ > PICO_MAX_VOL) { |
| LOG(WARNING) << "Volume is outside the allowed range."; |
| } |
| |
| *synth_text = ""; |
| |
| // Append opening tags |
| if (rate_level_ != PICO_DEF_RATE) { |
| *synth_text += "<speed level='" + properties_[PROP_RATE] + "'>"; |
| } |
| if (pitch_level_ != PICO_DEF_PITCH) { |
| *synth_text += "<pitch level='" + properties_[PROP_PITCH] + "'>"; |
| } |
| if (volume_level_ != PICO_DEF_VOL) { |
| *synth_text += "<volume level='" + properties_[PROP_VOLUME] + "'>"; |
| } |
| // Append text |
| *synth_text += text; |
| // Append closing tags in the reverse order |
| if (volume_level_ != PICO_DEF_VOL) { |
| *synth_text += "</volume>"; |
| } |
| if (pitch_level_ != PICO_DEF_PITCH) { |
| *synth_text += "</pitch>"; |
| } |
| if (rate_level_ != PICO_DEF_RATE) { |
| *synth_text += "</speed>"; |
| } |
| } |
| |
| // max_iterations_without_apparent_progress is a hack to prevent infinite loops. |
| // This needs to be more than 200 to pass simple tests such as hello world. |
| // TODO(fergus): we should fix the underlying bug <http://b/2501315> in the |
| // //third_party/pico sources, and then delete all the code relating to |
| // max_iterations_without_apparent_progress. |
| int PicoTtsEngine::max_iterations_without_apparent_progress = 10000; |
| |
| tts_result PicoTtsEngine::GetAudioFromTts(int16_t* audio_buffer, |
| int audio_buffer_size, |
| int* out_total_samples) { |
| int total_samples_output = 0; |
| int status; |
| tts_callback_status callback_status = TTS_CALLBACK_CONTINUE; |
| pico_Int16 data_type = PICO_DATA_PCM_16BIT; |
| uint32_t sample_rate = voices_[current_voice_index_].sample_rate; |
| int iterations_without_apparent_progress = 0; |
| while (1) { |
| pico_Int16 bytes_received = 0; |
| data_type = 0; |
| int8_t* buffer_ptr = reinterpret_cast<int8_t *>(audio_buffer); |
| pico_Int16 buffer_size_bytes = audio_buffer_size * sizeof(pico_Int16); |
| |
| status = pico_getData(engine_, buffer_ptr, buffer_size_bytes, |
| &bytes_received, &data_type); |
| |
| if (status != PICO_STEP_ERROR && bytes_received > 0) { |
| if (data_type != PICO_DATA_PCM_16BIT) { |
| break; |
| } |
| |
| int samples_output = bytes_received / sizeof(const pico_Int16); |
| total_samples_output += samples_output; |
| |
| // make the callback here...note that it's important to call this |
| // method even if no data was received. |
| if (receiver_) { |
| callback_status = |
| receiver_->Receive(sample_rate, 1, audio_buffer, |
| samples_output, TTS_SYNTH_PENDING); |
| if (callback_status != TTS_CALLBACK_CONTINUE) { |
| break; |
| } |
| } |
| } |
| if (status != PICO_STEP_BUSY) { |
| break; |
| } |
| if (bytes_received == 0) { |
| iterations_without_apparent_progress++; |
| if (iterations_without_apparent_progress > |
| max_iterations_without_apparent_progress) { |
| break; |
| } |
| } else { |
| iterations_without_apparent_progress = 0; |
| } |
| }; |
| |
| if (out_total_samples != NULL) { |
| *out_total_samples = total_samples_output; |
| } |
| |
| if (status == PICO_STEP_ERROR || |
| callback_status == TTS_CALLBACK_ERROR || |
| data_type != PICO_DATA_PCM_16BIT || |
| iterations_without_apparent_progress > |
| max_iterations_without_apparent_progress) { |
| return TTS_FAILURE; |
| } |
| |
| if (callback_status != TTS_CALLBACK_HALT) { |
| // Send a zero-length packet to tell the destination receiver that |
| // we're done. |
| callback_status = receiver_->Receive(sample_rate, 1, NULL, 0, |
| TTS_SYNTH_DONE); |
| if (callback_status == TTS_CALLBACK_ERROR) { |
| return TTS_FAILURE; |
| } |
| } |
| |
| return TTS_SUCCESS; |
| } |
| |
| } // namespace speech_synthesis |