blob: f5b7304eb4bba89969afd304215f59a1569384f3 [file] [log] [blame]
// Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// Pico specific implementation of the TtsEngine interface defined in
// tts_engine.h.
#include "pico/pico_tts_engine.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "base/string_number_conversions.h"
#include "log.h"
#define FAILERR(X) \
if (PICO_OK != (X)) { \
LOG(ERROR) << "Fail line " << __LINE__; \
return TTS_FAILURE; \
} \
else
using std::string;
namespace speech_synthesis {
const char* PROP_RATE = "rate";
const char* PROP_PITCH = "pitch";
const char* PROP_VOLUME = "volume";
const int PICO_MEM_SIZE = 2500000;
const pico_Char * PICO_VOICE_NAME =
reinterpret_cast<const pico_Char *>("PicoVoice");
PicoTtsEngine::PicoTtsEngine(const std::string& base_path)
: base_path_(base_path),
mem_area_(NULL),
system_(NULL),
engine_(NULL),
ta_resource_(NULL),
sg_resource_(NULL),
receiver_(NULL) {
}
PicoTtsEngine::~PicoTtsEngine() {
Shutdown();
}
// Unloads the Pico engine and any loaded Pico resources, but does not
// shut down.
void PicoTtsEngine::CleanResources(void) {
if (engine_) {
pico_disposeEngine(system_, &engine_);
pico_releaseVoiceDefinition(system_, PICO_VOICE_NAME);
engine_ = NULL;
}
if (ta_resource_) {
pico_unloadResource(system_, &ta_resource_);
ta_resource_ = NULL;
}
if (sg_resource_) {
pico_unloadResource(system_, &sg_resource_);
sg_resource_ = NULL;
}
current_voice_index_ = -1;
}
// Initializes the engine for the specified voice.
tts_result PicoTtsEngine::InitVoice(int voice_index) {
if (voice_index < 0 || voice_index >= GetVoiceCount()) {
LOG(INFO) << "Voice index out of range: " << voice_index;
return TTS_FAILURE;
}
const PicoTtsVoice * voice = &voices_[voice_index];
pico_Char ta_resource_name[PICO_MAX_RESOURCE_NAME_SIZE];
pico_Char sg_resource_name[PICO_MAX_RESOURCE_NAME_SIZE];
string tafile = base_path_ + voice->ta_lingware;
string sgfile = base_path_ + voice->sg_lingware;
const pico_Char *ta_filename =
reinterpret_cast<const pico_Char *>(tafile.c_str());
const pico_Char *sg_filename =
reinterpret_cast<const pico_Char *>(sgfile.c_str());
FAILERR(pico_loadResource(system_, ta_filename, &ta_resource_));
FAILERR(pico_loadResource(system_, sg_filename, &sg_resource_));
FAILERR(pico_getResourceName(system_, ta_resource_,
reinterpret_cast<char *>(ta_resource_name)));
FAILERR(pico_getResourceName(system_, sg_resource_,
reinterpret_cast<char *>(sg_resource_name)));
FAILERR(pico_createVoiceDefinition(system_, PICO_VOICE_NAME));
FAILERR(pico_addResourceToVoiceDefinition(
system_, PICO_VOICE_NAME, ta_resource_name));
FAILERR(pico_addResourceToVoiceDefinition(
system_, PICO_VOICE_NAME, sg_resource_name));
pico_newEngine(system_, PICO_VOICE_NAME, &engine_);
current_voice_index_ = voice_index;
return TTS_SUCCESS;
}
// Initialize TTS engine.
tts_result PicoTtsEngine::Init() {
LOG(INFO) << "Start.";
LoadVoices(base_path_ + "tts_support.xml");
mem_area_ = malloc(PICO_MEM_SIZE);
if (!mem_area_) {
LOG(ERROR) << "Failed to allocate memory for Pico system";
return TTS_FAILURE;
}
FAILERR(pico_initialize(mem_area_, PICO_MEM_SIZE, &system_));
// Set the first language in the data file as the default.
FAILERR(InitVoice(0));
LOG(INFO) << "Init done.";
return TTS_SUCCESS;
}
// Shuts down the TTS engine, cleans up resources.
tts_result PicoTtsEngine::Shutdown() {
CleanResources();
if (system_) {
pico_terminate(&system_);
system_ = NULL;
}
if (mem_area_) {
free(mem_area_);
mem_area_ = NULL;
}
return TTS_SUCCESS;
}
tts_result PicoTtsEngine::Stop() {
// TODO(fergus): use PICO_RESET_SOFT here instead?
pico_resetEngine(engine_, PICO_RESET_FULL);
return TTS_SUCCESS;
}
int PicoTtsEngine::GetVoiceCount() {
return static_cast<int>(voices_.size());
}
const TtsVoice* PicoTtsEngine::GetVoiceInfo(int voice_index) {
if (voice_index >= 0 && voice_index < GetVoiceCount()) {
return &voices_[voice_index];
} else {
return NULL;
}
}
tts_result PicoTtsEngine::SetVoice(int voice_index) {
if (current_voice_index_ != voice_index) {
CleanResources();
return InitVoice(voice_index);
} else {
return TTS_SUCCESS;
}
}
void PicoTtsEngine::SetReceiver(TtsDataReceiver* receiver) {
receiver_ = receiver;
}
// Sets the property for the engine.
tts_result PicoTtsEngine::SetProperty(const char *property, const char *value) {
if (properties_.count(property) > 0) {
properties_[property] = value;
return TTS_SUCCESS;
} else {
return TTS_PROPERTY_UNSUPPORTED;
}
}
tts_result PicoTtsEngine::SetProperty(const char *property, float value) {
SetProperty(property, base::IntToString(value).c_str());
return TTS_SUCCESS;
}
tts_result PicoTtsEngine::SetRate(float rate) {
if (rate < 0 || rate > 5) {
return TTS_VALUE_INVALID;
}
rate = rate < 0.2 ? 0.2 : rate;
return SetProperty(PROP_RATE, rate * 100);
}
tts_result PicoTtsEngine::SetPitch(float pitch) {
if (pitch < 0 || pitch > 2) {
return TTS_VALUE_INVALID;
}
pitch = pitch < 0.5 ? 0.5 : pitch;
return SetProperty(PROP_PITCH, pitch * 100);
}
tts_result PicoTtsEngine::SetVolume(float volume) {
if (volume < 0 || volume > 5) {
return TTS_VALUE_INVALID;
}
return SetProperty(PROP_VOLUME, volume * 100);
}
tts_result PicoTtsEngine::GetProperty(const char *property,
const char **value) {
std::map<string, string>::const_iterator iter =
properties_.find(property);
if (iter != properties_.end()) {
if (value != NULL) {
(*value) = iter->second.c_str();
}
return TTS_SUCCESS;
}
return TTS_PROPERTY_UNSUPPORTED;
}
int PicoTtsEngine::GetSampleRate() {
return voices_[current_voice_index_].sample_rate;
}
tts_result PicoTtsEngine::SynthesizeText(const char* text,
int16_t* audio_buffer,
int audio_buffer_size,
int* out_total_samples) {
if (out_total_samples != NULL) {
*out_total_samples = 0;
}
string synth_text = "";
AppendProperties(text, &synth_text);
int text_pos = 0;
const pico_Char* text_ptr =
reinterpret_cast<const pico_Char*>(synth_text.c_str());
int text_buffer_len = synth_text.size() + 1;
while (text_pos < text_buffer_len) {
pico_Int16 text_bytes_consumed = 0;
if (PICO_OK != pico_putTextUtf8(
engine_, text_ptr, text_buffer_len - text_pos,
&text_bytes_consumed)) {
return TTS_FAILURE;
}
int out_samples;
tts_result result = GetAudioFromTts(
audio_buffer, audio_buffer_size, &out_samples);
if (out_total_samples != NULL) {
*out_total_samples += out_samples;
}
if (result != TTS_SUCCESS) {
return result;
}
text_pos += text_bytes_consumed;
text_ptr += text_bytes_consumed;
}
return TTS_SUCCESS;
}
// This method appends the SSML tags for the supported properties if their
// values are different from the default values.
void PicoTtsEngine::AppendProperties(const char *text, string *synth_text) {
int rate_level_ = floor(atof(properties_[PROP_RATE].c_str()));
int pitch_level_ = floor(atof(properties_[PROP_PITCH].c_str()));
int volume_level_ = floor(atof(properties_[PROP_VOLUME].c_str()));
if (rate_level_ < PICO_MIN_RATE || rate_level_ > PICO_MAX_RATE) {
LOG(WARNING) << "Rate is outside the allowed range.";
}
if (pitch_level_ < PICO_MIN_PITCH || pitch_level_ > PICO_MAX_PITCH) {
LOG(WARNING) << "Pitch is outside the allowed range.";
}
if (volume_level_ < PICO_MIN_VOL || volume_level_ > PICO_MAX_VOL) {
LOG(WARNING) << "Volume is outside the allowed range.";
}
*synth_text = "";
// Append opening tags
if (rate_level_ != PICO_DEF_RATE) {
*synth_text += "<speed level='" + properties_[PROP_RATE] + "'>";
}
if (pitch_level_ != PICO_DEF_PITCH) {
*synth_text += "<pitch level='" + properties_[PROP_PITCH] + "'>";
}
if (volume_level_ != PICO_DEF_VOL) {
*synth_text += "<volume level='" + properties_[PROP_VOLUME] + "'>";
}
// Append text
*synth_text += text;
// Append closing tags in the reverse order
if (volume_level_ != PICO_DEF_VOL) {
*synth_text += "</volume>";
}
if (pitch_level_ != PICO_DEF_PITCH) {
*synth_text += "</pitch>";
}
if (rate_level_ != PICO_DEF_RATE) {
*synth_text += "</speed>";
}
}
// max_iterations_without_apparent_progress is a hack to prevent infinite loops.
// This needs to be more than 200 to pass simple tests such as hello world.
// TODO(fergus): we should fix the underlying bug <http://b/2501315> in the
// //third_party/pico sources, and then delete all the code relating to
// max_iterations_without_apparent_progress.
int PicoTtsEngine::max_iterations_without_apparent_progress = 10000;
tts_result PicoTtsEngine::GetAudioFromTts(int16_t* audio_buffer,
int audio_buffer_size,
int* out_total_samples) {
int total_samples_output = 0;
int status;
tts_callback_status callback_status = TTS_CALLBACK_CONTINUE;
pico_Int16 data_type = PICO_DATA_PCM_16BIT;
uint32_t sample_rate = voices_[current_voice_index_].sample_rate;
int iterations_without_apparent_progress = 0;
while (1) {
pico_Int16 bytes_received = 0;
data_type = 0;
int8_t* buffer_ptr = reinterpret_cast<int8_t *>(audio_buffer);
pico_Int16 buffer_size_bytes = audio_buffer_size * sizeof(pico_Int16);
status = pico_getData(engine_, buffer_ptr, buffer_size_bytes,
&bytes_received, &data_type);
if (status != PICO_STEP_ERROR && bytes_received > 0) {
if (data_type != PICO_DATA_PCM_16BIT) {
break;
}
int samples_output = bytes_received / sizeof(const pico_Int16);
total_samples_output += samples_output;
// make the callback here...note that it's important to call this
// method even if no data was received.
if (receiver_) {
callback_status =
receiver_->Receive(sample_rate, 1, audio_buffer,
samples_output, TTS_SYNTH_PENDING);
if (callback_status != TTS_CALLBACK_CONTINUE) {
break;
}
}
}
if (status != PICO_STEP_BUSY) {
break;
}
if (bytes_received == 0) {
iterations_without_apparent_progress++;
if (iterations_without_apparent_progress >
max_iterations_without_apparent_progress) {
break;
}
} else {
iterations_without_apparent_progress = 0;
}
};
if (out_total_samples != NULL) {
*out_total_samples = total_samples_output;
}
if (status == PICO_STEP_ERROR ||
callback_status == TTS_CALLBACK_ERROR ||
data_type != PICO_DATA_PCM_16BIT ||
iterations_without_apparent_progress >
max_iterations_without_apparent_progress) {
return TTS_FAILURE;
}
if (callback_status != TTS_CALLBACK_HALT) {
// Send a zero-length packet to tell the destination receiver that
// we're done.
callback_status = receiver_->Receive(sample_rate, 1, NULL, 0,
TTS_SYNTH_DONE);
if (callback_status == TTS_CALLBACK_ERROR) {
return TTS_FAILURE;
}
}
return TTS_SUCCESS;
}
} // namespace speech_synthesis