blob: 3bb628ae218b15c0768d8aa69587f3fee1028971 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/browser/speech/speech_recognition_engine.h"
#include <algorithm>
#include <vector>
#include "base/big_endian.h"
#include "base/bind.h"
#include "base/rand_util.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/time/time.h"
#include "content/browser/speech/audio_buffer.h"
#include "content/browser/speech/proto/google_streaming_api.pb.h"
#include "google_apis/google_api_keys.h"
#include "mojo/public/c/system/types.h"
#include "mojo/public/cpp/bindings/binding_set.h"
#include "net/base/escape.h"
#include "net/base/load_flags.h"
#include "net/traffic_annotation/network_traffic_annotation.h"
#include "services/network/public/cpp/shared_url_loader_factory.h"
#include "services/network/public/cpp/simple_url_loader.h"
#include "services/network/public/mojom/chunked_data_pipe_getter.mojom.h"
#include "third_party/blink/public/mojom/speech/speech_recognition_error.mojom.h"
#include "third_party/blink/public/mojom/speech/speech_recognition_result.mojom.h"
namespace content {
namespace {
const char kWebServiceBaseUrl[] =
"https://www.google.com/speech-api/full-duplex/v1";
const char kDownstreamUrl[] = "/down?";
const char kUpstreamUrl[] = "/up?";
// Used to override |kWebServiceBaseUrl| when non-null, only set in tests.
const char* web_service_base_url_for_tests = nullptr;
// This matches the maximum maxAlternatives value supported by the server.
const uint32_t kMaxMaxAlternatives = 30;
// Maximum amount of data written per Mojo write.
const uint32_t kMaxUploadWrite = 128 * 1024;
// TODO(hans): Remove this and other logging when we don't need it anymore.
void DumpResponse(const std::string& response) {
DVLOG(1) << "------------";
proto::SpeechRecognitionEvent event;
if (!event.ParseFromString(response)) {
DVLOG(1) << "Parse failed!";
return;
}
if (event.has_status())
DVLOG(1) << "STATUS\t" << event.status();
if (event.has_endpoint())
DVLOG(1) << "ENDPOINT\t" << event.endpoint();
for (int i = 0; i < event.result_size(); ++i) {
DVLOG(1) << "RESULT #" << i << ":";
const proto::SpeechRecognitionResult& res = event.result(i);
if (res.has_final())
DVLOG(1) << " final:\t" << res.final();
if (res.has_stability())
DVLOG(1) << " STABILITY:\t" << res.stability();
for (int j = 0; j < res.alternative_size(); ++j) {
const proto::SpeechRecognitionAlternative& alt =
res.alternative(j);
if (alt.has_confidence())
DVLOG(1) << " CONFIDENCE:\t" << alt.confidence();
if (alt.has_transcript())
DVLOG(1) << " TRANSCRIPT:\t" << alt.transcript();
}
}
}
const int kDefaultConfigSampleRate = 8000;
const int kDefaultConfigBitsPerSample = 16;
const uint32_t kDefaultMaxHypotheses = 1;
} // namespace
// Streams sound data up to the server.
class SpeechRecognitionEngine::UpstreamLoader
: public network::mojom::ChunkedDataPipeGetter {
public:
UpstreamLoader(std::unique_ptr<network::ResourceRequest> resource_request,
net::NetworkTrafficAnnotationTag upstream_traffic_annotation,
network::mojom::URLLoaderFactory* url_loader_factory,
SpeechRecognitionEngine* speech_recognition_engine)
: speech_recognition_engine_(speech_recognition_engine) {
// Attach a chunked upload body.
network::mojom::ChunkedDataPipeGetterPtr data_pipe;
binding_set_.AddBinding(this, mojo::MakeRequest(&data_pipe));
resource_request->request_body = new network::ResourceRequestBody();
resource_request->request_body->SetToChunkedDataPipe(std::move(data_pipe));
simple_url_loader_ = network::SimpleURLLoader::Create(
std::move(resource_request), upstream_traffic_annotation);
simple_url_loader_->DownloadToStringOfUnboundedSizeUntilCrashAndDie(
url_loader_factory,
base::BindOnce(&UpstreamLoader::OnComplete, base::Unretained(this)));
}
~UpstreamLoader() override = default;
void OnComplete(std::unique_ptr<std::string> response_body) {
int response_code = -1;
if (simple_url_loader_->ResponseInfo() &&
simple_url_loader_->ResponseInfo()->headers) {
response_code =
simple_url_loader_->ResponseInfo()->headers->response_code();
}
speech_recognition_engine_->OnUpstreamDataComplete(response_body != nullptr,
response_code);
}
void AppendChunkToUpload(const std::string& data, bool is_last_chunk) {
DCHECK(!has_last_chunk_);
upload_body_ += data;
if (is_last_chunk) {
// Send size before the rest of the body. While it doesn't matter much, if
// the other side receives the size before the last chunk, which Mojo does
// not gaurantee, some protocols can merge the data and the last chunk
// itself into a single frame.
has_last_chunk_ = is_last_chunk;
if (get_size_callback_)
std::move(get_size_callback_).Run(net::OK, upload_body_.size());
}
SendData();
}
private:
void OnUploadPipeWriteable(MojoResult unused) { SendData(); }
// Attempts to send more of the upload body, if more data is available, and
// |upload_pipe_| is valid.
void SendData() {
DCHECK_LE(upload_position_, upload_body_.size());
if (!upload_pipe_.is_valid())
return;
// Nothing more to write yet, or done writing everything.
if (upload_position_ == upload_body_.size())
return;
// Since kMaxUploadWrite is a uint32_t, no overflow occurs in this downcast.
uint32_t write_bytes = std::min(upload_body_.length() - upload_position_,
static_cast<size_t>(kMaxUploadWrite));
MojoResult result =
upload_pipe_->WriteData(upload_body_.data() + upload_position_,
&write_bytes, MOJO_WRITE_DATA_FLAG_NONE);
// Wait for the pipe to have more capacity available, if needed.
if (result == MOJO_RESULT_SHOULD_WAIT) {
upload_pipe_watcher_->ArmOrNotify();
return;
}
// Do nothing on pipe closure - depend on the SimpleURLLoader to notice the
// other pipes being closed on error. Can reach this point if there's a
// retry, for instance, so cannot draw any conclusions here.
if (result != MOJO_RESULT_OK)
return;
upload_position_ += write_bytes;
// If more data is available, arm the watcher again. Don't write again in a
// loop, even if WriteData would allow it, to avoid blocking the current
// thread.
if (upload_position_ < upload_body_.size())
upload_pipe_watcher_->ArmOrNotify();
}
// mojom::ChunkedDataPipeGetter implementation:
void GetSize(GetSizeCallback get_size_callback) override {
if (has_last_chunk_) {
std::move(get_size_callback).Run(net::OK, upload_body_.size());
} else {
get_size_callback_ = std::move(get_size_callback);
}
}
void StartReading(mojo::ScopedDataPipeProducerHandle pipe) override {
// Delete any existing pipe, if any.
upload_pipe_watcher_.reset();
upload_pipe_ = std::move(pipe);
upload_pipe_watcher_ = std::make_unique<mojo::SimpleWatcher>(
FROM_HERE, mojo::SimpleWatcher::ArmingPolicy::MANUAL);
upload_pipe_watcher_->Watch(
upload_pipe_.get(), MOJO_HANDLE_SIGNAL_WRITABLE,
base::BindRepeating(&UpstreamLoader::OnUploadPipeWriteable,
base::Unretained(this)));
upload_position_ = 0;
// Will attempt to start sending the request body, if any data is available.
SendData();
}
// Partial upload body. Have to cache the entire thing in memory, in case have
// to replay it.
std::string upload_body_;
// Current position in |upload_body_|. All bytes before this point have been
// written to |upload_pipe_|.
size_t upload_position_ = 0;
// Whether |upload_body_| is complete.
bool has_last_chunk_ = false;
// Current pipe being used to send the |upload_body_| to the URLLoader.
mojo::ScopedDataPipeProducerHandle upload_pipe_;
// Watches |upload_pipe_| for writeability.
std::unique_ptr<mojo::SimpleWatcher> upload_pipe_watcher_;
// If non-null, invoked once the size of the upload is known.
network::mojom::ChunkedDataPipeGetter::GetSizeCallback get_size_callback_;
SpeechRecognitionEngine* const speech_recognition_engine_;
std::unique_ptr<network::SimpleURLLoader> simple_url_loader_;
mojo::BindingSet<network::mojom::ChunkedDataPipeGetter> binding_set_;
DISALLOW_COPY_AND_ASSIGN(UpstreamLoader);
};
// Streams response data from the server to the SpeechRecognitionEngine.
class SpeechRecognitionEngine::DownstreamLoader
: public network::SimpleURLLoaderStreamConsumer {
public:
DownstreamLoader(std::unique_ptr<network::ResourceRequest> resource_request,
net::NetworkTrafficAnnotationTag upstream_traffic_annotation,
network::mojom::URLLoaderFactory* url_loader_factory,
SpeechRecognitionEngine* speech_recognition_engine)
: speech_recognition_engine_(speech_recognition_engine) {
simple_url_loader_ = network::SimpleURLLoader::Create(
std::move(resource_request), upstream_traffic_annotation);
simple_url_loader_->DownloadAsStream(url_loader_factory, this);
}
~DownstreamLoader() override = default;
// SimpleURLLoaderStreamConsumer implementation:
void OnDataReceived(base::StringPiece string_piece,
base::OnceClosure resume) override {
speech_recognition_engine_->OnDownstreamDataReceived(string_piece);
std::move(resume).Run();
}
void OnComplete(bool success) override {
int response_code = -1;
if (simple_url_loader_->ResponseInfo() &&
simple_url_loader_->ResponseInfo()->headers) {
response_code =
simple_url_loader_->ResponseInfo()->headers->response_code();
}
speech_recognition_engine_->OnDownstreamDataComplete(success,
response_code);
}
void OnRetry(base::OnceClosure start_retry) override {
// Retries are not enabled for these requests.
NOTREACHED();
}
private:
SpeechRecognitionEngine* const speech_recognition_engine_;
std::unique_ptr<network::SimpleURLLoader> simple_url_loader_;
DISALLOW_COPY_AND_ASSIGN(DownstreamLoader);
};
SpeechRecognitionEngine::Config::Config()
: filter_profanities(false),
continuous(true),
interim_results(true),
max_hypotheses(kDefaultMaxHypotheses),
audio_sample_rate(kDefaultConfigSampleRate),
audio_num_bits_per_sample(kDefaultConfigBitsPerSample) {}
SpeechRecognitionEngine::Config::~Config() {}
const int SpeechRecognitionEngine::kAudioPacketIntervalMs = 100;
const int SpeechRecognitionEngine::kWebserviceStatusNoError = 0;
const int SpeechRecognitionEngine::kWebserviceStatusErrorNoMatch = 5;
SpeechRecognitionEngine::SpeechRecognitionEngine(
scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory,
const std::string& accept_language)
: shared_url_loader_factory_(std::move(shared_url_loader_factory)),
accept_language_(accept_language),
got_last_definitive_result_(false),
is_dispatching_event_(false),
use_framed_post_data_(false),
state_(STATE_IDLE) {}
SpeechRecognitionEngine::~SpeechRecognitionEngine() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
}
void SpeechRecognitionEngine::set_web_service_base_url_for_tests(
const char* base_url_for_tests) {
web_service_base_url_for_tests = base_url_for_tests;
}
void SpeechRecognitionEngine::SetConfig(const Config& config) {
config_ = config;
}
void SpeechRecognitionEngine::StartRecognition() {
FSMEventArgs event_args(EVENT_START_RECOGNITION);
DispatchEvent(event_args);
}
void SpeechRecognitionEngine::EndRecognition() {
FSMEventArgs event_args(EVENT_END_RECOGNITION);
DispatchEvent(event_args);
}
void SpeechRecognitionEngine::TakeAudioChunk(const AudioChunk& data) {
FSMEventArgs event_args(EVENT_AUDIO_CHUNK);
event_args.audio_data = &data;
DispatchEvent(event_args);
}
void SpeechRecognitionEngine::AudioChunksEnded() {
FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED);
DispatchEvent(event_args);
}
void SpeechRecognitionEngine::OnUpstreamDataComplete(bool success,
int response_code) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DVLOG(1) << "Upstream complete success: " << success
<< " response_code: " << response_code;
if (!success) {
FSMEventArgs event_args(EVENT_UPSTREAM_ERROR);
DispatchEvent(event_args);
return;
}
// Do nothing on clean completion of upstream request.
}
void SpeechRecognitionEngine::OnDownstreamDataReceived(
base::StringPiece new_response_data) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DVLOG(1) << "Downstream length: " << new_response_data.size();
// The downstream response is organized in chunks, whose size is determined
// by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class.
// Such chunks are sent by the speech recognition webservice over the HTTP
// downstream channel using HTTP chunked transfer (unrelated to our chunks).
// This function is called every time an HTTP chunk is received by the
// url fetcher. However there isn't any particular matching beween our
// protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can
// contain a portion of one chunk or even more chunks together.
chunked_byte_buffer_.Append(new_response_data);
// A single HTTP chunk can contain more than one data chunk, thus the while.
while (chunked_byte_buffer_.HasChunks()) {
FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE);
event_args.response = chunked_byte_buffer_.PopChunk();
DCHECK(event_args.response.get());
DumpResponse(std::string(event_args.response->begin(),
event_args.response->end()));
DispatchEvent(event_args);
}
}
void SpeechRecognitionEngine::OnDownstreamDataComplete(bool success,
int response_code) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DVLOG(1) << "Downstream complete success: " << success
<< " response_code: " << response_code;
if (!success) {
FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR);
DispatchEvent(event_args);
return;
}
FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED);
DispatchEvent(event_args);
}
bool SpeechRecognitionEngine::IsRecognitionPending() const {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
return state_ != STATE_IDLE;
}
int SpeechRecognitionEngine::GetDesiredAudioChunkDurationMs() const {
return kAudioPacketIntervalMs;
}
// ----------------------- Core FSM implementation ---------------------------
void SpeechRecognitionEngine::DispatchEvent(
const FSMEventArgs& event_args) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
DCHECK_LE(state_, STATE_MAX_VALUE);
// Event dispatching must be sequential, otherwise it will break all the rules
// and the assumptions of the finite state automata model.
DCHECK(!is_dispatching_event_);
is_dispatching_event_ = true;
state_ = ExecuteTransitionAndGetNextState(event_args);
is_dispatching_event_ = false;
}
SpeechRecognitionEngine::FSMState
SpeechRecognitionEngine::ExecuteTransitionAndGetNextState(
const FSMEventArgs& event_args) {
const FSMEvent event = event_args.event;
switch (state_) {
case STATE_IDLE:
switch (event) {
case EVENT_START_RECOGNITION:
return ConnectBothStreams(event_args);
case EVENT_END_RECOGNITION:
// Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of
// abort, so we just silently drop them here.
case EVENT_AUDIO_CHUNK:
case EVENT_AUDIO_CHUNKS_ENDED:
// DOWNSTREAM_CLOSED can be received if we end up here due to an error.
case EVENT_DOWNSTREAM_CLOSED:
return DoNothing(event_args);
case EVENT_UPSTREAM_ERROR:
case EVENT_DOWNSTREAM_ERROR:
case EVENT_DOWNSTREAM_RESPONSE:
return NotFeasible(event_args);
}
break;
case STATE_BOTH_STREAMS_CONNECTED:
switch (event) {
case EVENT_AUDIO_CHUNK:
return TransmitAudioUpstream(event_args);
case EVENT_DOWNSTREAM_RESPONSE:
return ProcessDownstreamResponse(event_args);
case EVENT_AUDIO_CHUNKS_ENDED:
return CloseUpstreamAndWaitForResults(event_args);
case EVENT_END_RECOGNITION:
return AbortSilently(event_args);
case EVENT_UPSTREAM_ERROR:
case EVENT_DOWNSTREAM_ERROR:
case EVENT_DOWNSTREAM_CLOSED:
return AbortWithError(event_args);
case EVENT_START_RECOGNITION:
return NotFeasible(event_args);
}
break;
case STATE_WAITING_DOWNSTREAM_RESULTS:
switch (event) {
case EVENT_DOWNSTREAM_RESPONSE:
return ProcessDownstreamResponse(event_args);
case EVENT_DOWNSTREAM_CLOSED:
return RaiseNoMatchErrorIfGotNoResults(event_args);
case EVENT_END_RECOGNITION:
return AbortSilently(event_args);
case EVENT_UPSTREAM_ERROR:
case EVENT_DOWNSTREAM_ERROR:
return AbortWithError(event_args);
case EVENT_START_RECOGNITION:
case EVENT_AUDIO_CHUNK:
case EVENT_AUDIO_CHUNKS_ENDED:
return NotFeasible(event_args);
}
break;
}
return NotFeasible(event_args);
}
// ----------- Contract for all the FSM evolution functions below -------------
// - Are guaranteed to be executed in the same thread (IO, except for tests);
// - Are guaranteed to be not reentrant (themselves and each other);
// - event_args members are guaranteed to be stable during the call;
SpeechRecognitionEngine::FSMState
SpeechRecognitionEngine::ConnectBothStreams(const FSMEventArgs&) {
DCHECK(!upstream_loader_.get());
DCHECK(!downstream_loader_.get());
encoder_.reset(new AudioEncoder(config_.audio_sample_rate,
config_.audio_num_bits_per_sample));
DCHECK(encoder_.get());
const std::string request_key = GenerateRequestKey();
// Only use the framed post data format when a preamble needs to be logged.
use_framed_post_data_ = (config_.preamble &&
!config_.preamble->sample_data.empty() &&
!config_.auth_token.empty() &&
!config_.auth_scope.empty());
if (use_framed_post_data_) {
preamble_encoder_.reset(new AudioEncoder(
config_.preamble->sample_rate,
config_.preamble->sample_depth * 8));
}
const char* web_service_base_url = !web_service_base_url_for_tests
? kWebServiceBaseUrl
: web_service_base_url_for_tests;
// Setup downstream fetcher.
std::vector<std::string> downstream_args;
downstream_args.push_back(
"key=" + net::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
downstream_args.push_back("pair=" + request_key);
downstream_args.push_back("output=pb");
GURL downstream_url(std::string(web_service_base_url) +
std::string(kDownstreamUrl) +
base::JoinString(downstream_args, "&"));
net::NetworkTrafficAnnotationTag downstream_traffic_annotation =
net::DefineNetworkTrafficAnnotation("speech_recognition_downstream", R"(
semantics {
sender: "Speech Recognition"
description:
"Chrome provides translation from speech audio recorded with a "
"microphone to text, by using the Google speech recognition web "
"service. Audio is sent to Google's servers (upstream) and text is "
"returned (downstream). This network request (downstream) sends an "
"id for getting the text response. Then the (upstream) request "
"sends the audio data along with the id. When the server has "
"finished processing the audio data and produced a text response, "
"it replies to this request."
trigger:
"The user chooses to start the recognition by clicking the "
"microphone icon in the Google search field."
data: "A unique random id for this speech recognition request."
destination: GOOGLE_OWNED_SERVICE
}
policy {
cookies_allowed: NO
setting:
"The user must allow the browser to access the microphone in a "
"permission prompt. This is set per site (hostname pattern). In "
"the content settings menu, microphone access can be turned off "
"for all sites and site specific settings can be changed."
chrome_policy {
AudioCaptureAllowed {
policy_options {mode: MANDATORY}
AudioCaptureAllowed: false
}
}
chrome_policy {
AudioCaptureAllowedUrls {
policy_options {mode: MANDATORY}
AudioCaptureAllowedUrls: {}
}
}
})");
auto downstream_request = std::make_unique<network::ResourceRequest>();
downstream_request->load_flags = net::LOAD_DO_NOT_SAVE_COOKIES |
net::LOAD_DO_NOT_SEND_COOKIES |
net::LOAD_DO_NOT_SEND_AUTH_DATA;
downstream_request->url = downstream_url;
downstream_loader_ = std::make_unique<DownstreamLoader>(
std::move(downstream_request), downstream_traffic_annotation,
shared_url_loader_factory_.get(), this);
// Setup upstream fetcher.
// TODO(hans): Support for user-selected grammars.
std::vector<std::string> upstream_args;
upstream_args.push_back("key=" +
net::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
upstream_args.push_back("pair=" + request_key);
upstream_args.push_back("output=pb");
upstream_args.push_back(
"lang=" + net::EscapeQueryParamValue(GetAcceptedLanguages(), true));
upstream_args.push_back(
config_.filter_profanities ? "pFilter=2" : "pFilter=0");
if (config_.max_hypotheses > 0U) {
uint32_t max_alternatives =
std::min(kMaxMaxAlternatives, config_.max_hypotheses);
upstream_args.push_back("maxAlternatives=" +
base::UintToString(max_alternatives));
}
upstream_args.push_back("app=chromium");
for (const blink::mojom::SpeechRecognitionGrammar& grammar :
config_.grammars) {
std::string grammar_value(base::NumberToString(grammar.weight) + ":" +
grammar.url.spec());
upstream_args.push_back(
"grammar=" + net::EscapeQueryParamValue(grammar_value, true));
}
if (config_.continuous)
upstream_args.push_back("continuous");
else
upstream_args.push_back("endpoint=1");
if (config_.interim_results)
upstream_args.push_back("interim");
if (!config_.auth_token.empty() && !config_.auth_scope.empty()) {
upstream_args.push_back(
"authScope=" + net::EscapeQueryParamValue(config_.auth_scope, true));
upstream_args.push_back(
"authToken=" + net::EscapeQueryParamValue(config_.auth_token, true));
}
if (use_framed_post_data_) {
std::string audio_format;
if (preamble_encoder_)
audio_format = preamble_encoder_->GetMimeType() + ",";
audio_format += encoder_->GetMimeType();
upstream_args.push_back(
"audioFormat=" + net::EscapeQueryParamValue(audio_format, true));
}
GURL upstream_url(std::string(web_service_base_url) +
std::string(kUpstreamUrl) +
base::JoinString(upstream_args, "&"));
net::NetworkTrafficAnnotationTag upstream_traffic_annotation =
net::DefineNetworkTrafficAnnotation("speech_recognition_upstream", R"(
semantics {
sender: "Speech Recognition"
description:
"Chrome provides translation from speech audio recorded with a "
"microphone to text, by using the Google speech recognition web "
"service. Audio is sent to Google's servers (upstream) and text is "
"returned (downstream)."
trigger:
"The user chooses to start the recognition by clicking the "
"microphone icon in the Google search field."
data:
"Audio recorded with the microphone, and the unique id of "
"downstream speech recognition request."
destination: GOOGLE_OWNED_SERVICE
}
policy {
cookies_allowed: NO
setting:
"The user must allow the browser to access the microphone in a "
"permission prompt. This is set per site (hostname pattern). In "
"the content settings menu, microphone access can be turned off "
"for all sites and site specific settings can be changed."
chrome_policy {
AudioCaptureAllowed {
policy_options {mode: MANDATORY}
AudioCaptureAllowed: false
}
}
chrome_policy {
AudioCaptureAllowedUrls {
policy_options {mode: MANDATORY}
AudioCaptureAllowedUrls: {}
}
}
})");
auto upstream_request = std::make_unique<network::ResourceRequest>();
upstream_request->url = upstream_url;
upstream_request->method = "POST";
upstream_request->referrer = GURL(config_.origin_url);
upstream_request->load_flags = net::LOAD_DO_NOT_SAVE_COOKIES |
net::LOAD_DO_NOT_SEND_COOKIES |
net::LOAD_DO_NOT_SEND_AUTH_DATA;
if (use_framed_post_data_) {
upstream_request->headers.SetHeader(net::HttpRequestHeaders::kContentType,
"application/octet-stream");
} else {
upstream_request->headers.SetHeader(net::HttpRequestHeaders::kContentType,
encoder_->GetMimeType());
}
upstream_loader_ = std::make_unique<UpstreamLoader>(
std::move(upstream_request), upstream_traffic_annotation,
shared_url_loader_factory_.get(), this);
if (preamble_encoder_) {
// Encode and send preamble right away.
scoped_refptr<AudioChunk> chunk = new AudioChunk(
reinterpret_cast<const uint8_t*>(config_.preamble->sample_data.data()),
config_.preamble->sample_data.size(), config_.preamble->sample_depth);
preamble_encoder_->Encode(*chunk);
preamble_encoder_->Flush();
scoped_refptr<AudioChunk> encoded_data(
preamble_encoder_->GetEncodedDataAndClear());
UploadAudioChunk(encoded_data->AsString(), FRAME_PREAMBLE_AUDIO, false);
}
return STATE_BOTH_STREAMS_CONNECTED;
}
SpeechRecognitionEngine::FSMState
SpeechRecognitionEngine::TransmitAudioUpstream(
const FSMEventArgs& event_args) {
DCHECK(upstream_loader_.get());
DCHECK(event_args.audio_data.get());
const AudioChunk& audio = *(event_args.audio_data.get());
DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
encoder_->Encode(audio);
scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
UploadAudioChunk(encoded_data->AsString(), FRAME_RECOGNITION_AUDIO, false);
return state_;
}
SpeechRecognitionEngine::FSMState
SpeechRecognitionEngine::ProcessDownstreamResponse(
const FSMEventArgs& event_args) {
DCHECK(event_args.response.get());
proto::SpeechRecognitionEvent ws_event;
if (!ws_event.ParseFromString(std::string(event_args.response->begin(),
event_args.response->end())))
return AbortWithError(event_args);
if (ws_event.has_status()) {
switch (ws_event.status()) {
case proto::SpeechRecognitionEvent::STATUS_SUCCESS:
break;
case proto::SpeechRecognitionEvent::STATUS_NO_SPEECH:
return Abort(blink::mojom::SpeechRecognitionErrorCode::kNoSpeech);
case proto::SpeechRecognitionEvent::STATUS_ABORTED:
return Abort(blink::mojom::SpeechRecognitionErrorCode::kAborted);
case proto::SpeechRecognitionEvent::STATUS_AUDIO_CAPTURE:
return Abort(blink::mojom::SpeechRecognitionErrorCode::kAudioCapture);
case proto::SpeechRecognitionEvent::STATUS_NETWORK:
return Abort(blink::mojom::SpeechRecognitionErrorCode::kNetwork);
case proto::SpeechRecognitionEvent::STATUS_NOT_ALLOWED:
return Abort(blink::mojom::SpeechRecognitionErrorCode::kNotAllowed);
case proto::SpeechRecognitionEvent::STATUS_SERVICE_NOT_ALLOWED:
return Abort(
blink::mojom::SpeechRecognitionErrorCode::kServiceNotAllowed);
case proto::SpeechRecognitionEvent::STATUS_BAD_GRAMMAR:
return Abort(blink::mojom::SpeechRecognitionErrorCode::kBadGrammar);
case proto::SpeechRecognitionEvent::STATUS_LANGUAGE_NOT_SUPPORTED:
return Abort(
blink::mojom::SpeechRecognitionErrorCode::kLanguageNotSupported);
}
}
if (!config_.continuous && ws_event.has_endpoint() &&
ws_event.endpoint() == proto::SpeechRecognitionEvent::END_OF_UTTERANCE) {
delegate_->OnSpeechRecognitionEngineEndOfUtterance();
}
std::vector<blink::mojom::SpeechRecognitionResultPtr> results;
for (int i = 0; i < ws_event.result_size(); ++i) {
const proto::SpeechRecognitionResult& ws_result = ws_event.result(i);
results.push_back(blink::mojom::SpeechRecognitionResult::New());
blink::mojom::SpeechRecognitionResultPtr& result = results.back();
result->is_provisional = !(ws_result.has_final() && ws_result.final());
if (!result->is_provisional)
got_last_definitive_result_ = true;
for (int j = 0; j < ws_result.alternative_size(); ++j) {
const proto::SpeechRecognitionAlternative& ws_alternative =
ws_result.alternative(j);
blink::mojom::SpeechRecognitionHypothesisPtr hypothesis =
blink::mojom::SpeechRecognitionHypothesis::New();
if (ws_alternative.has_confidence())
hypothesis->confidence = ws_alternative.confidence();
else if (ws_result.has_stability())
hypothesis->confidence = ws_result.stability();
DCHECK(ws_alternative.has_transcript());
// TODO(hans): Perhaps the transcript should be required in the proto?
if (ws_alternative.has_transcript())
hypothesis->utterance = base::UTF8ToUTF16(ws_alternative.transcript());
result->hypotheses.push_back(std::move(hypothesis));
}
}
if (results.size()) {
delegate_->OnSpeechRecognitionEngineResults(results);
}
return state_;
}
SpeechRecognitionEngine::FSMState
SpeechRecognitionEngine::RaiseNoMatchErrorIfGotNoResults(
const FSMEventArgs& event_args) {
if (!got_last_definitive_result_) {
// Provide an empty result to notify that recognition is ended with no
// errors, yet neither any further results.
delegate_->OnSpeechRecognitionEngineResults(
std::vector<blink::mojom::SpeechRecognitionResultPtr>());
}
return AbortSilently(event_args);
}
SpeechRecognitionEngine::FSMState
SpeechRecognitionEngine::CloseUpstreamAndWaitForResults(
const FSMEventArgs&) {
DCHECK(upstream_loader_.get());
DCHECK(encoder_.get());
DVLOG(1) << "Closing upstream.";
// The encoder requires a non-empty final buffer. So we encode a packet
// of silence in case encoder had no data already.
size_t sample_count =
config_.audio_sample_rate * kAudioPacketIntervalMs / 1000;
scoped_refptr<AudioChunk> dummy_chunk = new AudioChunk(
sample_count * sizeof(int16_t), encoder_->GetBitsPerSample() / 8);
encoder_->Encode(*dummy_chunk.get());
encoder_->Flush();
scoped_refptr<AudioChunk> encoded_dummy_data =
encoder_->GetEncodedDataAndClear();
DCHECK(!encoded_dummy_data->IsEmpty());
encoder_.reset();
UploadAudioChunk(encoded_dummy_data->AsString(),
FRAME_RECOGNITION_AUDIO,
true);
got_last_definitive_result_ = false;
return STATE_WAITING_DOWNSTREAM_RESULTS;
}
SpeechRecognitionEngine::FSMState
SpeechRecognitionEngine::CloseDownstream(const FSMEventArgs&) {
DCHECK(!upstream_loader_.get());
DCHECK(downstream_loader_.get());
DVLOG(1) << "Closing downstream.";
downstream_loader_.reset();
return STATE_IDLE;
}
SpeechRecognitionEngine::FSMState
SpeechRecognitionEngine::AbortSilently(const FSMEventArgs&) {
return Abort(blink::mojom::SpeechRecognitionErrorCode::kNone);
}
SpeechRecognitionEngine::FSMState
SpeechRecognitionEngine::AbortWithError(const FSMEventArgs&) {
return Abort(blink::mojom::SpeechRecognitionErrorCode::kNetwork);
}
SpeechRecognitionEngine::FSMState SpeechRecognitionEngine::Abort(
blink::mojom::SpeechRecognitionErrorCode error_code) {
DVLOG(1) << "Aborting with error " << error_code;
if (error_code != blink::mojom::SpeechRecognitionErrorCode::kNone) {
delegate_->OnSpeechRecognitionEngineError(
blink::mojom::SpeechRecognitionError(
error_code, blink::mojom::SpeechAudioErrorDetails::kNone));
}
downstream_loader_.reset();
upstream_loader_.reset();
encoder_.reset();
return STATE_IDLE;
}
SpeechRecognitionEngine::FSMState
SpeechRecognitionEngine::DoNothing(const FSMEventArgs&) {
return state_;
}
SpeechRecognitionEngine::FSMState
SpeechRecognitionEngine::NotFeasible(const FSMEventArgs& event_args) {
NOTREACHED() << "Unfeasible event " << event_args.event
<< " in state " << state_;
return state_;
}
std::string SpeechRecognitionEngine::GetAcceptedLanguages() const {
std::string langs = config_.language;
if (langs.empty() && !accept_language_.empty()) {
// If no language is provided then we use the first from the accepted
// language list. If this list is empty then it defaults to "en-US".
// Example of the contents of this list: "es,en-GB;q=0.8", ""
size_t separator = accept_language_.find_first_of(",;");
if (separator != std::string::npos)
langs = accept_language_.substr(0, separator);
}
if (langs.empty())
langs = "en-US";
return langs;
}
// TODO(primiano): Is there any utility in the codebase that already does this?
std::string SpeechRecognitionEngine::GenerateRequestKey() const {
const int64_t kKeepLowBytes = 0x00000000FFFFFFFFLL;
const int64_t kKeepHighBytes = 0xFFFFFFFF00000000LL;
// Just keep the least significant bits of timestamp, in order to reduce
// probability of collisions.
int64_t key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) |
(base::RandUint64() & kKeepHighBytes);
return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key));
}
void SpeechRecognitionEngine::UploadAudioChunk(const std::string& data,
FrameType type,
bool is_final) {
if (use_framed_post_data_) {
std::string frame(data.size() + 8, 0);
base::WriteBigEndian(&frame[0], static_cast<uint32_t>(data.size()));
base::WriteBigEndian(&frame[4], static_cast<uint32_t>(type));
frame.replace(8, data.size(), data);
upstream_loader_->AppendChunkToUpload(frame, is_final);
} else {
upstream_loader_->AppendChunkToUpload(data, is_final);
}
}
SpeechRecognitionEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
: event(event_value) {
}
SpeechRecognitionEngine::FSMEventArgs::~FSMEventArgs() {
}
} // namespace content