blob: a13884054fd46a9ecb3e52be9adad9b230b8d9c2 [file] [log] [blame]
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/browser/speech/network_speech_recognition_engine_impl.h"
#include <algorithm>
#include <memory>
#include <string_view>
#include <vector>
#include "base/functional/bind.h"
#include "base/metrics/histogram_functions.h"
#include "base/numerics/byte_conversions.h"
#include "base/numerics/safe_conversions.h"
#include "base/rand_util.h"
#include "base/strings/escape.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/time/time.h"
#include "components/speech/audio_buffer.h"
#include "content/public/browser/google_streaming_api.pb.h"
#include "google_apis/google_api_keys.h"
#include "media/base/audio_timestamp_helper.h"
#include "media/mojo/mojom/speech_recognition_error.mojom.h"
#include "media/mojo/mojom/speech_recognition_result.mojom.h"
#include "mojo/public/c/system/types.h"
#include "mojo/public/cpp/bindings/receiver_set.h"
#include "net/base/load_flags.h"
#include "net/traffic_annotation/network_traffic_annotation.h"
#include "services/network/public/cpp/shared_url_loader_factory.h"
namespace content {
namespace {
const char kWebServiceBaseUrl[] =
"https://www.google.com/speech-api/full-duplex/v1";
const char kDownstreamUrl[] = "/down?";
const char kUpstreamUrl[] = "/up?";
constexpr char kWebSpeechAudioDuration[] = "Accessibility.WebSpeech.Duration";
// Used to override |kWebServiceBaseUrl| when non-null, only set in tests.
const char* web_service_base_url_for_tests = nullptr;
// This matches the maximum maxAlternatives value supported by the server.
const uint32_t kMaxMaxAlternatives = 30;
// TODO(hans): Remove this and other logging when we don't need it anymore.
void DumpResponse(const std::string& response) {
DVLOG(1) << "------------";
proto::SpeechRecognitionEvent event;
if (!event.ParseFromString(response)) {
DVLOG(1) << "Parse failed!";
return;
}
if (event.has_status()) {
DVLOG(1) << "STATUS\t" << event.status();
}
if (event.has_endpoint()) {
DVLOG(1) << "ENDPOINT\t" << event.endpoint();
}
for (int i = 0; i < event.result_size(); ++i) {
DVLOG(1) << "RESULT #" << i << ":";
const proto::SpeechRecognitionResult& res = event.result(i);
if (res.has_final()) {
DVLOG(1) << " final:\t" << res.final();
}
if (res.has_stability()) {
DVLOG(1) << " STABILITY:\t" << res.stability();
}
for (int j = 0; j < res.alternative_size(); ++j) {
const proto::SpeechRecognitionAlternative& alt = res.alternative(j);
if (alt.has_confidence()) {
DVLOG(1) << " CONFIDENCE:\t" << alt.confidence();
}
if (alt.has_transcript()) {
DVLOG(1) << " TRANSCRIPT:\t" << alt.transcript();
}
}
}
}
const int kDefaultConfigSampleRate = 8000;
const int kDefaultConfigBitsPerSample = 16;
const uint32_t kDefaultMaxHypotheses = 1;
} // namespace
NetworkSpeechRecognitionEngineImpl::Config::Config()
: max_hypotheses(kDefaultMaxHypotheses),
audio_sample_rate(kDefaultConfigSampleRate),
audio_num_bits_per_sample(kDefaultConfigBitsPerSample) {}
NetworkSpeechRecognitionEngineImpl::Config::~Config() = default;
const int NetworkSpeechRecognitionEngineImpl::kAudioPacketIntervalMs = 100;
const int NetworkSpeechRecognitionEngineImpl::kWebserviceStatusNoError = 0;
const int NetworkSpeechRecognitionEngineImpl::kWebserviceStatusErrorNoMatch = 5;
NetworkSpeechRecognitionEngineImpl::NetworkSpeechRecognitionEngineImpl(
scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory)
: shared_url_loader_factory_(std::move(shared_url_loader_factory)) {}
NetworkSpeechRecognitionEngineImpl::~NetworkSpeechRecognitionEngineImpl() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
}
void NetworkSpeechRecognitionEngineImpl::set_web_service_base_url_for_tests(
const char* base_url_for_tests) {
web_service_base_url_for_tests = base_url_for_tests;
}
void NetworkSpeechRecognitionEngineImpl::SetConfig(const Config& config) {
config_ = config;
}
bool NetworkSpeechRecognitionEngineImpl::IsRecognitionPending() const {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
return state_ != STATE_IDLE;
}
void NetworkSpeechRecognitionEngineImpl::StartRecognition() {
upstream_audio_duration_ = base::TimeDelta();
FSMEventArgs event_args(EVENT_START_RECOGNITION);
DispatchEvent(event_args);
}
void NetworkSpeechRecognitionEngineImpl::UpdateRecognitionContext(
const media::SpeechRecognitionRecognitionContext& recognition_context) {
Abort(media::mojom::SpeechRecognitionErrorCode::kPhrasesNotSupported);
}
void NetworkSpeechRecognitionEngineImpl::EndRecognition() {
base::UmaHistogramLongTimes100(kWebSpeechAudioDuration,
upstream_audio_duration_);
FSMEventArgs event_args(EVENT_END_RECOGNITION);
DispatchEvent(event_args);
}
void NetworkSpeechRecognitionEngineImpl::TakeAudioChunk(
const AudioChunk& data) {
FSMEventArgs event_args(EVENT_AUDIO_CHUNK);
event_args.audio_data = &data;
DispatchEvent(event_args);
}
void NetworkSpeechRecognitionEngineImpl::AudioChunksEnded() {
FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED);
DispatchEvent(event_args);
}
void NetworkSpeechRecognitionEngineImpl::OnUpstreamDataComplete(
bool success,
int response_code) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DVLOG(1) << "Upstream complete success: " << success
<< " response_code: " << response_code;
if (!success) {
FSMEventArgs event_args(EVENT_UPSTREAM_ERROR);
DispatchEvent(event_args);
return;
}
// Do nothing on clean completion of upstream request.
}
void NetworkSpeechRecognitionEngineImpl::OnDownstreamDataReceived(
std::string_view new_response_data) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DVLOG(1) << "Downstream length: " << new_response_data.size();
// The downstream response is organized in chunks, whose size is determined
// by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class.
// Such chunks are sent by the speech recognition webservice over the HTTP
// downstream channel using HTTP chunked transfer (unrelated to our chunks).
// This function is called every time an HTTP chunk is received by the
// url fetcher. However there isn't any particular matching between our
// protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can
// contain a portion of one chunk or even more chunks together.
chunked_byte_buffer_.Append(new_response_data);
// A single HTTP chunk can contain more than one data chunk, thus the while.
while (chunked_byte_buffer_.HasChunks()) {
FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE);
event_args.response = chunked_byte_buffer_.PopChunk();
DCHECK(event_args.response.get());
DumpResponse(
std::string(event_args.response->begin(), event_args.response->end()));
DispatchEvent(event_args);
}
}
void NetworkSpeechRecognitionEngineImpl::OnDownstreamDataComplete(
bool success,
int response_code) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DVLOG(1) << "Downstream complete success: " << success
<< " response_code: " << response_code;
if (!success) {
FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR);
DispatchEvent(event_args);
return;
}
FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED);
DispatchEvent(event_args);
}
int NetworkSpeechRecognitionEngineImpl::GetDesiredAudioChunkDurationMs() const {
return kAudioPacketIntervalMs;
}
// ----------------------- Core FSM implementation ---------------------------
void NetworkSpeechRecognitionEngineImpl::DispatchEvent(
const FSMEventArgs& event_args) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
DCHECK_LE(state_, STATE_MAX_VALUE);
// Event dispatching must be sequential, otherwise it will break all the rules
// and the assumptions of the finite state automata model.
DCHECK(!is_dispatching_event_);
is_dispatching_event_ = true;
state_ = ExecuteTransitionAndGetNextState(event_args);
is_dispatching_event_ = false;
}
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::ExecuteTransitionAndGetNextState(
const FSMEventArgs& event_args) {
const FSMEvent event = event_args.event;
switch (state_) {
case STATE_IDLE:
switch (event) {
case EVENT_START_RECOGNITION:
return ConnectBothStreams(event_args);
case EVENT_END_RECOGNITION:
// Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of
// abort, so we just silently drop them here.
case EVENT_AUDIO_CHUNK:
case EVENT_AUDIO_CHUNKS_ENDED:
// DOWNSTREAM_CLOSED can be received if we end up here due to an error.
case EVENT_DOWNSTREAM_CLOSED:
return DoNothing(event_args);
case EVENT_UPSTREAM_ERROR:
case EVENT_DOWNSTREAM_ERROR:
case EVENT_DOWNSTREAM_RESPONSE:
return NotFeasible(event_args);
}
break;
case STATE_BOTH_STREAMS_CONNECTED:
switch (event) {
case EVENT_AUDIO_CHUNK:
return TransmitAudioUpstream(event_args);
case EVENT_DOWNSTREAM_RESPONSE:
return ProcessDownstreamResponse(event_args);
case EVENT_AUDIO_CHUNKS_ENDED:
return CloseUpstreamAndWaitForResults(event_args);
case EVENT_END_RECOGNITION:
return AbortSilently(event_args);
case EVENT_UPSTREAM_ERROR:
case EVENT_DOWNSTREAM_ERROR:
case EVENT_DOWNSTREAM_CLOSED:
return AbortWithError(event_args);
case EVENT_START_RECOGNITION:
return NotFeasible(event_args);
}
break;
case STATE_WAITING_DOWNSTREAM_RESULTS:
switch (event) {
case EVENT_DOWNSTREAM_RESPONSE:
return ProcessDownstreamResponse(event_args);
case EVENT_DOWNSTREAM_CLOSED:
return RaiseNoMatchErrorIfGotNoResults(event_args);
case EVENT_END_RECOGNITION:
return AbortSilently(event_args);
case EVENT_UPSTREAM_ERROR:
case EVENT_DOWNSTREAM_ERROR:
return AbortWithError(event_args);
case EVENT_START_RECOGNITION:
case EVENT_AUDIO_CHUNK:
case EVENT_AUDIO_CHUNKS_ENDED:
return NotFeasible(event_args);
}
break;
}
return NotFeasible(event_args);
}
// ----------- Contract for all the FSM evolution functions below -------------
// - Are guaranteed to be executed in the same thread (IO, except for tests);
// - Are guaranteed to be not reentrant (themselves and each other);
// - event_args members are guaranteed to be stable during the call;
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::ConnectBothStreams(const FSMEventArgs&) {
DCHECK(!upstream_loader_.get());
DCHECK(!downstream_loader_.get());
encoder_ = std::make_unique<AudioEncoder>(config_.audio_sample_rate,
config_.audio_num_bits_per_sample);
DCHECK(encoder_.get());
const std::string request_key = GenerateRequestKey();
// Only use the framed post data format when a preamble needs to be logged.
use_framed_post_data_ =
(config_.preamble && !config_.preamble->sample_data.empty() &&
!config_.auth_token.empty() && !config_.auth_scope.empty());
if (use_framed_post_data_) {
preamble_encoder_ = std::make_unique<AudioEncoder>(
config_.preamble->sample_rate, config_.preamble->sample_depth * 8);
}
const char* web_service_base_url = !web_service_base_url_for_tests
? kWebServiceBaseUrl
: web_service_base_url_for_tests;
// Setup downstream fetcher.
std::vector<std::string> downstream_args;
downstream_args.push_back(
"key=" + base::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
downstream_args.push_back("pair=" + request_key);
downstream_args.push_back("output=pb");
GURL downstream_url(std::string(web_service_base_url) +
std::string(kDownstreamUrl) +
base::JoinString(downstream_args, "&"));
net::NetworkTrafficAnnotationTag downstream_traffic_annotation =
net::DefineNetworkTrafficAnnotation("speech_recognition_downstream", R"(
semantics {
sender: "Speech Recognition"
description:
"Chrome provides translation from speech audio recorded with a "
"microphone to text, by using the Google speech recognition web "
"service. Audio is sent to Google's servers (upstream) and text is "
"returned (downstream). This network request (downstream) sends an "
"id for getting the text response. Then the (upstream) request "
"sends the audio data along with the id. When the server has "
"finished processing the audio data and produced a text response, "
"it replies to this request."
trigger:
"The user chooses to start the recognition by clicking the "
"microphone icon of the pages using Web SpeechRecognition API."
internal {
contacts {
email: "chrome-media-ux@google.com"
}
}
user_data {
type: USER_CONTENT
}
data: "A unique random id for this speech recognition request."
destination: GOOGLE_OWNED_SERVICE
last_reviewed: "2024-2-21"
}
policy {
cookies_allowed: NO
setting:
"The user must allow the browser to access the microphone in a "
"permission prompt. This is set per site (hostname pattern). In "
"the site settings menu, microphone access can be turned off "
"for all sites and site specific settings can be changed."
chrome_policy {
AudioCaptureAllowed {
policy_options {mode: MANDATORY}
AudioCaptureAllowed: false
}
}
chrome_policy {
AudioCaptureAllowedUrls {
policy_options {mode: MANDATORY}
AudioCaptureAllowedUrls: {}
}
}
})");
auto downstream_request = std::make_unique<network::ResourceRequest>();
downstream_request->credentials_mode = network::mojom::CredentialsMode::kOmit;
downstream_request->url = downstream_url;
downstream_loader_ = std::make_unique<speech::DownstreamLoader>(
std::move(downstream_request), downstream_traffic_annotation,
shared_url_loader_factory_.get(), this);
// Setup upstream fetcher.
// TODO(hans): Support for user-selected grammars.
std::vector<std::string> upstream_args;
upstream_args.push_back(
"key=" + base::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
upstream_args.push_back("pair=" + request_key);
upstream_args.push_back("output=pb");
upstream_args.push_back("lang=" +
base::EscapeQueryParamValue(config_.language, true));
upstream_args.push_back(config_.filter_profanities ? "pFilter=2"
: "pFilter=0");
if (config_.max_hypotheses > 0U) {
uint32_t max_alternatives =
std::min(kMaxMaxAlternatives, config_.max_hypotheses);
upstream_args.push_back("maxAlternatives=" +
base::NumberToString(max_alternatives));
}
upstream_args.push_back("app=chromium");
for (const media::mojom::SpeechRecognitionGrammar& grammar :
config_.grammars) {
std::string grammar_value(base::NumberToString(grammar.weight) + ":" +
grammar.url.spec());
upstream_args.push_back("grammar=" +
base::EscapeQueryParamValue(grammar_value, true));
}
if (config_.continuous) {
upstream_args.push_back("continuous");
} else {
upstream_args.push_back("endpoint=1");
}
if (config_.interim_results) {
upstream_args.push_back("interim");
}
if (!config_.auth_token.empty() && !config_.auth_scope.empty()) {
upstream_args.push_back(
"authScope=" + base::EscapeQueryParamValue(config_.auth_scope, true));
upstream_args.push_back(
"authToken=" + base::EscapeQueryParamValue(config_.auth_token, true));
}
if (use_framed_post_data_) {
std::string audio_format;
if (preamble_encoder_) {
audio_format = preamble_encoder_->GetMimeType() + ",";
}
audio_format += encoder_->GetMimeType();
upstream_args.push_back("audioFormat=" +
base::EscapeQueryParamValue(audio_format, true));
}
GURL upstream_url(std::string(web_service_base_url) +
std::string(kUpstreamUrl) +
base::JoinString(upstream_args, "&"));
net::NetworkTrafficAnnotationTag upstream_traffic_annotation =
net::DefineNetworkTrafficAnnotation("speech_recognition_upstream", R"(
semantics {
sender: "Speech Recognition"
description:
"Chrome provides translation from speech audio recorded with a "
"microphone to text, by using the Google speech recognition web "
"service. Audio is sent to Google's servers (upstream) and text is "
"returned (downstream)."
trigger:
"The user chooses to start the recognition by clicking the "
"microphone icon of the pages using Web SpeechRecognition API."
internal {
contacts {
email: "chrome-media-ux@google.com"
}
}
user_data {
type: USER_CONTENT
}
data:
"Audio recorded with the microphone, and the unique id of "
"downstream speech recognition request."
destination: GOOGLE_OWNED_SERVICE
last_reviewed: "2024-2-21"
}
policy {
cookies_allowed: NO
setting:
"The user must allow the browser to access the microphone in a "
"permission prompt. This is set per site (hostname pattern). In "
"the site settings menu, microphone access can be turned off "
"for all sites and site specific settings can be changed."
chrome_policy {
AudioCaptureAllowed {
policy_options {mode: MANDATORY}
AudioCaptureAllowed: false
}
}
chrome_policy {
AudioCaptureAllowedUrls {
policy_options {mode: MANDATORY}
AudioCaptureAllowedUrls: {}
}
}
})");
auto upstream_request = std::make_unique<network::ResourceRequest>();
upstream_request->url = upstream_url;
upstream_request->method = "POST";
upstream_request->referrer = GURL(config_.origin_url);
upstream_request->credentials_mode = network::mojom::CredentialsMode::kOmit;
if (use_framed_post_data_) {
upstream_request->headers.SetHeader(net::HttpRequestHeaders::kContentType,
"application/octet-stream");
} else {
upstream_request->headers.SetHeader(net::HttpRequestHeaders::kContentType,
encoder_->GetMimeType());
}
upstream_loader_ = std::make_unique<speech::UpstreamLoader>(
std::move(upstream_request), upstream_traffic_annotation,
shared_url_loader_factory_.get(), this);
if (preamble_encoder_) {
// Encode and send preamble right away.
scoped_refptr<AudioChunk> chunk = new AudioChunk(
reinterpret_cast<const uint8_t*>(config_.preamble->sample_data.data()),
config_.preamble->sample_data.size(), config_.preamble->sample_depth);
preamble_encoder_->Encode(*chunk);
preamble_encoder_->Flush();
scoped_refptr<AudioChunk> encoded_data(
preamble_encoder_->GetEncodedDataAndClear());
UploadAudioChunk(encoded_data->AsString(), FRAME_PREAMBLE_AUDIO, false);
}
return STATE_BOTH_STREAMS_CONNECTED;
}
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::TransmitAudioUpstream(
const FSMEventArgs& event_args) {
DCHECK(upstream_loader_.get());
DCHECK(event_args.audio_data.get());
const AudioChunk& audio = *(event_args.audio_data.get());
base::TimeDelta duration = media::AudioTimestampHelper::FramesToTime(
audio.NumSamples(), config_.audio_sample_rate);
upstream_audio_duration_ += duration;
DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
encoder_->Encode(audio);
scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
UploadAudioChunk(encoded_data->AsString(), FRAME_RECOGNITION_AUDIO, false);
return state_;
}
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::ProcessDownstreamResponse(
const FSMEventArgs& event_args) {
DCHECK(event_args.response.get());
proto::SpeechRecognitionEvent ws_event;
if (!ws_event.ParseFromString(std::string(event_args.response->begin(),
event_args.response->end()))) {
return AbortWithError(event_args);
}
if (ws_event.has_status()) {
switch (ws_event.status()) {
case proto::SpeechRecognitionEvent::STATUS_SUCCESS:
break;
case proto::SpeechRecognitionEvent::STATUS_NO_SPEECH:
return Abort(media::mojom::SpeechRecognitionErrorCode::kNoSpeech);
case proto::SpeechRecognitionEvent::STATUS_ABORTED:
return Abort(media::mojom::SpeechRecognitionErrorCode::kAborted);
case proto::SpeechRecognitionEvent::STATUS_AUDIO_CAPTURE:
return Abort(media::mojom::SpeechRecognitionErrorCode::kAudioCapture);
case proto::SpeechRecognitionEvent::STATUS_NETWORK:
return Abort(media::mojom::SpeechRecognitionErrorCode::kNetwork);
case proto::SpeechRecognitionEvent::STATUS_NOT_ALLOWED:
return Abort(media::mojom::SpeechRecognitionErrorCode::kNotAllowed);
case proto::SpeechRecognitionEvent::STATUS_SERVICE_NOT_ALLOWED:
return Abort(
media::mojom::SpeechRecognitionErrorCode::kServiceNotAllowed);
case proto::SpeechRecognitionEvent::STATUS_BAD_GRAMMAR:
return Abort(media::mojom::SpeechRecognitionErrorCode::kBadGrammar);
case proto::SpeechRecognitionEvent::STATUS_LANGUAGE_NOT_SUPPORTED:
return Abort(
media::mojom::SpeechRecognitionErrorCode::kLanguageNotSupported);
}
}
if (!config_.continuous && ws_event.has_endpoint() &&
ws_event.endpoint() == proto::SpeechRecognitionEvent::END_OF_UTTERANCE) {
delegate_->OnSpeechRecognitionEngineEndOfUtterance();
}
std::vector<media::mojom::WebSpeechRecognitionResultPtr> results;
for (int i = 0; i < ws_event.result_size(); ++i) {
const proto::SpeechRecognitionResult& ws_result = ws_event.result(i);
results.push_back(media::mojom::WebSpeechRecognitionResult::New());
media::mojom::WebSpeechRecognitionResultPtr& result = results.back();
result->is_provisional = !(ws_result.has_final() && ws_result.final());
if (!result->is_provisional) {
got_last_definitive_result_ = true;
}
for (int j = 0; j < ws_result.alternative_size(); ++j) {
const proto::SpeechRecognitionAlternative& ws_alternative =
ws_result.alternative(j);
media::mojom::SpeechRecognitionHypothesisPtr hypothesis =
media::mojom::SpeechRecognitionHypothesis::New();
if (ws_alternative.has_confidence()) {
hypothesis->confidence = ws_alternative.confidence();
} else if (ws_result.has_stability()) {
hypothesis->confidence = ws_result.stability();
}
DCHECK(ws_alternative.has_transcript());
// TODO(hans): Perhaps the transcript should be required in the proto?
if (ws_alternative.has_transcript()) {
hypothesis->utterance = base::UTF8ToUTF16(ws_alternative.transcript());
}
result->hypotheses.push_back(std::move(hypothesis));
}
}
if (results.size()) {
delegate_->OnSpeechRecognitionEngineResults(results);
}
return state_;
}
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::RaiseNoMatchErrorIfGotNoResults(
const FSMEventArgs& event_args) {
if (!got_last_definitive_result_) {
// Provide an empty result to notify that recognition is ended with no
// errors, yet neither any further results.
delegate_->OnSpeechRecognitionEngineResults(
std::vector<media::mojom::WebSpeechRecognitionResultPtr>());
}
return AbortSilently(event_args);
}
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::CloseUpstreamAndWaitForResults(
const FSMEventArgs&) {
DCHECK(upstream_loader_.get());
DCHECK(encoder_.get());
DVLOG(1) << "Closing upstream.";
// The encoder requires a non-empty final buffer. So we encode a packet
// of silence in case encoder had no data already.
size_t sample_count =
config_.audio_sample_rate * GetDesiredAudioChunkDurationMs() / 1000;
scoped_refptr<AudioChunk> dummy_chunk = new AudioChunk(
sample_count * sizeof(int16_t), encoder_->GetBitsPerSample() / 8);
encoder_->Encode(*dummy_chunk.get());
encoder_->Flush();
scoped_refptr<AudioChunk> encoded_dummy_data =
encoder_->GetEncodedDataAndClear();
DCHECK(!encoded_dummy_data->IsEmpty());
encoder_.reset();
UploadAudioChunk(encoded_dummy_data->AsString(), FRAME_RECOGNITION_AUDIO,
true);
got_last_definitive_result_ = false;
return STATE_WAITING_DOWNSTREAM_RESULTS;
}
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::CloseDownstream(const FSMEventArgs&) {
DCHECK(!upstream_loader_.get());
DCHECK(downstream_loader_.get());
DVLOG(1) << "Closing downstream.";
downstream_loader_.reset();
return STATE_IDLE;
}
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::AbortSilently(const FSMEventArgs&) {
return Abort(media::mojom::SpeechRecognitionErrorCode::kNone);
}
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::AbortWithError(const FSMEventArgs&) {
return Abort(media::mojom::SpeechRecognitionErrorCode::kNetwork);
}
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::Abort(
media::mojom::SpeechRecognitionErrorCode error_code) {
DVLOG(1) << "Aborting with error " << error_code;
if (error_code != media::mojom::SpeechRecognitionErrorCode::kNone) {
delegate_->OnSpeechRecognitionEngineError(
media::mojom::SpeechRecognitionError(
error_code, media::mojom::SpeechAudioErrorDetails::kNone));
}
downstream_loader_.reset();
upstream_loader_.reset();
encoder_.reset();
return STATE_IDLE;
}
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::DoNothing(const FSMEventArgs&) {
return state_;
}
NetworkSpeechRecognitionEngineImpl::FSMState
NetworkSpeechRecognitionEngineImpl::NotFeasible(
const FSMEventArgs& event_args) {
NOTREACHED() << "Unfeasible event " << event_args.event << " in state "
<< state_;
}
// TODO(primiano): Is there any utility in the codebase that already does this?
std::string NetworkSpeechRecognitionEngineImpl::GenerateRequestKey() const {
const int64_t kKeepLowBytes = 0x00000000FFFFFFFFLL;
const int64_t kKeepHighBytes = 0xFFFFFFFF00000000LL;
// Just keep the least significant bits of timestamp, in order to reduce
// probability of collisions.
int64_t key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) |
(base::RandUint64() & kKeepHighBytes);
return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key));
}
void NetworkSpeechRecognitionEngineImpl::UploadAudioChunk(
const std::string& data,
FrameType type,
bool is_final) {
if (use_framed_post_data_) {
std::string frame(data.size() + 8u, char{0});
auto frame_span = base::as_writable_byte_span(frame);
frame_span.subspan<0u, 4u>().copy_from(
base::U32ToBigEndian(static_cast<uint32_t>(data.size())));
frame_span.subspan<4u, 4u>().copy_from(
base::U32ToBigEndian(base::checked_cast<uint32_t>(type)));
frame.replace(8u, data.size(), data);
upstream_loader_->AppendChunkToUpload(frame, is_final);
} else {
upstream_loader_->AppendChunkToUpload(data, is_final);
}
}
NetworkSpeechRecognitionEngineImpl::FSMEventArgs::FSMEventArgs(
FSMEvent event_value)
: event(event_value) {}
NetworkSpeechRecognitionEngineImpl::FSMEventArgs::~FSMEventArgs() = default;
} // namespace content