content/browser/speech/network_speech_recognition_engine_impl.cc - chromium/src - Git at Google

 // Copyright 2024 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "content/browser/speech/network_speech_recognition_engine_impl.h"

 #include <algorithm>
 #include <memory>
 #include <string_view>
 #include <vector>

 #include "base/functional/bind.h"
 #include "base/metrics/histogram_functions.h"
 #include "base/numerics/byte_conversions.h"
 #include "base/numerics/safe_conversions.h"
 #include "base/rand_util.h"
 #include "base/strings/escape.h"
 #include "base/strings/string_number_conversions.h"
 #include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversions.h"
 #include "base/time/time.h"
 #include "components/speech/audio_buffer.h"
 #include "content/public/browser/google_streaming_api.pb.h"
 #include "google_apis/google_api_keys.h"
 #include "media/base/audio_timestamp_helper.h"
 #include "media/mojo/mojom/speech_recognition_error.mojom.h"
 #include "media/mojo/mojom/speech_recognition_result.mojom.h"
 #include "mojo/public/c/system/types.h"
 #include "mojo/public/cpp/bindings/receiver_set.h"
 #include "net/base/load_flags.h"
 #include "net/traffic_annotation/network_traffic_annotation.h"
 #include "services/network/public/cpp/shared_url_loader_factory.h"

 namespace content {
 namespace {

 const char kWebServiceBaseUrl[] =
     "https://www.google.com/speech-api/full-duplex/v1";
 const char kDownstreamUrl[] = "/down?";
 const char kUpstreamUrl[] = "/up?";

 constexpr char kWebSpeechAudioDuration[] = "Accessibility.WebSpeech.Duration";

 // Used to override |kWebServiceBaseUrl| when non-null, only set in tests.
 const char* web_service_base_url_for_tests = nullptr;

 // This matches the maximum maxAlternatives value supported by the server.
 const uint32_t kMaxMaxAlternatives = 30;

 // TODO(hans): Remove this and other logging when we don't need it anymore.
 void DumpResponse(const std::string& response) {
   DVLOG(1) << "------------";
   proto::SpeechRecognitionEvent event;
   if (!event.ParseFromString(response)) {
     DVLOG(1) << "Parse failed!";
     return;
   }
   if (event.has_status()) {
     DVLOG(1) << "STATUS\t" << event.status();
   }
   if (event.has_endpoint()) {
     DVLOG(1) << "ENDPOINT\t" << event.endpoint();
   }
   for (int i = 0; i < event.result_size(); ++i) {
     DVLOG(1) << "RESULT #" << i << ":";
     const proto::SpeechRecognitionResult& res = event.result(i);
     if (res.has_final()) {
       DVLOG(1) << "  final:\t" << res.final();
     }
     if (res.has_stability()) {
       DVLOG(1) << "  STABILITY:\t" << res.stability();
     }
     for (int j = 0; j < res.alternative_size(); ++j) {
       const proto::SpeechRecognitionAlternative& alt = res.alternative(j);
       if (alt.has_confidence()) {
         DVLOG(1) << "    CONFIDENCE:\t" << alt.confidence();
       }
       if (alt.has_transcript()) {
         DVLOG(1) << "    TRANSCRIPT:\t" << alt.transcript();
       }
     }
   }
 }

 const int kDefaultConfigSampleRate = 8000;
 const int kDefaultConfigBitsPerSample = 16;
 const uint32_t kDefaultMaxHypotheses = 1;

 }  // namespace

 NetworkSpeechRecognitionEngineImpl::Config::Config()
     : max_hypotheses(kDefaultMaxHypotheses),
       audio_sample_rate(kDefaultConfigSampleRate),
       audio_num_bits_per_sample(kDefaultConfigBitsPerSample) {}

 NetworkSpeechRecognitionEngineImpl::Config::~Config() = default;

 const int NetworkSpeechRecognitionEngineImpl::kAudioPacketIntervalMs = 100;
 const int NetworkSpeechRecognitionEngineImpl::kWebserviceStatusNoError = 0;
 const int NetworkSpeechRecognitionEngineImpl::kWebserviceStatusErrorNoMatch = 5;

 NetworkSpeechRecognitionEngineImpl::NetworkSpeechRecognitionEngineImpl(
     scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory)
     : shared_url_loader_factory_(std::move(shared_url_loader_factory)) {}

 NetworkSpeechRecognitionEngineImpl::~NetworkSpeechRecognitionEngineImpl() {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
 }

 void NetworkSpeechRecognitionEngineImpl::set_web_service_base_url_for_tests(
     const char* base_url_for_tests) {
   web_service_base_url_for_tests = base_url_for_tests;
 }

 void NetworkSpeechRecognitionEngineImpl::SetConfig(const Config& config) {
   config_ = config;
 }

 bool NetworkSpeechRecognitionEngineImpl::IsRecognitionPending() const {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   return state_ != STATE_IDLE;
 }

 void NetworkSpeechRecognitionEngineImpl::StartRecognition() {
   upstream_audio_duration_ = base::TimeDelta();
   FSMEventArgs event_args(EVENT_START_RECOGNITION);
   DispatchEvent(event_args);
 }

 void NetworkSpeechRecognitionEngineImpl::UpdateRecognitionContext(
     const media::SpeechRecognitionRecognitionContext& recognition_context) {
   Abort(media::mojom::SpeechRecognitionErrorCode::kPhrasesNotSupported);
 }

 void NetworkSpeechRecognitionEngineImpl::EndRecognition() {
   base::UmaHistogramLongTimes100(kWebSpeechAudioDuration,
                                  upstream_audio_duration_);

   FSMEventArgs event_args(EVENT_END_RECOGNITION);
   DispatchEvent(event_args);
 }

 void NetworkSpeechRecognitionEngineImpl::TakeAudioChunk(
     const AudioChunk& data) {
   FSMEventArgs event_args(EVENT_AUDIO_CHUNK);
   event_args.audio_data = &data;
   DispatchEvent(event_args);
 }

 void NetworkSpeechRecognitionEngineImpl::AudioChunksEnded() {
   FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED);
   DispatchEvent(event_args);
 }

 void NetworkSpeechRecognitionEngineImpl::OnUpstreamDataComplete(
     bool success,
     int response_code) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);

   DVLOG(1) << "Upstream complete success: " << success
            << " response_code: " << response_code;

   if (!success) {
     FSMEventArgs event_args(EVENT_UPSTREAM_ERROR);
     DispatchEvent(event_args);
     return;
   }

   // Do nothing on clean completion of upstream request.
 }

 void NetworkSpeechRecognitionEngineImpl::OnDownstreamDataReceived(
     std::string_view new_response_data) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);

   DVLOG(1) << "Downstream length: " << new_response_data.size();

   // The downstream response is organized in chunks, whose size is determined
   // by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class.
   // Such chunks are sent by the speech recognition webservice over the HTTP
   // downstream channel using HTTP chunked transfer (unrelated to our chunks).
   // This function is called every time an HTTP chunk is received by the
   // url fetcher. However there isn't any particular matching between our
   // protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can
   // contain a portion of one chunk or even more chunks together.
   chunked_byte_buffer_.Append(new_response_data);

   // A single HTTP chunk can contain more than one data chunk, thus the while.
   while (chunked_byte_buffer_.HasChunks()) {
     FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE);
     event_args.response = chunked_byte_buffer_.PopChunk();
     DCHECK(event_args.response.get());
     DumpResponse(
         std::string(event_args.response->begin(), event_args.response->end()));
     DispatchEvent(event_args);
   }
 }

 void NetworkSpeechRecognitionEngineImpl::OnDownstreamDataComplete(
     bool success,
     int response_code) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);

   DVLOG(1) << "Downstream complete success: " << success
            << " response_code: " << response_code;

   if (!success) {
     FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR);
     DispatchEvent(event_args);
     return;
   }

   FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED);
   DispatchEvent(event_args);
 }

 int NetworkSpeechRecognitionEngineImpl::GetDesiredAudioChunkDurationMs() const {
   return kAudioPacketIntervalMs;
 }

 // -----------------------  Core FSM implementation ---------------------------

 void NetworkSpeechRecognitionEngineImpl::DispatchEvent(
     const FSMEventArgs& event_args) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
   DCHECK_LE(state_, STATE_MAX_VALUE);

   // Event dispatching must be sequential, otherwise it will break all the rules
   // and the assumptions of the finite state automata model.
   DCHECK(!is_dispatching_event_);
   is_dispatching_event_ = true;

   state_ = ExecuteTransitionAndGetNextState(event_args);

   is_dispatching_event_ = false;
 }

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::ExecuteTransitionAndGetNextState(
     const FSMEventArgs& event_args) {
   const FSMEvent event = event_args.event;
   switch (state_) {
     case STATE_IDLE:
       switch (event) {
         case EVENT_START_RECOGNITION:
           return ConnectBothStreams(event_args);
         case EVENT_END_RECOGNITION:
         // Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of
         // abort, so we just silently drop them here.
         case EVENT_AUDIO_CHUNK:
         case EVENT_AUDIO_CHUNKS_ENDED:
         // DOWNSTREAM_CLOSED can be received if we end up here due to an error.
         case EVENT_DOWNSTREAM_CLOSED:
           return DoNothing(event_args);
         case EVENT_UPSTREAM_ERROR:
         case EVENT_DOWNSTREAM_ERROR:
         case EVENT_DOWNSTREAM_RESPONSE:
           return NotFeasible(event_args);
       }
       break;
     case STATE_BOTH_STREAMS_CONNECTED:
       switch (event) {
         case EVENT_AUDIO_CHUNK:
           return TransmitAudioUpstream(event_args);
         case EVENT_DOWNSTREAM_RESPONSE:
           return ProcessDownstreamResponse(event_args);
         case EVENT_AUDIO_CHUNKS_ENDED:
           return CloseUpstreamAndWaitForResults(event_args);
         case EVENT_END_RECOGNITION:
           return AbortSilently(event_args);
         case EVENT_UPSTREAM_ERROR:
         case EVENT_DOWNSTREAM_ERROR:
         case EVENT_DOWNSTREAM_CLOSED:
           return AbortWithError(event_args);
         case EVENT_START_RECOGNITION:
           return NotFeasible(event_args);
       }
       break;
     case STATE_WAITING_DOWNSTREAM_RESULTS:
       switch (event) {
         case EVENT_DOWNSTREAM_RESPONSE:
           return ProcessDownstreamResponse(event_args);
         case EVENT_DOWNSTREAM_CLOSED:
           return RaiseNoMatchErrorIfGotNoResults(event_args);
         case EVENT_END_RECOGNITION:
           return AbortSilently(event_args);
         case EVENT_UPSTREAM_ERROR:
         case EVENT_DOWNSTREAM_ERROR:
           return AbortWithError(event_args);
         case EVENT_START_RECOGNITION:
         case EVENT_AUDIO_CHUNK:
         case EVENT_AUDIO_CHUNKS_ENDED:
           return NotFeasible(event_args);
       }
       break;
   }
   return NotFeasible(event_args);
 }

 // ----------- Contract for all the FSM evolution functions below -------------
 //  - Are guaranteed to be executed in the same thread (IO, except for tests);
 //  - Are guaranteed to be not reentrant (themselves and each other);
 //  - event_args members are guaranteed to be stable during the call;

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::ConnectBothStreams(const FSMEventArgs&) {
   DCHECK(!upstream_loader_.get());
   DCHECK(!downstream_loader_.get());

   encoder_ = std::make_unique<AudioEncoder>(config_.audio_sample_rate,
                                             config_.audio_num_bits_per_sample);
   DCHECK(encoder_.get());
   const std::string request_key = GenerateRequestKey();

   // Only use the framed post data format when a preamble needs to be logged.
   use_framed_post_data_ =
       (config_.preamble && !config_.preamble->sample_data.empty() &&
        !config_.auth_token.empty() && !config_.auth_scope.empty());
   if (use_framed_post_data_) {
     preamble_encoder_ = std::make_unique<AudioEncoder>(
         config_.preamble->sample_rate, config_.preamble->sample_depth * 8);
   }

   const char* web_service_base_url = !web_service_base_url_for_tests
                                          ? kWebServiceBaseUrl
                                          : web_service_base_url_for_tests;

   // Setup downstream fetcher.
   std::vector<std::string> downstream_args;
   downstream_args.push_back(
       "key=" + base::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
   downstream_args.push_back("pair=" + request_key);
   downstream_args.push_back("output=pb");
   GURL downstream_url(std::string(web_service_base_url) +
                       std::string(kDownstreamUrl) +
                       base::JoinString(downstream_args, "&"));

   net::NetworkTrafficAnnotationTag downstream_traffic_annotation =
       net::DefineNetworkTrafficAnnotation("speech_recognition_downstream", R"(
         semantics {
           sender: "Speech Recognition"
           description:
             "Chrome provides translation from speech audio recorded with a "
             "microphone to text, by using the Google speech recognition web "
             "service. Audio is sent to Google's servers (upstream) and text is "
             "returned (downstream). This network request (downstream) sends an "
             "id for getting the text response. Then the (upstream) request "
             "sends the audio data along with the id. When the server has "
             "finished processing the audio data and produced a text response, "
             "it replies to this request."
           trigger:
             "The user chooses to start the recognition by clicking the "
             "microphone icon of the pages using Web SpeechRecognition API."
           internal {
             contacts {
               email: "chrome-media-ux@google.com"
             }
           }
           user_data {
             type: USER_CONTENT
           }
           data: "A unique random id for this speech recognition request."
           destination: GOOGLE_OWNED_SERVICE
           last_reviewed: "2024-2-21"
         }
         policy {
           cookies_allowed: NO
           setting:
             "The user must allow the browser to access the microphone in a "
             "permission prompt. This is set per site (hostname pattern). In "
             "the site settings menu, microphone access can be turned off "
             "for all sites and site specific settings can be changed."
           chrome_policy {
             AudioCaptureAllowed {
               policy_options {mode: MANDATORY}
               AudioCaptureAllowed: false
             }
           }
           chrome_policy {
             AudioCaptureAllowedUrls {
               policy_options {mode: MANDATORY}
               AudioCaptureAllowedUrls: {}
             }
           }
         })");
   auto downstream_request = std::make_unique<network::ResourceRequest>();
   downstream_request->credentials_mode = network::mojom::CredentialsMode::kOmit;
   downstream_request->url = downstream_url;
   downstream_loader_ = std::make_unique<speech::DownstreamLoader>(
       std::move(downstream_request), downstream_traffic_annotation,
       shared_url_loader_factory_.get(), this);

   // Setup upstream fetcher.
   // TODO(hans): Support for user-selected grammars.
   std::vector<std::string> upstream_args;
   upstream_args.push_back(
       "key=" + base::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
   upstream_args.push_back("pair=" + request_key);
   upstream_args.push_back("output=pb");
   upstream_args.push_back("lang=" +
                           base::EscapeQueryParamValue(config_.language, true));
   upstream_args.push_back(config_.filter_profanities ? "pFilter=2"
                                                      : "pFilter=0");
   if (config_.max_hypotheses > 0U) {
     uint32_t max_alternatives =
         std::min(kMaxMaxAlternatives, config_.max_hypotheses);
     upstream_args.push_back("maxAlternatives=" +
                             base::NumberToString(max_alternatives));
   }
   upstream_args.push_back("app=chromium");
   for (const media::mojom::SpeechRecognitionGrammar& grammar :
        config_.grammars) {
     std::string grammar_value(base::NumberToString(grammar.weight) + ":" +
                               grammar.url.spec());
     upstream_args.push_back("grammar=" +
                             base::EscapeQueryParamValue(grammar_value, true));
   }
   if (config_.continuous) {
     upstream_args.push_back("continuous");
   } else {
     upstream_args.push_back("endpoint=1");
   }
   if (config_.interim_results) {
     upstream_args.push_back("interim");
   }
   if (!config_.auth_token.empty() && !config_.auth_scope.empty()) {
     upstream_args.push_back(
         "authScope=" + base::EscapeQueryParamValue(config_.auth_scope, true));
     upstream_args.push_back(
         "authToken=" + base::EscapeQueryParamValue(config_.auth_token, true));
   }
   if (use_framed_post_data_) {
     std::string audio_format;
     if (preamble_encoder_) {
       audio_format = preamble_encoder_->GetMimeType() + ",";
     }
     audio_format += encoder_->GetMimeType();
     upstream_args.push_back("audioFormat=" +
                             base::EscapeQueryParamValue(audio_format, true));
   }

   GURL upstream_url(std::string(web_service_base_url) +
                     std::string(kUpstreamUrl) +
                     base::JoinString(upstream_args, "&"));

   net::NetworkTrafficAnnotationTag upstream_traffic_annotation =
       net::DefineNetworkTrafficAnnotation("speech_recognition_upstream", R"(
         semantics {
           sender: "Speech Recognition"
           description:
             "Chrome provides translation from speech audio recorded with a "
             "microphone to text, by using the Google speech recognition web "
             "service. Audio is sent to Google's servers (upstream) and text is "
             "returned (downstream)."
           trigger:
             "The user chooses to start the recognition by clicking the "
             "microphone icon of the pages using Web SpeechRecognition API."
           internal {
             contacts {
               email: "chrome-media-ux@google.com"
             }
           }
           user_data {
             type: USER_CONTENT
           }
           data:
             "Audio recorded with the microphone, and the unique id of "
             "downstream speech recognition request."
           destination: GOOGLE_OWNED_SERVICE
           last_reviewed: "2024-2-21"
         }
         policy {
           cookies_allowed: NO
           setting:
             "The user must allow the browser to access the microphone in a "
             "permission prompt. This is set per site (hostname pattern). In "
             "the site settings menu, microphone access can be turned off "
             "for all sites and site specific settings can be changed."
           chrome_policy {
             AudioCaptureAllowed {
               policy_options {mode: MANDATORY}
               AudioCaptureAllowed: false
             }
           }
           chrome_policy {
             AudioCaptureAllowedUrls {
               policy_options {mode: MANDATORY}
               AudioCaptureAllowedUrls: {}
             }
           }
         })");

   auto upstream_request = std::make_unique<network::ResourceRequest>();
   upstream_request->url = upstream_url;
   upstream_request->method = "POST";
   upstream_request->referrer = GURL(config_.origin_url);
   upstream_request->credentials_mode = network::mojom::CredentialsMode::kOmit;
   if (use_framed_post_data_) {
     upstream_request->headers.SetHeader(net::HttpRequestHeaders::kContentType,
                                         "application/octet-stream");
   } else {
     upstream_request->headers.SetHeader(net::HttpRequestHeaders::kContentType,
                                         encoder_->GetMimeType());
   }

   upstream_loader_ = std::make_unique<speech::UpstreamLoader>(
       std::move(upstream_request), upstream_traffic_annotation,
       shared_url_loader_factory_.get(), this);

   if (preamble_encoder_) {
     // Encode and send preamble right away.
     scoped_refptr<AudioChunk> chunk = new AudioChunk(
         reinterpret_cast<const uint8_t*>(config_.preamble->sample_data.data()),
         config_.preamble->sample_data.size(), config_.preamble->sample_depth);
     preamble_encoder_->Encode(*chunk);
     preamble_encoder_->Flush();
     scoped_refptr<AudioChunk> encoded_data(
         preamble_encoder_->GetEncodedDataAndClear());
     UploadAudioChunk(encoded_data->AsString(), FRAME_PREAMBLE_AUDIO, false);
   }
   return STATE_BOTH_STREAMS_CONNECTED;
 }

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::TransmitAudioUpstream(
     const FSMEventArgs& event_args) {
   DCHECK(upstream_loader_.get());
   DCHECK(event_args.audio_data.get());
   const AudioChunk& audio = *(event_args.audio_data.get());

   base::TimeDelta duration = media::AudioTimestampHelper::FramesToTime(
       audio.NumSamples(), config_.audio_sample_rate);
   upstream_audio_duration_ += duration;

   DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
   encoder_->Encode(audio);
   scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
   UploadAudioChunk(encoded_data->AsString(), FRAME_RECOGNITION_AUDIO, false);
   return state_;
 }

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::ProcessDownstreamResponse(
     const FSMEventArgs& event_args) {
   DCHECK(event_args.response.get());

   proto::SpeechRecognitionEvent ws_event;
   if (!ws_event.ParseFromString(std::string(event_args.response->begin(),
                                             event_args.response->end()))) {
     return AbortWithError(event_args);
   }

   if (ws_event.has_status()) {
     switch (ws_event.status()) {
       case proto::SpeechRecognitionEvent::STATUS_SUCCESS:
         break;
       case proto::SpeechRecognitionEvent::STATUS_NO_SPEECH:
         return Abort(media::mojom::SpeechRecognitionErrorCode::kNoSpeech);
       case proto::SpeechRecognitionEvent::STATUS_ABORTED:
         return Abort(media::mojom::SpeechRecognitionErrorCode::kAborted);
       case proto::SpeechRecognitionEvent::STATUS_AUDIO_CAPTURE:
         return Abort(media::mojom::SpeechRecognitionErrorCode::kAudioCapture);
       case proto::SpeechRecognitionEvent::STATUS_NETWORK:
         return Abort(media::mojom::SpeechRecognitionErrorCode::kNetwork);
       case proto::SpeechRecognitionEvent::STATUS_NOT_ALLOWED:
         return Abort(media::mojom::SpeechRecognitionErrorCode::kNotAllowed);
       case proto::SpeechRecognitionEvent::STATUS_SERVICE_NOT_ALLOWED:
         return Abort(
             media::mojom::SpeechRecognitionErrorCode::kServiceNotAllowed);
       case proto::SpeechRecognitionEvent::STATUS_BAD_GRAMMAR:
         return Abort(media::mojom::SpeechRecognitionErrorCode::kBadGrammar);
       case proto::SpeechRecognitionEvent::STATUS_LANGUAGE_NOT_SUPPORTED:
         return Abort(
             media::mojom::SpeechRecognitionErrorCode::kLanguageNotSupported);
     }
   }

   if (!config_.continuous && ws_event.has_endpoint() &&
       ws_event.endpoint() == proto::SpeechRecognitionEvent::END_OF_UTTERANCE) {
     delegate_->OnSpeechRecognitionEngineEndOfUtterance();
   }

   std::vector<media::mojom::WebSpeechRecognitionResultPtr> results;
   for (int i = 0; i < ws_event.result_size(); ++i) {
     const proto::SpeechRecognitionResult& ws_result = ws_event.result(i);
     results.push_back(media::mojom::WebSpeechRecognitionResult::New());
     media::mojom::WebSpeechRecognitionResultPtr& result = results.back();
     result->is_provisional = !(ws_result.has_final() && ws_result.final());

     if (!result->is_provisional) {
       got_last_definitive_result_ = true;
     }

     for (int j = 0; j < ws_result.alternative_size(); ++j) {
       const proto::SpeechRecognitionAlternative& ws_alternative =
           ws_result.alternative(j);
       media::mojom::SpeechRecognitionHypothesisPtr hypothesis =
           media::mojom::SpeechRecognitionHypothesis::New();
       if (ws_alternative.has_confidence()) {
         hypothesis->confidence = ws_alternative.confidence();
       } else if (ws_result.has_stability()) {
         hypothesis->confidence = ws_result.stability();
       }
       DCHECK(ws_alternative.has_transcript());
       // TODO(hans): Perhaps the transcript should be required in the proto?
       if (ws_alternative.has_transcript()) {
         hypothesis->utterance = base::UTF8ToUTF16(ws_alternative.transcript());
       }

       result->hypotheses.push_back(std::move(hypothesis));
     }
   }
   if (results.size()) {
     delegate_->OnSpeechRecognitionEngineResults(results);
   }

   return state_;
 }

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::RaiseNoMatchErrorIfGotNoResults(
     const FSMEventArgs& event_args) {
   if (!got_last_definitive_result_) {
     // Provide an empty result to notify that recognition is ended with no
     // errors, yet neither any further results.
     delegate_->OnSpeechRecognitionEngineResults(
         std::vector<media::mojom::WebSpeechRecognitionResultPtr>());
   }
   return AbortSilently(event_args);
 }

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::CloseUpstreamAndWaitForResults(
     const FSMEventArgs&) {
   DCHECK(upstream_loader_.get());
   DCHECK(encoder_.get());

   DVLOG(1) << "Closing upstream.";

   // The encoder requires a non-empty final buffer. So we encode a packet
   // of silence in case encoder had no data already.
   size_t sample_count =
       config_.audio_sample_rate * GetDesiredAudioChunkDurationMs() / 1000;
   scoped_refptr<AudioChunk> dummy_chunk = new AudioChunk(
       sample_count * sizeof(int16_t), encoder_->GetBitsPerSample() / 8);
   encoder_->Encode(*dummy_chunk.get());
   encoder_->Flush();
   scoped_refptr<AudioChunk> encoded_dummy_data =
       encoder_->GetEncodedDataAndClear();
   DCHECK(!encoded_dummy_data->IsEmpty());
   encoder_.reset();

   UploadAudioChunk(encoded_dummy_data->AsString(), FRAME_RECOGNITION_AUDIO,
                    true);
   got_last_definitive_result_ = false;
   return STATE_WAITING_DOWNSTREAM_RESULTS;
 }

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::CloseDownstream(const FSMEventArgs&) {
   DCHECK(!upstream_loader_.get());
   DCHECK(downstream_loader_.get());

   DVLOG(1) << "Closing downstream.";
   downstream_loader_.reset();
   return STATE_IDLE;
 }

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::AbortSilently(const FSMEventArgs&) {
   return Abort(media::mojom::SpeechRecognitionErrorCode::kNone);
 }

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::AbortWithError(const FSMEventArgs&) {
   return Abort(media::mojom::SpeechRecognitionErrorCode::kNetwork);
 }

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::Abort(
     media::mojom::SpeechRecognitionErrorCode error_code) {
   DVLOG(1) << "Aborting with error " << error_code;

   if (error_code != media::mojom::SpeechRecognitionErrorCode::kNone) {
     delegate_->OnSpeechRecognitionEngineError(
         media::mojom::SpeechRecognitionError(
             error_code, media::mojom::SpeechAudioErrorDetails::kNone));
   }
   downstream_loader_.reset();
   upstream_loader_.reset();
   encoder_.reset();
   return STATE_IDLE;
 }

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::DoNothing(const FSMEventArgs&) {
   return state_;
 }

 NetworkSpeechRecognitionEngineImpl::FSMState
 NetworkSpeechRecognitionEngineImpl::NotFeasible(
     const FSMEventArgs& event_args) {
   NOTREACHED() << "Unfeasible event " << event_args.event << " in state "
                << state_;
 }

 // TODO(primiano): Is there any utility in the codebase that already does this?
 std::string NetworkSpeechRecognitionEngineImpl::GenerateRequestKey() const {
   const int64_t kKeepLowBytes = 0x00000000FFFFFFFFLL;
   const int64_t kKeepHighBytes = 0xFFFFFFFF00000000LL;

   // Just keep the least significant bits of timestamp, in order to reduce
   // probability of collisions.
   int64_t key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) |
                 (base::RandUint64() & kKeepHighBytes);
   return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key));
 }

 void NetworkSpeechRecognitionEngineImpl::UploadAudioChunk(
     const std::string& data,
     FrameType type,
     bool is_final) {
   if (use_framed_post_data_) {
     std::string frame(data.size() + 8u, char{0});
     auto frame_span = base::as_writable_byte_span(frame);
     frame_span.subspan<0u, 4u>().copy_from(
         base::U32ToBigEndian(static_cast<uint32_t>(data.size())));
     frame_span.subspan<4u, 4u>().copy_from(
         base::U32ToBigEndian(base::checked_cast<uint32_t>(type)));
     frame.replace(8u, data.size(), data);
     upstream_loader_->AppendChunkToUpload(frame, is_final);
   } else {
     upstream_loader_->AppendChunkToUpload(data, is_final);
   }
 }

 NetworkSpeechRecognitionEngineImpl::FSMEventArgs::FSMEventArgs(
     FSMEvent event_value)
     : event(event_value) {}

 NetworkSpeechRecognitionEngineImpl::FSMEventArgs::~FSMEventArgs() = default;

 }  // namespace content