|  | // Copyright 2012 The Chromium Authors | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include "content/browser/speech/speech_recognition_engine.h" | 
|  |  | 
|  | #include <algorithm> | 
|  | #include <memory> | 
|  | #include <vector> | 
|  |  | 
|  | #include "base/big_endian.h" | 
|  | #include "base/bind.h" | 
|  | #include "base/metrics/histogram_functions.h" | 
|  | #include "base/rand_util.h" | 
|  | #include "base/strings/escape.h" | 
|  | #include "base/strings/string_number_conversions.h" | 
|  | #include "base/strings/string_util.h" | 
|  | #include "base/strings/utf_string_conversions.h" | 
|  | #include "base/time/time.h" | 
|  | #include "content/browser/speech/audio_buffer.h" | 
|  | #include "content/public/browser/google_streaming_api.pb.h" | 
|  | #include "google_apis/google_api_keys.h" | 
|  | #include "media/base/audio_timestamp_helper.h" | 
|  | #include "mojo/public/c/system/types.h" | 
|  | #include "mojo/public/cpp/bindings/receiver_set.h" | 
|  | #include "net/base/load_flags.h" | 
|  | #include "net/traffic_annotation/network_traffic_annotation.h" | 
|  | #include "services/network/public/cpp/shared_url_loader_factory.h" | 
|  | #include "third_party/blink/public/mojom/speech/speech_recognition_error.mojom.h" | 
|  | #include "third_party/blink/public/mojom/speech/speech_recognition_result.mojom.h" | 
|  |  | 
|  | namespace content { | 
|  | namespace { | 
|  |  | 
|  | const char kWebServiceBaseUrl[] = | 
|  | "https://www.google.com/speech-api/full-duplex/v1"; | 
|  | const char kDownstreamUrl[] = "/down?"; | 
|  | const char kUpstreamUrl[] = "/up?"; | 
|  |  | 
|  | constexpr char kWebSpeechAudioDuration[] = "Accessibility.WebSpeech.Duration"; | 
|  |  | 
|  | // Used to override |kWebServiceBaseUrl| when non-null, only set in tests. | 
|  | const char* web_service_base_url_for_tests = nullptr; | 
|  |  | 
|  | // This matches the maximum maxAlternatives value supported by the server. | 
|  | const uint32_t kMaxMaxAlternatives = 30; | 
|  |  | 
|  | // TODO(hans): Remove this and other logging when we don't need it anymore. | 
|  | void DumpResponse(const std::string& response) { | 
|  | DVLOG(1) << "------------"; | 
|  | proto::SpeechRecognitionEvent event; | 
|  | if (!event.ParseFromString(response)) { | 
|  | DVLOG(1) << "Parse failed!"; | 
|  | return; | 
|  | } | 
|  | if (event.has_status()) | 
|  | DVLOG(1) << "STATUS\t" << event.status(); | 
|  | if (event.has_endpoint()) | 
|  | DVLOG(1) << "ENDPOINT\t" << event.endpoint(); | 
|  | for (int i = 0; i < event.result_size(); ++i) { | 
|  | DVLOG(1) << "RESULT #" << i << ":"; | 
|  | const proto::SpeechRecognitionResult& res = event.result(i); | 
|  | if (res.has_final()) | 
|  | DVLOG(1) << "  final:\t" << res.final(); | 
|  | if (res.has_stability()) | 
|  | DVLOG(1) << "  STABILITY:\t" << res.stability(); | 
|  | for (int j = 0; j < res.alternative_size(); ++j) { | 
|  | const proto::SpeechRecognitionAlternative& alt = | 
|  | res.alternative(j); | 
|  | if (alt.has_confidence()) | 
|  | DVLOG(1) << "    CONFIDENCE:\t" << alt.confidence(); | 
|  | if (alt.has_transcript()) | 
|  | DVLOG(1) << "    TRANSCRIPT:\t" << alt.transcript(); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | const int kDefaultConfigSampleRate = 8000; | 
|  | const int kDefaultConfigBitsPerSample = 16; | 
|  | const uint32_t kDefaultMaxHypotheses = 1; | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | SpeechRecognitionEngine::Config::Config() | 
|  | : filter_profanities(false), | 
|  | continuous(true), | 
|  | interim_results(true), | 
|  | max_hypotheses(kDefaultMaxHypotheses), | 
|  | audio_sample_rate(kDefaultConfigSampleRate), | 
|  | audio_num_bits_per_sample(kDefaultConfigBitsPerSample) {} | 
|  |  | 
|  | SpeechRecognitionEngine::Config::~Config() {} | 
|  |  | 
|  | const int SpeechRecognitionEngine::kAudioPacketIntervalMs = 100; | 
|  | const int SpeechRecognitionEngine::kWebserviceStatusNoError = 0; | 
|  | const int SpeechRecognitionEngine::kWebserviceStatusErrorNoMatch = 5; | 
|  |  | 
|  | SpeechRecognitionEngine::SpeechRecognitionEngine( | 
|  | scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory, | 
|  | const std::string& accept_language) | 
|  | : shared_url_loader_factory_(std::move(shared_url_loader_factory)), | 
|  | accept_language_(accept_language), | 
|  | got_last_definitive_result_(false), | 
|  | is_dispatching_event_(false), | 
|  | use_framed_post_data_(false), | 
|  | state_(STATE_IDLE) {} | 
|  |  | 
|  | SpeechRecognitionEngine::~SpeechRecognitionEngine() { | 
|  | DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); | 
|  | } | 
|  |  | 
|  | void SpeechRecognitionEngine::set_web_service_base_url_for_tests( | 
|  | const char* base_url_for_tests) { | 
|  | web_service_base_url_for_tests = base_url_for_tests; | 
|  | } | 
|  |  | 
|  | void SpeechRecognitionEngine::SetConfig(const Config& config) { | 
|  | config_ = config; | 
|  | } | 
|  |  | 
|  | void SpeechRecognitionEngine::StartRecognition() { | 
|  | upstream_audio_duration_ = base::TimeDelta(); | 
|  | FSMEventArgs event_args(EVENT_START_RECOGNITION); | 
|  | DispatchEvent(event_args); | 
|  | } | 
|  |  | 
|  | void SpeechRecognitionEngine::EndRecognition() { | 
|  | base::UmaHistogramLongTimes100(kWebSpeechAudioDuration, | 
|  | upstream_audio_duration_); | 
|  |  | 
|  | FSMEventArgs event_args(EVENT_END_RECOGNITION); | 
|  | DispatchEvent(event_args); | 
|  | } | 
|  |  | 
|  | void SpeechRecognitionEngine::TakeAudioChunk(const AudioChunk& data) { | 
|  | FSMEventArgs event_args(EVENT_AUDIO_CHUNK); | 
|  | event_args.audio_data = &data; | 
|  | DispatchEvent(event_args); | 
|  | } | 
|  |  | 
|  | void SpeechRecognitionEngine::AudioChunksEnded() { | 
|  | FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED); | 
|  | DispatchEvent(event_args); | 
|  | } | 
|  |  | 
|  | void SpeechRecognitionEngine::OnUpstreamDataComplete(bool success, | 
|  | int response_code) { | 
|  | DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); | 
|  |  | 
|  | DVLOG(1) << "Upstream complete success: " << success | 
|  | << " response_code: " << response_code; | 
|  |  | 
|  | if (!success) { | 
|  | FSMEventArgs event_args(EVENT_UPSTREAM_ERROR); | 
|  | DispatchEvent(event_args); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Do nothing on clean completion of upstream request. | 
|  | } | 
|  |  | 
|  | void SpeechRecognitionEngine::OnDownstreamDataReceived( | 
|  | base::StringPiece new_response_data) { | 
|  | DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); | 
|  |  | 
|  | DVLOG(1) << "Downstream length: " << new_response_data.size(); | 
|  |  | 
|  | // The downstream response is organized in chunks, whose size is determined | 
|  | // by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class. | 
|  | // Such chunks are sent by the speech recognition webservice over the HTTP | 
|  | // downstream channel using HTTP chunked transfer (unrelated to our chunks). | 
|  | // This function is called every time an HTTP chunk is received by the | 
|  | // url fetcher. However there isn't any particular matching beween our | 
|  | // protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can | 
|  | // contain a portion of one chunk or even more chunks together. | 
|  | chunked_byte_buffer_.Append(new_response_data); | 
|  |  | 
|  | // A single HTTP chunk can contain more than one data chunk, thus the while. | 
|  | while (chunked_byte_buffer_.HasChunks()) { | 
|  | FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE); | 
|  | event_args.response = chunked_byte_buffer_.PopChunk(); | 
|  | DCHECK(event_args.response.get()); | 
|  | DumpResponse(std::string(event_args.response->begin(), | 
|  | event_args.response->end())); | 
|  | DispatchEvent(event_args); | 
|  | } | 
|  | } | 
|  |  | 
|  | void SpeechRecognitionEngine::OnDownstreamDataComplete(bool success, | 
|  | int response_code) { | 
|  | DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); | 
|  |  | 
|  | DVLOG(1) << "Downstream complete success: " << success | 
|  | << " response_code: " << response_code; | 
|  |  | 
|  | if (!success) { | 
|  | FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR); | 
|  | DispatchEvent(event_args); | 
|  | return; | 
|  | } | 
|  |  | 
|  | FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED); | 
|  | DispatchEvent(event_args); | 
|  | } | 
|  |  | 
|  | bool SpeechRecognitionEngine::IsRecognitionPending() const { | 
|  | DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); | 
|  | return state_ != STATE_IDLE; | 
|  | } | 
|  |  | 
|  | int SpeechRecognitionEngine::GetDesiredAudioChunkDurationMs() const { | 
|  | return kAudioPacketIntervalMs; | 
|  | } | 
|  |  | 
|  | // -----------------------  Core FSM implementation --------------------------- | 
|  |  | 
|  | void SpeechRecognitionEngine::DispatchEvent( | 
|  | const FSMEventArgs& event_args) { | 
|  | DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); | 
|  | DCHECK_LE(event_args.event, EVENT_MAX_VALUE); | 
|  | DCHECK_LE(state_, STATE_MAX_VALUE); | 
|  |  | 
|  | // Event dispatching must be sequential, otherwise it will break all the rules | 
|  | // and the assumptions of the finite state automata model. | 
|  | DCHECK(!is_dispatching_event_); | 
|  | is_dispatching_event_ = true; | 
|  |  | 
|  | state_ = ExecuteTransitionAndGetNextState(event_args); | 
|  |  | 
|  | is_dispatching_event_ = false; | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState | 
|  | SpeechRecognitionEngine::ExecuteTransitionAndGetNextState( | 
|  | const FSMEventArgs& event_args) { | 
|  | const FSMEvent event = event_args.event; | 
|  | switch (state_) { | 
|  | case STATE_IDLE: | 
|  | switch (event) { | 
|  | case EVENT_START_RECOGNITION: | 
|  | return ConnectBothStreams(event_args); | 
|  | case EVENT_END_RECOGNITION: | 
|  | // Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of | 
|  | // abort, so we just silently drop them here. | 
|  | case EVENT_AUDIO_CHUNK: | 
|  | case EVENT_AUDIO_CHUNKS_ENDED: | 
|  | // DOWNSTREAM_CLOSED can be received if we end up here due to an error. | 
|  | case EVENT_DOWNSTREAM_CLOSED: | 
|  | return DoNothing(event_args); | 
|  | case EVENT_UPSTREAM_ERROR: | 
|  | case EVENT_DOWNSTREAM_ERROR: | 
|  | case EVENT_DOWNSTREAM_RESPONSE: | 
|  | return NotFeasible(event_args); | 
|  | } | 
|  | break; | 
|  | case STATE_BOTH_STREAMS_CONNECTED: | 
|  | switch (event) { | 
|  | case EVENT_AUDIO_CHUNK: | 
|  | return TransmitAudioUpstream(event_args); | 
|  | case EVENT_DOWNSTREAM_RESPONSE: | 
|  | return ProcessDownstreamResponse(event_args); | 
|  | case EVENT_AUDIO_CHUNKS_ENDED: | 
|  | return CloseUpstreamAndWaitForResults(event_args); | 
|  | case EVENT_END_RECOGNITION: | 
|  | return AbortSilently(event_args); | 
|  | case EVENT_UPSTREAM_ERROR: | 
|  | case EVENT_DOWNSTREAM_ERROR: | 
|  | case EVENT_DOWNSTREAM_CLOSED: | 
|  | return AbortWithError(event_args); | 
|  | case EVENT_START_RECOGNITION: | 
|  | return NotFeasible(event_args); | 
|  | } | 
|  | break; | 
|  | case STATE_WAITING_DOWNSTREAM_RESULTS: | 
|  | switch (event) { | 
|  | case EVENT_DOWNSTREAM_RESPONSE: | 
|  | return ProcessDownstreamResponse(event_args); | 
|  | case EVENT_DOWNSTREAM_CLOSED: | 
|  | return RaiseNoMatchErrorIfGotNoResults(event_args); | 
|  | case EVENT_END_RECOGNITION: | 
|  | return AbortSilently(event_args); | 
|  | case EVENT_UPSTREAM_ERROR: | 
|  | case EVENT_DOWNSTREAM_ERROR: | 
|  | return AbortWithError(event_args); | 
|  | case EVENT_START_RECOGNITION: | 
|  | case EVENT_AUDIO_CHUNK: | 
|  | case EVENT_AUDIO_CHUNKS_ENDED: | 
|  | return NotFeasible(event_args); | 
|  | } | 
|  | break; | 
|  | } | 
|  | return NotFeasible(event_args); | 
|  | } | 
|  |  | 
|  | // ----------- Contract for all the FSM evolution functions below ------------- | 
|  | //  - Are guaranteed to be executed in the same thread (IO, except for tests); | 
|  | //  - Are guaranteed to be not reentrant (themselves and each other); | 
|  | //  - event_args members are guaranteed to be stable during the call; | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState | 
|  | SpeechRecognitionEngine::ConnectBothStreams(const FSMEventArgs&) { | 
|  | DCHECK(!upstream_loader_.get()); | 
|  | DCHECK(!downstream_loader_.get()); | 
|  |  | 
|  | encoder_ = std::make_unique<AudioEncoder>(config_.audio_sample_rate, | 
|  | config_.audio_num_bits_per_sample); | 
|  | DCHECK(encoder_.get()); | 
|  | const std::string request_key = GenerateRequestKey(); | 
|  |  | 
|  | // Only use the framed post data format when a preamble needs to be logged. | 
|  | use_framed_post_data_ = (config_.preamble && | 
|  | !config_.preamble->sample_data.empty() && | 
|  | !config_.auth_token.empty() && | 
|  | !config_.auth_scope.empty()); | 
|  | if (use_framed_post_data_) { | 
|  | preamble_encoder_ = std::make_unique<AudioEncoder>( | 
|  | config_.preamble->sample_rate, config_.preamble->sample_depth * 8); | 
|  | } | 
|  |  | 
|  | const char* web_service_base_url = !web_service_base_url_for_tests | 
|  | ? kWebServiceBaseUrl | 
|  | : web_service_base_url_for_tests; | 
|  |  | 
|  | // Setup downstream fetcher. | 
|  | std::vector<std::string> downstream_args; | 
|  | downstream_args.push_back( | 
|  | "key=" + base::EscapeQueryParamValue(google_apis::GetAPIKey(), true)); | 
|  | downstream_args.push_back("pair=" + request_key); | 
|  | downstream_args.push_back("output=pb"); | 
|  | GURL downstream_url(std::string(web_service_base_url) + | 
|  | std::string(kDownstreamUrl) + | 
|  | base::JoinString(downstream_args, "&")); | 
|  |  | 
|  | net::NetworkTrafficAnnotationTag downstream_traffic_annotation = | 
|  | net::DefineNetworkTrafficAnnotation("speech_recognition_downstream", R"( | 
|  | semantics { | 
|  | sender: "Speech Recognition" | 
|  | description: | 
|  | "Chrome provides translation from speech audio recorded with a " | 
|  | "microphone to text, by using the Google speech recognition web " | 
|  | "service. Audio is sent to Google's servers (upstream) and text is " | 
|  | "returned (downstream). This network request (downstream) sends an " | 
|  | "id for getting the text response. Then the (upstream) request " | 
|  | "sends the audio data along with the id. When the server has " | 
|  | "finished processing the audio data and produced a text response, " | 
|  | "it replies to this request." | 
|  | trigger: | 
|  | "The user chooses to start the recognition by clicking the " | 
|  | "microphone icon in the Google search field." | 
|  | data: "A unique random id for this speech recognition request." | 
|  | destination: GOOGLE_OWNED_SERVICE | 
|  | } | 
|  | policy { | 
|  | cookies_allowed: NO | 
|  | setting: | 
|  | "The user must allow the browser to access the microphone in a " | 
|  | "permission prompt. This is set per site (hostname pattern). In " | 
|  | "the content settings menu, microphone access can be turned off " | 
|  | "for all sites and site specific settings can be changed." | 
|  | chrome_policy { | 
|  | AudioCaptureAllowed { | 
|  | policy_options {mode: MANDATORY} | 
|  | AudioCaptureAllowed: false | 
|  | } | 
|  | } | 
|  | chrome_policy { | 
|  | AudioCaptureAllowedUrls { | 
|  | policy_options {mode: MANDATORY} | 
|  | AudioCaptureAllowedUrls: {} | 
|  | } | 
|  | } | 
|  | })"); | 
|  | auto downstream_request = std::make_unique<network::ResourceRequest>(); | 
|  | downstream_request->credentials_mode = network::mojom::CredentialsMode::kOmit; | 
|  | downstream_request->url = downstream_url; | 
|  | downstream_loader_ = std::make_unique<speech::DownstreamLoader>( | 
|  | std::move(downstream_request), downstream_traffic_annotation, | 
|  | shared_url_loader_factory_.get(), this); | 
|  |  | 
|  | // Setup upstream fetcher. | 
|  | // TODO(hans): Support for user-selected grammars. | 
|  | std::vector<std::string> upstream_args; | 
|  | upstream_args.push_back( | 
|  | "key=" + base::EscapeQueryParamValue(google_apis::GetAPIKey(), true)); | 
|  | upstream_args.push_back("pair=" + request_key); | 
|  | upstream_args.push_back("output=pb"); | 
|  | upstream_args.push_back( | 
|  | "lang=" + base::EscapeQueryParamValue(GetAcceptedLanguages(), true)); | 
|  | upstream_args.push_back( | 
|  | config_.filter_profanities ? "pFilter=2" : "pFilter=0"); | 
|  | if (config_.max_hypotheses > 0U) { | 
|  | uint32_t max_alternatives = | 
|  | std::min(kMaxMaxAlternatives, config_.max_hypotheses); | 
|  | upstream_args.push_back("maxAlternatives=" + | 
|  | base::NumberToString(max_alternatives)); | 
|  | } | 
|  | upstream_args.push_back("app=chromium"); | 
|  | for (const blink::mojom::SpeechRecognitionGrammar& grammar : | 
|  | config_.grammars) { | 
|  | std::string grammar_value(base::NumberToString(grammar.weight) + ":" + | 
|  | grammar.url.spec()); | 
|  | upstream_args.push_back("grammar=" + | 
|  | base::EscapeQueryParamValue(grammar_value, true)); | 
|  | } | 
|  | if (config_.continuous) | 
|  | upstream_args.push_back("continuous"); | 
|  | else | 
|  | upstream_args.push_back("endpoint=1"); | 
|  | if (config_.interim_results) | 
|  | upstream_args.push_back("interim"); | 
|  | if (!config_.auth_token.empty() && !config_.auth_scope.empty()) { | 
|  | upstream_args.push_back( | 
|  | "authScope=" + base::EscapeQueryParamValue(config_.auth_scope, true)); | 
|  | upstream_args.push_back( | 
|  | "authToken=" + base::EscapeQueryParamValue(config_.auth_token, true)); | 
|  | } | 
|  | if (use_framed_post_data_) { | 
|  | std::string audio_format; | 
|  | if (preamble_encoder_) | 
|  | audio_format = preamble_encoder_->GetMimeType() + ","; | 
|  | audio_format += encoder_->GetMimeType(); | 
|  | upstream_args.push_back("audioFormat=" + | 
|  | base::EscapeQueryParamValue(audio_format, true)); | 
|  | } | 
|  |  | 
|  | GURL upstream_url(std::string(web_service_base_url) + | 
|  | std::string(kUpstreamUrl) + | 
|  | base::JoinString(upstream_args, "&")); | 
|  |  | 
|  | net::NetworkTrafficAnnotationTag upstream_traffic_annotation = | 
|  | net::DefineNetworkTrafficAnnotation("speech_recognition_upstream", R"( | 
|  | semantics { | 
|  | sender: "Speech Recognition" | 
|  | description: | 
|  | "Chrome provides translation from speech audio recorded with a " | 
|  | "microphone to text, by using the Google speech recognition web " | 
|  | "service. Audio is sent to Google's servers (upstream) and text is " | 
|  | "returned (downstream)." | 
|  | trigger: | 
|  | "The user chooses to start the recognition by clicking the " | 
|  | "microphone icon in the Google search field." | 
|  | data: | 
|  | "Audio recorded with the microphone, and the unique id of " | 
|  | "downstream speech recognition request." | 
|  | destination: GOOGLE_OWNED_SERVICE | 
|  | } | 
|  | policy { | 
|  | cookies_allowed: NO | 
|  | setting: | 
|  | "The user must allow the browser to access the microphone in a " | 
|  | "permission prompt. This is set per site (hostname pattern). In " | 
|  | "the content settings menu, microphone access can be turned off " | 
|  | "for all sites and site specific settings can be changed." | 
|  | chrome_policy { | 
|  | AudioCaptureAllowed { | 
|  | policy_options {mode: MANDATORY} | 
|  | AudioCaptureAllowed: false | 
|  | } | 
|  | } | 
|  | chrome_policy { | 
|  | AudioCaptureAllowedUrls { | 
|  | policy_options {mode: MANDATORY} | 
|  | AudioCaptureAllowedUrls: {} | 
|  | } | 
|  | } | 
|  | })"); | 
|  |  | 
|  | auto upstream_request = std::make_unique<network::ResourceRequest>(); | 
|  | upstream_request->url = upstream_url; | 
|  | upstream_request->method = "POST"; | 
|  | upstream_request->referrer = GURL(config_.origin_url); | 
|  | upstream_request->credentials_mode = network::mojom::CredentialsMode::kOmit; | 
|  | if (use_framed_post_data_) { | 
|  | upstream_request->headers.SetHeader(net::HttpRequestHeaders::kContentType, | 
|  | "application/octet-stream"); | 
|  | } else { | 
|  | upstream_request->headers.SetHeader(net::HttpRequestHeaders::kContentType, | 
|  | encoder_->GetMimeType()); | 
|  | } | 
|  |  | 
|  | upstream_loader_ = std::make_unique<speech::UpstreamLoader>( | 
|  | std::move(upstream_request), upstream_traffic_annotation, | 
|  | shared_url_loader_factory_.get(), this); | 
|  |  | 
|  | if (preamble_encoder_) { | 
|  | // Encode and send preamble right away. | 
|  | scoped_refptr<AudioChunk> chunk = new AudioChunk( | 
|  | reinterpret_cast<const uint8_t*>(config_.preamble->sample_data.data()), | 
|  | config_.preamble->sample_data.size(), config_.preamble->sample_depth); | 
|  | preamble_encoder_->Encode(*chunk); | 
|  | preamble_encoder_->Flush(); | 
|  | scoped_refptr<AudioChunk> encoded_data( | 
|  | preamble_encoder_->GetEncodedDataAndClear()); | 
|  | UploadAudioChunk(encoded_data->AsString(), FRAME_PREAMBLE_AUDIO, false); | 
|  | } | 
|  | return STATE_BOTH_STREAMS_CONNECTED; | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState | 
|  | SpeechRecognitionEngine::TransmitAudioUpstream( | 
|  | const FSMEventArgs& event_args) { | 
|  | DCHECK(upstream_loader_.get()); | 
|  | DCHECK(event_args.audio_data.get()); | 
|  | const AudioChunk& audio = *(event_args.audio_data.get()); | 
|  |  | 
|  | base::TimeDelta duration = media::AudioTimestampHelper::FramesToTime( | 
|  | audio.NumSamples(), config_.audio_sample_rate); | 
|  | upstream_audio_duration_ += duration; | 
|  |  | 
|  | DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); | 
|  | encoder_->Encode(audio); | 
|  | scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); | 
|  | UploadAudioChunk(encoded_data->AsString(), FRAME_RECOGNITION_AUDIO, false); | 
|  | return state_; | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState | 
|  | SpeechRecognitionEngine::ProcessDownstreamResponse( | 
|  | const FSMEventArgs& event_args) { | 
|  | DCHECK(event_args.response.get()); | 
|  |  | 
|  | proto::SpeechRecognitionEvent ws_event; | 
|  | if (!ws_event.ParseFromString(std::string(event_args.response->begin(), | 
|  | event_args.response->end()))) | 
|  | return AbortWithError(event_args); | 
|  |  | 
|  | if (ws_event.has_status()) { | 
|  | switch (ws_event.status()) { | 
|  | case proto::SpeechRecognitionEvent::STATUS_SUCCESS: | 
|  | break; | 
|  | case proto::SpeechRecognitionEvent::STATUS_NO_SPEECH: | 
|  | return Abort(blink::mojom::SpeechRecognitionErrorCode::kNoSpeech); | 
|  | case proto::SpeechRecognitionEvent::STATUS_ABORTED: | 
|  | return Abort(blink::mojom::SpeechRecognitionErrorCode::kAborted); | 
|  | case proto::SpeechRecognitionEvent::STATUS_AUDIO_CAPTURE: | 
|  | return Abort(blink::mojom::SpeechRecognitionErrorCode::kAudioCapture); | 
|  | case proto::SpeechRecognitionEvent::STATUS_NETWORK: | 
|  | return Abort(blink::mojom::SpeechRecognitionErrorCode::kNetwork); | 
|  | case proto::SpeechRecognitionEvent::STATUS_NOT_ALLOWED: | 
|  | return Abort(blink::mojom::SpeechRecognitionErrorCode::kNotAllowed); | 
|  | case proto::SpeechRecognitionEvent::STATUS_SERVICE_NOT_ALLOWED: | 
|  | return Abort( | 
|  | blink::mojom::SpeechRecognitionErrorCode::kServiceNotAllowed); | 
|  | case proto::SpeechRecognitionEvent::STATUS_BAD_GRAMMAR: | 
|  | return Abort(blink::mojom::SpeechRecognitionErrorCode::kBadGrammar); | 
|  | case proto::SpeechRecognitionEvent::STATUS_LANGUAGE_NOT_SUPPORTED: | 
|  | return Abort( | 
|  | blink::mojom::SpeechRecognitionErrorCode::kLanguageNotSupported); | 
|  | } | 
|  | } | 
|  |  | 
|  | if (!config_.continuous && ws_event.has_endpoint() && | 
|  | ws_event.endpoint() == proto::SpeechRecognitionEvent::END_OF_UTTERANCE) { | 
|  | delegate_->OnSpeechRecognitionEngineEndOfUtterance(); | 
|  | } | 
|  |  | 
|  | std::vector<blink::mojom::SpeechRecognitionResultPtr> results; | 
|  | for (int i = 0; i < ws_event.result_size(); ++i) { | 
|  | const proto::SpeechRecognitionResult& ws_result = ws_event.result(i); | 
|  | results.push_back(blink::mojom::SpeechRecognitionResult::New()); | 
|  | blink::mojom::SpeechRecognitionResultPtr& result = results.back(); | 
|  | result->is_provisional = !(ws_result.has_final() && ws_result.final()); | 
|  |  | 
|  | if (!result->is_provisional) | 
|  | got_last_definitive_result_ = true; | 
|  |  | 
|  | for (int j = 0; j < ws_result.alternative_size(); ++j) { | 
|  | const proto::SpeechRecognitionAlternative& ws_alternative = | 
|  | ws_result.alternative(j); | 
|  | blink::mojom::SpeechRecognitionHypothesisPtr hypothesis = | 
|  | blink::mojom::SpeechRecognitionHypothesis::New(); | 
|  | if (ws_alternative.has_confidence()) | 
|  | hypothesis->confidence = ws_alternative.confidence(); | 
|  | else if (ws_result.has_stability()) | 
|  | hypothesis->confidence = ws_result.stability(); | 
|  | DCHECK(ws_alternative.has_transcript()); | 
|  | // TODO(hans): Perhaps the transcript should be required in the proto? | 
|  | if (ws_alternative.has_transcript()) | 
|  | hypothesis->utterance = base::UTF8ToUTF16(ws_alternative.transcript()); | 
|  |  | 
|  | result->hypotheses.push_back(std::move(hypothesis)); | 
|  | } | 
|  | } | 
|  | if (results.size()) { | 
|  | delegate_->OnSpeechRecognitionEngineResults(results); | 
|  | } | 
|  |  | 
|  | return state_; | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState | 
|  | SpeechRecognitionEngine::RaiseNoMatchErrorIfGotNoResults( | 
|  | const FSMEventArgs& event_args) { | 
|  | if (!got_last_definitive_result_) { | 
|  | // Provide an empty result to notify that recognition is ended with no | 
|  | // errors, yet neither any further results. | 
|  | delegate_->OnSpeechRecognitionEngineResults( | 
|  | std::vector<blink::mojom::SpeechRecognitionResultPtr>()); | 
|  | } | 
|  | return AbortSilently(event_args); | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState | 
|  | SpeechRecognitionEngine::CloseUpstreamAndWaitForResults( | 
|  | const FSMEventArgs&) { | 
|  | DCHECK(upstream_loader_.get()); | 
|  | DCHECK(encoder_.get()); | 
|  |  | 
|  | DVLOG(1) <<  "Closing upstream."; | 
|  |  | 
|  | // The encoder requires a non-empty final buffer. So we encode a packet | 
|  | // of silence in case encoder had no data already. | 
|  | size_t sample_count = | 
|  | config_.audio_sample_rate * kAudioPacketIntervalMs / 1000; | 
|  | scoped_refptr<AudioChunk> dummy_chunk = new AudioChunk( | 
|  | sample_count * sizeof(int16_t), encoder_->GetBitsPerSample() / 8); | 
|  | encoder_->Encode(*dummy_chunk.get()); | 
|  | encoder_->Flush(); | 
|  | scoped_refptr<AudioChunk> encoded_dummy_data = | 
|  | encoder_->GetEncodedDataAndClear(); | 
|  | DCHECK(!encoded_dummy_data->IsEmpty()); | 
|  | encoder_.reset(); | 
|  |  | 
|  | UploadAudioChunk(encoded_dummy_data->AsString(), | 
|  | FRAME_RECOGNITION_AUDIO, | 
|  | true); | 
|  | got_last_definitive_result_ = false; | 
|  | return STATE_WAITING_DOWNSTREAM_RESULTS; | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState | 
|  | SpeechRecognitionEngine::CloseDownstream(const FSMEventArgs&) { | 
|  | DCHECK(!upstream_loader_.get()); | 
|  | DCHECK(downstream_loader_.get()); | 
|  |  | 
|  | DVLOG(1) <<  "Closing downstream."; | 
|  | downstream_loader_.reset(); | 
|  | return STATE_IDLE; | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState | 
|  | SpeechRecognitionEngine::AbortSilently(const FSMEventArgs&) { | 
|  | return Abort(blink::mojom::SpeechRecognitionErrorCode::kNone); | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState | 
|  | SpeechRecognitionEngine::AbortWithError(const FSMEventArgs&) { | 
|  | return Abort(blink::mojom::SpeechRecognitionErrorCode::kNetwork); | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState SpeechRecognitionEngine::Abort( | 
|  | blink::mojom::SpeechRecognitionErrorCode error_code) { | 
|  | DVLOG(1) << "Aborting with error " << error_code; | 
|  |  | 
|  | if (error_code != blink::mojom::SpeechRecognitionErrorCode::kNone) { | 
|  | delegate_->OnSpeechRecognitionEngineError( | 
|  | blink::mojom::SpeechRecognitionError( | 
|  | error_code, blink::mojom::SpeechAudioErrorDetails::kNone)); | 
|  | } | 
|  | downstream_loader_.reset(); | 
|  | upstream_loader_.reset(); | 
|  | encoder_.reset(); | 
|  | return STATE_IDLE; | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState | 
|  | SpeechRecognitionEngine::DoNothing(const FSMEventArgs&) { | 
|  | return state_; | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMState | 
|  | SpeechRecognitionEngine::NotFeasible(const FSMEventArgs& event_args) { | 
|  | NOTREACHED() << "Unfeasible event " << event_args.event | 
|  | << " in state " << state_; | 
|  | return state_; | 
|  | } | 
|  |  | 
|  | std::string SpeechRecognitionEngine::GetAcceptedLanguages() const { | 
|  | std::string langs = config_.language; | 
|  | if (langs.empty() && !accept_language_.empty()) { | 
|  | // If no language is provided then we use the first from the accepted | 
|  | // language list. If this list is empty then it defaults to "en-US". | 
|  | // Example of the contents of this list: "es,en-GB;q=0.8", "" | 
|  | size_t separator = accept_language_.find_first_of(",;"); | 
|  | if (separator != std::string::npos) | 
|  | langs = accept_language_.substr(0, separator); | 
|  | } | 
|  | if (langs.empty()) | 
|  | langs = "en-US"; | 
|  | return langs; | 
|  | } | 
|  |  | 
|  | // TODO(primiano): Is there any utility in the codebase that already does this? | 
|  | std::string SpeechRecognitionEngine::GenerateRequestKey() const { | 
|  | const int64_t kKeepLowBytes = 0x00000000FFFFFFFFLL; | 
|  | const int64_t kKeepHighBytes = 0xFFFFFFFF00000000LL; | 
|  |  | 
|  | // Just keep the least significant bits of timestamp, in order to reduce | 
|  | // probability of collisions. | 
|  | int64_t key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) | | 
|  | (base::RandUint64() & kKeepHighBytes); | 
|  | return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key)); | 
|  | } | 
|  |  | 
|  | void SpeechRecognitionEngine::UploadAudioChunk(const std::string& data, | 
|  | FrameType type, | 
|  | bool is_final) { | 
|  | if (use_framed_post_data_) { | 
|  | std::string frame(data.size() + 8, 0); | 
|  | base::WriteBigEndian(&frame[0], static_cast<uint32_t>(data.size())); | 
|  | base::WriteBigEndian(&frame[4], static_cast<uint32_t>(type)); | 
|  | frame.replace(8, data.size(), data); | 
|  | upstream_loader_->AppendChunkToUpload(frame, is_final); | 
|  | } else { | 
|  | upstream_loader_->AppendChunkToUpload(data, is_final); | 
|  | } | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value) | 
|  | : event(event_value) { | 
|  | } | 
|  |  | 
|  | SpeechRecognitionEngine::FSMEventArgs::~FSMEventArgs() { | 
|  | } | 
|  |  | 
|  | }  // namespace content |