| // Copyright 2020 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "chrome/services/speech/cloud_speech_recognition_client.h" |
| |
| #include "base/memory/ptr_util.h" |
| #include "base/metrics/histogram_functions.h" |
| #include "base/strings/string_number_conversions.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/stringprintf.h" |
| #include "content/public/browser/google_streaming_api.pb.h" |
| #include "google_apis/google_api_keys.h" |
| #include "mojo/public/cpp/bindings/receiver_set.h" |
| #include "net/base/escape.h" |
| #include "net/traffic_annotation/network_traffic_annotation.h" |
| #include "services/network/public/cpp/shared_url_loader_factory.h" |
| #include "services/network/public/cpp/simple_url_loader.h" |
| #include "services/network/public/cpp/simple_url_loader_stream_consumer.h" |
| #include "services/network/public/mojom/chunked_data_pipe_getter.mojom.h" |
| #include "url/gurl.h" |
| |
| namespace speech { |
| |
| // The maximum duration a stream can be open for. The Open Speech API supports 5 |
| // minutes of continuous recognition. |
| constexpr base::TimeDelta kMaximumStreamDuration = |
| base::TimeDelta::FromSeconds(295); |
| |
| // The Open Speech API will not return any recognition events if 30 seconds have |
| // elapsed since the last audio upload. |
| constexpr base::TimeDelta kMaximumPauseDuration = |
| base::TimeDelta::FromSeconds(28); |
| |
| constexpr char kWebServiceBaseUrl[] = |
| "https://www.google.com/speech-api/full-duplex/v1"; |
| constexpr char kDownstreamUrl[] = "/down"; |
| constexpr char kUpstreamUrl[] = "/up"; |
| |
| CloudSpeechRecognitionClient::CloudSpeechRecognitionClient( |
| OnRecognitionEventCallback callback, |
| base::WeakPtr<SpeechRecognitionServiceImpl> speech_recognition_service_impl) |
| : recognition_event_callback_(callback), |
| speech_recognition_service_impl_( |
| std::move(speech_recognition_service_impl)) { |
| ResetUrlLoaderFactory(); |
| } |
| |
| CloudSpeechRecognitionClient::~CloudSpeechRecognitionClient() { |
| base::UmaHistogramBoolean("Accessibility.LiveCaption.AudioPropertyChanged", |
| audio_property_changed_midstream_); |
| } |
| |
| bool CloudSpeechRecognitionClient::DidAudioPropertyChange(int sample_rate, |
| int channel_count) { |
| bool property_changed = |
| sample_rate != sample_rate_ || channel_count != channel_count_; |
| audio_property_changed_midstream_ |= property_changed; |
| return property_changed; |
| } |
| |
| void CloudSpeechRecognitionClient::Initialize(const CloudSpeechConfig& config) { |
| channel_count_ = config.channel_count; |
| sample_rate_ = config.sample_rate; |
| language_code_ = config.language_code; |
| is_initialized_ = true; |
| Reset(); |
| } |
| |
| void CloudSpeechRecognitionClient::OnDownstreamDataReceived( |
| base::StringPiece new_response_data) { |
| // The downstream response is organized in chunks, whose size is determined |
| // by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class. |
| // Such chunks are sent by the speech recognition webservice over the HTTP |
| // downstream channel using HTTP chunked transfer (unrelated to our chunks). |
| // This function is called every time an HTTP chunk is received by the |
| // url fetcher. However there isn't any particular matching between our |
| // protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can |
| // contain a portion of one chunk or even more chunks together. |
| chunked_byte_buffer_.Append(new_response_data); |
| std::string result; |
| bool is_final = false; |
| |
| // A single HTTP chunk can contain more than one data chunk, thus the while. |
| while (chunked_byte_buffer_.HasChunks()) { |
| auto chunk = chunked_byte_buffer_.PopChunk(); |
| content::proto::SpeechRecognitionEvent event; |
| if (!event.ParseFromArray(chunk->data(), chunk->size() * sizeof(uint8_t))) { |
| DLOG(ERROR) << "Parsing of the recognition response failed."; |
| return; |
| } |
| |
| // A speech recognition event can have multiple recognition results in |
| // descending order of stability. Concatenate all of the recognition result |
| // parts to build the full transcription. |
| for (const auto& recognition_result : event.result()) { |
| is_final |= recognition_result.final(); |
| if (recognition_result.has_stability()) { |
| for (const auto& alternative : recognition_result.alternative()) { |
| if (alternative.has_transcript()) |
| result += alternative.transcript(); |
| } |
| } |
| } |
| |
| // Remove the leading whitespace that the Open Speech API automatically |
| // prepends because the captioning bubble will handle the formatting. |
| if (!result.empty() && result[0] == ' ') |
| result.erase(0, 1); |
| |
| // The Open Speech API returns an empty recognition event with |final| |
| // marked as true to indicate that the previous result returned was a final |
| // recognition result. |
| if (is_final && result.empty()) |
| result = previous_result_; |
| |
| previous_result_ = result; |
| recognition_event_callback().Run( |
| media::SpeechRecognitionResult(result, is_final)); |
| } |
| } |
| |
| void CloudSpeechRecognitionClient::Reset() { |
| DCHECK(is_initialized_); |
| // Return if the URL loader factory has not been set. |
| if (!url_loader_factory_) |
| return; |
| |
| last_reset_ = base::TimeTicks::Now(); |
| last_upload_ = base::TimeTicks::Now(); |
| const std::string request_key = base::UnguessableToken::Create().ToString(); |
| |
| // Setup downstream fetcher. |
| GURL downstream_url(base::StringPrintf( |
| "%s%s?key=%s&pair=%s&output=pb", kWebServiceBaseUrl, kDownstreamUrl, |
| net::EscapeQueryParamValue(google_apis::GetAPIKey(), true).c_str(), |
| net::EscapeQueryParamValue(request_key, true).c_str())); |
| |
| net::NetworkTrafficAnnotationTag traffic_annotation = |
| net::DefineNetworkTrafficAnnotation("cloud_speech_recognition", |
| R"( |
| semantics { |
| sender: "Speech Recognition" |
| description: |
| "Chrome provides transcription from output audio by using the " |
| "Google speech recognition web service. Audio is sent to Google's " |
| "servers (upstream) and text is returned (downstream). This " |
| "network request (downstream) sends an id for getting the text " |
| "response. Then the (upstream) request sends the audio data along " |
| "with the id. When the server has finished processing the audio " |
| "data and produced a text response, it replies to this request." |
| trigger: |
| "Generally triggered in direct response to a user playing a " |
| "media with audio." |
| data: "A unique random id for this speech recognition request and " |
| "the audio output stream." |
| destination: GOOGLE_OWNED_SERVICE |
| } |
| policy { |
| cookies_allowed: NO |
| setting: |
| "The Live Caption feature can be enabled/disabled in the Chrome " |
| "accessibility settings menu. The feature is disabled by default." |
| chrome_policy { |
| AudioCaptureAllowed { |
| policy_options {mode: MANDATORY} |
| AudioCaptureAllowed: false |
| } |
| } |
| chrome_policy { |
| AudioCaptureAllowedUrls { |
| policy_options {mode: MANDATORY} |
| AudioCaptureAllowedUrls: {} |
| } |
| } |
| })"); |
| auto downstream_request = std::make_unique<network::ResourceRequest>(); |
| downstream_request->credentials_mode = network::mojom::CredentialsMode::kOmit; |
| downstream_request->url = downstream_url; |
| downstream_loader_ = std::make_unique<speech::DownstreamLoader>( |
| std::move(downstream_request), traffic_annotation, |
| url_loader_factory_.get(), this); |
| |
| // Setup upstream fetcher. |
| GURL upstream_url(base::StringPrintf( |
| "%s%s?key=%s&pair=%s&output=pb&lang=%s&pFilter=0&maxAlternatives=1&app=" |
| "chrome&continuous&interim", |
| kWebServiceBaseUrl, kUpstreamUrl, |
| net::EscapeQueryParamValue(google_apis::GetAPIKey(), true).c_str(), |
| net::EscapeQueryParamValue(request_key, true).c_str(), |
| net::EscapeQueryParamValue(language_code_, true).c_str())); |
| |
| auto upstream_request = std::make_unique<network::ResourceRequest>(); |
| upstream_request->url = upstream_url; |
| upstream_request->method = "POST"; |
| upstream_request->credentials_mode = network::mojom::CredentialsMode::kOmit; |
| upstream_request->headers.SetHeader( |
| net::HttpRequestHeaders::kContentType, |
| "audio/l16; rate=" + base::NumberToString(sample_rate_)); |
| upstream_loader_ = std::make_unique<speech::UpstreamLoader>( |
| std::move(upstream_request), traffic_annotation, |
| url_loader_factory_.get(), this); |
| } |
| |
| void CloudSpeechRecognitionClient::AddAudio(base::span<const char> chunk) { |
| DCHECK(is_initialized_); |
| base::TimeTicks now = base::TimeTicks::Now(); |
| if (now - last_reset_ > kMaximumStreamDuration || |
| now - last_upload_ > kMaximumPauseDuration) { |
| Reset(); |
| } |
| |
| last_upload_ = now; |
| upstream_loader_->AppendChunkToUpload(std::string(chunk.data(), chunk.size()), |
| false); |
| } |
| |
| void CloudSpeechRecognitionClient::SetUrlLoaderFactoryForTesting( |
| mojo::PendingRemote<network::mojom::URLLoaderFactory> factory) { |
| url_loader_factory_ = |
| mojo::Remote<network::mojom::URLLoaderFactory>(std::move(factory)); |
| } |
| |
| void CloudSpeechRecognitionClient::ResetUrlLoaderFactory() { |
| downstream_loader_.reset(); |
| upstream_loader_.reset(); |
| url_loader_factory_.reset(); |
| |
| if (!speech_recognition_service_impl_) |
| return; |
| |
| url_loader_factory_ = mojo::Remote<network::mojom::URLLoaderFactory>( |
| speech_recognition_service_impl_->GetUrlLoaderFactory()); |
| |
| url_loader_factory_.set_disconnect_handler( |
| base::BindOnce(&CloudSpeechRecognitionClient::ResetUrlLoaderFactory, |
| base::Unretained(this))); |
| |
| if (!is_initialized_) |
| return; |
| |
| Reset(); |
| } |
| |
| } // namespace speech |