blob: 53c9368b9eb72178f12acfc1c74a11ebf75fe9b1 [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/renderer/media/chrome_speech_recognition_client.h"
#include <utility>
#include "base/metrics/field_trial_params.h"
#include "base/metrics/histogram_functions.h"
#include "content/public/renderer/render_frame.h"
#include "media/base/audio_bus.h"
#include "media/base/audio_parameters.h"
#include "media/base/bind_to_current_loop.h"
#include "media/base/channel_mixer.h"
#include "media/base/media_switches.h"
#include "media/mojo/mojom/media_types.mojom.h"
#include "third_party/blink/public/common/browser_interface_broker_proxy.h"
#include "third_party/blink/public/platform/web_string.h"
#include "third_party/blink/public/web/web_frame.h"
#include "third_party/blink/public/web/web_local_frame.h"
// Get the list of blocked URLs defined by the Finch experiment parameter. These
// websites provide captions by default and thus do not require the live caption
// feature.
std::vector<std::string> GetBlockedURLs() {
return base::SplitString(base::GetFieldTrialParamValueByFeature(
media::kLiveCaption, "blocked_websites"),
",", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
}
ChromeSpeechRecognitionClient::ChromeSpeechRecognitionClient(
content::RenderFrame* render_frame,
media::SpeechRecognitionClient::OnReadyCallback callback)
: on_ready_callback_(std::move(callback)), blocked_urls_(GetBlockedURLs()) {
mojo::PendingReceiver<media::mojom::SpeechRecognitionContext>
speech_recognition_context_receiver =
speech_recognition_context_.BindNewPipeAndPassReceiver();
speech_recognition_context_->BindRecognizer(
speech_recognition_recognizer_.BindNewPipeAndPassReceiver(),
speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
base::BindOnce(&ChromeSpeechRecognitionClient::OnRecognizerBound,
base::Unretained(this)));
render_frame->GetBrowserInterfaceBroker()->GetInterface(
std::move(speech_recognition_context_receiver));
render_frame->GetBrowserInterfaceBroker()->GetInterface(
caption_host_.BindNewPipeAndPassReceiver());
is_website_blocked_ = IsUrlBlocked(
render_frame->GetWebFrame()->GetSecurityOrigin().ToString().Utf8());
base::UmaHistogramBoolean("Accessibility.LiveCaption.WebsiteBlocked",
is_website_blocked_);
send_audio_callback_ = media::BindToCurrentLoop(base::BindRepeating(
&ChromeSpeechRecognitionClient::SendAudioToSpeechRecognitionService,
weak_factory_.GetWeakPtr()));
speech_recognition_context_.set_disconnect_handler(media::BindToCurrentLoop(
base::BindOnce(&ChromeSpeechRecognitionClient::OnRecognizerDisconnected,
weak_factory_.GetWeakPtr())));
caption_host_.set_disconnect_handler(
base::BindOnce(&ChromeSpeechRecognitionClient::OnCaptionHostDisconnected,
base::Unretained(this)));
}
void ChromeSpeechRecognitionClient::OnRecognizerBound(
bool is_multichannel_supported) {
is_multichannel_supported_ = is_multichannel_supported;
is_recognizer_bound_ = true;
if (on_ready_callback_)
std::move(on_ready_callback_).Run();
}
void ChromeSpeechRecognitionClient::OnRecognizerDisconnected() {
is_recognizer_bound_ = false;
caption_host_->OnError();
}
void ChromeSpeechRecognitionClient::OnCaptionHostDisconnected() {
is_browser_requesting_transcription_ = false;
}
ChromeSpeechRecognitionClient::~ChromeSpeechRecognitionClient() = default;
void ChromeSpeechRecognitionClient::AddAudio(
scoped_refptr<media::AudioBuffer> buffer) {
DCHECK(buffer);
send_audio_callback_.Run(ConvertToAudioDataS16(std::move(buffer)));
}
void ChromeSpeechRecognitionClient::AddAudio(
std::unique_ptr<media::AudioBus> audio_bus,
int sample_rate,
media::ChannelLayout channel_layout) {
DCHECK(audio_bus);
send_audio_callback_.Run(
ConvertToAudioDataS16(std::move(audio_bus), sample_rate, channel_layout));
}
bool ChromeSpeechRecognitionClient::IsSpeechRecognitionAvailable() {
// TODO(evliu): Check if SODA is available.
return !is_website_blocked_ && is_browser_requesting_transcription_ &&
is_recognizer_bound_;
}
// The OnReadyCallback is set by the owner of |this| and is executed when speech
// recognition becomes available. Setting the callback will override any
// existing callback.
void ChromeSpeechRecognitionClient::SetOnReadyCallback(
SpeechRecognitionClient::OnReadyCallback callback) {
on_ready_callback_ = std::move(callback);
// Immediately run the callback if speech recognition is already available.
if (IsSpeechRecognitionAvailable() && on_ready_callback_)
std::move(on_ready_callback_).Run();
}
void ChromeSpeechRecognitionClient::OnSpeechRecognitionRecognitionEvent(
media::mojom::SpeechRecognitionResultPtr result) {
caption_host_->OnTranscription(
chrome::mojom::TranscriptionResult::New(result->transcription,
result->is_final),
base::BindOnce(&ChromeSpeechRecognitionClient::OnTranscriptionCallback,
base::Unretained(this)));
}
void ChromeSpeechRecognitionClient::OnTranscriptionCallback(bool success) {
is_browser_requesting_transcription_ = success;
}
void ChromeSpeechRecognitionClient::CopyBufferToTempAudioBus(
const media::AudioBuffer& buffer) {
if (!temp_audio_bus_ ||
buffer.channel_count() != temp_audio_bus_->channels() ||
buffer.frame_count() != temp_audio_bus_->frames()) {
temp_audio_bus_ =
media::AudioBus::Create(buffer.channel_count(), buffer.frame_count());
}
buffer.ReadFrames(buffer.frame_count(),
/* source_frame_offset */ 0, /* dest_frame_offset */ 0,
temp_audio_bus_.get());
}
void ChromeSpeechRecognitionClient::ResetChannelMixer(
int frame_count,
media::ChannelLayout channel_layout) {
if (!monaural_audio_bus_ || frame_count != monaural_audio_bus_->frames()) {
monaural_audio_bus_ =
media::AudioBus::Create(1 /* channels */, frame_count);
}
if (channel_layout != channel_layout_) {
channel_layout_ = channel_layout;
channel_mixer_ = std::make_unique<media::ChannelMixer>(
channel_layout, media::CHANNEL_LAYOUT_MONO);
}
}
void ChromeSpeechRecognitionClient::SendAudioToSpeechRecognitionService(
media::mojom::AudioDataS16Ptr audio_data) {
DCHECK(audio_data);
if (IsSpeechRecognitionAvailable()) {
speech_recognition_recognizer_->SendAudioToSpeechRecognitionService(
std::move(audio_data));
}
}
media::mojom::AudioDataS16Ptr
ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
scoped_refptr<media::AudioBuffer> buffer) {
DCHECK_GT(buffer->frame_count(), 0);
DCHECK_GT(buffer->channel_count(), 0);
DCHECK_GT(buffer->sample_rate(), 0);
auto signed_buffer = media::mojom::AudioDataS16::New();
signed_buffer->channel_count = buffer->channel_count();
signed_buffer->frame_count = buffer->frame_count();
signed_buffer->sample_rate = buffer->sample_rate();
// If multichannel audio is not supported by the speech recognition service,
// mix the channels into a monaural channel before converting it.
if (buffer->channel_count() > 1 && !is_multichannel_supported_) {
signed_buffer->channel_count = 1;
CopyBufferToTempAudioBus(*buffer);
ResetChannelMixer(buffer->frame_count(), buffer->channel_layout());
signed_buffer->data.resize(buffer->frame_count());
channel_mixer_->Transform(temp_audio_bus_.get(), monaural_audio_bus_.get());
monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
monaural_audio_bus_->frames(), &signed_buffer->data[0]);
return signed_buffer;
}
// If the audio is already in the interleaved signed int 16 format, directly
// assign it to the buffer.
if (buffer->sample_format() == media::SampleFormat::kSampleFormatS16) {
int16_t* audio_data = reinterpret_cast<int16_t*>(buffer->channel_data()[0]);
signed_buffer->data.assign(
audio_data,
audio_data + buffer->frame_count() * buffer->channel_count());
return signed_buffer;
}
// Convert the raw audio to the interleaved signed int 16 sample type.
CopyBufferToTempAudioBus(*buffer);
signed_buffer->data.resize(buffer->frame_count() * buffer->channel_count());
temp_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
temp_audio_bus_->frames(), &signed_buffer->data[0]);
return signed_buffer;
}
media::mojom::AudioDataS16Ptr
ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
std::unique_ptr<media::AudioBus> audio_bus,
int sample_rate,
media::ChannelLayout channel_layout) {
DCHECK_GT(audio_bus->frames(), 0);
DCHECK_GT(audio_bus->channels(), 0);
auto signed_buffer = media::mojom::AudioDataS16::New();
signed_buffer->channel_count = audio_bus->channels();
signed_buffer->frame_count = audio_bus->frames();
signed_buffer->sample_rate = sample_rate;
// If multichannel audio is not supported by the speech recognition service,
// mix the channels into a monaural channel before converting it.
if (audio_bus->channels() > 1 && !is_multichannel_supported_) {
signed_buffer->channel_count = 1;
ResetChannelMixer(audio_bus->frames(), channel_layout);
signed_buffer->data.resize(audio_bus->frames());
channel_mixer_->Transform(audio_bus.get(), monaural_audio_bus_.get());
monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
monaural_audio_bus_->frames(), &signed_buffer->data[0]);
return signed_buffer;
}
signed_buffer->data.resize(audio_bus->frames() * audio_bus->channels());
audio_bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(
audio_bus->frames(), &signed_buffer->data[0]);
return signed_buffer;
}
bool ChromeSpeechRecognitionClient::IsUrlBlocked(const std::string& url) const {
return blocked_urls_.find(url) != blocked_urls_.end();
}