blob: fc157408dfe08b67045c0db8bccd4b0ca7e637b9 [file] [log] [blame]
// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/browser/speech/speech_recognition_dispatcher_host.h"
#include <memory>
#include "base/command_line.h"
#include "base/functional/bind.h"
#include "base/lazy_instance.h"
#include "content/browser/renderer_host/frame_tree_node.h"
#include "content/browser/renderer_host/render_frame_host_manager.h"
#include "content/browser/speech/speech_recognition_manager_impl.h"
#include "content/browser/speech/speech_recognition_session.h"
#include "content/browser/web_contents/web_contents_impl.h"
#include "content/public/browser/browser_context.h"
#include "content/public/browser/browser_thread.h"
#include "content/public/browser/content_browser_client.h"
#include "content/public/browser/render_frame_host.h"
#include "content/public/browser/speech_recognition_audio_forwarder_config.h"
#include "content/public/browser/speech_recognition_manager_delegate.h"
#include "content/public/browser/speech_recognition_session_config.h"
#include "content/public/browser/speech_recognition_session_context.h"
#include "content/public/browser/storage_partition.h"
#include "content/public/common/content_client.h"
#include "content/public/common/content_switches.h"
#include "ipc/constants.mojom.h"
#include "media/mojo/mojom/speech_recognizer.mojom.h"
#include "mojo/public/cpp/bindings/message.h"
#include "mojo/public/cpp/bindings/self_owned_receiver.h"
#include "services/network/public/cpp/shared_url_loader_factory.h"
namespace {
std::string GetAcceptedLanguages(const std::string& language,
const std::string& accept_language) {
std::string langs = language;
if (langs.empty() && !accept_language.empty()) {
// If no language is provided then we use the first from the accepted
// language list. If this list is empty then it defaults to "en-US".
// Example of the contents of this list: "es,en-GB;q=0.8", ""
size_t separator = accept_language.find_first_of(",;");
if (separator != std::string::npos) {
langs = accept_language.substr(0, separator);
}
}
if (langs.empty()) {
langs = "en-US";
}
return langs;
}
} // namespace
namespace content {
SpeechRecognitionDispatcherHost::SpeechRecognitionDispatcherHost(
int render_process_id,
int render_frame_id)
: render_process_id_(render_process_id), render_frame_id_(render_frame_id) {
// Do not add any non-trivial initialization here, instead do it lazily when
// required (e.g. see the method |SpeechRecognitionManager::GetInstance()|) or
// add an Init() method.
}
// static
void SpeechRecognitionDispatcherHost::Create(
int render_process_id,
int render_frame_id,
mojo::PendingReceiver<media::mojom::SpeechRecognizer> receiver) {
mojo::MakeSelfOwnedReceiver(std::make_unique<SpeechRecognitionDispatcherHost>(
render_process_id, render_frame_id),
std::move(receiver));
}
SpeechRecognitionDispatcherHost::~SpeechRecognitionDispatcherHost() {}
base::WeakPtr<SpeechRecognitionDispatcherHost>
SpeechRecognitionDispatcherHost::AsWeakPtr() {
return weak_factory_.GetWeakPtr();
}
// -------- media::mojom::SpeechRecognizer interface implementation ------------
void SpeechRecognitionDispatcherHost::Start(
media::mojom::StartSpeechRecognitionRequestParamsPtr params) {
DCHECK_CURRENTLY_ON(BrowserThread::IO);
if (params->audio_forwarder.is_valid()) {
CHECK_GT(params->channel_count, 0);
if (params->channel_count <= 0) {
mojo::ReportBadMessage("Channel count must be positive.");
return;
}
if (params->sample_rate <= 0) {
mojo::ReportBadMessage("Sample rate must be positive.");
return;
}
}
GetUIThreadTaskRunner({})->PostTask(
FROM_HERE,
base::BindOnce(&SpeechRecognitionDispatcherHost::StartRequestOnUI,
AsWeakPtr(), render_process_id_, render_frame_id_,
std::move(params)));
}
// static
void SpeechRecognitionDispatcherHost::StartRequestOnUI(
base::WeakPtr<SpeechRecognitionDispatcherHost>
speech_recognition_dispatcher_host,
int render_process_id,
int render_frame_id,
media::mojom::StartSpeechRecognitionRequestParamsPtr params) {
DCHECK_CURRENTLY_ON(BrowserThread::UI);
int embedder_render_process_id = 0;
int embedder_render_frame_id = IPC::mojom::kRoutingIdNone;
RenderFrameHostImpl* rfh =
RenderFrameHostImpl::FromID(render_process_id, render_frame_id);
if (!rfh) {
DLOG(ERROR) << "SRDH::OnStartRequest, invalid frame";
return;
}
WebContentsImpl* web_contents =
static_cast<WebContentsImpl*>(WebContents::FromRenderFrameHost(rfh));
// Disable BackForwardCache when using the SpeechRecognition feature, because
// currently we do not handle speech recognition after placing the page in
// BackForwardCache.
// TODO(sreejakshetty): Make SpeechRecognition compatible with
// BackForwardCache.
rfh->OnBackForwardCacheDisablingStickyFeatureUsed(
blink::scheduler::WebSchedulerTrackedFeature::kSpeechRecognizer);
// If the speech API request was from an inner WebContents or a guest, save
// the context of the outer WebContents or the embedder since we will use it
// to decide permission.
WebContents* outer_web_contents = web_contents->GetOuterWebContents();
if (outer_web_contents) {
RenderFrameHost* embedder_frame = nullptr;
FrameTreeNode* embedder_frame_node = web_contents->GetPrimaryMainFrame()
->frame_tree_node()
->render_manager()
->GetOuterDelegateNode();
if (embedder_frame_node) {
embedder_frame = embedder_frame_node->current_frame_host();
} else {
// The outer web contents is embedded using the browser plugin. Fall back
// to a simple lookup of the main frame. TODO(avi): When the browser
// plugin is retired, remove this code.
embedder_frame = outer_web_contents->GetPrimaryMainFrame();
}
embedder_render_process_id =
embedder_frame->GetProcess()->GetDeprecatedID();
DCHECK_NE(embedder_render_process_id, 0);
embedder_render_frame_id = embedder_frame->GetRoutingID();
DCHECK_NE(embedder_render_frame_id, IPC::mojom::kRoutingIdNone);
}
content::BrowserContext* browser_context = web_contents->GetBrowserContext();
StoragePartition* storage_partition =
browser_context->GetStoragePartition(web_contents->GetSiteInstance());
bool can_render_frame_use_on_device =
storage_partition == browser_context->GetDefaultStoragePartition()
? true
: !rfh->GetLastCommittedURL().SchemeIsHTTPOrHTTPS();
GetIOThreadTaskRunner({})->PostTask(
FROM_HERE,
base::BindOnce(
&SpeechRecognitionDispatcherHost::StartSessionOnIO,
speech_recognition_dispatcher_host, std::move(params),
embedder_render_process_id, embedder_render_frame_id,
rfh->GetLastCommittedOrigin(),
storage_partition->GetURLLoaderFactoryForBrowserProcessIOThread(),
GetContentClient()->browser()->GetAcceptLangs(browser_context),
can_render_frame_use_on_device));
}
void SpeechRecognitionDispatcherHost::StartSessionOnIO(
media::mojom::StartSpeechRecognitionRequestParamsPtr params,
int embedder_render_process_id,
int embedder_render_frame_id,
const url::Origin& origin,
std::unique_ptr<network::PendingSharedURLLoaderFactory>
pending_shared_url_loader_factory,
const std::string& accept_language,
bool can_render_frame_use_on_device) {
DCHECK_CURRENTLY_ON(BrowserThread::IO);
SpeechRecognitionSessionContext context;
context.security_origin = origin;
context.render_process_id = render_process_id_;
context.render_frame_id = render_frame_id_;
context.embedder_render_process_id = embedder_render_process_id;
context.embedder_render_frame_id = embedder_render_frame_id;
SpeechRecognitionSessionConfig config;
config.language = GetAcceptedLanguages(params->language, accept_language);
config.max_hypotheses = params->max_hypotheses;
config.origin = origin;
config.initial_context = context;
config.shared_url_loader_factory = network::SharedURLLoaderFactory::Create(
std::move(pending_shared_url_loader_factory));
config.filter_profanities = false;
config.continuous = params->continuous;
config.interim_results = params->interim_results;
config.on_device = params->on_device;
config.allow_cloud_fallback = params->allow_cloud_fallback;
config.recognition_context = params->recognition_context;
for (media::mojom::SpeechRecognitionGrammarPtr& grammar_ptr :
params->grammars) {
config.grammars.push_back(*grammar_ptr);
}
if (SpeechRecognitionManager::GetInstance()->UseOnDeviceSpeechRecognition(
config) &&
params->audio_forwarder.is_valid()) {
// Use on-device speech recognition, bypassing the browser process. The
// speech recognition session will live in the speech recognition service
// process.
CreateSession(config, std::move(params->session_receiver),
std::move(params->client),
std::make_optional<SpeechRecognitionAudioForwarderConfig>(
std::move(params->audio_forwarder), params->channel_count,
params->sample_rate),
can_render_frame_use_on_device);
} else {
// Create the speech recognition session in the browser if cloud-based
// speech recognition is used or if microphone audio input is used.
auto session =
std::make_unique<SpeechRecognitionSession>(std::move(params->client));
config.event_listener = session->AsWeakPtr();
int session_id = CreateSession(
config, mojo::NullReceiver(), mojo::NullRemote(),
params->audio_forwarder.is_valid()
? std::make_optional<SpeechRecognitionAudioForwarderConfig>(
std::move(params->audio_forwarder), params->channel_count,
params->sample_rate)
: std::nullopt,
can_render_frame_use_on_device);
DCHECK_NE(session_id, SpeechRecognitionManager::kSessionIDInvalid);
session->SetSessionId(session_id);
mojo::MakeSelfOwnedReceiver(std::move(session),
std::move(params->session_receiver));
SpeechRecognitionManager::GetInstance()->StartSession(session_id);
}
}
int SpeechRecognitionDispatcherHost::CreateSession(
const SpeechRecognitionSessionConfig& config,
mojo::PendingReceiver<media::mojom::SpeechRecognitionSession>
session_receiver,
mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient>
client_remote,
std::optional<SpeechRecognitionAudioForwarderConfig> audio_forwarder_config,
bool can_render_frame_use_on_device) {
bool use_fake_manager = SpeechRecognitionManager::GetInstance() !=
SpeechRecognitionManagerImpl::GetInstance();
if (use_fake_manager) {
return SpeechRecognitionManager::GetInstance()->CreateSession(
config, std::move(session_receiver), std::move(client_remote),
audio_forwarder_config.has_value()
? std::make_optional<SpeechRecognitionAudioForwarderConfig>(
audio_forwarder_config.value())
: std::nullopt);
}
return SpeechRecognitionManagerImpl::GetInstance()->CreateSession(
config, std::move(session_receiver), std::move(client_remote),
audio_forwarder_config.has_value()
? std::make_optional<SpeechRecognitionAudioForwarderConfig>(
audio_forwarder_config.value())
: std::nullopt,
can_render_frame_use_on_device);
}
} // namespace content