blob: b044b7ff52f74c99ab8261f4044caaf709cb43a4 [file] [log] [blame]
// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/speech/speech_recognition_recognizer_client_impl.h"
#include <algorithm>
#include <utility>
#include "ash/constants/ash_features.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/browser/profiles/profile.h"
#include "chrome/browser/speech/cros_speech_recognition_service.h"
#include "chrome/browser/speech/cros_speech_recognition_service_factory.h"
#include "chrome/browser/speech/speech_recognizer_delegate.h"
#include "components/soda/soda_installer.h"
#include "content/public/browser/audio_service.h"
#include "content/public/browser/browser_thread.h"
#include "media/audio/audio_system.h"
#include "media/base/audio_parameters.h"
#include "media/base/bind_to_current_loop.h"
namespace {
// Sample rate used by content::SpeechRecognizerImpl, which is used
// by NetworkSpeechRecognizer.
static constexpr int kAudioSampleRate = 16000;
// This is about how many times we want the audio callback to happen per second.
// Web speech recognition happens about 10 time per second, so we take that
// convervative number here. We can increase if it seems laggy.
static constexpr int kPollingTimesPerSecond = 10;
media::AudioParameters GetAudioParameters(
const absl::optional<media::AudioParameters>& params,
bool is_multichannel_supported) {
if (params) {
media::AudioParameters result = params.value();
int sample_rate = params->sample_rate();
int frames_per_buffer = std::max(params->frames_per_buffer(),
sample_rate / kPollingTimesPerSecond);
media::ChannelLayoutConfig channel_layout_config =
is_multichannel_supported ? params->channel_layout_config()
: media::ChannelLayoutConfig::Mono();
result.Reset(params->format(), channel_layout_config, sample_rate,
frames_per_buffer);
return result;
}
static_assert(kAudioSampleRate % 100 == 0,
"Audio sample rate is not divisible by 100");
return media::AudioParameters(
media::AudioParameters::AUDIO_PCM_LOW_LATENCY,
is_multichannel_supported ? media::ChannelLayoutConfig::Stereo()
: media::ChannelLayoutConfig::Mono(),
kAudioSampleRate, kAudioSampleRate / kPollingTimesPerSecond);
}
} // namespace
bool SpeechRecognitionRecognizerClientImpl::IsOnDeviceSpeechRecognizerAvailable(
const std::string& language) {
if (!base::FeatureList::IsEnabled(ash::features::kOnDeviceSpeechRecognition))
return false;
speech::SodaInstaller* soda_installer = speech::SodaInstaller::GetInstance();
return soda_installer->IsSodaInstalled(speech::GetLanguageCode(language));
}
SpeechRecognitionRecognizerClientImpl::SpeechRecognitionRecognizerClientImpl(
const base::WeakPtr<SpeechRecognizerDelegate>& delegate,
Profile* profile,
media::mojom::SpeechRecognitionOptionsPtr options)
: SpeechRecognizer(delegate),
state_(SpeechRecognizerStatus::SPEECH_RECOGNIZER_OFF),
is_multichannel_supported_(false),
waiting_for_params_(false) {
DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
DCHECK(options->language.has_value());
language_ = options->language.value();
// Connect the AudioSourceSpeechRecognitionContext & bind an
// AudioSourceFetcher recognizer.
CrosSpeechRecognitionServiceFactory::GetForProfile(profile)
->BindAudioSourceSpeechRecognitionContext(
audio_source_speech_recognition_context_
.BindNewPipeAndPassReceiver());
audio_source_speech_recognition_context_->BindAudioSourceFetcher(
audio_source_fetcher_.BindNewPipeAndPassReceiver(),
speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
std::move(options),
media::BindToCurrentLoop(base::BindOnce(
&SpeechRecognitionRecognizerClientImpl::OnRecognizerBound,
weak_factory_.GetWeakPtr())));
audio_source_speech_recognition_context_.set_disconnect_handler(
media::BindToCurrentLoop(base::BindOnce(
&SpeechRecognitionRecognizerClientImpl::OnRecognizerDisconnected,
weak_factory_.GetWeakPtr())));
}
SpeechRecognitionRecognizerClientImpl::
~SpeechRecognitionRecognizerClientImpl() {
audio_source_fetcher_->Stop();
audio_source_fetcher_.reset();
speech_recognition_client_receiver_.reset();
audio_source_speech_recognition_context_.reset();
}
void SpeechRecognitionRecognizerClientImpl::Start() {
// Get audio parameters from the AudioSystem, and use these to start
// recognition from the callback.
if (!audio_system_)
audio_system_ = content::CreateAudioSystemForAudioService();
waiting_for_params_ = true;
audio_system_->GetInputStreamParameters(
media::AudioDeviceDescription::kDefaultDeviceId,
base::BindOnce(&SpeechRecognitionRecognizerClientImpl::
StartFetchingOnInputDeviceInfo,
weak_factory_.GetWeakPtr()));
}
void SpeechRecognitionRecognizerClientImpl::Stop() {
audio_source_fetcher_->Stop();
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNITION_STOPPING);
}
void SpeechRecognitionRecognizerClientImpl::OnSpeechRecognitionRecognitionEvent(
const media::SpeechRecognitionResult& result,
OnSpeechRecognitionRecognitionEventCallback reply) {
DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
// Returning true ensures the speech recognition continues.
std::move(reply).Run(true);
if (!result.transcription.size())
return;
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_IN_SPEECH);
delegate()->OnSpeechResult(base::UTF8ToUTF16(result.transcription),
result.is_final, result);
}
void SpeechRecognitionRecognizerClientImpl::OnSpeechRecognitionError() {
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_ERROR);
}
void SpeechRecognitionRecognizerClientImpl::OnLanguageIdentificationEvent(
media::mojom::LanguageIdentificationEventPtr event) {
// TODO(b/260372471): pipe through language info.
}
void SpeechRecognitionRecognizerClientImpl::OnSpeechRecognitionStopped() {
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_READY);
delegate()->OnSpeechRecognitionStopped();
}
void SpeechRecognitionRecognizerClientImpl::OnRecognizerBound(
bool is_multichannel_supported) {
is_multichannel_supported_ = is_multichannel_supported;
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_READY);
}
void SpeechRecognitionRecognizerClientImpl::OnRecognizerDisconnected() {
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_ERROR);
}
void SpeechRecognitionRecognizerClientImpl::StartFetchingOnInputDeviceInfo(
const absl::optional<media::AudioParameters>& params) {
// waiting_for_params_ was set before requesting audio params from the
// AudioSystem, which returns here asynchronously. If this has changed, then
// we shouldn't start up any more.
if (!waiting_for_params_)
return;
waiting_for_params_ = false;
// Bind to an AudioSourceFetcher in the Speech Recognition service,
// passing the stream factory so it can listen to mic audio.
mojo::PendingRemote<media::mojom::AudioStreamFactory> stream_factory;
content::GetAudioServiceStreamFactoryBinder().Run(
stream_factory.InitWithNewPipeAndPassReceiver());
audio_source_fetcher_->Start(
std::move(stream_factory),
media::AudioDeviceDescription::kDefaultDeviceId,
GetAudioParameters(params, is_multichannel_supported_));
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_RECOGNIZING);
}
void SpeechRecognitionRecognizerClientImpl::UpdateStatus(
SpeechRecognizerStatus state) {
DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
waiting_for_params_ = false;
if (state_ == state)
return;
state_ = state;
// Since the |OnSpeechRecognitionStateChanged| call below can destroy |this|
// it should be the last thing done in here.
delegate()->OnSpeechRecognitionStateChanged(state);
}