chrome/browser/speech/cros_speech_recognition_service.cc - chromium/src - Git at Google

 // Copyright 2021 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "chrome/browser/speech/cros_speech_recognition_service.h"

 #include <memory>
 #include <string>
 #include <utility>

 #include "ash/constants/ash_features.h"
 #include "base/containers/flat_map.h"
 #include "base/files/file_path.h"
 #include "base/functional/bind.h"
 #include "base/notimplemented.h"
 #include "base/types/optional_util.h"
 #include "chrome/services/speech/audio_source_fetcher_impl.h"
 #include "chrome/services/speech/buildflags/buildflags.h"
 #include "chrome/services/speech/cros_speech_recognition_recognizer_impl.h"
 #include "components/live_caption/pref_names.h"
 #include "components/soda/constants.h"
 #include "components/soda/soda_installer.h"
 #include "components/user_prefs/user_prefs.h"
 #include "content/public/browser/browser_context.h"
 #include "content/public/browser/browser_task_traits.h"
 #include "content/public/browser/browser_thread.h"
 #include "content/public/browser/storage_partition.h"
 #include "media/base/media_switches.h"
 #include "media/mojo/mojom/speech_recognition.mojom-shared.h"
 #include "media/mojo/mojom/speech_recognition.mojom.h"
 #include "media/mojo/mojom/speech_recognition_service.mojom.h"
 #include "services/network/public/cpp/shared_url_loader_factory.h"

 #if BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
 #include "chrome/services/speech/internal/server_based_recognition_recognizer.h"
 #endif  // BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)

 namespace speech {

 namespace {

 constexpr char kInvalidSpeechRecogntionOptions[] =
     "Invalid SpeechRecognitionOptions provided";

 void PopulateFilePaths(
     base::FilePath& binary_path,
     base::flat_map<std::string, base::FilePath>& config_paths) {
   speech::SodaInstaller* soda_installer = speech::SodaInstaller::GetInstance();

   binary_path = soda_installer->GetSodaBinaryPath();
   for (const auto& language_code : soda_installer->InstalledLanguages()) {
     config_paths[GetLanguageName(language_code)] =
         soda_installer->GetLanguagePath(GetLanguageName(language_code));
   }
 }

 std::unique_ptr<CrosSpeechRecognitionRecognizerImpl>
 CreateCrosSpeechRecognitionRecognizer(
     mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> client,
     media::mojom::SpeechRecognitionOptionsPtr options,
     const base::FilePath& binary_path,
     const base::flat_map<std::string, base::FilePath>& config_paths,
     const std::string& primary_language_name,
     const bool mask_offensive_words) {
   return std::make_unique<CrosSpeechRecognitionRecognizerImpl>(
       std::move(client), std::move(options), binary_path, config_paths,
       primary_language_name, mask_offensive_words);
 }

 }  // namespace

 CrosSpeechRecognitionService::CrosSpeechRecognitionService(
     content::BrowserContext* context)
     : ChromeSpeechRecognitionService(context) {
   cros_speech_recognition_recognizer_cb_ =
       base::BindRepeating(CreateCrosSpeechRecognitionRecognizer);
 }

 CrosSpeechRecognitionService::~CrosSpeechRecognitionService() = default;

 void CrosSpeechRecognitionService::BindSpeechRecognitionContext(
     mojo::PendingReceiver<media::mojom::SpeechRecognitionContext> receiver) {
   speech_recognition_contexts_.Add(this, std::move(receiver));
 }

 void CrosSpeechRecognitionService::BindAudioSourceSpeechRecognitionContext(
     mojo::PendingReceiver<media::mojom::AudioSourceSpeechRecognitionContext>
         receiver) {
   audio_source_speech_recognition_contexts_.Add(this, std::move(receiver));
 }

 void CrosSpeechRecognitionService::BindRecognizer(
     mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver,
     mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> client,
     media::mojom::SpeechRecognitionOptionsPtr options,
     BindRecognizerCallback callback) {
   // This binding is used by LiveCaption and it can't be server based
   // recognition.
   if (options->is_server_based ||
       options->recognizer_client_type !=
           media::mojom::RecognizerClientType::kLiveCaption) {
     mojo::ReportBadMessage(kInvalidSpeechRecogntionOptions);
     return;
   }

   base::FilePath binary_path;
   base::flat_map<std::string, base::FilePath> config_paths;
   // The options should have locale set, but if they don't, pull from the prefs.
   std::string language_name;
   if (options->language) {
     language_name = options->language.value();
   } else {
     PrefService* profile_prefs = user_prefs::UserPrefs::Get(context());
     language_name = prefs::GetLiveCaptionLanguageCode(profile_prefs);
   }

   PopulateFilePaths(binary_path, config_paths);

   // TODO(crbug.com/40924425): Implement offensive word mask on ChromeOS so that
   // mask_offensive_words is not hard-coded.
   CrosSpeechRecognitionRecognizerImpl::Create(
       std::move(receiver), std::move(client), std::move(options), binary_path,
       config_paths, language_name, /* mask_offensive_words= */ false);
   std::move(callback).Run(
       CrosSpeechRecognitionRecognizerImpl::IsMultichannelSupported());
 }

 void CrosSpeechRecognitionService::BindWebSpeechRecognizer(
     mojo::PendingReceiver<media::mojom::SpeechRecognitionSession>
         session_receiver,
     mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient>
         session_client,
     mojo::PendingReceiver<media::mojom::SpeechRecognitionAudioForwarder>
         audio_forwarder,
     int channel_count,
     int sample_rate,
     media::mojom::SpeechRecognitionOptionsPtr options,
     bool continuous) {
   NOTIMPLEMENTED();
 }

 void CrosSpeechRecognitionService::BindAudioSourceFetcher(
     mojo::PendingReceiver<media::mojom::AudioSourceFetcher> fetcher_receiver,
     mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> client,
     media::mojom::SpeechRecognitionOptionsPtr options,
     BindRecognizerCallback callback) {
   if (!options->is_server_based) {
     base::FilePath binary_path;
     base::flat_map<std::string, base::FilePath> config_paths;
     PopulateFilePaths(binary_path, config_paths);

     // The options should have locale set, but if they don't, pull from the
     // prefs.
     std::string language_name;
     if (options->language) {
       language_name = options->language.value();
     } else {
       PrefService* profile_prefs = user_prefs::UserPrefs::Get(context());
       language_name = prefs::GetLiveCaptionLanguageCode(profile_prefs);
     }
     // `mask_offensive_words` is always true for
     // `RecognizerClientType::kSchoolTools`.
     // TODO(crbug.com/40924425): Implement offensive word mask on ChromeOS for
     // live caption as well so that mask_offensive_words is set according to the
     //  settings for `RecognizerClientType::kLiveCaption`.
     bool mask_offensive_words =
         options->recognizer_client_type ==
         media::mojom::RecognizerClientType::kSchoolTools;
     // CrosSpeechRecognitionService runs on browser UI thread.
     // Create AudioSourceFetcher on browser IO thread to avoid UI jank.
     // Note that its CrosSpeechRecognitionRecognizer must also run
     // on the IO thread. If CrosSpeechRecognitionService is moved away from
     // browser UI thread, we can call AudioSourceFetcherImpl::Create directly.
     content::GetIOThreadTaskRunner({})->PostTask(
         FROM_HERE,
         base::BindOnce(
             &CrosSpeechRecognitionService::
                 CreateAudioSourceFetcherForOnDeviceRecognitionOnIOThread,
             weak_factory_.GetWeakPtr(), std::move(fetcher_receiver),
             std::move(client), std::move(options), binary_path, config_paths,
             language_name, mask_offensive_words));
     std::move(callback).Run(
         CrosSpeechRecognitionRecognizerImpl::IsMultichannelSupported());
     return;
   }
 #if BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
   if (!ash::features::IsInternalServerSideSpeechRecognitionEnabled()) {
     // A request is made for a service that has not been enabled.
     mojo::ReportBadMessage(kInvalidSpeechRecogntionOptions);
     return;
   }
   content::GetIOThreadTaskRunner({})->PostTask(
       FROM_HERE,
       base::BindOnce(
           &CrosSpeechRecognitionService::
               CreateAudioSourceFetcherForServerBasedRecognitionOnIOThread,
           weak_factory_.GetWeakPtr(), std::move(fetcher_receiver),
           std::move(client), std::move(options),
           context()
               ->GetDefaultStoragePartition()
               ->GetURLLoaderFactoryForBrowserProcessIOThread()));
   std::move(callback).Run(/*is_multichannel_supported=*/false);
   return;
 #else
   mojo::ReportBadMessage(kInvalidSpeechRecogntionOptions);
 #endif  // BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
 }

 void CrosSpeechRecognitionService::
     SetCreateCrosSpeechRecognitionRecognizerCbForTesting(
         CreateCrosSpeechRecognitionRecognizerCb callback) {
   cros_speech_recognition_recognizer_cb_ = std::move(callback);
 }

 void CrosSpeechRecognitionService::
     CreateAudioSourceFetcherForOnDeviceRecognitionOnIOThread(
         mojo::PendingReceiver<media::mojom::AudioSourceFetcher>
             fetcher_receiver,
         mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
             client,
         media::mojom::SpeechRecognitionOptionsPtr options,
         const base::FilePath& binary_path,
         const base::flat_map<std::string, base::FilePath>& config_paths,
         const std::string& primary_language_name,
         const bool mask_offensive_words) {
   DCHECK_CURRENTLY_ON(content::BrowserThread::IO);
   DCHECK(!options->is_server_based);
   AudioSourceFetcherImpl::Create(
       std::move(fetcher_receiver),
       cros_speech_recognition_recognizer_cb_.Run(
           std::move(client), std::move(options), binary_path, config_paths,
           primary_language_name, mask_offensive_words),
       CrosSpeechRecognitionRecognizerImpl::IsMultichannelSupported(),
       /*is_server_based=*/false);
 }

 void CrosSpeechRecognitionService::
     CreateAudioSourceFetcherForServerBasedRecognitionOnIOThread(
         mojo::PendingReceiver<media::mojom::AudioSourceFetcher>
             fetcher_receiver,
         mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
             client,
         media::mojom::SpeechRecognitionOptionsPtr options,
         std::unique_ptr<network::PendingSharedURLLoaderFactory>
             pending_loader_factory) {
 #if BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
   DCHECK_CURRENTLY_ON(content::BrowserThread::IO);
   DCHECK(options->is_server_based);
   AudioSourceFetcherImpl::Create(
       std::move(fetcher_receiver),
       std::make_unique<ServerBasedRecognitionRecognizer>(
           std::move(client), std::move(options),
           network::SharedURLLoaderFactory::Create(
               std::move(pending_loader_factory))),
       /*is_multichannel_supported=*/false, /*is_server_based=*/true);
 #endif  // BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
 }

 }  // namespace speech
	// Copyright 2021 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "chrome/browser/speech/cros_speech_recognition_service.h"

	#include <memory>
	#include <string>
	#include <utility>

	#include "ash/constants/ash_features.h"
	#include "base/containers/flat_map.h"
	#include "base/files/file_path.h"
	#include "base/functional/bind.h"
	#include "base/notimplemented.h"
	#include "base/types/optional_util.h"
	#include "chrome/services/speech/audio_source_fetcher_impl.h"
	#include "chrome/services/speech/buildflags/buildflags.h"
	#include "chrome/services/speech/cros_speech_recognition_recognizer_impl.h"
	#include "components/live_caption/pref_names.h"
	#include "components/soda/constants.h"
	#include "components/soda/soda_installer.h"
	#include "components/user_prefs/user_prefs.h"
	#include "content/public/browser/browser_context.h"
	#include "content/public/browser/browser_task_traits.h"
	#include "content/public/browser/browser_thread.h"
	#include "content/public/browser/storage_partition.h"
	#include "media/base/media_switches.h"
	#include "media/mojo/mojom/speech_recognition.mojom-shared.h"
	#include "media/mojo/mojom/speech_recognition.mojom.h"
	#include "media/mojo/mojom/speech_recognition_service.mojom.h"
	#include "services/network/public/cpp/shared_url_loader_factory.h"

	#if BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
	#include "chrome/services/speech/internal/server_based_recognition_recognizer.h"
	#endif // BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)

	namespace speech {

	namespace {

	constexpr char kInvalidSpeechRecogntionOptions[] =
	"Invalid SpeechRecognitionOptions provided";

	void PopulateFilePaths(
	base::FilePath& binary_path,
	base::flat_map<std::string, base::FilePath>& config_paths) {
	speech::SodaInstaller* soda_installer = speech::SodaInstaller::GetInstance();

	binary_path = soda_installer->GetSodaBinaryPath();
	for (const auto& language_code : soda_installer->InstalledLanguages()) {
	config_paths[GetLanguageName(language_code)] =
	soda_installer->GetLanguagePath(GetLanguageName(language_code));
	}
	}

	std::unique_ptr<CrosSpeechRecognitionRecognizerImpl>
	CreateCrosSpeechRecognitionRecognizer(
	mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> client,
	media::mojom::SpeechRecognitionOptionsPtr options,
	const base::FilePath& binary_path,
	const base::flat_map<std::string, base::FilePath>& config_paths,
	const std::string& primary_language_name,
	const bool mask_offensive_words) {
	return std::make_unique<CrosSpeechRecognitionRecognizerImpl>(
	std::move(client), std::move(options), binary_path, config_paths,
	primary_language_name, mask_offensive_words);
	}

	} // namespace

	CrosSpeechRecognitionService::CrosSpeechRecognitionService(
	content::BrowserContext* context)
	: ChromeSpeechRecognitionService(context) {
	cros_speech_recognition_recognizer_cb_ =
	base::BindRepeating(CreateCrosSpeechRecognitionRecognizer);
	}

	CrosSpeechRecognitionService::~CrosSpeechRecognitionService() = default;

	void CrosSpeechRecognitionService::BindSpeechRecognitionContext(
	mojo::PendingReceiver<media::mojom::SpeechRecognitionContext> receiver) {
	speech_recognition_contexts_.Add(this, std::move(receiver));
	}

	void CrosSpeechRecognitionService::BindAudioSourceSpeechRecognitionContext(
	mojo::PendingReceiver<media::mojom::AudioSourceSpeechRecognitionContext>
	receiver) {
	audio_source_speech_recognition_contexts_.Add(this, std::move(receiver));
	}

	void CrosSpeechRecognitionService::BindRecognizer(
	mojo::PendingReceiver<media::mojom::SpeechRecognitionRecognizer> receiver,
	mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> client,
	media::mojom::SpeechRecognitionOptionsPtr options,
	BindRecognizerCallback callback) {
	// This binding is used by LiveCaption and it can't be server based
	// recognition.
	if (options->is_server_based \|\|
	options->recognizer_client_type !=
	media::mojom::RecognizerClientType::kLiveCaption) {
	mojo::ReportBadMessage(kInvalidSpeechRecogntionOptions);
	return;
	}

	base::FilePath binary_path;
	base::flat_map<std::string, base::FilePath> config_paths;
	// The options should have locale set, but if they don't, pull from the prefs.
	std::string language_name;
	if (options->language) {
	language_name = options->language.value();
	} else {
	PrefService* profile_prefs = user_prefs::UserPrefs::Get(context());
	language_name = prefs::GetLiveCaptionLanguageCode(profile_prefs);
	}

	PopulateFilePaths(binary_path, config_paths);

	// TODO(crbug.com/40924425): Implement offensive word mask on ChromeOS so that
	// mask_offensive_words is not hard-coded.
	CrosSpeechRecognitionRecognizerImpl::Create(
	std::move(receiver), std::move(client), std::move(options), binary_path,
	config_paths, language_name, /* mask_offensive_words= */ false);
	std::move(callback).Run(
	CrosSpeechRecognitionRecognizerImpl::IsMultichannelSupported());
	}

	void CrosSpeechRecognitionService::BindWebSpeechRecognizer(
	mojo::PendingReceiver<media::mojom::SpeechRecognitionSession>
	session_receiver,
	mojo::PendingRemote<media::mojom::SpeechRecognitionSessionClient>
	session_client,
	mojo::PendingReceiver<media::mojom::SpeechRecognitionAudioForwarder>
	audio_forwarder,
	int channel_count,
	int sample_rate,
	media::mojom::SpeechRecognitionOptionsPtr options,
	bool continuous) {
	NOTIMPLEMENTED();
	}

	void CrosSpeechRecognitionService::BindAudioSourceFetcher(
	mojo::PendingReceiver<media::mojom::AudioSourceFetcher> fetcher_receiver,
	mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> client,
	media::mojom::SpeechRecognitionOptionsPtr options,
	BindRecognizerCallback callback) {
	if (!options->is_server_based) {
	base::FilePath binary_path;
	base::flat_map<std::string, base::FilePath> config_paths;
	PopulateFilePaths(binary_path, config_paths);

	// The options should have locale set, but if they don't, pull from the
	// prefs.
	std::string language_name;
	if (options->language) {
	language_name = options->language.value();
	} else {
	PrefService* profile_prefs = user_prefs::UserPrefs::Get(context());
	language_name = prefs::GetLiveCaptionLanguageCode(profile_prefs);
	}
	// `mask_offensive_words` is always true for
	// `RecognizerClientType::kSchoolTools`.
	// TODO(crbug.com/40924425): Implement offensive word mask on ChromeOS for
	// live caption as well so that mask_offensive_words is set according to the
	// settings for `RecognizerClientType::kLiveCaption`.
	bool mask_offensive_words =
	options->recognizer_client_type ==
	media::mojom::RecognizerClientType::kSchoolTools;
	// CrosSpeechRecognitionService runs on browser UI thread.
	// Create AudioSourceFetcher on browser IO thread to avoid UI jank.
	// Note that its CrosSpeechRecognitionRecognizer must also run
	// on the IO thread. If CrosSpeechRecognitionService is moved away from
	// browser UI thread, we can call AudioSourceFetcherImpl::Create directly.
	content::GetIOThreadTaskRunner({})->PostTask(
	FROM_HERE,
	base::BindOnce(
	&CrosSpeechRecognitionService::
	CreateAudioSourceFetcherForOnDeviceRecognitionOnIOThread,
	weak_factory_.GetWeakPtr(), std::move(fetcher_receiver),
	std::move(client), std::move(options), binary_path, config_paths,
	language_name, mask_offensive_words));
	std::move(callback).Run(
	CrosSpeechRecognitionRecognizerImpl::IsMultichannelSupported());
	return;
	}
	#if BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
	if (!ash::features::IsInternalServerSideSpeechRecognitionEnabled()) {
	// A request is made for a service that has not been enabled.
	mojo::ReportBadMessage(kInvalidSpeechRecogntionOptions);
	return;
	}
	content::GetIOThreadTaskRunner({})->PostTask(
	FROM_HERE,
	base::BindOnce(
	&CrosSpeechRecognitionService::
	CreateAudioSourceFetcherForServerBasedRecognitionOnIOThread,
	weak_factory_.GetWeakPtr(), std::move(fetcher_receiver),
	std::move(client), std::move(options),
	context()
	->GetDefaultStoragePartition()
	->GetURLLoaderFactoryForBrowserProcessIOThread()));
	std::move(callback).Run(/is_multichannel_supported=/false);
	return;
	#else
	mojo::ReportBadMessage(kInvalidSpeechRecogntionOptions);
	#endif // BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
	}

	void CrosSpeechRecognitionService::
	SetCreateCrosSpeechRecognitionRecognizerCbForTesting(
	CreateCrosSpeechRecognitionRecognizerCb callback) {
	cros_speech_recognition_recognizer_cb_ = std::move(callback);
	}

	void CrosSpeechRecognitionService::
	CreateAudioSourceFetcherForOnDeviceRecognitionOnIOThread(
	mojo::PendingReceiver<media::mojom::AudioSourceFetcher>
	fetcher_receiver,
	mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
	client,
	media::mojom::SpeechRecognitionOptionsPtr options,
	const base::FilePath& binary_path,
	const base::flat_map<std::string, base::FilePath>& config_paths,
	const std::string& primary_language_name,
	const bool mask_offensive_words) {
	DCHECK_CURRENTLY_ON(content::BrowserThread::IO);
	DCHECK(!options->is_server_based);
	AudioSourceFetcherImpl::Create(
	std::move(fetcher_receiver),
	cros_speech_recognition_recognizer_cb_.Run(
	std::move(client), std::move(options), binary_path, config_paths,
	primary_language_name, mask_offensive_words),
	CrosSpeechRecognitionRecognizerImpl::IsMultichannelSupported(),
	/is_server_based=/false);
	}

	void CrosSpeechRecognitionService::
	CreateAudioSourceFetcherForServerBasedRecognitionOnIOThread(
	mojo::PendingReceiver<media::mojom::AudioSourceFetcher>
	fetcher_receiver,
	mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
	client,
	media::mojom::SpeechRecognitionOptionsPtr options,
	std::unique_ptr<network::PendingSharedURLLoaderFactory>
	pending_loader_factory) {
	#if BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
	DCHECK_CURRENTLY_ON(content::BrowserThread::IO);
	DCHECK(options->is_server_based);
	AudioSourceFetcherImpl::Create(
	std::move(fetcher_receiver),
	std::make_unique<ServerBasedRecognitionRecognizer>(
	std::move(client), std::move(options),
	network::SharedURLLoaderFactory::Create(
	std::move(pending_loader_factory))),
	/is_multichannel_supported=/false, /is_server_based=/true);
	#endif // BUILDFLAG(ENABLE_SERVER_BASED_RECOGNITION_RECOGNIZER)
	}

	} // namespace speech