chrome/browser/speech/speech_recognition_service_browsertest.cc - chromium/src - Git at Google

 // Copyright 2020 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include <algorithm>

 #include "base/files/file_path.h"
 #include "base/files/file_util.h"
 #include "base/files/scoped_temp_dir.h"
 #include "base/notreached.h"
 #include "base/path_service.h"
 #include "base/sync_socket.h"
 #include "base/test/metrics/histogram_tester.h"
 #include "base/test/scoped_feature_list.h"
 #include "base/threading/thread_restrictions.h"
 #include "base/timer/timer.h"
 #include "build/build_config.h"
 #include "chrome/browser/browser_process.h"
 #include "chrome/browser/profiles/profile.h"
 #include "chrome/browser/profiles/profile_manager.h"
 #include "chrome/browser/speech/chrome_speech_recognition_service.h"
 #include "chrome/browser/ui/browser.h"
 #include "chrome/common/pref_names.h"
 #include "chrome/services/speech/soda/soda_test_paths.h"
 #include "chrome/services/speech/speech_recognition_recognizer_impl.h"
 #include "chrome/test/base/in_process_browser_test.h"
 #include "components/metrics/content/subprocess_metrics_provider.h"
 #include "components/prefs/pref_service.h"
 #include "components/soda/constants.h"
 #include "components/soda/pref_names.h"
 #include "content/public/browser/audio_service.h"
 #include "content/public/common/content_switches.h"
 #include "content/public/test/browser_test.h"
 #include "media/audio/audio_device_description.h"
 #include "media/audio/wav_audio_handler.h"
 #include "media/base/audio_bus.h"
 #include "media/base/media_switches.h"
 #include "media/mojo/mojom/audio_data.mojom.h"
 #include "media/mojo/mojom/audio_data_pipe.mojom.h"
 #include "media/mojo/mojom/audio_input_stream.mojom.h"
 #include "media/mojo/mojom/audio_stream_factory.mojom.h"
 #include "media/mojo/mojom/media_types.mojom.h"
 #include "media/mojo/mojom/speech_recognition_service.mojom.h"
 #include "sandbox/policy/switches.h"
 #include "services/audio/public/cpp/fake_stream_factory.h"
 #include "testing/gmock/include/gmock/gmock.h"
 #include "third_party/abseil-cpp/absl/utility/utility.h"

 #if BUILDFLAG(IS_WIN)
 #include <windows.h>
 #else
 #include <unistd.h>
 #endif

 using testing::StrictMock;

 namespace speech {

 constexpr int kExpectedChannelCount = 1;

 // TODO: Should be a way to generate this, this seems way too brittle.
 const size_t kShMemSize = 82240;

 class MockStream : public media::mojom::AudioInputStream {
  public:
   MOCK_METHOD0(Record, void());
   MOCK_METHOD1(SetVolume, void(double));
 };

 class TestStreamFactory : public audio::FakeStreamFactory {
  public:
   TestStreamFactory() : stream_(), stream_receiver_(&stream_) {}
   ~TestStreamFactory() override = default;
   void CreateInputStream(
       mojo::PendingReceiver<media::mojom::AudioInputStream> stream_receiver,
       mojo::PendingRemote<media::mojom::AudioInputStreamClient> client,
       mojo::PendingRemote<media::mojom::AudioInputStreamObserver> observer,
       mojo::PendingRemote<media::mojom::AudioLog> log,
       const std::string& device_id,
       const media::AudioParameters& params,
       uint32_t shared_memory_count,
       bool enable_agc,
       base::ReadOnlySharedMemoryRegion key_press_count_buffer,
       media::mojom::AudioProcessingConfigPtr processing_config,
       CreateInputStreamCallback created_callback) override {
     device_id_ = device_id;
     params_ = params;
     if (stream_receiver_.is_bound())
       stream_receiver_.reset();
     stream_receiver_.Bind(std::move(stream_receiver));
     if (client_)
       client_.reset();
     // Keep the passed client alive to avoid binding errors.
     client_.Bind(std::move(client));
     base::SyncSocket socket1, socket2;
     base::SyncSocket::CreatePair(&socket1, &socket2);
     std::move(created_callback)
         .Run({absl::in_place,
               base::ReadOnlySharedMemoryRegion::Create(kShMemSize).region,
               mojo::PlatformHandle(socket1.Take())},
              false /*initially muted*/, base::UnguessableToken::Create());
   }

   mojo::PendingRemote<media::mojom::AudioStreamFactory> MakeRemote() {
     return receiver_.BindNewPipeAndPassRemote();
   }

   void WaitToCreateInputStream() {
     if (stream_receiver_.is_bound())
       return;
     base::RepeatingTimer check_timer;
     check_timer.Start(FROM_HERE, base::Milliseconds(10), this,
                       &TestStreamFactory::OnTimer);
     runner_.Run();
   }

   StrictMock<MockStream> stream_;
   mojo::Remote<media::mojom::AudioInputStreamClient> client_;
   mojo::Receiver<media::mojom::AudioInputStream> stream_receiver_;
   std::string device_id_;
   absl::optional<media::AudioParameters> params_;

  private:
   void OnTimer() {
     if (stream_receiver_.is_bound())
       runner_.Quit();
   }

   base::RunLoop runner_;
 };

 class SpeechRecognitionServiceTest
     : public InProcessBrowserTest,
       public media::mojom::SpeechRecognitionRecognizerClient {
  public:
   SpeechRecognitionServiceTest() {
     scoped_feature_list_.InitWithFeatures({media::kLiveCaption}, {});
   }

   SpeechRecognitionServiceTest(const SpeechRecognitionServiceTest&) = delete;
   SpeechRecognitionServiceTest& operator=(const SpeechRecognitionServiceTest&) =
       delete;

   ~SpeechRecognitionServiceTest() override = default;

   // InProcessBrowserTest
   void SetUp() override;

   // media::mojom::SpeechRecognitionRecognizerClient
   void OnSpeechRecognitionRecognitionEvent(
       const media::SpeechRecognitionResult& result,
       OnSpeechRecognitionRecognitionEventCallback reply) override;
   void OnSpeechRecognitionStopped() override;
   void OnSpeechRecognitionError() override;
   void OnLanguageIdentificationEvent(
       media::mojom::LanguageIdentificationEventPtr event) override;

   // Disable the sandbox on Windows and MacOS as the sandboxes on those
   // platforms have not been configured yet.
 #if BUILDFLAG(IS_WIN) || BUILDFLAG(IS_MAC)
   void SetUpCommandLine(base::CommandLine* command_line) override {
     // Required for the utility process to access the directory containing the
     // test files.
     command_line->AppendSwitch(sandbox::policy::switches::kNoSandbox);
   }
 #endif

  protected:
   void CloseCaptionBubble() {
     is_client_requesting_speech_recognition_ = false;
   }
   void SetUpPrefs();
   void LaunchService();
   void LaunchServiceWithAudioSourceFetcher();
   void SendAudioChunk(const std::vector<int16_t>& audio_data,
                       media::WavAudioHandler* handler,
                       size_t kMaxChunkSize);

   // The root directory for test files.
   base::FilePath test_data_dir_;

   base::test::ScopedFeatureList scoped_feature_list_;
   mojo::Remote<media::mojom::AudioSourceSpeechRecognitionContext>
       audio_source_speech_recognition_context_;
   mojo::Remote<media::mojom::SpeechRecognitionContext>
       speech_recognition_context_;

   mojo::Remote<media::mojom::SpeechRecognitionRecognizer>
       speech_recognition_recognizer_;

   mojo::Remote<media::mojom::AudioSourceFetcher> audio_source_fetcher_;

   mojo::Receiver<media::mojom::SpeechRecognitionRecognizerClient>
       speech_recognition_client_receiver_{this};

   std::vector<std::string> recognition_results_;

   bool is_client_requesting_speech_recognition_ = true;
 };

 void SpeechRecognitionServiceTest::SetUp() {
   ASSERT_TRUE(base::PathService::Get(base::DIR_SOURCE_ROOT, &test_data_dir_));
   InProcessBrowserTest::SetUp();
 }

 void SpeechRecognitionServiceTest::OnSpeechRecognitionRecognitionEvent(
     const media::SpeechRecognitionResult& result,
     OnSpeechRecognitionRecognitionEventCallback reply) {
   std::string transcription = result.transcription;
   // The language pack used by the MacOS builder is newer and has punctuation
   // enabled whereas the one used by the Linux builder does not.
   transcription.erase(
       std::remove(transcription.begin(), transcription.end(), ','),
       transcription.end());
   recognition_results_.push_back(std::move(transcription));
   std::move(reply).Run(is_client_requesting_speech_recognition_);
 }

 void SpeechRecognitionServiceTest::OnSpeechRecognitionStopped() {
   NOTREACHED();
 }

 void SpeechRecognitionServiceTest::OnSpeechRecognitionError() {
   NOTREACHED();
 }

 void SpeechRecognitionServiceTest::OnLanguageIdentificationEvent(
     media::mojom::LanguageIdentificationEventPtr event) {
   NOTREACHED();
 }

 void SpeechRecognitionServiceTest::SetUpPrefs() {
   base::FilePath soda_binary_path;
 #if BUILDFLAG(IS_WIN) || BUILDFLAG(IS_MAC)
   soda_binary_path =
       test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
           .Append(soda::kSodaTestBinaryRelativePath);
 #else
   base::FilePath soda_test_binary_path =
       test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
           .Append(soda::kSodaTestBinaryRelativePath);
   DVLOG(0) << "SODA test path: " << soda_test_binary_path.value().c_str();
   base::ScopedAllowBlockingForTesting allow_blocking;
   ASSERT_TRUE(base::PathExists(soda_test_binary_path));

   soda_binary_path = GetSodaTestBinaryPath();
   DVLOG(0) << "SODA binary path: " << soda_binary_path.value().c_str();
   ASSERT_TRUE(base::PathExists(soda_binary_path));
 #endif
   g_browser_process->local_state()->SetFilePath(prefs::kSodaBinaryPath,
                                                 soda_binary_path);
   g_browser_process->local_state()->SetFilePath(
       prefs::kSodaEnUsConfigPath,
       test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
           .Append(soda::kSodaLanguagePackRelativePath));
 }

 void SpeechRecognitionServiceTest::LaunchService() {
   // Launch the Speech Recognition service.
   auto* browser_context =
       static_cast<content::BrowserContext*>(browser()->profile());
   auto* service = new ChromeSpeechRecognitionService(browser_context);

   service->BindSpeechRecognitionContext(
       speech_recognition_context_.BindNewPipeAndPassReceiver());

   bool is_multichannel_supported = true;
   auto run_loop = std::make_unique<base::RunLoop>();
   // Bind the recognizer pipes used to send audio and receive results.
   speech_recognition_context_->BindRecognizer(
       speech_recognition_recognizer_.BindNewPipeAndPassReceiver(),
       speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
       media::mojom::SpeechRecognitionOptions::New(
           media::mojom::SpeechRecognitionMode::kCaption,
           /*enable_formatting=*/true, kUsEnglishLocale,
           /*is_server_based=*/false,
           media::mojom::RecognizerClientType::kLiveCaption),
       base::BindOnce(
           [](bool* p_is_multichannel_supported, base::RunLoop* run_loop,
              bool is_multichannel_supported) {
             *p_is_multichannel_supported = is_multichannel_supported;
             run_loop->Quit();
           },
           &is_multichannel_supported, run_loop.get()));

   run_loop->Run();
   ASSERT_FALSE(is_multichannel_supported);
 }

 void SpeechRecognitionServiceTest::LaunchServiceWithAudioSourceFetcher() {
   // Launch the Speech Recognition service.
   auto* browser_context =
       static_cast<content::BrowserContext*>(browser()->profile());
   auto* service = new ChromeSpeechRecognitionService(browser_context);

   service->BindAudioSourceSpeechRecognitionContext(
       audio_source_speech_recognition_context_.BindNewPipeAndPassReceiver());

   bool is_multichannel_supported = true;
   auto run_loop = std::make_unique<base::RunLoop>();
   // Bind the recognizer pipes used to send audio and receive results.
   audio_source_speech_recognition_context_->BindAudioSourceFetcher(
       audio_source_fetcher_.BindNewPipeAndPassReceiver(),
       speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
       media::mojom::SpeechRecognitionOptions::New(
           media::mojom::SpeechRecognitionMode::kIme,
           /*enable_formatting=*/false, kUsEnglishLocale),
       base::BindOnce(
           [](bool* p_is_multichannel_supported, base::RunLoop* run_loop,
              bool is_multichannel_supported) {
             *p_is_multichannel_supported = is_multichannel_supported;
             run_loop->Quit();
           },
           &is_multichannel_supported, run_loop.get()));
   run_loop->Run();
   ASSERT_FALSE(is_multichannel_supported);
 }

 void SpeechRecognitionServiceTest::SendAudioChunk(
     const std::vector<int16_t>& audio_data,
     media::WavAudioHandler* handler,
     size_t kMaxChunkSize) {
   int chunk_start = 0;
   // Upload chunks of 1024 frames at a time.
   while (chunk_start < static_cast<int>(audio_data.size())) {
     int chunk_size = kMaxChunkSize < audio_data.size() - chunk_start
                          ? kMaxChunkSize
                          : audio_data.size() - chunk_start;

     auto signed_buffer = media::mojom::AudioDataS16::New();
     signed_buffer->channel_count = kExpectedChannelCount;
     signed_buffer->frame_count = chunk_size;
     signed_buffer->sample_rate = handler->sample_rate();
     for (int i = 0; i < chunk_size; i++) {
       signed_buffer->data.push_back(audio_data[chunk_start + i]);
     }

     speech_recognition_recognizer_->SendAudioToSpeechRecognitionService(
         std::move(signed_buffer));
     chunk_start += chunk_size;

     // Sleep for 20ms to simulate real-time audio. SODA requires audio
     // streaming in order to return events.
 #if BUILDFLAG(IS_WIN)
     ::Sleep(20);
 #else
     usleep(20000);
 #endif
   }
 }

 IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest, RecognizePhrase) {
   base::HistogramTester histograms;
   SetUpPrefs();
   LaunchService();

   std::string buffer;
   auto audio_file =
       test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
           .Append(base::FilePath(soda::kSodaTestAudioRelativePath));
   {
     base::ScopedAllowBlockingForTesting allow_blocking;
     ASSERT_TRUE(base::PathExists(audio_file));
     ASSERT_TRUE(base::ReadFileToString(audio_file, &buffer));
   }

   auto handler = media::WavAudioHandler::Create(buffer);
   ASSERT_TRUE(handler.get());
   ASSERT_EQ(handler->num_channels(), kExpectedChannelCount);

   auto bus =
       media::AudioBus::Create(kExpectedChannelCount, handler->total_frames());

   size_t bytes_written = 0u;
   ASSERT_TRUE(handler->CopyTo(bus.get(), 0, &bytes_written));

   std::vector<int16_t> audio_data(bus->frames());
   bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(bus->frames(),
                                                          audio_data.data());

   constexpr size_t kMaxChunkSize = 1024;
   constexpr int kReplayAudioCount = 2;
   for (int i = 0; i < kReplayAudioCount; i++) {
     SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);
   }

   speech_recognition_recognizer_.reset();
   base::RunLoop().RunUntilIdle();

   // Sleep for 100ms to ensure SODA has returned real-time results.
 #if BUILDFLAG(IS_WIN)
   ::Sleep(100);
 #else
   usleep(100000);
 #endif
   ASSERT_GT(static_cast<int>(recognition_results_.size()), kReplayAudioCount);
   ASSERT_EQ(recognition_results_.back(), "Hey Google Hey Google");

   metrics::SubprocessMetricsProvider::MergeHistogramDeltasForTesting();
   histograms.ExpectUniqueTimeSample(
       SpeechRecognitionRecognizerImpl::kCaptionBubbleVisibleHistogramName,
       base::Milliseconds(2520), 1);
   histograms.ExpectTotalCount(
       SpeechRecognitionRecognizerImpl::kCaptionBubbleHiddenHistogramName, 0);
 }

 IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest,
                        ClosingCaptionBubbleStopsRecognition) {
   base::HistogramTester histograms;
   SetUpPrefs();
   LaunchService();

   std::string buffer;
   auto audio_file =
       test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
           .Append(base::FilePath(soda::kSodaTestAudioRelativePath));
   {
     base::ScopedAllowBlockingForTesting allow_blocking;
     ASSERT_TRUE(base::PathExists(audio_file));
     ASSERT_TRUE(base::ReadFileToString(audio_file, &buffer));
   }

   auto handler = media::WavAudioHandler::Create(buffer);
   ASSERT_TRUE(handler.get());
   ASSERT_EQ(handler->num_channels(), kExpectedChannelCount);

   auto bus =
       media::AudioBus::Create(kExpectedChannelCount, handler->total_frames());

   size_t bytes_written = 0u;
   ASSERT_TRUE(handler->CopyTo(bus.get(), 0, &bytes_written));

   std::vector<int16_t> audio_data(bus->frames());
   bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(bus->frames(),
                                                          audio_data.data());
   constexpr size_t kMaxChunkSize = 1024;

   // Send an audio chunk to the service. It will output "Hey Google". When the
   // client receives the result, it responds to the service with `success =
   // true`, informing the speech recognition service that it still wants
   // transcriptions.
   SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);
   base::RunLoop().RunUntilIdle();

   // Close caption bubble. This means that the next time the client receives a
   // transcription, it will respond to the speech service with `success =
   // false`, informing the speech recognition service that it is no longer
   // requesting speech recognition.
   CloseCaptionBubble();

   // Send an audio chunk to the service. It will output "Hey Google". When the
   // client receives the result, it responds to the service with `success =
   // false`, informing the speech recognition service that it is no longer
   // requesting speech recognition.
   SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);
   base::RunLoop().RunUntilIdle();

   // Send an audio chunk to the service. It does not get transcribed.
   SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);

   speech_recognition_recognizer_.reset();
   base::RunLoop().RunUntilIdle();

   // Sleep for 100ms to ensure SODA has returned real-time results.
 #if BUILDFLAG(IS_WIN)
   ::Sleep(100);
 #else
   usleep(100000);
 #endif
   ASSERT_GT(static_cast<int>(recognition_results_.size()), 3);
   ASSERT_EQ(recognition_results_.back(), "Hey Google Hey Google");

   metrics::SubprocessMetricsProvider::MergeHistogramDeltasForTesting();
   histograms.ExpectUniqueTimeSample(
       SpeechRecognitionRecognizerImpl::kCaptionBubbleVisibleHistogramName,
       base::Milliseconds(2520), 1);
   histograms.ExpectUniqueTimeSample(
       SpeechRecognitionRecognizerImpl::kCaptionBubbleHiddenHistogramName,
       base::Milliseconds(1260), 1);
 }

 IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest, CreateAudioSourceFetcher) {
   base::HistogramTester histograms;
   SetUpPrefs();
   LaunchServiceWithAudioSourceFetcher();

   // TODO(crbug.com/1185978): Check implementation / sandbox policy on Mac and
   // Windows.
 #if BUILDFLAG(IS_CHROMEOS) || BUILDFLAG(IS_LINUX)
   // Check that Start begins audio recording.
   // TODO(crbug.com/1173135): Try to mock audio input, maybe with
   // TestStreamFactory::stream_, to test end-to-end.
   std::string device_id = media::AudioDeviceDescription::kDefaultDeviceId;
   media::AudioParameters params(media::AudioParameters::AUDIO_PCM_LOW_LATENCY,
                                 media::ChannelLayoutConfig::Stereo(), 10000,
                                 1000);

   // Create a fake stream factory.
   std::unique_ptr<StrictMock<TestStreamFactory>> stream_factory =
       std::make_unique<StrictMock<TestStreamFactory>>();
   EXPECT_CALL(stream_factory->stream_, Record());
   audio_source_fetcher_->Start(stream_factory->MakeRemote(), device_id, params);
   stream_factory->WaitToCreateInputStream();

   EXPECT_EQ(device_id, stream_factory->device_id_);
   ASSERT_TRUE(stream_factory->params_);
   EXPECT_TRUE(params.Equals(stream_factory->params_.value()));
 #endif

   audio_source_fetcher_->Stop();
   base::RunLoop().RunUntilIdle();
 }

 IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest, CompromisedRenderer) {
   // Create temporary SODA files.
   SetUpPrefs();
   base::ScopedAllowBlockingForTesting allow_blocking;
   base::FilePath config_dir =
       GetSodaLanguagePacksDirectory()
           .AppendASCII(kUsEnglishLocale)
           .Append(FILE_PATH_LITERAL("1.1.1"))
           .Append(kSodaLanguagePackDirectoryRelativePath);
   base::CreateDirectory(config_dir);
   ASSERT_TRUE(base::PathExists(config_dir));
   base::FilePath config_file_path =
       config_dir.Append(FILE_PATH_LITERAL("config_file"));
   ASSERT_EQ(base::WriteFile(config_file_path, nullptr, 0), 0);
   ASSERT_TRUE(base::PathExists(config_file_path));
   g_browser_process->local_state()->SetFilePath(prefs::kSodaEnUsConfigPath,
                                                 config_file_path);

   // Launch the Speech Recognition service.
   auto* browser_context =
       static_cast<content::BrowserContext*>(browser()->profile());
   auto* service = new ChromeSpeechRecognitionService(browser_context);
   service->BindSpeechRecognitionContext(
       speech_recognition_context_.BindNewPipeAndPassReceiver());

   // Bind the recognizer pipes used to send audio and receive results.
   auto run_loop = std::make_unique<base::RunLoop>();
   speech_recognition_context_->BindRecognizer(
       speech_recognition_recognizer_.BindNewPipeAndPassReceiver(),
       speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
       media::mojom::SpeechRecognitionOptions::New(
           media::mojom::SpeechRecognitionMode::kCaption,
           /*enable_formatting=*/true, kUsEnglishLocale,
           /*is_server_based=*/false,
           media::mojom::RecognizerClientType::kLiveCaption),
       base::BindOnce([](base::RunLoop* run_loop,
                         bool is_multichannel_supported) { run_loop->Quit(); },
                      run_loop.get()));
   run_loop->Run();

   // Simulate a compromised renderer by changing the language and immediately
   // resetting the recognizer and verify that the subsequent callbacks do not
   // cause any crashes.
   speech_recognition_recognizer_->OnLanguageChanged(kUsEnglishLocale);
   speech_recognition_recognizer_.reset();
   base::RunLoop().RunUntilIdle();
 }

 }  // namespace speech
	// Copyright 2020 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include <algorithm>

	#include "base/files/file_path.h"
	#include "base/files/file_util.h"
	#include "base/files/scoped_temp_dir.h"
	#include "base/notreached.h"
	#include "base/path_service.h"
	#include "base/sync_socket.h"
	#include "base/test/metrics/histogram_tester.h"
	#include "base/test/scoped_feature_list.h"
	#include "base/threading/thread_restrictions.h"
	#include "base/timer/timer.h"
	#include "build/build_config.h"
	#include "chrome/browser/browser_process.h"
	#include "chrome/browser/profiles/profile.h"
	#include "chrome/browser/profiles/profile_manager.h"
	#include "chrome/browser/speech/chrome_speech_recognition_service.h"
	#include "chrome/browser/ui/browser.h"
	#include "chrome/common/pref_names.h"
	#include "chrome/services/speech/soda/soda_test_paths.h"
	#include "chrome/services/speech/speech_recognition_recognizer_impl.h"
	#include "chrome/test/base/in_process_browser_test.h"
	#include "components/metrics/content/subprocess_metrics_provider.h"
	#include "components/prefs/pref_service.h"
	#include "components/soda/constants.h"
	#include "components/soda/pref_names.h"
	#include "content/public/browser/audio_service.h"
	#include "content/public/common/content_switches.h"
	#include "content/public/test/browser_test.h"
	#include "media/audio/audio_device_description.h"
	#include "media/audio/wav_audio_handler.h"
	#include "media/base/audio_bus.h"
	#include "media/base/media_switches.h"
	#include "media/mojo/mojom/audio_data.mojom.h"
	#include "media/mojo/mojom/audio_data_pipe.mojom.h"
	#include "media/mojo/mojom/audio_input_stream.mojom.h"
	#include "media/mojo/mojom/audio_stream_factory.mojom.h"
	#include "media/mojo/mojom/media_types.mojom.h"
	#include "media/mojo/mojom/speech_recognition_service.mojom.h"
	#include "sandbox/policy/switches.h"
	#include "services/audio/public/cpp/fake_stream_factory.h"
	#include "testing/gmock/include/gmock/gmock.h"
	#include "third_party/abseil-cpp/absl/utility/utility.h"

	#if BUILDFLAG(IS_WIN)
	#include <windows.h>
	#else
	#include <unistd.h>
	#endif

	using testing::StrictMock;

	namespace speech {

	constexpr int kExpectedChannelCount = 1;

	// TODO: Should be a way to generate this, this seems way too brittle.
	const size_t kShMemSize = 82240;

	class MockStream : public media::mojom::AudioInputStream {
	public:
	MOCK_METHOD0(Record, void());
	MOCK_METHOD1(SetVolume, void(double));
	};

	class TestStreamFactory : public audio::FakeStreamFactory {
	public:
	TestStreamFactory() : stream_(), stream_receiver_(&stream_) {}
	~TestStreamFactory() override = default;
	void CreateInputStream(
	mojo::PendingReceiver<media::mojom::AudioInputStream> stream_receiver,
	mojo::PendingRemote<media::mojom::AudioInputStreamClient> client,
	mojo::PendingRemote<media::mojom::AudioInputStreamObserver> observer,
	mojo::PendingRemote<media::mojom::AudioLog> log,
	const std::string& device_id,
	const media::AudioParameters& params,
	uint32_t shared_memory_count,
	bool enable_agc,
	base::ReadOnlySharedMemoryRegion key_press_count_buffer,
	media::mojom::AudioProcessingConfigPtr processing_config,
	CreateInputStreamCallback created_callback) override {
	device_id_ = device_id;
	params_ = params;
	if (stream_receiver_.is_bound())
	stream_receiver_.reset();
	stream_receiver_.Bind(std::move(stream_receiver));
	if (client_)
	client_.reset();
	// Keep the passed client alive to avoid binding errors.
	client_.Bind(std::move(client));
	base::SyncSocket socket1, socket2;
	base::SyncSocket::CreatePair(&socket1, &socket2);
	std::move(created_callback)
	.Run({absl::in_place,
	base::ReadOnlySharedMemoryRegion::Create(kShMemSize).region,
	mojo::PlatformHandle(socket1.Take())},
	false /initially muted/, base::UnguessableToken::Create());
	}

	mojo::PendingRemote<media::mojom::AudioStreamFactory> MakeRemote() {
	return receiver_.BindNewPipeAndPassRemote();
	}

	void WaitToCreateInputStream() {
	if (stream_receiver_.is_bound())
	return;
	base::RepeatingTimer check_timer;
	check_timer.Start(FROM_HERE, base::Milliseconds(10), this,
	&TestStreamFactory::OnTimer);
	runner_.Run();
	}

	StrictMock<MockStream> stream_;
	mojo::Remote<media::mojom::AudioInputStreamClient> client_;
	mojo::Receiver<media::mojom::AudioInputStream> stream_receiver_;
	std::string device_id_;
	absl::optional<media::AudioParameters> params_;

	private:
	void OnTimer() {
	if (stream_receiver_.is_bound())
	runner_.Quit();
	}

	base::RunLoop runner_;
	};

	class SpeechRecognitionServiceTest
	: public InProcessBrowserTest,
	public media::mojom::SpeechRecognitionRecognizerClient {
	public:
	SpeechRecognitionServiceTest() {
	scoped_feature_list_.InitWithFeatures({media::kLiveCaption}, {});
	}

	SpeechRecognitionServiceTest(const SpeechRecognitionServiceTest&) = delete;
	SpeechRecognitionServiceTest& operator=(const SpeechRecognitionServiceTest&) =
	delete;

	~SpeechRecognitionServiceTest() override = default;

	// InProcessBrowserTest
	void SetUp() override;

	// media::mojom::SpeechRecognitionRecognizerClient
	void OnSpeechRecognitionRecognitionEvent(
	const media::SpeechRecognitionResult& result,
	OnSpeechRecognitionRecognitionEventCallback reply) override;
	void OnSpeechRecognitionStopped() override;
	void OnSpeechRecognitionError() override;
	void OnLanguageIdentificationEvent(
	media::mojom::LanguageIdentificationEventPtr event) override;

	// Disable the sandbox on Windows and MacOS as the sandboxes on those
	// platforms have not been configured yet.
	#if BUILDFLAG(IS_WIN) \|\| BUILDFLAG(IS_MAC)
	void SetUpCommandLine(base::CommandLine* command_line) override {
	// Required for the utility process to access the directory containing the
	// test files.
	command_line->AppendSwitch(sandbox::policy::switches::kNoSandbox);
	}
	#endif

	protected:
	void CloseCaptionBubble() {
	is_client_requesting_speech_recognition_ = false;
	}
	void SetUpPrefs();
	void LaunchService();
	void LaunchServiceWithAudioSourceFetcher();
	void SendAudioChunk(const std::vector<int16_t>& audio_data,
	media::WavAudioHandler* handler,
	size_t kMaxChunkSize);

	// The root directory for test files.
	base::FilePath test_data_dir_;

	base::test::ScopedFeatureList scoped_feature_list_;
	mojo::Remote<media::mojom::AudioSourceSpeechRecognitionContext>
	audio_source_speech_recognition_context_;
	mojo::Remote<media::mojom::SpeechRecognitionContext>
	speech_recognition_context_;

	mojo::Remote<media::mojom::SpeechRecognitionRecognizer>
	speech_recognition_recognizer_;

	mojo::Remote<media::mojom::AudioSourceFetcher> audio_source_fetcher_;

	mojo::Receiver<media::mojom::SpeechRecognitionRecognizerClient>
	speech_recognition_client_receiver_{this};

	std::vector<std::string> recognition_results_;

	bool is_client_requesting_speech_recognition_ = true;
	};

	void SpeechRecognitionServiceTest::SetUp() {
	ASSERT_TRUE(base::PathService::Get(base::DIR_SOURCE_ROOT, &test_data_dir_));
	InProcessBrowserTest::SetUp();
	}

	void SpeechRecognitionServiceTest::OnSpeechRecognitionRecognitionEvent(
	const media::SpeechRecognitionResult& result,
	OnSpeechRecognitionRecognitionEventCallback reply) {
	std::string transcription = result.transcription;
	// The language pack used by the MacOS builder is newer and has punctuation
	// enabled whereas the one used by the Linux builder does not.
	transcription.erase(
	std::remove(transcription.begin(), transcription.end(), ','),
	transcription.end());
	recognition_results_.push_back(std::move(transcription));
	std::move(reply).Run(is_client_requesting_speech_recognition_);
	}

	void SpeechRecognitionServiceTest::OnSpeechRecognitionStopped() {
	NOTREACHED();
	}

	void SpeechRecognitionServiceTest::OnSpeechRecognitionError() {
	NOTREACHED();
	}

	void SpeechRecognitionServiceTest::OnLanguageIdentificationEvent(
	media::mojom::LanguageIdentificationEventPtr event) {
	NOTREACHED();
	}

	void SpeechRecognitionServiceTest::SetUpPrefs() {
	base::FilePath soda_binary_path;
	#if BUILDFLAG(IS_WIN) \|\| BUILDFLAG(IS_MAC)
	soda_binary_path =
	test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
	.Append(soda::kSodaTestBinaryRelativePath);
	#else
	base::FilePath soda_test_binary_path =
	test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
	.Append(soda::kSodaTestBinaryRelativePath);
	DVLOG(0) << "SODA test path: " << soda_test_binary_path.value().c_str();
	base::ScopedAllowBlockingForTesting allow_blocking;
	ASSERT_TRUE(base::PathExists(soda_test_binary_path));

	soda_binary_path = GetSodaTestBinaryPath();
	DVLOG(0) << "SODA binary path: " << soda_binary_path.value().c_str();
	ASSERT_TRUE(base::PathExists(soda_binary_path));
	#endif
	g_browser_process->local_state()->SetFilePath(prefs::kSodaBinaryPath,
	soda_binary_path);
	g_browser_process->local_state()->SetFilePath(
	prefs::kSodaEnUsConfigPath,
	test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
	.Append(soda::kSodaLanguagePackRelativePath));
	}

	void SpeechRecognitionServiceTest::LaunchService() {
	// Launch the Speech Recognition service.
	auto* browser_context =
	static_cast<content::BrowserContext*>(browser()->profile());
	auto* service = new ChromeSpeechRecognitionService(browser_context);

	service->BindSpeechRecognitionContext(
	speech_recognition_context_.BindNewPipeAndPassReceiver());

	bool is_multichannel_supported = true;
	auto run_loop = std::make_unique<base::RunLoop>();
	// Bind the recognizer pipes used to send audio and receive results.
	speech_recognition_context_->BindRecognizer(
	speech_recognition_recognizer_.BindNewPipeAndPassReceiver(),
	speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
	media::mojom::SpeechRecognitionOptions::New(
	media::mojom::SpeechRecognitionMode::kCaption,
	/enable_formatting=/true, kUsEnglishLocale,
	/is_server_based=/false,
	media::mojom::RecognizerClientType::kLiveCaption),
	base::BindOnce(
	[](bool* p_is_multichannel_supported, base::RunLoop* run_loop,
	bool is_multichannel_supported) {
	*p_is_multichannel_supported = is_multichannel_supported;
	run_loop->Quit();
	},
	&is_multichannel_supported, run_loop.get()));

	run_loop->Run();
	ASSERT_FALSE(is_multichannel_supported);
	}

	void SpeechRecognitionServiceTest::LaunchServiceWithAudioSourceFetcher() {
	// Launch the Speech Recognition service.
	auto* browser_context =
	static_cast<content::BrowserContext*>(browser()->profile());
	auto* service = new ChromeSpeechRecognitionService(browser_context);

	service->BindAudioSourceSpeechRecognitionContext(
	audio_source_speech_recognition_context_.BindNewPipeAndPassReceiver());

	bool is_multichannel_supported = true;
	auto run_loop = std::make_unique<base::RunLoop>();
	// Bind the recognizer pipes used to send audio and receive results.
	audio_source_speech_recognition_context_->BindAudioSourceFetcher(
	audio_source_fetcher_.BindNewPipeAndPassReceiver(),
	speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
	media::mojom::SpeechRecognitionOptions::New(
	media::mojom::SpeechRecognitionMode::kIme,
	/enable_formatting=/false, kUsEnglishLocale),
	base::BindOnce(
	[](bool* p_is_multichannel_supported, base::RunLoop* run_loop,
	bool is_multichannel_supported) {
	*p_is_multichannel_supported = is_multichannel_supported;
	run_loop->Quit();
	},
	&is_multichannel_supported, run_loop.get()));
	run_loop->Run();
	ASSERT_FALSE(is_multichannel_supported);
	}

	void SpeechRecognitionServiceTest::SendAudioChunk(
	const std::vector<int16_t>& audio_data,
	media::WavAudioHandler* handler,
	size_t kMaxChunkSize) {
	int chunk_start = 0;
	// Upload chunks of 1024 frames at a time.
	while (chunk_start < static_cast<int>(audio_data.size())) {
	int chunk_size = kMaxChunkSize < audio_data.size() - chunk_start
	? kMaxChunkSize
	: audio_data.size() - chunk_start;

	auto signed_buffer = media::mojom::AudioDataS16::New();
	signed_buffer->channel_count = kExpectedChannelCount;
	signed_buffer->frame_count = chunk_size;
	signed_buffer->sample_rate = handler->sample_rate();
	for (int i = 0; i < chunk_size; i++) {
	signed_buffer->data.push_back(audio_data[chunk_start + i]);
	}

	speech_recognition_recognizer_->SendAudioToSpeechRecognitionService(
	std::move(signed_buffer));
	chunk_start += chunk_size;

	// Sleep for 20ms to simulate real-time audio. SODA requires audio
	// streaming in order to return events.
	#if BUILDFLAG(IS_WIN)
	::Sleep(20);
	#else
	usleep(20000);
	#endif
	}
	}

	IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest, RecognizePhrase) {
	base::HistogramTester histograms;
	SetUpPrefs();
	LaunchService();

	std::string buffer;
	auto audio_file =
	test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
	.Append(base::FilePath(soda::kSodaTestAudioRelativePath));
	{
	base::ScopedAllowBlockingForTesting allow_blocking;
	ASSERT_TRUE(base::PathExists(audio_file));
	ASSERT_TRUE(base::ReadFileToString(audio_file, &buffer));
	}

	auto handler = media::WavAudioHandler::Create(buffer);
	ASSERT_TRUE(handler.get());
	ASSERT_EQ(handler->num_channels(), kExpectedChannelCount);

	auto bus =
	media::AudioBus::Create(kExpectedChannelCount, handler->total_frames());

	size_t bytes_written = 0u;
	ASSERT_TRUE(handler->CopyTo(bus.get(), 0, &bytes_written));

	std::vector<int16_t> audio_data(bus->frames());
	bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(bus->frames(),
	audio_data.data());

	constexpr size_t kMaxChunkSize = 1024;
	constexpr int kReplayAudioCount = 2;
	for (int i = 0; i < kReplayAudioCount; i++) {
	SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);
	}

	speech_recognition_recognizer_.reset();
	base::RunLoop().RunUntilIdle();

	// Sleep for 100ms to ensure SODA has returned real-time results.
	#if BUILDFLAG(IS_WIN)
	::Sleep(100);
	#else
	usleep(100000);
	#endif
	ASSERT_GT(static_cast<int>(recognition_results_.size()), kReplayAudioCount);
	ASSERT_EQ(recognition_results_.back(), "Hey Google Hey Google");

	metrics::SubprocessMetricsProvider::MergeHistogramDeltasForTesting();
	histograms.ExpectUniqueTimeSample(
	SpeechRecognitionRecognizerImpl::kCaptionBubbleVisibleHistogramName,
	base::Milliseconds(2520), 1);
	histograms.ExpectTotalCount(
	SpeechRecognitionRecognizerImpl::kCaptionBubbleHiddenHistogramName, 0);
	}

	IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest,
	ClosingCaptionBubbleStopsRecognition) {
	base::HistogramTester histograms;
	SetUpPrefs();
	LaunchService();

	std::string buffer;
	auto audio_file =
	test_data_dir_.Append(base::FilePath(soda::kSodaResourcePath))
	.Append(base::FilePath(soda::kSodaTestAudioRelativePath));
	{
	base::ScopedAllowBlockingForTesting allow_blocking;
	ASSERT_TRUE(base::PathExists(audio_file));
	ASSERT_TRUE(base::ReadFileToString(audio_file, &buffer));
	}

	auto handler = media::WavAudioHandler::Create(buffer);
	ASSERT_TRUE(handler.get());
	ASSERT_EQ(handler->num_channels(), kExpectedChannelCount);

	auto bus =
	media::AudioBus::Create(kExpectedChannelCount, handler->total_frames());

	size_t bytes_written = 0u;
	ASSERT_TRUE(handler->CopyTo(bus.get(), 0, &bytes_written));

	std::vector<int16_t> audio_data(bus->frames());
	bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(bus->frames(),
	audio_data.data());
	constexpr size_t kMaxChunkSize = 1024;

	// Send an audio chunk to the service. It will output "Hey Google". When the
	// client receives the result, it responds to the service with `success =
	// true`, informing the speech recognition service that it still wants
	// transcriptions.
	SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);
	base::RunLoop().RunUntilIdle();

	// Close caption bubble. This means that the next time the client receives a
	// transcription, it will respond to the speech service with `success =
	// false`, informing the speech recognition service that it is no longer
	// requesting speech recognition.
	CloseCaptionBubble();

	// Send an audio chunk to the service. It will output "Hey Google". When the
	// client receives the result, it responds to the service with `success =
	// false`, informing the speech recognition service that it is no longer
	// requesting speech recognition.
	SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);
	base::RunLoop().RunUntilIdle();

	// Send an audio chunk to the service. It does not get transcribed.
	SendAudioChunk(audio_data, handler.get(), kMaxChunkSize);

	speech_recognition_recognizer_.reset();
	base::RunLoop().RunUntilIdle();

	// Sleep for 100ms to ensure SODA has returned real-time results.
	#if BUILDFLAG(IS_WIN)
	::Sleep(100);
	#else
	usleep(100000);
	#endif
	ASSERT_GT(static_cast<int>(recognition_results_.size()), 3);
	ASSERT_EQ(recognition_results_.back(), "Hey Google Hey Google");

	metrics::SubprocessMetricsProvider::MergeHistogramDeltasForTesting();
	histograms.ExpectUniqueTimeSample(
	SpeechRecognitionRecognizerImpl::kCaptionBubbleVisibleHistogramName,
	base::Milliseconds(2520), 1);
	histograms.ExpectUniqueTimeSample(
	SpeechRecognitionRecognizerImpl::kCaptionBubbleHiddenHistogramName,
	base::Milliseconds(1260), 1);
	}

	IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest, CreateAudioSourceFetcher) {
	base::HistogramTester histograms;
	SetUpPrefs();
	LaunchServiceWithAudioSourceFetcher();

	// TODO(crbug.com/1185978): Check implementation / sandbox policy on Mac and
	// Windows.
	#if BUILDFLAG(IS_CHROMEOS) \|\| BUILDFLAG(IS_LINUX)
	// Check that Start begins audio recording.
	// TODO(crbug.com/1173135): Try to mock audio input, maybe with
	// TestStreamFactory::stream_, to test end-to-end.
	std::string device_id = media::AudioDeviceDescription::kDefaultDeviceId;
	media::AudioParameters params(media::AudioParameters::AUDIO_PCM_LOW_LATENCY,
	media::ChannelLayoutConfig::Stereo(), 10000,
	1000);

	// Create a fake stream factory.
	std::unique_ptr<StrictMock<TestStreamFactory>> stream_factory =
	std::make_unique<StrictMock<TestStreamFactory>>();
	EXPECT_CALL(stream_factory->stream_, Record());
	audio_source_fetcher_->Start(stream_factory->MakeRemote(), device_id, params);
	stream_factory->WaitToCreateInputStream();

	EXPECT_EQ(device_id, stream_factory->device_id_);
	ASSERT_TRUE(stream_factory->params_);
	EXPECT_TRUE(params.Equals(stream_factory->params_.value()));
	#endif

	audio_source_fetcher_->Stop();
	base::RunLoop().RunUntilIdle();
	}

	IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest, CompromisedRenderer) {
	// Create temporary SODA files.
	SetUpPrefs();
	base::ScopedAllowBlockingForTesting allow_blocking;
	base::FilePath config_dir =
	GetSodaLanguagePacksDirectory()
	.AppendASCII(kUsEnglishLocale)
	.Append(FILE_PATH_LITERAL("1.1.1"))
	.Append(kSodaLanguagePackDirectoryRelativePath);
	base::CreateDirectory(config_dir);
	ASSERT_TRUE(base::PathExists(config_dir));
	base::FilePath config_file_path =
	config_dir.Append(FILE_PATH_LITERAL("config_file"));
	ASSERT_EQ(base::WriteFile(config_file_path, nullptr, 0), 0);
	ASSERT_TRUE(base::PathExists(config_file_path));
	g_browser_process->local_state()->SetFilePath(prefs::kSodaEnUsConfigPath,
	config_file_path);

	// Launch the Speech Recognition service.
	auto* browser_context =
	static_cast<content::BrowserContext*>(browser()->profile());
	auto* service = new ChromeSpeechRecognitionService(browser_context);
	service->BindSpeechRecognitionContext(
	speech_recognition_context_.BindNewPipeAndPassReceiver());

	// Bind the recognizer pipes used to send audio and receive results.
	auto run_loop = std::make_unique<base::RunLoop>();
	speech_recognition_context_->BindRecognizer(
	speech_recognition_recognizer_.BindNewPipeAndPassReceiver(),
	speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
	media::mojom::SpeechRecognitionOptions::New(
	media::mojom::SpeechRecognitionMode::kCaption,
	/enable_formatting=/true, kUsEnglishLocale,
	/is_server_based=/false,
	media::mojom::RecognizerClientType::kLiveCaption),
	base::BindOnce([](base::RunLoop* run_loop,
	bool is_multichannel_supported) { run_loop->Quit(); },
	run_loop.get()));
	run_loop->Run();

	// Simulate a compromised renderer by changing the language and immediately
	// resetting the recognizer and verify that the subsequent callbacks do not
	// cause any crashes.
	speech_recognition_recognizer_->OnLanguageChanged(kUsEnglishLocale);
	speech_recognition_recognizer_.reset();
	base::RunLoop().RunUntilIdle();
	}

	} // namespace speech