chrome/browser/speech/network_speech_recognizer.cc - chromium/src - Git at Google

 // Copyright 2014 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "chrome/browser/speech/network_speech_recognizer.h"

 #include <stddef.h>
 #include <stdint.h>

 #include <algorithm>
 #include <string>

 #include "base/functional/bind.h"
 #include "chrome/browser/speech/speech_recognizer_delegate.h"
 #include "content/public/browser/browser_task_traits.h"
 #include "content/public/browser/browser_thread.h"
 #include "content/public/browser/child_process_host.h"
 #include "content/public/browser/render_process_host.h"
 #include "content/public/browser/speech_recognition_event_listener.h"
 #include "content/public/browser/speech_recognition_manager.h"
 #include "content/public/browser/speech_recognition_session_config.h"
 #include "content/public/browser/speech_recognition_session_preamble.h"
 #include "media/mojo/mojom/speech_recognition_error.mojom.h"
 #include "services/network/public/cpp/shared_url_loader_factory.h"

 // Invalid speech session.
 static const int kInvalidSessionId = -1;

 // Speech recognizer listener. This is separate from SpeechRecognizer because
 // the speech recognition engine must function from the IO thread. Because of
 // this, the lifecycle of this class must be decoupled from the lifecycle of
 // SpeechRecognizer. To avoid circular references, this class has no reference
 // to SpeechRecognizer. Instead, it has a reference to the
 // SpeechRecognizerDelegate via a weak pointer that is only ever referenced from
 // the UI thread.
 class NetworkSpeechRecognizer::EventListener
     : public base::RefCountedThreadSafe<
           NetworkSpeechRecognizer::EventListener,
           content::BrowserThread::DeleteOnIOThread>,
       public content::SpeechRecognitionEventListener {
  public:
   EventListener(const base::WeakPtr<SpeechRecognizerDelegate>& delegate,
                 std::unique_ptr<network::PendingSharedURLLoaderFactory>
                     pending_shared_url_loader_factory,
                 const std::string& locale);

   EventListener(const EventListener&) = delete;
   EventListener& operator=(const EventListener&) = delete;

   void StartOnIOThread(
       const std::string& auth_scope,
       const std::string& auth_token,
       const scoped_refptr<content::SpeechRecognitionSessionPreamble>& preamble);
   void StopOnIOThread();

  private:
   friend struct content::BrowserThread::DeleteOnThread<
       content::BrowserThread::IO>;
   friend class base::DeleteHelper<NetworkSpeechRecognizer::EventListener>;
   ~EventListener() override;

   void NotifyRecognitionStateChanged(SpeechRecognizerStatus new_state);

   // Overridden from content::SpeechRecognitionEventListener:
   // These are always called on the IO thread.
   void OnRecognitionStart(int session_id) override;
   void OnRecognitionEnd(int session_id) override;
   void OnRecognitionResults(
       int session_id,
       const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results)
       override;
   void OnRecognitionError(
       int session_id,
       const media::mojom::SpeechRecognitionError& error) override;
   void OnSoundStart(int session_id) override;
   void OnSoundEnd(int session_id) override;
   void OnAudioLevelsChange(int session_id,
                            float volume,
                            float noise_volume) override;
   void OnAudioStart(int session_id) override;
   void OnAudioEnd(int session_id) override;

   // Only dereferenced from the UI thread, but copied on IO thread.
   base::WeakPtr<SpeechRecognizerDelegate> delegate_;

   // All remaining members only accessed from the IO thread.
   std::unique_ptr<network::PendingSharedURLLoaderFactory>
       pending_shared_url_loader_factory_;
   // Initialized from |pending_shared_url_loader_factory_| on first use.
   scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory_;
   std::string locale_;
   int session_;
   std::u16string last_result_str_;

   base::WeakPtrFactory<EventListener> weak_factory_{this};
 };

 NetworkSpeechRecognizer::EventListener::EventListener(
     const base::WeakPtr<SpeechRecognizerDelegate>& delegate,
     std::unique_ptr<network::PendingSharedURLLoaderFactory>
         pending_shared_url_loader_factory,
     const std::string& locale)
     : delegate_(delegate),
       pending_shared_url_loader_factory_(
           std::move(pending_shared_url_loader_factory)),
       locale_(locale),
       session_(kInvalidSessionId) {
   DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
   NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_READY);
 }

 NetworkSpeechRecognizer::EventListener::~EventListener() {
   // No more callbacks when we are deleting.
   delegate_.reset();
   if (session_ != kInvalidSessionId) {
     // Ensure the session is aborted.
     int session = session_;
     session_ = kInvalidSessionId;
     content::SpeechRecognitionManager::GetInstance()->AbortSession(session);
   }
 }

 void NetworkSpeechRecognizer::EventListener::StartOnIOThread(
     const std::string& auth_scope,
     const std::string& auth_token,
     const scoped_refptr<content::SpeechRecognitionSessionPreamble>& preamble) {
   DCHECK_CURRENTLY_ON(content::BrowserThread::IO);
   if (session_ != kInvalidSessionId)
     StopOnIOThread();

   // Don't filter profanities. NetworkSpeechRecognizer is currently used by
   // Dictation which does not want to filter user input. If this needs to be
   // changed for other clients in the future, whether to filter should be passed
   // as a parameter to the speech recognizer instead of changed here.
   bool filter_profanities = false;
   content::SpeechRecognitionSessionConfig config;
   config.language = locale_;
   config.continuous = true;
   config.interim_results = true;
   config.max_hypotheses = 1;
   config.filter_profanities = filter_profanities;
   if (!shared_url_loader_factory_) {
     DCHECK(pending_shared_url_loader_factory_);
     shared_url_loader_factory_ = network::SharedURLLoaderFactory::Create(
         std::move(pending_shared_url_loader_factory_));
   }
   config.shared_url_loader_factory = shared_url_loader_factory_;
   config.event_listener = weak_factory_.GetWeakPtr();
   // kInvalidUniqueID is not a valid render process, so the speech permission
   // check allows the request through.
   config.initial_context.render_process_id =
       content::ChildProcessHost::kInvalidUniqueID;
   config.auth_scope = auth_scope;
   config.auth_token = auth_token;
   config.preamble = preamble;

   auto* speech_instance = content::SpeechRecognitionManager::GetInstance();
   session_ = speech_instance->CreateSession(config);
   speech_instance->StartSession(session_);
 }

 void NetworkSpeechRecognizer::EventListener::StopOnIOThread() {
   DCHECK_CURRENTLY_ON(content::BrowserThread::IO);
   if (session_ == kInvalidSessionId)
     return;

   // Prevent recursion.
   int session = session_;
   session_ = kInvalidSessionId;
   content::SpeechRecognitionManager::GetInstance()->StopAudioCaptureForSession(
       session);
   // Since we no longer have access to this session ID, end the session
   // associated with it.
   content::SpeechRecognitionManager::GetInstance()->AbortSession(session);
   weak_factory_.InvalidateWeakPtrs();
 }

 void NetworkSpeechRecognizer::EventListener::NotifyRecognitionStateChanged(
     SpeechRecognizerStatus new_state) {
   content::GetUIThreadTaskRunner({})->PostTask(
       FROM_HERE,
       base::BindOnce(&SpeechRecognizerDelegate::OnSpeechRecognitionStateChanged,
                      delegate_, new_state));
 }

 void NetworkSpeechRecognizer::EventListener::OnRecognitionStart(
     int session_id) {
   NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_RECOGNIZING);
 }

 void NetworkSpeechRecognizer::EventListener::OnRecognitionEnd(int session_id) {
   StopOnIOThread();
   NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_READY);
 }

 void NetworkSpeechRecognizer::EventListener::OnRecognitionResults(
     int session_id,
     const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results) {
   std::u16string result_str;
   size_t final_count = 0;
   // The number of results with |is_provisional| false. If |final_count| ==
   // results.size(), then all results are non-provisional and the recognition is
   // complete.
   for (const auto& result : results) {
     if (!result->is_provisional)
       final_count++;
     result_str += result->hypotheses[0]->utterance;
   }
   // media::mojom::WebSpeechRecognitionResult doesn't have word offsets.
   content::GetUIThreadTaskRunner({})->PostTask(
       FROM_HERE,
       base::BindOnce(&SpeechRecognizerDelegate::OnSpeechResult, delegate_,
                      result_str, final_count == results.size(),
                      /* full_result = */ std::nullopt));

   last_result_str_ = result_str;
 }

 void NetworkSpeechRecognizer::EventListener::OnRecognitionError(
     int session_id,
     const media::mojom::SpeechRecognitionError& error) {
   StopOnIOThread();
   if (error.code == media::mojom::SpeechRecognitionErrorCode::kNetwork) {
     NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_ERROR);
   }
   NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_READY);
 }

 void NetworkSpeechRecognizer::EventListener::OnSoundStart(int session_id) {
   NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_IN_SPEECH);
 }

 void NetworkSpeechRecognizer::EventListener::OnSoundEnd(int session_id) {
   StopOnIOThread();
   NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_RECOGNIZING);
 }

 void NetworkSpeechRecognizer::EventListener::OnAudioLevelsChange(
     int session_id,
     float volume,
     float noise_volume) {
   DCHECK_LE(0.0, volume);
   DCHECK_GE(1.0, volume);
   DCHECK_LE(0.0, noise_volume);
   DCHECK_GE(1.0, noise_volume);
   volume = std::max(0.0f, volume - noise_volume);
   // Both |volume| and |noise_volume| are defined to be in the range [0.0, 1.0].
   // See: content/public/browser/speech_recognition_event_listener.h
   int16_t sound_level = static_cast<int16_t>(INT16_MAX * volume);
   content::GetUIThreadTaskRunner({})->PostTask(
       FROM_HERE,
       base::BindOnce(&SpeechRecognizerDelegate::OnSpeechSoundLevelChanged,
                      delegate_, sound_level));
 }

 void NetworkSpeechRecognizer::EventListener::OnAudioStart(int session_id) {}

 void NetworkSpeechRecognizer::EventListener::OnAudioEnd(int session_id) {}

 NetworkSpeechRecognizer::NetworkSpeechRecognizer(
     const base::WeakPtr<SpeechRecognizerDelegate>& delegate,
     std::unique_ptr<network::PendingSharedURLLoaderFactory>
         pending_shared_url_loader_factory,
     const std::string& locale)
     : SpeechRecognizer(delegate),
       speech_event_listener_(
           new EventListener(delegate,
                             std::move(pending_shared_url_loader_factory),
                             locale)) {
   DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
 }

 NetworkSpeechRecognizer::~NetworkSpeechRecognizer() {
   DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
   // Reset the delegate before calling Stop() to avoid any additional callbacks.
   delegate().reset();
   Stop();
 }

 void NetworkSpeechRecognizer::Start() {
   DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
   content::GetIOThreadTaskRunner({})->PostTask(
       FROM_HERE,
       base::BindOnce(&NetworkSpeechRecognizer::EventListener::StartOnIOThread,
                      speech_event_listener_, std::string() /* auth scope*/,
                      std::string() /* auth_token */, /* preamble */ nullptr));
 }

 void NetworkSpeechRecognizer::Stop() {
   DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
   content::GetIOThreadTaskRunner({})->PostTask(
       FROM_HERE,
       base::BindOnce(&NetworkSpeechRecognizer::EventListener::StopOnIOThread,
                      speech_event_listener_));
 }
	// Copyright 2014 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "chrome/browser/speech/network_speech_recognizer.h"

	#include <stddef.h>
	#include <stdint.h>

	#include <algorithm>
	#include <string>

	#include "base/functional/bind.h"
	#include "chrome/browser/speech/speech_recognizer_delegate.h"
	#include "content/public/browser/browser_task_traits.h"
	#include "content/public/browser/browser_thread.h"
	#include "content/public/browser/child_process_host.h"
	#include "content/public/browser/render_process_host.h"
	#include "content/public/browser/speech_recognition_event_listener.h"
	#include "content/public/browser/speech_recognition_manager.h"
	#include "content/public/browser/speech_recognition_session_config.h"
	#include "content/public/browser/speech_recognition_session_preamble.h"
	#include "media/mojo/mojom/speech_recognition_error.mojom.h"
	#include "services/network/public/cpp/shared_url_loader_factory.h"

	// Invalid speech session.
	static const int kInvalidSessionId = -1;

	// Speech recognizer listener. This is separate from SpeechRecognizer because
	// the speech recognition engine must function from the IO thread. Because of
	// this, the lifecycle of this class must be decoupled from the lifecycle of
	// SpeechRecognizer. To avoid circular references, this class has no reference
	// to SpeechRecognizer. Instead, it has a reference to the
	// SpeechRecognizerDelegate via a weak pointer that is only ever referenced from
	// the UI thread.
	class NetworkSpeechRecognizer::EventListener
	: public base::RefCountedThreadSafe<
	NetworkSpeechRecognizer::EventListener,
	content::BrowserThread::DeleteOnIOThread>,
	public content::SpeechRecognitionEventListener {
	public:
	EventListener(const base::WeakPtr<SpeechRecognizerDelegate>& delegate,
	std::unique_ptr<network::PendingSharedURLLoaderFactory>
	pending_shared_url_loader_factory,
	const std::string& locale);

	EventListener(const EventListener&) = delete;
	EventListener& operator=(const EventListener&) = delete;

	void StartOnIOThread(
	const std::string& auth_scope,
	const std::string& auth_token,
	const scoped_refptr<content::SpeechRecognitionSessionPreamble>& preamble);
	void StopOnIOThread();

	private:
	friend struct content::BrowserThread::DeleteOnThread<
	content::BrowserThread::IO>;
	friend class base::DeleteHelper<NetworkSpeechRecognizer::EventListener>;
	~EventListener() override;

	void NotifyRecognitionStateChanged(SpeechRecognizerStatus new_state);

	// Overridden from content::SpeechRecognitionEventListener:
	// These are always called on the IO thread.
	void OnRecognitionStart(int session_id) override;
	void OnRecognitionEnd(int session_id) override;
	void OnRecognitionResults(
	int session_id,
	const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results)
	override;
	void OnRecognitionError(
	int session_id,
	const media::mojom::SpeechRecognitionError& error) override;
	void OnSoundStart(int session_id) override;
	void OnSoundEnd(int session_id) override;
	void OnAudioLevelsChange(int session_id,
	float volume,
	float noise_volume) override;
	void OnAudioStart(int session_id) override;
	void OnAudioEnd(int session_id) override;

	// Only dereferenced from the UI thread, but copied on IO thread.
	base::WeakPtr<SpeechRecognizerDelegate> delegate_;

	// All remaining members only accessed from the IO thread.
	std::unique_ptr<network::PendingSharedURLLoaderFactory>
	pending_shared_url_loader_factory_;
	// Initialized from \|pending_shared_url_loader_factory_\| on first use.
	scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory_;
	std::string locale_;
	int session_;
	std::u16string last_result_str_;

	base::WeakPtrFactory<EventListener> weak_factory_{this};
	};

	NetworkSpeechRecognizer::EventListener::EventListener(
	const base::WeakPtr<SpeechRecognizerDelegate>& delegate,
	std::unique_ptr<network::PendingSharedURLLoaderFactory>
	pending_shared_url_loader_factory,
	const std::string& locale)
	: delegate_(delegate),
	pending_shared_url_loader_factory_(
	std::move(pending_shared_url_loader_factory)),
	locale_(locale),
	session_(kInvalidSessionId) {
	DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
	NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_READY);
	}

	NetworkSpeechRecognizer::EventListener::~EventListener() {
	// No more callbacks when we are deleting.
	delegate_.reset();
	if (session_ != kInvalidSessionId) {
	// Ensure the session is aborted.
	int session = session_;
	session_ = kInvalidSessionId;
	content::SpeechRecognitionManager::GetInstance()->AbortSession(session);
	}
	}

	void NetworkSpeechRecognizer::EventListener::StartOnIOThread(
	const std::string& auth_scope,
	const std::string& auth_token,
	const scoped_refptr<content::SpeechRecognitionSessionPreamble>& preamble) {
	DCHECK_CURRENTLY_ON(content::BrowserThread::IO);
	if (session_ != kInvalidSessionId)
	StopOnIOThread();

	// Don't filter profanities. NetworkSpeechRecognizer is currently used by
	// Dictation which does not want to filter user input. If this needs to be
	// changed for other clients in the future, whether to filter should be passed
	// as a parameter to the speech recognizer instead of changed here.
	bool filter_profanities = false;
	content::SpeechRecognitionSessionConfig config;
	config.language = locale_;
	config.continuous = true;
	config.interim_results = true;
	config.max_hypotheses = 1;
	config.filter_profanities = filter_profanities;
	if (!shared_url_loader_factory_) {
	DCHECK(pending_shared_url_loader_factory_);
	shared_url_loader_factory_ = network::SharedURLLoaderFactory::Create(
	std::move(pending_shared_url_loader_factory_));
	}
	config.shared_url_loader_factory = shared_url_loader_factory_;
	config.event_listener = weak_factory_.GetWeakPtr();
	// kInvalidUniqueID is not a valid render process, so the speech permission
	// check allows the request through.
	config.initial_context.render_process_id =
	content::ChildProcessHost::kInvalidUniqueID;
	config.auth_scope = auth_scope;
	config.auth_token = auth_token;
	config.preamble = preamble;

	auto* speech_instance = content::SpeechRecognitionManager::GetInstance();
	session_ = speech_instance->CreateSession(config);
	speech_instance->StartSession(session_);
	}

	void NetworkSpeechRecognizer::EventListener::StopOnIOThread() {
	DCHECK_CURRENTLY_ON(content::BrowserThread::IO);
	if (session_ == kInvalidSessionId)
	return;

	// Prevent recursion.
	int session = session_;
	session_ = kInvalidSessionId;
	content::SpeechRecognitionManager::GetInstance()->StopAudioCaptureForSession(
	session);
	// Since we no longer have access to this session ID, end the session
	// associated with it.
	content::SpeechRecognitionManager::GetInstance()->AbortSession(session);
	weak_factory_.InvalidateWeakPtrs();
	}

	void NetworkSpeechRecognizer::EventListener::NotifyRecognitionStateChanged(
	SpeechRecognizerStatus new_state) {
	content::GetUIThreadTaskRunner({})->PostTask(
	FROM_HERE,
	base::BindOnce(&SpeechRecognizerDelegate::OnSpeechRecognitionStateChanged,
	delegate_, new_state));
	}

	void NetworkSpeechRecognizer::EventListener::OnRecognitionStart(
	int session_id) {
	NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_RECOGNIZING);
	}

	void NetworkSpeechRecognizer::EventListener::OnRecognitionEnd(int session_id) {
	StopOnIOThread();
	NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_READY);
	}

	void NetworkSpeechRecognizer::EventListener::OnRecognitionResults(
	int session_id,
	const std::vector<media::mojom::WebSpeechRecognitionResultPtr>& results) {
	std::u16string result_str;
	size_t final_count = 0;
	// The number of results with \|is_provisional\| false. If \|final_count\| ==
	// results.size(), then all results are non-provisional and the recognition is
	// complete.
	for (const auto& result : results) {
	if (!result->is_provisional)
	final_count++;
	result_str += result->hypotheses[0]->utterance;
	}
	// media::mojom::WebSpeechRecognitionResult doesn't have word offsets.
	content::GetUIThreadTaskRunner({})->PostTask(
	FROM_HERE,
	base::BindOnce(&SpeechRecognizerDelegate::OnSpeechResult, delegate_,
	result_str, final_count == results.size(),
	/* full_result = */ std::nullopt));

	last_result_str_ = result_str;
	}

	void NetworkSpeechRecognizer::EventListener::OnRecognitionError(
	int session_id,
	const media::mojom::SpeechRecognitionError& error) {
	StopOnIOThread();
	if (error.code == media::mojom::SpeechRecognitionErrorCode::kNetwork) {
	NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_ERROR);
	}
	NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_READY);
	}

	void NetworkSpeechRecognizer::EventListener::OnSoundStart(int session_id) {
	NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_IN_SPEECH);
	}

	void NetworkSpeechRecognizer::EventListener::OnSoundEnd(int session_id) {
	StopOnIOThread();
	NotifyRecognitionStateChanged(SPEECH_RECOGNIZER_RECOGNIZING);
	}

	void NetworkSpeechRecognizer::EventListener::OnAudioLevelsChange(
	int session_id,
	float volume,
	float noise_volume) {
	DCHECK_LE(0.0, volume);
	DCHECK_GE(1.0, volume);
	DCHECK_LE(0.0, noise_volume);
	DCHECK_GE(1.0, noise_volume);
	volume = std::max(0.0f, volume - noise_volume);
	// Both \|volume\| and \|noise_volume\| are defined to be in the range [0.0, 1.0].
	// See: content/public/browser/speech_recognition_event_listener.h
	int16_t sound_level = static_cast<int16_t>(INT16_MAX * volume);
	content::GetUIThreadTaskRunner({})->PostTask(
	FROM_HERE,
	base::BindOnce(&SpeechRecognizerDelegate::OnSpeechSoundLevelChanged,
	delegate_, sound_level));
	}

	void NetworkSpeechRecognizer::EventListener::OnAudioStart(int session_id) {}

	void NetworkSpeechRecognizer::EventListener::OnAudioEnd(int session_id) {}

	NetworkSpeechRecognizer::NetworkSpeechRecognizer(
	const base::WeakPtr<SpeechRecognizerDelegate>& delegate,
	std::unique_ptr<network::PendingSharedURLLoaderFactory>
	pending_shared_url_loader_factory,
	const std::string& locale)
	: SpeechRecognizer(delegate),
	speech_event_listener_(
	new EventListener(delegate,
	std::move(pending_shared_url_loader_factory),
	locale)) {
	DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
	}

	NetworkSpeechRecognizer::~NetworkSpeechRecognizer() {
	DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
	// Reset the delegate before calling Stop() to avoid any additional callbacks.
	delegate().reset();
	Stop();
	}

	void NetworkSpeechRecognizer::Start() {
	DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
	content::GetIOThreadTaskRunner({})->PostTask(
	FROM_HERE,
	base::BindOnce(&NetworkSpeechRecognizer::EventListener::StartOnIOThread,
	speech_event_listener_, std::string() /* auth scope*/,
	std::string() /* auth_token /, / preamble */ nullptr));
	}

	void NetworkSpeechRecognizer::Stop() {
	DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
	content::GetIOThreadTaskRunner({})->PostTask(
	FROM_HERE,
	base::BindOnce(&NetworkSpeechRecognizer::EventListener::StopOnIOThread,
	speech_event_listener_));
	}