content/browser/speech/network_speech_recognition_engine_impl.h - chromium/src - Git at Google

 // Copyright 2024 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_
 #define CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_

 #include <stdint.h>

 #include <memory>
 #include <string>
 #include <string_view>
 #include <vector>

 #include "base/memory/raw_ptr.h"
 #include "base/memory/ref_counted.h"
 #include "base/sequence_checker.h"
 #include "components/speech/audio_encoder.h"
 #include "components/speech/chunked_byte_buffer.h"
 #include "components/speech/downstream_loader.h"
 #include "components/speech/downstream_loader_client.h"
 #include "components/speech/upstream_loader.h"
 #include "components/speech/upstream_loader_client.h"
 #include "content/browser/speech/speech_recognition_engine.h"
 #include "content/common/content_export.h"
 #include "content/public/browser/speech_recognition_session_preamble.h"
 #include "media/mojo/mojom/speech_recognition_error.mojom.h"
 #include "media/mojo/mojom/speech_recognition_grammar.mojom.h"
 #include "media/mojo/mojom/speech_recognition_result.mojom.h"
 #include "services/network/public/cpp/simple_url_loader_stream_consumer.h"

 class AudioChunk;

 namespace base {
 class TimeDelta;
 }

 namespace network {
 class SharedURLLoaderFactory;
 }

 namespace content {

 // This is the network implementation for `SpeechRecognitionEngine`, which is
 // supporting continuous recognition by means of interaction with the Google
 // streaming speech recognition webservice.
 //
 // This class establishes two HTTPS connections with the webservice for each
 // session, herein called "upstream" and "downstream". Audio chunks are sent on
 // the upstream by means of a chunked HTTP POST upload. Recognition results are
 // retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream)
 // on the downstream by means of a chunked HTTP GET request. Pairing between the
 // two stream is handled through a randomly generated key, unique for each
 // request, which is passed in the &pair= arg to both stream request URLs. In
 // the case of a regular session, the upstream is closed when the audio capture
 // ends (notified through a |AudioChunksEnded| call) and the downstream waits
 // for a corresponding server closure (eventually some late results can come
 // after closing the upstream). Both streams are guaranteed to be closed when
 // |EndRecognition| call is issued.

 class CONTENT_EXPORT NetworkSpeechRecognitionEngineImpl
     : public SpeechRecognitionEngine,
       public speech::UpstreamLoaderClient,
       public speech::DownstreamLoaderClient {
  public:
   // Network engine configuration.
   struct CONTENT_EXPORT Config {
     Config();
     ~Config();

     std::string language;
     std::vector<media::mojom::SpeechRecognitionGrammar> grammars;
     bool filter_profanities = false;
     bool continuous = true;
     bool interim_results = true;
     uint32_t max_hypotheses;
     std::string origin_url;
     int audio_sample_rate;
     int audio_num_bits_per_sample;
     std::string auth_token;
     std::string auth_scope;
     scoped_refptr<SpeechRecognitionSessionPreamble> preamble;
   };

   // Duration of each audio packet.
   static const int kAudioPacketIntervalMs;

   explicit NetworkSpeechRecognitionEngineImpl(
       scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory);

   NetworkSpeechRecognitionEngineImpl(
       const NetworkSpeechRecognitionEngineImpl&) = delete;
   NetworkSpeechRecognitionEngineImpl& operator=(
       const NetworkSpeechRecognitionEngineImpl&) = delete;

   ~NetworkSpeechRecognitionEngineImpl() override;

   // Sets the URL requests are sent to for tests.
   static void set_web_service_base_url_for_tests(
       const char* base_url_for_tests);

   void SetConfig(const Config& config);
   bool IsRecognitionPending() const;

   // content::SpeechRecognitionEngine:
   void StartRecognition() override;
   void UpdateRecognitionContext(
       const media::SpeechRecognitionRecognitionContext& recognition_context)
       override;
   void EndRecognition() override;
   void TakeAudioChunk(const AudioChunk& data) override;
   void AudioChunksEnded() override;
   int GetDesiredAudioChunkDurationMs() const override;

  private:
   friend class speech::UpstreamLoaderClient;
   friend class speech::DownstreamLoader;

   // Response status codes from the speech recognition webservice.
   static const int kWebserviceStatusNoError;
   static const int kWebserviceStatusErrorNoMatch;

   // Frame type for framed POST data. Do NOT change these. They must match
   // values the server expects.
   enum FrameType { FRAME_PREAMBLE_AUDIO = 0, FRAME_RECOGNITION_AUDIO = 1 };

   // Data types for the internal Finite State Machine (FSM).
   enum FSMState {
     STATE_IDLE = 0,
     STATE_BOTH_STREAMS_CONNECTED,
     STATE_WAITING_DOWNSTREAM_RESULTS,
     STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
   };

   enum FSMEvent {
     EVENT_END_RECOGNITION = 0,
     EVENT_START_RECOGNITION,
     EVENT_AUDIO_CHUNK,
     EVENT_AUDIO_CHUNKS_ENDED,
     EVENT_UPSTREAM_ERROR,
     EVENT_DOWNSTREAM_ERROR,
     EVENT_DOWNSTREAM_RESPONSE,
     EVENT_DOWNSTREAM_CLOSED,
     EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
   };

   struct FSMEventArgs {
     explicit FSMEventArgs(FSMEvent event_value);

     FSMEventArgs(const FSMEventArgs&) = delete;
     FSMEventArgs& operator=(const FSMEventArgs&) = delete;

     ~FSMEventArgs();

     FSMEvent event;

     // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
     scoped_refptr<const AudioChunk> audio_data;

     // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
     std::unique_ptr<std::vector<uint8_t>> response;
   };

   // speech::UpstreamLoaderClient
   void OnUpstreamDataComplete(bool success, int response_code) override;

   // speech::DownstreamLoaderClient
   void OnDownstreamDataReceived(std::string_view new_response_data) override;
   void OnDownstreamDataComplete(bool success, int response_code) override;

   // Entry point for pushing any new external event into the recognizer FSM.
   void DispatchEvent(const FSMEventArgs& event_args);

   // Defines the behavior of the recognizer FSM, selecting the appropriate
   // transition according to the current state and event.
   FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);

   // The methods below handle transitions of the recognizer FSM.
   FSMState ConnectBothStreams(const FSMEventArgs& event_args);
   FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
   FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
   FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
   FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
   FSMState CloseDownstream(const FSMEventArgs& event_args);
   FSMState AbortSilently(const FSMEventArgs& event_args);
   FSMState AbortWithError(const FSMEventArgs& event_args);
   FSMState Abort(media::mojom::SpeechRecognitionErrorCode error);
   FSMState DoNothing(const FSMEventArgs& event_args);
   FSMState NotFeasible(const FSMEventArgs& event_args);

   std::string GenerateRequestKey() const;

   // Upload a single chunk of audio data. Handles both unframed and framed
   // upload formats, and uses the appropriate one.
   void UploadAudioChunk(const std::string& data, FrameType type, bool is_final);

   // The total audio duration of the upstream request.
   base::TimeDelta upstream_audio_duration_;

   Config config_;
   std::unique_ptr<speech::UpstreamLoader> upstream_loader_;
   std::unique_ptr<speech::DownstreamLoader> downstream_loader_;
   scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory_;
   std::unique_ptr<AudioEncoder> encoder_;
   std::unique_ptr<AudioEncoder> preamble_encoder_;
   speech::ChunkedByteBuffer chunked_byte_buffer_;
   bool got_last_definitive_result_ = false;
   bool is_dispatching_event_ = false;
   bool use_framed_post_data_ = false;
   FSMState state_ = FSMState::STATE_IDLE;

   SEQUENCE_CHECKER(sequence_checker_);
 };

 }  // namespace content

 #endif  // CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_
	// Copyright 2024 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifndef CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_
	#define CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_

	#include <stdint.h>

	#include <memory>
	#include <string>
	#include <string_view>
	#include <vector>

	#include "base/memory/raw_ptr.h"
	#include "base/memory/ref_counted.h"
	#include "base/sequence_checker.h"
	#include "components/speech/audio_encoder.h"
	#include "components/speech/chunked_byte_buffer.h"
	#include "components/speech/downstream_loader.h"
	#include "components/speech/downstream_loader_client.h"
	#include "components/speech/upstream_loader.h"
	#include "components/speech/upstream_loader_client.h"
	#include "content/browser/speech/speech_recognition_engine.h"
	#include "content/common/content_export.h"
	#include "content/public/browser/speech_recognition_session_preamble.h"
	#include "media/mojo/mojom/speech_recognition_error.mojom.h"
	#include "media/mojo/mojom/speech_recognition_grammar.mojom.h"
	#include "media/mojo/mojom/speech_recognition_result.mojom.h"
	#include "services/network/public/cpp/simple_url_loader_stream_consumer.h"

	class AudioChunk;

	namespace base {
	class TimeDelta;
	}

	namespace network {
	class SharedURLLoaderFactory;
	}

	namespace content {

	// This is the network implementation for `SpeechRecognitionEngine`, which is
	// supporting continuous recognition by means of interaction with the Google
	// streaming speech recognition webservice.
	//
	// This class establishes two HTTPS connections with the webservice for each
	// session, herein called "upstream" and "downstream". Audio chunks are sent on
	// the upstream by means of a chunked HTTP POST upload. Recognition results are
	// retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream)
	// on the downstream by means of a chunked HTTP GET request. Pairing between the
	// two stream is handled through a randomly generated key, unique for each
	// request, which is passed in the &pair= arg to both stream request URLs. In
	// the case of a regular session, the upstream is closed when the audio capture
	// ends (notified through a \|AudioChunksEnded\| call) and the downstream waits
	// for a corresponding server closure (eventually some late results can come
	// after closing the upstream). Both streams are guaranteed to be closed when
	// \|EndRecognition\| call is issued.

	class CONTENT_EXPORT NetworkSpeechRecognitionEngineImpl
	: public SpeechRecognitionEngine,
	public speech::UpstreamLoaderClient,
	public speech::DownstreamLoaderClient {
	public:
	// Network engine configuration.
	struct CONTENT_EXPORT Config {
	Config();
	~Config();

	std::string language;
	std::vector<media::mojom::SpeechRecognitionGrammar> grammars;
	bool filter_profanities = false;
	bool continuous = true;
	bool interim_results = true;
	uint32_t max_hypotheses;
	std::string origin_url;
	int audio_sample_rate;
	int audio_num_bits_per_sample;
	std::string auth_token;
	std::string auth_scope;
	scoped_refptr<SpeechRecognitionSessionPreamble> preamble;
	};

	// Duration of each audio packet.
	static const int kAudioPacketIntervalMs;

	explicit NetworkSpeechRecognitionEngineImpl(
	scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory);

	NetworkSpeechRecognitionEngineImpl(
	const NetworkSpeechRecognitionEngineImpl&) = delete;
	NetworkSpeechRecognitionEngineImpl& operator=(
	const NetworkSpeechRecognitionEngineImpl&) = delete;

	~NetworkSpeechRecognitionEngineImpl() override;

	// Sets the URL requests are sent to for tests.
	static void set_web_service_base_url_for_tests(
	const char* base_url_for_tests);

	void SetConfig(const Config& config);
	bool IsRecognitionPending() const;

	// content::SpeechRecognitionEngine:
	void StartRecognition() override;
	void UpdateRecognitionContext(
	const media::SpeechRecognitionRecognitionContext& recognition_context)
	override;
	void EndRecognition() override;
	void TakeAudioChunk(const AudioChunk& data) override;
	void AudioChunksEnded() override;
	int GetDesiredAudioChunkDurationMs() const override;

	private:
	friend class speech::UpstreamLoaderClient;
	friend class speech::DownstreamLoader;

	// Response status codes from the speech recognition webservice.
	static const int kWebserviceStatusNoError;
	static const int kWebserviceStatusErrorNoMatch;

	// Frame type for framed POST data. Do NOT change these. They must match
	// values the server expects.
	enum FrameType { FRAME_PREAMBLE_AUDIO = 0, FRAME_RECOGNITION_AUDIO = 1 };

	// Data types for the internal Finite State Machine (FSM).
	enum FSMState {
	STATE_IDLE = 0,
	STATE_BOTH_STREAMS_CONNECTED,
	STATE_WAITING_DOWNSTREAM_RESULTS,
	STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
	};

	enum FSMEvent {
	EVENT_END_RECOGNITION = 0,
	EVENT_START_RECOGNITION,
	EVENT_AUDIO_CHUNK,
	EVENT_AUDIO_CHUNKS_ENDED,
	EVENT_UPSTREAM_ERROR,
	EVENT_DOWNSTREAM_ERROR,
	EVENT_DOWNSTREAM_RESPONSE,
	EVENT_DOWNSTREAM_CLOSED,
	EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
	};

	struct FSMEventArgs {
	explicit FSMEventArgs(FSMEvent event_value);

	FSMEventArgs(const FSMEventArgs&) = delete;
	FSMEventArgs& operator=(const FSMEventArgs&) = delete;

	~FSMEventArgs();

	FSMEvent event;

	// In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by \|TakeAudioChunk\|.
	scoped_refptr<const AudioChunk> audio_data;

	// In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
	std::unique_ptr<std::vector<uint8_t>> response;
	};

	// speech::UpstreamLoaderClient
	void OnUpstreamDataComplete(bool success, int response_code) override;

	// speech::DownstreamLoaderClient
	void OnDownstreamDataReceived(std::string_view new_response_data) override;
	void OnDownstreamDataComplete(bool success, int response_code) override;

	// Entry point for pushing any new external event into the recognizer FSM.
	void DispatchEvent(const FSMEventArgs& event_args);

	// Defines the behavior of the recognizer FSM, selecting the appropriate
	// transition according to the current state and event.
	FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);

	// The methods below handle transitions of the recognizer FSM.
	FSMState ConnectBothStreams(const FSMEventArgs& event_args);
	FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
	FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
	FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
	FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
	FSMState CloseDownstream(const FSMEventArgs& event_args);
	FSMState AbortSilently(const FSMEventArgs& event_args);
	FSMState AbortWithError(const FSMEventArgs& event_args);
	FSMState Abort(media::mojom::SpeechRecognitionErrorCode error);
	FSMState DoNothing(const FSMEventArgs& event_args);
	FSMState NotFeasible(const FSMEventArgs& event_args);

	std::string GenerateRequestKey() const;

	// Upload a single chunk of audio data. Handles both unframed and framed
	// upload formats, and uses the appropriate one.
	void UploadAudioChunk(const std::string& data, FrameType type, bool is_final);

	// The total audio duration of the upstream request.
	base::TimeDelta upstream_audio_duration_;

	Config config_;
	std::unique_ptr<speech::UpstreamLoader> upstream_loader_;
	std::unique_ptr<speech::DownstreamLoader> downstream_loader_;
	scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory_;
	std::unique_ptr<AudioEncoder> encoder_;
	std::unique_ptr<AudioEncoder> preamble_encoder_;
	speech::ChunkedByteBuffer chunked_byte_buffer_;
	bool got_last_definitive_result_ = false;
	bool is_dispatching_event_ = false;
	bool use_framed_post_data_ = false;
	FSMState state_ = FSMState::STATE_IDLE;

	SEQUENCE_CHECKER(sequence_checker_);
	};

	} // namespace content

	#endif // CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_