content/browser/speech/speech_recognition_engine.h - chromium/src.git - Git at Google

 // Copyright 2012 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_
 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_

 #include <stdint.h>
 #include <memory>
 #include <string>
 #include <vector>

 #include "base/memory/raw_ptr.h"
 #include "base/memory/ref_counted.h"
 #include "base/sequence_checker.h"
 #include "base/strings/string_piece.h"
 #include "components/speech/chunked_byte_buffer.h"
 #include "components/speech/downstream_loader.h"
 #include "components/speech/downstream_loader_client.h"
 #include "components/speech/upstream_loader.h"
 #include "components/speech/upstream_loader_client.h"
 #include "content/browser/speech/audio_encoder.h"
 #include "content/common/content_export.h"
 #include "content/public/browser/speech_recognition_session_preamble.h"
 #include "services/network/public/cpp/simple_url_loader_stream_consumer.h"
 #include "third_party/blink/public/mojom/speech/speech_recognition_error.mojom.h"
 #include "third_party/blink/public/mojom/speech/speech_recognition_grammar.mojom.h"
 #include "third_party/blink/public/mojom/speech/speech_recognition_result.mojom.h"

 namespace base {
 class TimeDelta;
 }

 namespace network {
 class SharedURLLoaderFactory;
 }

 namespace content {

 class AudioChunk;
 struct SpeechRecognitionError;

 // A speech recognition engine supporting continuous recognition by means of
 // interaction with the Google streaming speech recognition webservice.
 //
 // This class establishes two HTTPS connections with the webservice for each
 // session, herein called "upstream" and "downstream". Audio chunks are sent on
 // the upstream by means of a chunked HTTP POST upload. Recognition results are
 // retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream)
 // on the downstream by means of a chunked HTTP GET request. Pairing between the
 // two stream is handled through a randomly generated key, unique for each
 // request, which is passed in the &pair= arg to both stream request URLs. In
 // the case of a regular session, the upstream is closed when the audio capture
 // ends (notified through a |AudioChunksEnded| call) and the downstream waits
 // for a corresponding server closure (eventually some late results can come
 // after closing the upstream). Both streams are guaranteed to be closed when
 // |EndRecognition| call is issued.
 //
 // The expected call sequence is:
 // StartRecognition      Mandatory at beginning of SR.
 //   TakeAudioChunk      For every audio chunk pushed.
 //   AudioChunksEnded    Finalize the audio stream (omitted in case of errors).
 // EndRecognition        Mandatory at end of SR (even on errors).
 //
 // No delegate callbacks are performed before StartRecognition or after
 // EndRecognition. If a recognition was started, the caller can free the
 // SpeechRecognitionEngine only after calling EndRecognition.

 class CONTENT_EXPORT SpeechRecognitionEngine
     : public speech::UpstreamLoaderClient,
       public speech::DownstreamLoaderClient {
  public:
   class Delegate {
    public:
     // Called whenever a result is retrieved.
     virtual void OnSpeechRecognitionEngineResults(
         const std::vector<blink::mojom::SpeechRecognitionResultPtr>&
             results) = 0;
     virtual void OnSpeechRecognitionEngineEndOfUtterance() = 0;
     virtual void OnSpeechRecognitionEngineError(
         const blink::mojom::SpeechRecognitionError& error) = 0;

    protected:
     virtual ~Delegate() {}
   };

   // Engine configuration.
   struct CONTENT_EXPORT Config {
     Config();
     ~Config();

     std::string language;
     std::vector<blink::mojom::SpeechRecognitionGrammar> grammars;
     bool filter_profanities;
     bool continuous;
     bool interim_results;
     uint32_t max_hypotheses;
     std::string origin_url;
     int audio_sample_rate;
     int audio_num_bits_per_sample;
     std::string auth_token;
     std::string auth_scope;
     scoped_refptr<SpeechRecognitionSessionPreamble> preamble;
   };

   // set_delegate detached from constructor for lazy dependency injection.
   void set_delegate(Delegate* delegate) { delegate_ = delegate; }

   // Duration of each audio packet.
   static const int kAudioPacketIntervalMs;

   // |accept_language| is the default Accept-Language header.
   SpeechRecognitionEngine(
       scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory,
       const std::string& accept_language);

   SpeechRecognitionEngine(const SpeechRecognitionEngine&) = delete;
   SpeechRecognitionEngine& operator=(const SpeechRecognitionEngine&) = delete;

   ~SpeechRecognitionEngine() override;

   // Sets the URL requests are sent to for tests.
   static void set_web_service_base_url_for_tests(
       const char* base_url_for_tests);

   void SetConfig(const Config& config);
   void StartRecognition();
   void EndRecognition();
   void TakeAudioChunk(const AudioChunk& data);
   void AudioChunksEnded();
   bool IsRecognitionPending() const;
   int GetDesiredAudioChunkDurationMs() const;

  private:
   friend class speech::UpstreamLoaderClient;
   friend class speech::DownstreamLoader;

   raw_ptr<Delegate> delegate_;

   // Response status codes from the speech recognition webservice.
   static const int kWebserviceStatusNoError;
   static const int kWebserviceStatusErrorNoMatch;

   // Frame type for framed POST data. Do NOT change these. They must match
   // values the server expects.
   enum FrameType {
     FRAME_PREAMBLE_AUDIO = 0,
     FRAME_RECOGNITION_AUDIO = 1
   };

   // Data types for the internal Finite State Machine (FSM).
   enum FSMState {
     STATE_IDLE = 0,
     STATE_BOTH_STREAMS_CONNECTED,
     STATE_WAITING_DOWNSTREAM_RESULTS,
     STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
   };

   enum FSMEvent {
     EVENT_END_RECOGNITION = 0,
     EVENT_START_RECOGNITION,
     EVENT_AUDIO_CHUNK,
     EVENT_AUDIO_CHUNKS_ENDED,
     EVENT_UPSTREAM_ERROR,
     EVENT_DOWNSTREAM_ERROR,
     EVENT_DOWNSTREAM_RESPONSE,
     EVENT_DOWNSTREAM_CLOSED,
     EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
   };

   struct FSMEventArgs {
     explicit FSMEventArgs(FSMEvent event_value);

     FSMEventArgs(const FSMEventArgs&) = delete;
     FSMEventArgs& operator=(const FSMEventArgs&) = delete;

     ~FSMEventArgs();

     FSMEvent event;

     // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
     scoped_refptr<const AudioChunk> audio_data;

     // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
     std::unique_ptr<std::vector<uint8_t>> response;
   };

   // speech::UpstreamLoaderClient
   void OnUpstreamDataComplete(bool success, int response_code) override;

   // speech::DownstreamLoaderClient
   void OnDownstreamDataReceived(base::StringPiece new_response_data) override;
   void OnDownstreamDataComplete(bool success, int response_code) override;

   // Entry point for pushing any new external event into the recognizer FSM.
   void DispatchEvent(const FSMEventArgs& event_args);

   // Defines the behavior of the recognizer FSM, selecting the appropriate
   // transition according to the current state and event.
   FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);

   // The methods below handle transitions of the recognizer FSM.
   FSMState ConnectBothStreams(const FSMEventArgs& event_args);
   FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
   FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
   FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
   FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
   FSMState CloseDownstream(const FSMEventArgs& event_args);
   FSMState AbortSilently(const FSMEventArgs& event_args);
   FSMState AbortWithError(const FSMEventArgs& event_args);
   FSMState Abort(blink::mojom::SpeechRecognitionErrorCode error);
   FSMState DoNothing(const FSMEventArgs& event_args);
   FSMState NotFeasible(const FSMEventArgs& event_args);

   std::string GetAcceptedLanguages() const;
   std::string GenerateRequestKey() const;

   // Upload a single chunk of audio data. Handles both unframed and framed
   // upload formats, and uses the appropriate one.
   void UploadAudioChunk(const std::string& data, FrameType type, bool is_final);

   // The total audio duration of the upstream request.
   base::TimeDelta upstream_audio_duration_;

   Config config_;
   std::unique_ptr<speech::UpstreamLoader> upstream_loader_;
   std::unique_ptr<speech::DownstreamLoader> downstream_loader_;
   scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory_;
   const std::string accept_language_;
   std::unique_ptr<AudioEncoder> encoder_;
   std::unique_ptr<AudioEncoder> preamble_encoder_;
   speech::ChunkedByteBuffer chunked_byte_buffer_;
   bool got_last_definitive_result_;
   bool is_dispatching_event_;
   bool use_framed_post_data_;
   FSMState state_;

   SEQUENCE_CHECKER(sequence_checker_);
 };

 }  // namespace content

 #endif  // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_
	// Copyright 2012 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_
	#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_

	#include <stdint.h>
	#include <memory>
	#include <string>
	#include <vector>

	#include "base/memory/raw_ptr.h"
	#include "base/memory/ref_counted.h"
	#include "base/sequence_checker.h"
	#include "base/strings/string_piece.h"
	#include "components/speech/chunked_byte_buffer.h"
	#include "components/speech/downstream_loader.h"
	#include "components/speech/downstream_loader_client.h"
	#include "components/speech/upstream_loader.h"
	#include "components/speech/upstream_loader_client.h"
	#include "content/browser/speech/audio_encoder.h"
	#include "content/common/content_export.h"
	#include "content/public/browser/speech_recognition_session_preamble.h"
	#include "services/network/public/cpp/simple_url_loader_stream_consumer.h"
	#include "third_party/blink/public/mojom/speech/speech_recognition_error.mojom.h"
	#include "third_party/blink/public/mojom/speech/speech_recognition_grammar.mojom.h"
	#include "third_party/blink/public/mojom/speech/speech_recognition_result.mojom.h"

	namespace base {
	class TimeDelta;
	}

	namespace network {
	class SharedURLLoaderFactory;
	}

	namespace content {

	class AudioChunk;
	struct SpeechRecognitionError;

	// A speech recognition engine supporting continuous recognition by means of
	// interaction with the Google streaming speech recognition webservice.
	//
	// This class establishes two HTTPS connections with the webservice for each
	// session, herein called "upstream" and "downstream". Audio chunks are sent on
	// the upstream by means of a chunked HTTP POST upload. Recognition results are
	// retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream)
	// on the downstream by means of a chunked HTTP GET request. Pairing between the
	// two stream is handled through a randomly generated key, unique for each
	// request, which is passed in the &pair= arg to both stream request URLs. In
	// the case of a regular session, the upstream is closed when the audio capture
	// ends (notified through a \|AudioChunksEnded\| call) and the downstream waits
	// for a corresponding server closure (eventually some late results can come
	// after closing the upstream). Both streams are guaranteed to be closed when
	// \|EndRecognition\| call is issued.
	//
	// The expected call sequence is:
	// StartRecognition Mandatory at beginning of SR.
	// TakeAudioChunk For every audio chunk pushed.
	// AudioChunksEnded Finalize the audio stream (omitted in case of errors).
	// EndRecognition Mandatory at end of SR (even on errors).
	//
	// No delegate callbacks are performed before StartRecognition or after
	// EndRecognition. If a recognition was started, the caller can free the
	// SpeechRecognitionEngine only after calling EndRecognition.

	class CONTENT_EXPORT SpeechRecognitionEngine
	: public speech::UpstreamLoaderClient,
	public speech::DownstreamLoaderClient {
	public:
	class Delegate {
	public:
	// Called whenever a result is retrieved.
	virtual void OnSpeechRecognitionEngineResults(
	const std::vector<blink::mojom::SpeechRecognitionResultPtr>&
	results) = 0;
	virtual void OnSpeechRecognitionEngineEndOfUtterance() = 0;
	virtual void OnSpeechRecognitionEngineError(
	const blink::mojom::SpeechRecognitionError& error) = 0;

	protected:
	virtual ~Delegate() {}
	};

	// Engine configuration.
	struct CONTENT_EXPORT Config {
	Config();
	~Config();

	std::string language;
	std::vector<blink::mojom::SpeechRecognitionGrammar> grammars;
	bool filter_profanities;
	bool continuous;
	bool interim_results;
	uint32_t max_hypotheses;
	std::string origin_url;
	int audio_sample_rate;
	int audio_num_bits_per_sample;
	std::string auth_token;
	std::string auth_scope;
	scoped_refptr<SpeechRecognitionSessionPreamble> preamble;
	};

	// set_delegate detached from constructor for lazy dependency injection.
	void set_delegate(Delegate* delegate) { delegate_ = delegate; }

	// Duration of each audio packet.
	static const int kAudioPacketIntervalMs;

	// \|accept_language\| is the default Accept-Language header.
	SpeechRecognitionEngine(
	scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory,
	const std::string& accept_language);

	SpeechRecognitionEngine(const SpeechRecognitionEngine&) = delete;
	SpeechRecognitionEngine& operator=(const SpeechRecognitionEngine&) = delete;

	~SpeechRecognitionEngine() override;

	// Sets the URL requests are sent to for tests.
	static void set_web_service_base_url_for_tests(
	const char* base_url_for_tests);

	void SetConfig(const Config& config);
	void StartRecognition();
	void EndRecognition();
	void TakeAudioChunk(const AudioChunk& data);
	void AudioChunksEnded();
	bool IsRecognitionPending() const;
	int GetDesiredAudioChunkDurationMs() const;

	private:
	friend class speech::UpstreamLoaderClient;
	friend class speech::DownstreamLoader;

	raw_ptr<Delegate> delegate_;

	// Response status codes from the speech recognition webservice.
	static const int kWebserviceStatusNoError;
	static const int kWebserviceStatusErrorNoMatch;

	// Frame type for framed POST data. Do NOT change these. They must match
	// values the server expects.
	enum FrameType {
	FRAME_PREAMBLE_AUDIO = 0,
	FRAME_RECOGNITION_AUDIO = 1
	};

	// Data types for the internal Finite State Machine (FSM).
	enum FSMState {
	STATE_IDLE = 0,
	STATE_BOTH_STREAMS_CONNECTED,
	STATE_WAITING_DOWNSTREAM_RESULTS,
	STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
	};

	enum FSMEvent {
	EVENT_END_RECOGNITION = 0,
	EVENT_START_RECOGNITION,
	EVENT_AUDIO_CHUNK,
	EVENT_AUDIO_CHUNKS_ENDED,
	EVENT_UPSTREAM_ERROR,
	EVENT_DOWNSTREAM_ERROR,
	EVENT_DOWNSTREAM_RESPONSE,
	EVENT_DOWNSTREAM_CLOSED,
	EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
	};

	struct FSMEventArgs {
	explicit FSMEventArgs(FSMEvent event_value);

	FSMEventArgs(const FSMEventArgs&) = delete;
	FSMEventArgs& operator=(const FSMEventArgs&) = delete;

	~FSMEventArgs();

	FSMEvent event;

	// In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by \|TakeAudioChunk\|.
	scoped_refptr<const AudioChunk> audio_data;

	// In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
	std::unique_ptr<std::vector<uint8_t>> response;
	};

	// speech::UpstreamLoaderClient
	void OnUpstreamDataComplete(bool success, int response_code) override;

	// speech::DownstreamLoaderClient
	void OnDownstreamDataReceived(base::StringPiece new_response_data) override;
	void OnDownstreamDataComplete(bool success, int response_code) override;

	// Entry point for pushing any new external event into the recognizer FSM.
	void DispatchEvent(const FSMEventArgs& event_args);

	// Defines the behavior of the recognizer FSM, selecting the appropriate
	// transition according to the current state and event.
	FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);

	// The methods below handle transitions of the recognizer FSM.
	FSMState ConnectBothStreams(const FSMEventArgs& event_args);
	FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
	FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
	FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
	FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
	FSMState CloseDownstream(const FSMEventArgs& event_args);
	FSMState AbortSilently(const FSMEventArgs& event_args);
	FSMState AbortWithError(const FSMEventArgs& event_args);
	FSMState Abort(blink::mojom::SpeechRecognitionErrorCode error);
	FSMState DoNothing(const FSMEventArgs& event_args);
	FSMState NotFeasible(const FSMEventArgs& event_args);

	std::string GetAcceptedLanguages() const;
	std::string GenerateRequestKey() const;

	// Upload a single chunk of audio data. Handles both unframed and framed
	// upload formats, and uses the appropriate one.
	void UploadAudioChunk(const std::string& data, FrameType type, bool is_final);

	// The total audio duration of the upstream request.
	base::TimeDelta upstream_audio_duration_;

	Config config_;
	std::unique_ptr<speech::UpstreamLoader> upstream_loader_;
	std::unique_ptr<speech::DownstreamLoader> downstream_loader_;
	scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory_;
	const std::string accept_language_;
	std::unique_ptr<AudioEncoder> encoder_;
	std::unique_ptr<AudioEncoder> preamble_encoder_;
	speech::ChunkedByteBuffer chunked_byte_buffer_;
	bool got_last_definitive_result_;
	bool is_dispatching_event_;
	bool use_framed_post_data_;
	FSMState state_;

	SEQUENCE_CHECKER(sequence_checker_);
	};

	} // namespace content

	#endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_