| // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ |
| #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ |
| |
| #include <stdint.h> |
| #include <memory> |
| #include <string> |
| #include <vector> |
| |
| #include "base/macros.h" |
| #include "base/memory/ref_counted.h" |
| #include "base/sequence_checker.h" |
| #include "base/strings/string_piece.h" |
| #include "content/browser/speech/audio_encoder.h" |
| #include "content/browser/speech/chunked_byte_buffer.h" |
| #include "content/common/content_export.h" |
| #include "content/public/browser/speech_recognition_session_preamble.h" |
| #include "services/network/public/cpp/simple_url_loader_stream_consumer.h" |
| #include "services/network/public/mojom/url_loader_factory.mojom.h" |
| #include "third_party/blink/public/mojom/speech/speech_recognition_error.mojom.h" |
| #include "third_party/blink/public/mojom/speech/speech_recognition_grammar.mojom.h" |
| #include "third_party/blink/public/mojom/speech/speech_recognition_result.mojom.h" |
| |
| namespace network { |
| class SharedURLLoaderFactory; |
| } |
| |
| namespace content { |
| |
| class AudioChunk; |
| struct SpeechRecognitionError; |
| |
| // A speech recognition engine supporting continuous recognition by means of |
| // interaction with the Google streaming speech recognition webservice. |
| // |
| // This class establishes two HTTPS connections with the webservice for each |
| // session, herein called "upstream" and "downstream". Audio chunks are sent on |
| // the upstream by means of a chunked HTTP POST upload. Recognition results are |
| // retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream) |
| // on the downstream by means of a chunked HTTP GET request. Pairing between the |
| // two stream is handled through a randomly generated key, unique for each |
| // request, which is passed in the &pair= arg to both stream request URLs. In |
| // the case of a regular session, the upstream is closed when the audio capture |
| // ends (notified through a |AudioChunksEnded| call) and the downstream waits |
| // for a corresponding server closure (eventually some late results can come |
| // after closing the upstream). Both streams are guaranteed to be closed when |
| // |EndRecognition| call is issued. |
| // |
| // The expected call sequence is: |
| // StartRecognition Mandatory at beginning of SR. |
| // TakeAudioChunk For every audio chunk pushed. |
| // AudioChunksEnded Finalize the audio stream (omitted in case of errors). |
| // EndRecognition Mandatory at end of SR (even on errors). |
| // |
| // No delegate callbacks are performed before StartRecognition or after |
| // EndRecognition. If a recognition was started, the caller can free the |
| // SpeechRecognitionEngine only after calling EndRecognition. |
| |
| class CONTENT_EXPORT SpeechRecognitionEngine { |
| public: |
| class Delegate { |
| public: |
| // Called whenever a result is retrieved. |
| virtual void OnSpeechRecognitionEngineResults( |
| const std::vector<blink::mojom::SpeechRecognitionResultPtr>& |
| results) = 0; |
| virtual void OnSpeechRecognitionEngineEndOfUtterance() = 0; |
| virtual void OnSpeechRecognitionEngineError( |
| const blink::mojom::SpeechRecognitionError& error) = 0; |
| |
| protected: |
| virtual ~Delegate() {} |
| }; |
| |
| // Engine configuration. |
| struct CONTENT_EXPORT Config { |
| Config(); |
| ~Config(); |
| |
| std::string language; |
| std::vector<blink::mojom::SpeechRecognitionGrammar> grammars; |
| bool filter_profanities; |
| bool continuous; |
| bool interim_results; |
| uint32_t max_hypotheses; |
| std::string origin_url; |
| int audio_sample_rate; |
| int audio_num_bits_per_sample; |
| std::string auth_token; |
| std::string auth_scope; |
| scoped_refptr<SpeechRecognitionSessionPreamble> preamble; |
| }; |
| |
| // set_delegate detached from constructor for lazy dependency injection. |
| void set_delegate(Delegate* delegate) { delegate_ = delegate; } |
| |
| // Duration of each audio packet. |
| static const int kAudioPacketIntervalMs; |
| |
| // |accept_language| is the default Accept-Language header. |
| SpeechRecognitionEngine( |
| scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory, |
| const std::string& accept_language); |
| ~SpeechRecognitionEngine(); |
| |
| // Sets the URL requests are sent to for tests. |
| static void set_web_service_base_url_for_tests( |
| const char* base_url_for_tests); |
| |
| void SetConfig(const Config& config); |
| void StartRecognition(); |
| void EndRecognition(); |
| void TakeAudioChunk(const AudioChunk& data); |
| void AudioChunksEnded(); |
| bool IsRecognitionPending() const; |
| int GetDesiredAudioChunkDurationMs() const; |
| |
| private: |
| class UpstreamLoader; |
| class DownstreamLoader; |
| |
| Delegate* delegate_; |
| |
| // Response status codes from the speech recognition webservice. |
| static const int kWebserviceStatusNoError; |
| static const int kWebserviceStatusErrorNoMatch; |
| |
| // Frame type for framed POST data. Do NOT change these. They must match |
| // values the server expects. |
| enum FrameType { |
| FRAME_PREAMBLE_AUDIO = 0, |
| FRAME_RECOGNITION_AUDIO = 1 |
| }; |
| |
| // Data types for the internal Finite State Machine (FSM). |
| enum FSMState { |
| STATE_IDLE = 0, |
| STATE_BOTH_STREAMS_CONNECTED, |
| STATE_WAITING_DOWNSTREAM_RESULTS, |
| STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS |
| }; |
| |
| enum FSMEvent { |
| EVENT_END_RECOGNITION = 0, |
| EVENT_START_RECOGNITION, |
| EVENT_AUDIO_CHUNK, |
| EVENT_AUDIO_CHUNKS_ENDED, |
| EVENT_UPSTREAM_ERROR, |
| EVENT_DOWNSTREAM_ERROR, |
| EVENT_DOWNSTREAM_RESPONSE, |
| EVENT_DOWNSTREAM_CLOSED, |
| EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED |
| }; |
| |
| struct FSMEventArgs { |
| explicit FSMEventArgs(FSMEvent event_value); |
| ~FSMEventArgs(); |
| |
| FSMEvent event; |
| |
| // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|. |
| scoped_refptr<const AudioChunk> audio_data; |
| |
| // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes. |
| std::unique_ptr<std::vector<uint8_t>> response; |
| |
| private: |
| DISALLOW_COPY_AND_ASSIGN(FSMEventArgs); |
| }; |
| |
| void OnUpstreamDataComplete(bool success, int response_code); |
| |
| void OnDownstreamDataReceived(base::StringPiece new_response_data); |
| void OnDownstreamDataComplete(bool success, int response_code); |
| |
| // Entry point for pushing any new external event into the recognizer FSM. |
| void DispatchEvent(const FSMEventArgs& event_args); |
| |
| // Defines the behavior of the recognizer FSM, selecting the appropriate |
| // transition according to the current state and event. |
| FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args); |
| |
| // The methods below handle transitions of the recognizer FSM. |
| FSMState ConnectBothStreams(const FSMEventArgs& event_args); |
| FSMState TransmitAudioUpstream(const FSMEventArgs& event_args); |
| FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args); |
| FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args); |
| FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args); |
| FSMState CloseDownstream(const FSMEventArgs& event_args); |
| FSMState AbortSilently(const FSMEventArgs& event_args); |
| FSMState AbortWithError(const FSMEventArgs& event_args); |
| FSMState Abort(blink::mojom::SpeechRecognitionErrorCode error); |
| FSMState DoNothing(const FSMEventArgs& event_args); |
| FSMState NotFeasible(const FSMEventArgs& event_args); |
| |
| std::string GetAcceptedLanguages() const; |
| std::string GenerateRequestKey() const; |
| |
| // Upload a single chunk of audio data. Handles both unframed and framed |
| // upload formats, and uses the appropriate one. |
| void UploadAudioChunk(const std::string& data, FrameType type, bool is_final); |
| |
| Config config_; |
| std::unique_ptr<UpstreamLoader> upstream_loader_; |
| std::unique_ptr<DownstreamLoader> downstream_loader_; |
| scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory_; |
| const std::string accept_language_; |
| std::unique_ptr<AudioEncoder> encoder_; |
| std::unique_ptr<AudioEncoder> preamble_encoder_; |
| ChunkedByteBuffer chunked_byte_buffer_; |
| bool got_last_definitive_result_; |
| bool is_dispatching_event_; |
| bool use_framed_post_data_; |
| FSMState state_; |
| |
| SEQUENCE_CHECKER(sequence_checker_); |
| |
| DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionEngine); |
| }; |
| |
| } // namespace content |
| |
| #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_ |