blob: 1f3501200e5d81e57009540e18f49617ae83db40 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_
#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_
#include <stdint.h>
#include <memory>
#include <string>
#include <vector>
#include "base/macros.h"
#include "base/memory/ref_counted.h"
#include "base/sequence_checker.h"
#include "base/strings/string_piece.h"
#include "content/browser/speech/audio_encoder.h"
#include "content/browser/speech/chunked_byte_buffer.h"
#include "content/common/content_export.h"
#include "content/public/browser/speech_recognition_session_preamble.h"
#include "services/network/public/cpp/simple_url_loader_stream_consumer.h"
#include "services/network/public/mojom/url_loader_factory.mojom.h"
#include "third_party/blink/public/mojom/speech/speech_recognition_error.mojom.h"
#include "third_party/blink/public/mojom/speech/speech_recognition_grammar.mojom.h"
#include "third_party/blink/public/mojom/speech/speech_recognition_result.mojom.h"
namespace network {
class SharedURLLoaderFactory;
}
namespace content {
class AudioChunk;
struct SpeechRecognitionError;
// A speech recognition engine supporting continuous recognition by means of
// interaction with the Google streaming speech recognition webservice.
//
// This class establishes two HTTPS connections with the webservice for each
// session, herein called "upstream" and "downstream". Audio chunks are sent on
// the upstream by means of a chunked HTTP POST upload. Recognition results are
// retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream)
// on the downstream by means of a chunked HTTP GET request. Pairing between the
// two stream is handled through a randomly generated key, unique for each
// request, which is passed in the &pair= arg to both stream request URLs. In
// the case of a regular session, the upstream is closed when the audio capture
// ends (notified through a |AudioChunksEnded| call) and the downstream waits
// for a corresponding server closure (eventually some late results can come
// after closing the upstream). Both streams are guaranteed to be closed when
// |EndRecognition| call is issued.
//
// The expected call sequence is:
// StartRecognition Mandatory at beginning of SR.
// TakeAudioChunk For every audio chunk pushed.
// AudioChunksEnded Finalize the audio stream (omitted in case of errors).
// EndRecognition Mandatory at end of SR (even on errors).
//
// No delegate callbacks are performed before StartRecognition or after
// EndRecognition. If a recognition was started, the caller can free the
// SpeechRecognitionEngine only after calling EndRecognition.
class CONTENT_EXPORT SpeechRecognitionEngine {
public:
class Delegate {
public:
// Called whenever a result is retrieved.
virtual void OnSpeechRecognitionEngineResults(
const std::vector<blink::mojom::SpeechRecognitionResultPtr>&
results) = 0;
virtual void OnSpeechRecognitionEngineEndOfUtterance() = 0;
virtual void OnSpeechRecognitionEngineError(
const blink::mojom::SpeechRecognitionError& error) = 0;
protected:
virtual ~Delegate() {}
};
// Engine configuration.
struct CONTENT_EXPORT Config {
Config();
~Config();
std::string language;
std::vector<blink::mojom::SpeechRecognitionGrammar> grammars;
bool filter_profanities;
bool continuous;
bool interim_results;
uint32_t max_hypotheses;
std::string origin_url;
int audio_sample_rate;
int audio_num_bits_per_sample;
std::string auth_token;
std::string auth_scope;
scoped_refptr<SpeechRecognitionSessionPreamble> preamble;
};
// set_delegate detached from constructor for lazy dependency injection.
void set_delegate(Delegate* delegate) { delegate_ = delegate; }
// Duration of each audio packet.
static const int kAudioPacketIntervalMs;
// |accept_language| is the default Accept-Language header.
SpeechRecognitionEngine(
scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory,
const std::string& accept_language);
~SpeechRecognitionEngine();
// Sets the URL requests are sent to for tests.
static void set_web_service_base_url_for_tests(
const char* base_url_for_tests);
void SetConfig(const Config& config);
void StartRecognition();
void EndRecognition();
void TakeAudioChunk(const AudioChunk& data);
void AudioChunksEnded();
bool IsRecognitionPending() const;
int GetDesiredAudioChunkDurationMs() const;
private:
class UpstreamLoader;
class DownstreamLoader;
Delegate* delegate_;
// Response status codes from the speech recognition webservice.
static const int kWebserviceStatusNoError;
static const int kWebserviceStatusErrorNoMatch;
// Frame type for framed POST data. Do NOT change these. They must match
// values the server expects.
enum FrameType {
FRAME_PREAMBLE_AUDIO = 0,
FRAME_RECOGNITION_AUDIO = 1
};
// Data types for the internal Finite State Machine (FSM).
enum FSMState {
STATE_IDLE = 0,
STATE_BOTH_STREAMS_CONNECTED,
STATE_WAITING_DOWNSTREAM_RESULTS,
STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
};
enum FSMEvent {
EVENT_END_RECOGNITION = 0,
EVENT_START_RECOGNITION,
EVENT_AUDIO_CHUNK,
EVENT_AUDIO_CHUNKS_ENDED,
EVENT_UPSTREAM_ERROR,
EVENT_DOWNSTREAM_ERROR,
EVENT_DOWNSTREAM_RESPONSE,
EVENT_DOWNSTREAM_CLOSED,
EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
};
struct FSMEventArgs {
explicit FSMEventArgs(FSMEvent event_value);
~FSMEventArgs();
FSMEvent event;
// In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
scoped_refptr<const AudioChunk> audio_data;
// In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
std::unique_ptr<std::vector<uint8_t>> response;
private:
DISALLOW_COPY_AND_ASSIGN(FSMEventArgs);
};
void OnUpstreamDataComplete(bool success, int response_code);
void OnDownstreamDataReceived(base::StringPiece new_response_data);
void OnDownstreamDataComplete(bool success, int response_code);
// Entry point for pushing any new external event into the recognizer FSM.
void DispatchEvent(const FSMEventArgs& event_args);
// Defines the behavior of the recognizer FSM, selecting the appropriate
// transition according to the current state and event.
FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);
// The methods below handle transitions of the recognizer FSM.
FSMState ConnectBothStreams(const FSMEventArgs& event_args);
FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
FSMState CloseDownstream(const FSMEventArgs& event_args);
FSMState AbortSilently(const FSMEventArgs& event_args);
FSMState AbortWithError(const FSMEventArgs& event_args);
FSMState Abort(blink::mojom::SpeechRecognitionErrorCode error);
FSMState DoNothing(const FSMEventArgs& event_args);
FSMState NotFeasible(const FSMEventArgs& event_args);
std::string GetAcceptedLanguages() const;
std::string GenerateRequestKey() const;
// Upload a single chunk of audio data. Handles both unframed and framed
// upload formats, and uses the appropriate one.
void UploadAudioChunk(const std::string& data, FrameType type, bool is_final);
Config config_;
std::unique_ptr<UpstreamLoader> upstream_loader_;
std::unique_ptr<DownstreamLoader> downstream_loader_;
scoped_refptr<network::SharedURLLoaderFactory> shared_url_loader_factory_;
const std::string accept_language_;
std::unique_ptr<AudioEncoder> encoder_;
std::unique_ptr<AudioEncoder> preamble_encoder_;
ChunkedByteBuffer chunked_byte_buffer_;
bool got_last_definitive_result_;
bool is_dispatching_event_;
bool use_framed_post_data_;
FSMState state_;
SEQUENCE_CHECKER(sequence_checker_);
DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionEngine);
};
} // namespace content
#endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_ENGINE_H_