|  | // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | // The EnergyEndpointer class finds likely speech onset and offset points. | 
|  | // | 
|  | // The implementation described here is about the simplest possible. | 
|  | // It is based on timings of threshold crossings for overall signal | 
|  | // RMS. It is suitable for light weight applications. | 
|  | // | 
|  | // As written, the basic idea is that one specifies intervals that | 
|  | // must be occupied by super- and sub-threshold energy levels, and | 
|  | // defers decisions re onset and offset times until these | 
|  | // specifications have been met.  Three basic intervals are tested: an | 
|  | // onset window, a speech-on window, and an offset window.  We require | 
|  | // super-threshold to exceed some mimimum total durations in the onset | 
|  | // and speech-on windows before declaring the speech onset time, and | 
|  | // we specify a required sub-threshold residency in the offset window | 
|  | // before declaring speech offset. As the various residency requirements are | 
|  | // met, the EnergyEndpointer instance assumes various states, and can return the | 
|  | // ID of these states to the client (see EpStatus below). | 
|  | // | 
|  | // The levels of the speech and background noise are continuously updated. It is | 
|  | // important that the background noise level be estimated initially for | 
|  | // robustness in noisy conditions. The first frames are assumed to be background | 
|  | // noise and a fast update rate is used for the noise level. The duration for | 
|  | // fast update is controlled by the fast_update_dur_ paramter. | 
|  | // | 
|  | // If used in noisy conditions, the endpointer should be started and run in the | 
|  | // EnvironmentEstimation mode, for at least 200ms, before switching to | 
|  | // UserInputMode. | 
|  | // Audio feedback contamination can appear in the input audio, if not cut | 
|  | // out or handled by echo cancellation. Audio feedback can trigger a false | 
|  | // accept. The false accepts can be ignored by setting | 
|  | // ep_contamination_rejection_period. | 
|  |  | 
|  | #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ | 
|  | #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ | 
|  |  | 
|  | #include <vector> | 
|  |  | 
|  | #include "base/basictypes.h" | 
|  | #include "base/memory/scoped_ptr.h" | 
|  | #include "content/browser/speech/endpointer/energy_endpointer_params.h" | 
|  | #include "content/common/content_export.h" | 
|  |  | 
|  | namespace content { | 
|  |  | 
|  | // Endpointer status codes | 
|  | enum EpStatus { | 
|  | EP_PRE_SPEECH = 10, | 
|  | EP_POSSIBLE_ONSET, | 
|  | EP_SPEECH_PRESENT, | 
|  | EP_POSSIBLE_OFFSET, | 
|  | EP_POST_SPEECH, | 
|  | }; | 
|  |  | 
|  | class CONTENT_EXPORT EnergyEndpointer { | 
|  | public: | 
|  | // The default construction MUST be followed by Init(), before any | 
|  | // other use can be made of the instance. | 
|  | EnergyEndpointer(); | 
|  | virtual ~EnergyEndpointer(); | 
|  |  | 
|  | void Init(const EnergyEndpointerParams& params); | 
|  |  | 
|  | // Start the endpointer. This should be called at the beginning of a session. | 
|  | void StartSession(); | 
|  |  | 
|  | // Stop the endpointer. | 
|  | void EndSession(); | 
|  |  | 
|  | // Start environment estimation. Audio will be used for environment estimation | 
|  | // i.e. noise level estimation. | 
|  | void SetEnvironmentEstimationMode(); | 
|  |  | 
|  | // Start user input. This should be called when the user indicates start of | 
|  | // input, e.g. by pressing a button. | 
|  | void SetUserInputMode(); | 
|  |  | 
|  | // Computes the next input frame and modifies EnergyEndpointer status as | 
|  | // appropriate based on the computation. | 
|  | void ProcessAudioFrame(int64 time_us, | 
|  | const int16* samples, int num_samples, | 
|  | float* rms_out); | 
|  |  | 
|  | // Returns the current state of the EnergyEndpointer and the time | 
|  | // corresponding to the most recently computed frame. | 
|  | EpStatus Status(int64* status_time_us) const; | 
|  |  | 
|  | bool estimating_environment() const { | 
|  | return estimating_environment_; | 
|  | } | 
|  |  | 
|  | // Returns estimated noise level in dB. | 
|  | float GetNoiseLevelDb() const; | 
|  |  | 
|  | private: | 
|  | class HistoryRing; | 
|  |  | 
|  | // Resets the endpointer internal state.  If reset_threshold is true, the | 
|  | // state will be reset completely, including adaptive thresholds and the | 
|  | // removal of all history information. | 
|  | void Restart(bool reset_threshold); | 
|  |  | 
|  | // Update internal speech and noise levels. | 
|  | void UpdateLevels(float rms); | 
|  |  | 
|  | // Returns the number of frames (or frame number) corresponding to | 
|  | // the 'time' (in seconds). | 
|  | int TimeToFrame(float time) const; | 
|  |  | 
|  | EpStatus status_;  // The current state of this instance. | 
|  | float offset_confirm_dur_sec_;  // max on time allowed to confirm POST_SPEECH | 
|  | int64 endpointer_time_us_;  // Time of the most recently received audio frame. | 
|  | int64 fast_update_frames_; // Number of frames for initial level adaptation. | 
|  | int64 frame_counter_;  // Number of frames seen. Used for initial adaptation. | 
|  | float max_window_dur_;  // Largest search window size (seconds) | 
|  | float sample_rate_;  // Sampling rate. | 
|  |  | 
|  | // Ring buffers to hold the speech activity history. | 
|  | scoped_ptr<HistoryRing> history_; | 
|  |  | 
|  | // Configuration parameters. | 
|  | EnergyEndpointerParams params_; | 
|  |  | 
|  | // RMS which must be exceeded to conclude frame is speech. | 
|  | float decision_threshold_; | 
|  |  | 
|  | // Flag to indicate that audio should be used to estimate environment, prior | 
|  | // to receiving user input. | 
|  | bool estimating_environment_; | 
|  |  | 
|  | // Estimate of the background noise level. Used externally for UI feedback. | 
|  | float noise_level_; | 
|  |  | 
|  | // An adaptive threshold used to update decision_threshold_ when appropriate. | 
|  | float rms_adapt_; | 
|  |  | 
|  | // Start lag corresponds to the highest fundamental frequency. | 
|  | int start_lag_; | 
|  |  | 
|  | // End lag corresponds to the lowest fundamental frequency. | 
|  | int end_lag_; | 
|  |  | 
|  | // Time when mode switched from environment estimation to user input. This | 
|  | // is used to time forced rejection of audio feedback contamination. | 
|  | int64 user_input_start_time_us_; | 
|  |  | 
|  | DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer); | 
|  | }; | 
|  |  | 
|  | }  // namespace content | 
|  |  | 
|  | #endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |