components/speech/endpointer/endpointer_unittest.cc - chromium/src - Git at Google

 // Copyright 2024 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.


 #include "components/speech/endpointer/endpointer.h"

 #include <stdint.h>

 #include "base/memory/raw_ptr.h"
 #include "base/types/fixed_array.h"
 #include "components/speech/audio_buffer.h"
 #include "testing/gtest/include/gtest/gtest.h"

 namespace {
 const int kFrameRate = 50;     // 20 ms long frames for AMR encoding.
 }  // namespace

 namespace speech {

 class FrameProcessor {
  public:
   // Process a single frame of test audio samples.
   virtual EpStatus ProcessFrame(int64_t time,
                                 int16_t* samples,
                                 int frame_size) = 0;
 };

 void RunEndpointerEventsTest(FrameProcessor* processor, int sample_rate) {
   int frame_size = sample_rate / kFrameRate;
   base::FixedArray<int16_t> samples(frame_size);

   // We will create a white noise signal of 150 frames. The frames from 50 to
   // 100 will have more power, and the endpointer should fire on those frames.
   const int kNumFrames = 150;

   // Create a random sequence of samples.
   srand(1);
   float gain = 0.0;
   int64_t time = 0;
   for (int frame_count = 0; frame_count < kNumFrames; ++frame_count) {
     // The frames from 50 to 100 will have more power, and the endpointer
     // should detect those frames as speech.
     if ((frame_count >= 50) && (frame_count < 100)) {
       gain = 2000.0;
     } else {
       gain = 1.0;
     }
     // Create random samples.
     for (int i = 0; i < frame_size; ++i) {
       float randNum = static_cast<float>(rand() - (RAND_MAX / 2)) /
                       static_cast<float>(RAND_MAX);
       samples[i] = static_cast<int16_t>(gain * randNum);
     }

     EpStatus ep_status =
         processor->ProcessFrame(time, samples.data(), frame_size);
     time += static_cast<int64_t>(frame_size * (1e6 / sample_rate));

     // Log the status.
     if (20 == frame_count) {
       EXPECT_EQ(EP_PRE_SPEECH, ep_status);
     }
     if (70 == frame_count) {
       EXPECT_EQ(EP_SPEECH_PRESENT, ep_status);
     }
     if (120 == frame_count) {
       EXPECT_EQ(EP_PRE_SPEECH, ep_status);
     }
   }
 }

 // This test instantiates and initializes a stand alone endpointer module.
 // The test creates FrameData objects with random noise and send them
 // to the endointer module. The energy of the first 50 frames is low,
 // followed by 500 high energy frames, and another 50 low energy frames.
 // We test that the correct start and end frames were detected.
 class EnergyEndpointerFrameProcessor : public FrameProcessor {
  public:
   explicit EnergyEndpointerFrameProcessor(EnergyEndpointer* endpointer)
       : endpointer_(endpointer) {}

   EpStatus ProcessFrame(int64_t time,
                         int16_t* samples,
                         int frame_size) override {
     endpointer_->ProcessAudioFrame(time, samples, frame_size, nullptr);
     int64_t ep_time;
     return endpointer_->Status(&ep_time);
   }

  private:
   raw_ptr<EnergyEndpointer> endpointer_;
 };

 TEST(EndpointerTest, TestEnergyEndpointerEvents) {
   const int sample_rate = 8000;  // 8 k samples per second for AMR encoding.

   // Initialize endpointer and configure it. We specify the parameters
   // here for a 20ms window, and a 20ms step size, which corrsponds to
   // the narrow band AMR codec.
   EnergyEndpointerParams ep_config;
   ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
   ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
   ep_config.set_endpoint_margin(0.2f);
   ep_config.set_onset_window(0.15f);
   ep_config.set_speech_on_window(0.4f);
   ep_config.set_offset_window(0.15f);
   ep_config.set_onset_detect_dur(0.09f);
   ep_config.set_onset_confirm_dur(0.075f);
   ep_config.set_on_maintain_dur(0.10f);
   ep_config.set_offset_confirm_dur(0.12f);
   ep_config.set_decision_threshold(100.0f);
   EnergyEndpointer endpointer;
   endpointer.Init(ep_config);

   endpointer.StartSession();

   EnergyEndpointerFrameProcessor frame_processor(&endpointer);
   RunEndpointerEventsTest(&frame_processor, sample_rate);

   endpointer.EndSession();
 }

 // Test endpointer wrapper class.
 class EndpointerFrameProcessor : public FrameProcessor {
  public:
   explicit EndpointerFrameProcessor(Endpointer* endpointer)
       : endpointer_(endpointer) {}

   EpStatus ProcessFrame(int64_t time,
                         int16_t* samples,
                         int frame_size) override {
     scoped_refptr<AudioChunk> frame(
         new AudioChunk(reinterpret_cast<uint8_t*>(samples), frame_size * 2, 2));
     endpointer_->ProcessAudio(*frame.get(), nullptr);
     int64_t ep_time;
     return endpointer_->Status(&ep_time);
   }

  private:
   raw_ptr<Endpointer> endpointer_;
 };

 TEST(EndpointerTest, TestEmbeddedEndpointerEvents) {
   const int sample_rate = 8000;  // 8 k samples per second for AMR encoding.

   Endpointer endpointer(sample_rate);
   const int64_t kMillisecondsPerMicrosecond = 1000;
   const int64_t short_timeout = 300 * kMillisecondsPerMicrosecond;
   endpointer.set_speech_input_possibly_complete_silence_length(short_timeout);
   const int64_t long_timeout = 500 * kMillisecondsPerMicrosecond;
   endpointer.set_speech_input_complete_silence_length(long_timeout);
   endpointer.StartSession();

   EndpointerFrameProcessor frame_processor(&endpointer);
   RunEndpointerEventsTest(&frame_processor, sample_rate);

   endpointer.EndSession();
 }

 TEST(EndpointerTest, HighSampleRate) {
   const int sample_rate = 48000;

   Endpointer endpointer(sample_rate);
   const int64_t kMillisecondsPerMicrosecond = 1000;
   const int64_t short_timeout = 300 * kMillisecondsPerMicrosecond;
   endpointer.set_speech_input_possibly_complete_silence_length(short_timeout);
   const int64_t long_timeout = 500 * kMillisecondsPerMicrosecond;
   endpointer.set_speech_input_complete_silence_length(long_timeout);
   endpointer.StartSession();

   EndpointerFrameProcessor frame_processor(&endpointer);
   RunEndpointerEventsTest(&frame_processor, sample_rate);

   endpointer.EndSession();
 }

 }  // namespace speech
	// Copyright 2024 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.


	#include "components/speech/endpointer/endpointer.h"

	#include <stdint.h>

	#include "base/memory/raw_ptr.h"
	#include "base/types/fixed_array.h"
	#include "components/speech/audio_buffer.h"
	#include "testing/gtest/include/gtest/gtest.h"

	namespace {
	const int kFrameRate = 50; // 20 ms long frames for AMR encoding.
	} // namespace

	namespace speech {

	class FrameProcessor {
	public:
	// Process a single frame of test audio samples.
	virtual EpStatus ProcessFrame(int64_t time,
	int16_t* samples,
	int frame_size) = 0;
	};

	void RunEndpointerEventsTest(FrameProcessor* processor, int sample_rate) {
	int frame_size = sample_rate / kFrameRate;
	base::FixedArray<int16_t> samples(frame_size);

	// We will create a white noise signal of 150 frames. The frames from 50 to
	// 100 will have more power, and the endpointer should fire on those frames.
	const int kNumFrames = 150;

	// Create a random sequence of samples.
	srand(1);
	float gain = 0.0;
	int64_t time = 0;
	for (int frame_count = 0; frame_count < kNumFrames; ++frame_count) {
	// The frames from 50 to 100 will have more power, and the endpointer
	// should detect those frames as speech.
	if ((frame_count >= 50) && (frame_count < 100)) {
	gain = 2000.0;
	} else {
	gain = 1.0;
	}
	// Create random samples.
	for (int i = 0; i < frame_size; ++i) {
	float randNum = static_cast<float>(rand() - (RAND_MAX / 2)) /
	static_cast<float>(RAND_MAX);
	samples[i] = static_cast<int16_t>(gain * randNum);
	}

	EpStatus ep_status =
	processor->ProcessFrame(time, samples.data(), frame_size);
	time += static_cast<int64_t>(frame_size * (1e6 / sample_rate));

	// Log the status.
	if (20 == frame_count) {
	EXPECT_EQ(EP_PRE_SPEECH, ep_status);
	}
	if (70 == frame_count) {
	EXPECT_EQ(EP_SPEECH_PRESENT, ep_status);
	}
	if (120 == frame_count) {
	EXPECT_EQ(EP_PRE_SPEECH, ep_status);
	}
	}
	}

	// This test instantiates and initializes a stand alone endpointer module.
	// The test creates FrameData objects with random noise and send them
	// to the endointer module. The energy of the first 50 frames is low,
	// followed by 500 high energy frames, and another 50 low energy frames.
	// We test that the correct start and end frames were detected.
	class EnergyEndpointerFrameProcessor : public FrameProcessor {
	public:
	explicit EnergyEndpointerFrameProcessor(EnergyEndpointer* endpointer)
	: endpointer_(endpointer) {}

	EpStatus ProcessFrame(int64_t time,
	int16_t* samples,
	int frame_size) override {
	endpointer_->ProcessAudioFrame(time, samples, frame_size, nullptr);
	int64_t ep_time;
	return endpointer_->Status(&ep_time);
	}

	private:
	raw_ptr<EnergyEndpointer> endpointer_;
	};

	TEST(EndpointerTest, TestEnergyEndpointerEvents) {
	const int sample_rate = 8000; // 8 k samples per second for AMR encoding.

	// Initialize endpointer and configure it. We specify the parameters
	// here for a 20ms window, and a 20ms step size, which corrsponds to
	// the narrow band AMR codec.
	EnergyEndpointerParams ep_config;
	ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
	ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
	ep_config.set_endpoint_margin(0.2f);
	ep_config.set_onset_window(0.15f);
	ep_config.set_speech_on_window(0.4f);
	ep_config.set_offset_window(0.15f);
	ep_config.set_onset_detect_dur(0.09f);
	ep_config.set_onset_confirm_dur(0.075f);
	ep_config.set_on_maintain_dur(0.10f);
	ep_config.set_offset_confirm_dur(0.12f);
	ep_config.set_decision_threshold(100.0f);
	EnergyEndpointer endpointer;
	endpointer.Init(ep_config);

	endpointer.StartSession();

	EnergyEndpointerFrameProcessor frame_processor(&endpointer);
	RunEndpointerEventsTest(&frame_processor, sample_rate);

	endpointer.EndSession();
	}

	// Test endpointer wrapper class.
	class EndpointerFrameProcessor : public FrameProcessor {
	public:
	explicit EndpointerFrameProcessor(Endpointer* endpointer)
	: endpointer_(endpointer) {}

	EpStatus ProcessFrame(int64_t time,
	int16_t* samples,
	int frame_size) override {
	scoped_refptr<AudioChunk> frame(
	new AudioChunk(reinterpret_cast<uint8_t>(samples), frame_size 2, 2));
	endpointer_->ProcessAudio(*frame.get(), nullptr);
	int64_t ep_time;
	return endpointer_->Status(&ep_time);
	}

	private:
	raw_ptr<Endpointer> endpointer_;
	};

	TEST(EndpointerTest, TestEmbeddedEndpointerEvents) {
	const int sample_rate = 8000; // 8 k samples per second for AMR encoding.

	Endpointer endpointer(sample_rate);
	const int64_t kMillisecondsPerMicrosecond = 1000;
	const int64_t short_timeout = 300 * kMillisecondsPerMicrosecond;
	endpointer.set_speech_input_possibly_complete_silence_length(short_timeout);
	const int64_t long_timeout = 500 * kMillisecondsPerMicrosecond;
	endpointer.set_speech_input_complete_silence_length(long_timeout);
	endpointer.StartSession();

	EndpointerFrameProcessor frame_processor(&endpointer);
	RunEndpointerEventsTest(&frame_processor, sample_rate);

	endpointer.EndSession();
	}

	TEST(EndpointerTest, HighSampleRate) {
	const int sample_rate = 48000;

	Endpointer endpointer(sample_rate);
	const int64_t kMillisecondsPerMicrosecond = 1000;
	const int64_t short_timeout = 300 * kMillisecondsPerMicrosecond;
	endpointer.set_speech_input_possibly_complete_silence_length(short_timeout);
	const int64_t long_timeout = 500 * kMillisecondsPerMicrosecond;
	endpointer.set_speech_input_complete_silence_length(long_timeout);
	endpointer.StartSession();

	EndpointerFrameProcessor frame_processor(&endpointer);
	RunEndpointerEventsTest(&frame_processor, sample_rate);

	endpointer.EndSession();
	}

	} // namespace speech