blob: bd193b07612bf367939b85685a52dfd1f3199607 [file] [log] [blame]
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/renderer/media/speech_recognition_audio_sink.h"
#include "base/bind.h"
#include "base/strings/utf_string_conversions.h"
#include "content/renderer/media/media_stream_audio_source.h"
#include "content/renderer/media/mock_media_constraint_factory.h"
#include "content/renderer/media/webrtc/webrtc_local_audio_track_adapter.h"
#include "content/renderer/media/webrtc_local_audio_track.h"
#include "media/audio/audio_parameters.h"
#include "media/base/audio_bus.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "third_party/WebKit/public/platform/WebMediaStreamTrack.h"
namespace {
// Supported speech recognition audio parameters.
const int kSpeechRecognitionSampleRate = 16000;
const int kSpeechRecognitionFramesPerBuffer = 1600;
// Input audio format.
const media::AudioParameters::Format kInputFormat =
media::AudioParameters::AUDIO_PCM_LOW_LATENCY;
const media::ChannelLayout kInputChannelLayout = media::CHANNEL_LAYOUT_MONO;
const int kInputChannels = 1;
const int kInputBitsPerSample = 16;
// Output audio format.
const media::AudioParameters::Format kOutputFormat =
media::AudioParameters::AUDIO_PCM_LOW_LATENCY;
const media::ChannelLayout kOutputChannelLayout = media::CHANNEL_LAYOUT_STEREO;
const int kOutputChannels = 2;
const int kOutputBitsPerSample = 16;
// Mocked out sockets used for Send/Receive.
// Data is written and read from a shared buffer used as a FIFO and there is
// no blocking. |OnSendCB| is used to trigger a |Receive| on the other socket.
class MockSyncSocket : public base::SyncSocket {
public:
// This allows for 2 requests in queue between the |MockSyncSocket|s.
static const int kSharedBufferSize = 8;
// Buffer to be shared between two |MockSyncSocket|s. Allocated on heap.
struct SharedBuffer {
SharedBuffer() : data(), start(0), length(0) {}
uint8 data[kSharedBufferSize];
size_t start;
size_t length;
};
// Callback used for pairing an A.Send() with B.Receieve() without blocking.
typedef base::Callback<void()> OnSendCB;
explicit MockSyncSocket(SharedBuffer* shared_buffer)
: buffer_(shared_buffer),
in_failure_mode_(false) {}
MockSyncSocket(SharedBuffer* shared_buffer, const OnSendCB& on_send_cb)
: buffer_(shared_buffer),
on_send_cb_(on_send_cb),
in_failure_mode_(false) {}
size_t Send(const void* buffer, size_t length) override;
size_t Receive(void* buffer, size_t length) override;
// When |in_failure_mode_| == true, the socket fails to send.
void SetFailureMode(bool in_failure_mode) {
in_failure_mode_ = in_failure_mode;
}
private:
SharedBuffer* buffer_;
const OnSendCB on_send_cb_;
bool in_failure_mode_;
DISALLOW_COPY_AND_ASSIGN(MockSyncSocket);
};
// base::SyncSocket implementation
size_t MockSyncSocket::Send(const void* buffer, size_t length) {
if (in_failure_mode_)
return 0;
const uint8* b = static_cast<const uint8*>(buffer);
for (size_t i = 0; i < length; ++i, ++buffer_->length)
buffer_->data[buffer_->start + buffer_->length] = b[i];
on_send_cb_.Run();
return length;
}
size_t MockSyncSocket::Receive(void* buffer, size_t length) {
uint8* b = static_cast<uint8*>(buffer);
for (size_t i = buffer_->start; i < buffer_->length; ++i, ++buffer_->start)
b[i] = buffer_->data[buffer_->start];
// Since buffer is used sequentially, we can reset the buffer indices here.
buffer_->start = buffer_->length = 0;
return length;
}
// This fake class is the consumer used to verify behaviour of the producer.
// The |Initialize()| method shows what the consumer should be responsible for
// in the production code (minus the mocks).
class FakeSpeechRecognizer {
public:
FakeSpeechRecognizer() : is_responsive_(true) {}
void Initialize(
const blink::WebMediaStreamTrack& track,
const media::AudioParameters& sink_params,
base::SharedMemoryHandle* foreign_memory_handle) {
// Shared memory is allocated, mapped and shared.
const uint32 kSharedMemorySize =
sizeof(media::AudioInputBufferParameters) +
media::AudioBus::CalculateMemorySize(sink_params);
shared_memory_.reset(new base::SharedMemory());
ASSERT_TRUE(shared_memory_->CreateAndMapAnonymous(kSharedMemorySize));
memset(shared_memory_->memory(), 0, kSharedMemorySize);
ASSERT_TRUE(shared_memory_->ShareToProcess(base::GetCurrentProcessHandle(),
foreign_memory_handle));
// Wrap the shared memory for the audio bus.
media::AudioInputBuffer* buffer =
static_cast<media::AudioInputBuffer*>(shared_memory_->memory());
audio_track_bus_ = media::AudioBus::WrapMemory(sink_params, buffer->audio);
audio_track_bus_->Zero();
// Reference to the counter used to synchronize.
buffer->params.size = 0U;
// Create a shared buffer for the |MockSyncSocket|s.
shared_buffer_.reset(new MockSyncSocket::SharedBuffer());
// Local socket will receive signals from the producer.
receiving_socket_.reset(new MockSyncSocket(shared_buffer_.get()));
// We automatically trigger a Receive when data is sent over the socket.
sending_socket_ = new MockSyncSocket(
shared_buffer_.get(),
base::Bind(&FakeSpeechRecognizer::EmulateReceiveThreadLoopIteration,
base::Unretained(this)));
// This is usually done to pair the sockets. Here it's not effective.
base::SyncSocket::CreatePair(receiving_socket_.get(), sending_socket_);
}
// Emulates a single iteraton of a thread receiving on the socket.
// This would normally be done on a receiving thread's task on the browser.
void EmulateReceiveThreadLoopIteration() {
if (!is_responsive_)
return;
const int kSize = sizeof(media::AudioInputBufferParameters().size);
receiving_socket_->Receive(&(GetAudioInputBuffer()->params.size), kSize);
// Notify the producer that the audio buffer has been consumed.
GetAudioInputBuffer()->params.size++;
}
// Used to simulate an unresponsive behaviour of the consumer.
void SimulateResponsiveness(bool is_responsive) {
is_responsive_ = is_responsive;
}
media::AudioInputBuffer * GetAudioInputBuffer() const {
return static_cast<media::AudioInputBuffer*>(shared_memory_->memory());
}
MockSyncSocket* sending_socket() { return sending_socket_; }
media::AudioBus* audio_bus() const { return audio_track_bus_.get(); }
private:
bool is_responsive_;
// Shared memory for the audio and synchronization.
scoped_ptr<base::SharedMemory> shared_memory_;
// Fake sockets and their shared buffer.
scoped_ptr<MockSyncSocket::SharedBuffer> shared_buffer_;
scoped_ptr<MockSyncSocket> receiving_socket_;
MockSyncSocket* sending_socket_;
// Audio bus wrapping the shared memory from the renderer.
scoped_ptr<media::AudioBus> audio_track_bus_;
DISALLOW_COPY_AND_ASSIGN(FakeSpeechRecognizer);
};
} // namespace
namespace content {
class SpeechRecognitionAudioSinkTest : public testing::Test {
public:
SpeechRecognitionAudioSinkTest() {}
~SpeechRecognitionAudioSinkTest() {}
// Initializes the producer and consumer with specified audio parameters.
// Returns the minimal number of input audio buffers which need to be captured
// before they get sent to the consumer.
uint32 Initialize(int input_sample_rate,
int input_frames_per_buffer,
int output_sample_rate,
int output_frames_per_buffer) {
// Audio Environment setup.
source_params_.Reset(kInputFormat,
kInputChannelLayout,
kInputChannels,
input_sample_rate,
kInputBitsPerSample,
input_frames_per_buffer);
sink_params_.Reset(kOutputFormat,
kOutputChannelLayout,
kOutputChannels,
output_sample_rate,
kOutputBitsPerSample,
output_frames_per_buffer);
source_data_.reset(new int16[input_frames_per_buffer * kInputChannels]());
// Prepare the track and audio source.
blink::WebMediaStreamTrack blink_track;
PrepareBlinkTrackOfType(MEDIA_DEVICE_AUDIO_CAPTURE, &blink_track);
// Get the native track from the blink track and initialize.
native_track_ =
static_cast<WebRtcLocalAudioTrack*>(blink_track.extraData());
native_track_->OnSetFormat(source_params_);
// Create and initialize the consumer.
recognizer_.reset(new FakeSpeechRecognizer());
base::SharedMemoryHandle foreign_memory_handle;
recognizer_->Initialize(blink_track, sink_params_, &foreign_memory_handle);
// Create the producer.
scoped_ptr<base::SyncSocket> sending_socket(recognizer_->sending_socket());
speech_audio_sink_.reset(new SpeechRecognitionAudioSink(
blink_track, sink_params_, foreign_memory_handle,
sending_socket.Pass(),
base::Bind(&SpeechRecognitionAudioSinkTest::StoppedCallback,
base::Unretained(this))));
// Return number of buffers needed to trigger resampling and consumption.
return static_cast<uint32>(std::ceil(
static_cast<double>(output_frames_per_buffer * input_sample_rate) /
(input_frames_per_buffer * output_sample_rate)));
}
// Mock callback expected to be called when the track is stopped.
MOCK_METHOD0(StoppedCallback, void());
protected:
// Prepares a blink track of a given MediaStreamType and attaches the native
// track which can be used to capture audio data and pass it to the producer.
static void PrepareBlinkTrackOfType(
const MediaStreamType device_type,
blink::WebMediaStreamTrack* blink_track) {
StreamDeviceInfo device_info(device_type, "Mock device",
"mock_device_id");
MockMediaConstraintFactory constraint_factory;
const blink::WebMediaConstraints constraints =
constraint_factory.CreateWebMediaConstraints();
scoped_refptr<WebRtcAudioCapturer> capturer(
WebRtcAudioCapturer::CreateCapturer(-1, device_info, constraints, NULL,
NULL));
scoped_refptr<WebRtcLocalAudioTrackAdapter> adapter(
WebRtcLocalAudioTrackAdapter::Create(std::string(), NULL));
scoped_ptr<WebRtcLocalAudioTrack> native_track(
new WebRtcLocalAudioTrack(adapter.get(), capturer, NULL));
blink::WebMediaStreamSource blink_audio_source;
blink_audio_source.initialize(base::UTF8ToUTF16("dummy_source_id"),
blink::WebMediaStreamSource::TypeAudio,
base::UTF8ToUTF16("dummy_source_name"),
false /* remote */, true /* readonly */);
MediaStreamSource::SourceStoppedCallback cb;
blink_audio_source.setExtraData(
new MediaStreamAudioSource(-1, device_info, cb, NULL));
blink_track->initialize(blink::WebString::fromUTF8("dummy_track"),
blink_audio_source);
blink_track->setExtraData(native_track.release());
}
// Emulates an audio capture device capturing data from the source.
inline void CaptureAudio(const uint32 buffers) {
for (uint32 i = 0; i < buffers; ++i)
native_track()->Capture(source_data(), false);
}
// Used to simulate a problem with sockets.
void SetFailureModeOnForeignSocket(bool in_failure_mode) {
recognizer()->sending_socket()->SetFailureMode(in_failure_mode);
}
// Helper method for verifying captured audio data has been consumed.
inline void AssertConsumedBuffers(const uint32 buffer_index) {
ASSERT_EQ(buffer_index, recognizer()->GetAudioInputBuffer()->params.size);
}
// Helper method for providing audio data to producer and verifying it was
// consumed on the recognizer.
inline void CaptureAudioAndAssertConsumedBuffers(const uint32 buffers,
const uint32 buffer_index) {
CaptureAudio(buffers);
AssertConsumedBuffers(buffer_index);
}
// Helper method to capture and assert consumption at different sample rates
// and audio buffer sizes.
inline void AssertConsumptionForAudioParameters(
const int input_sample_rate,
const int input_frames_per_buffer,
const int output_sample_rate,
const int output_frames_per_buffer,
const uint32 consumptions) {
const uint32 kBuffersPerNotification = Initialize(input_sample_rate,
input_frames_per_buffer,
output_sample_rate,
output_frames_per_buffer);
AssertConsumedBuffers(0U);
for (uint32 i = 1U; i <= consumptions; ++i) {
CaptureAudio(kBuffersPerNotification);
ASSERT_EQ(i, recognizer()->GetAudioInputBuffer()->params.size)
<< "Tested at rates: "
<< "In(" << input_sample_rate << ", " << input_frames_per_buffer
<< ") "
<< "Out(" << output_sample_rate << ", " << output_frames_per_buffer
<< ")";
}
}
int16* source_data() { return source_data_.get(); }
FakeSpeechRecognizer* recognizer() { return recognizer_.get(); }
const media::AudioParameters& sink_params() { return sink_params_; }
WebRtcLocalAudioTrack* native_track() { return native_track_; }
private:
// Producer.
scoped_ptr<SpeechRecognitionAudioSink> speech_audio_sink_;
// Consumer.
scoped_ptr<FakeSpeechRecognizer> recognizer_;
// Audio related members.
scoped_ptr<int16[]> source_data_;
media::AudioParameters source_params_;
media::AudioParameters sink_params_;
WebRtcLocalAudioTrack* native_track_;
DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionAudioSinkTest);
};
// Not all types of tracks are supported. This test checks if that policy is
// implemented correctly.
TEST_F(SpeechRecognitionAudioSinkTest, CheckIsSupportedAudioTrack) {
typedef std::map<MediaStreamType, bool> SupportedTrackPolicy;
// This test must be aligned with the policy of supported tracks.
SupportedTrackPolicy p;
p[MEDIA_NO_SERVICE] = false;
p[MEDIA_DEVICE_AUDIO_CAPTURE] = true; // The only one supported for now.
p[MEDIA_DEVICE_VIDEO_CAPTURE] = false;
p[MEDIA_TAB_AUDIO_CAPTURE] = false;
p[MEDIA_TAB_VIDEO_CAPTURE] = false;
p[MEDIA_DESKTOP_VIDEO_CAPTURE] = false;
p[MEDIA_DESKTOP_AUDIO_CAPTURE] = false;
p[MEDIA_DEVICE_AUDIO_OUTPUT] = false;
// Ensure this test gets updated along with |content::MediaStreamType| enum.
EXPECT_EQ(NUM_MEDIA_TYPES, p.size());
// Check the the entire policy.
for (SupportedTrackPolicy::iterator it = p.begin(); it != p.end(); ++it) {
blink::WebMediaStreamTrack blink_track;
PrepareBlinkTrackOfType(it->first, &blink_track);
ASSERT_EQ(
it->second,
SpeechRecognitionAudioSink::IsSupportedTrack(blink_track));
}
}
// Checks if the producer can support the listed range of input sample rates
// and associated buffer sizes.
TEST_F(SpeechRecognitionAudioSinkTest, RecognizerNotifiedOnSocket) {
const size_t kNumAudioParamTuples = 24;
const int kAudioParams[kNumAudioParamTuples][2] = {
{8000, 80}, {8000, 800}, {16000, 160}, {16000, 1600},
{24000, 240}, {24000, 2400}, {32000, 320}, {32000, 3200},
{44100, 441}, {44100, 4410}, {48000, 480}, {48000, 4800},
{96000, 960}, {96000, 9600}, {11025, 111}, {11025, 1103},
{22050, 221}, {22050, 2205}, {88200, 882}, {88200, 8820},
{176400, 1764}, {176400, 17640}, {192000, 1920}, {192000, 19200}};
// Check all listed tuples of input sample rates and buffers sizes.
for (size_t i = 0; i < kNumAudioParamTuples; ++i) {
AssertConsumptionForAudioParameters(
kAudioParams[i][0], kAudioParams[i][1],
kSpeechRecognitionSampleRate, kSpeechRecognitionFramesPerBuffer, 3U);
}
}
// Checks that the input data is getting resampled to the target sample rate.
TEST_F(SpeechRecognitionAudioSinkTest, AudioDataIsResampledOnSink) {
EXPECT_GE(kInputChannels, 1);
EXPECT_GE(kOutputChannels, 1);
// Input audio is sampled at 44.1 KHz with data chunks of 10ms. Desired output
// is corresponding to the speech recognition engine requirements: 16 KHz with
// 100 ms chunks (1600 frames per buffer).
const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);
// Fill audio input frames with 0, 1, 2, 3, ..., 440.
const uint32 kSourceDataLength = 441 * kInputChannels;
for (uint32 i = 0; i < kSourceDataLength; ++i) {
for (int c = 0; c < kInputChannels; ++c)
source_data()[i * kInputChannels + c] = i;
}
// Prepare sink audio bus and data for rendering.
media::AudioBus* sink_bus = recognizer()->audio_bus();
const uint32 kSinkDataLength = 1600 * kOutputChannels;
int16 sink_data[kSinkDataLength] = {0};
// Render the audio data from the recognizer.
sink_bus->ToInterleaved(sink_bus->frames(),
sink_params().bits_per_sample() / 8, sink_data);
// Checking only a fraction of the sink frames.
const uint32 kNumFramesToTest = 12;
// Check all channels are zeroed out before we trigger resampling.
for (uint32 i = 0; i < kNumFramesToTest; ++i) {
for (int c = 0; c < kOutputChannels; ++c)
EXPECT_EQ(0, sink_data[i * kOutputChannels + c]);
}
// Trigger the speech sink to resample the input data.
AssertConsumedBuffers(0U);
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
// Render the audio data from the recognizer.
sink_bus->ToInterleaved(sink_bus->frames(),
sink_params().bits_per_sample() / 8, sink_data);
// Resampled data expected frames. Extracted based on |source_data()|.
const int16 kExpectedData[kNumFramesToTest] = {0, 2, 5, 8, 11, 13,
16, 19, 22, 24, 27, 30};
// Check all channels have the same resampled data.
for (uint32 i = 0; i < kNumFramesToTest; ++i) {
for (int c = 0; c < kOutputChannels; ++c)
EXPECT_EQ(kExpectedData[i], sink_data[i * kOutputChannels + c]);
}
}
// Checks that the producer does not misbehave when a socket failure occurs.
TEST_F(SpeechRecognitionAudioSinkTest, SyncSocketFailsSendingData) {
const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);
// Start with no problems on the socket.
AssertConsumedBuffers(0U);
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
// A failure occurs (socket cannot send).
SetFailureModeOnForeignSocket(true);
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
}
// A very unlikely scenario in which the peer is not synchronizing for a long
// time (e.g. 300 ms) which results in dropping cached buffers and restarting.
// We check that the FIFO overflow does not occur and that the producer is able
// to resume.
TEST_F(SpeechRecognitionAudioSinkTest, RepeatedSycnhronizationLag) {
const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);
// Start with no synchronization problems.
AssertConsumedBuffers(0U);
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
// Consumer gets out of sync.
recognizer()->SimulateResponsiveness(false);
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
// Consumer recovers.
recognizer()->SimulateResponsiveness(true);
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 2U);
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 3U);
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 4U);
}
// Checks that an OnStoppedCallback is issued when the track is stopped.
TEST_F(SpeechRecognitionAudioSinkTest, OnReadyStateChangedOccured) {
const uint32 kBuffersPerNotification = Initialize(44100, 441, 16000, 1600);
AssertConsumedBuffers(0U);
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
EXPECT_CALL(*this, StoppedCallback()).Times(1);
native_track()->Stop();
CaptureAudioAndAssertConsumedBuffers(kBuffersPerNotification, 1U);
}
} // namespace content