blob: f5232627189d87037a4699ba2ad60ed209a15ba9 [file] [log] [blame]
/*
* Copyright (C) 2011 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "third_party/blink/renderer/platform/audio/sinc_resampler.h"
#include "build/build_config.h"
#include "third_party/blink/renderer/platform/audio/audio_bus.h"
#include "third_party/blink/renderer/platform/wtf/math_extras.h"
#if defined(ARCH_CPU_X86_FAMILY)
#include <emmintrin.h>
#endif
// Input buffer layout, dividing the total buffer into regions (r0 - r5):
//
// |----------------|-----------------------------------------|----------------|
//
// blockSize + kernelSize / 2
// <--------------------------------------------------------->
// r0
//
// kernelSize / 2 kernelSize / 2 kernelSize / 2 kernelSize / 2
// <---------------> <---------------> <---------------> <--------------->
// r1 r2 r3 r4
//
// blockSize
// <---------------------------------------->
// r5
// The Algorithm:
//
// 1) Consume input frames into r0 (r1 is zero-initialized).
// 2) Position kernel centered at start of r0 (r2) and generate output frames
// until kernel is centered at start of r4, or we've finished generating
// all the output frames.
// 3) Copy r3 to r1 and r4 to r2.
// 4) Consume input frames into r5 (zero-pad if we run out of input).
// 5) Goto (2) until all of input is consumed.
//
// note: we're glossing over how the sub-sample handling works with
// m_virtualSourceIndex, etc.
namespace blink {
SincResampler::SincResampler(double scale_factor,
unsigned kernel_size,
unsigned number_of_kernel_offsets)
: scale_factor_(scale_factor),
kernel_size_(kernel_size),
number_of_kernel_offsets_(number_of_kernel_offsets),
kernel_storage_(kernel_size_ * (number_of_kernel_offsets_ + 1)),
virtual_source_index_(0),
block_size_(512),
// See input buffer layout above.
input_buffer_(block_size_ + kernel_size_),
source_(nullptr),
source_frames_available_(0),
source_provider_(nullptr),
is_buffer_primed_(false) {
InitializeKernel();
}
void SincResampler::InitializeKernel() {
// Blackman window parameters.
double alpha = 0.16;
double a0 = 0.5 * (1.0 - alpha);
double a1 = 0.5;
double a2 = 0.5 * alpha;
// sincScaleFactor is basically the normalized cutoff frequency of the
// low-pass filter.
double sinc_scale_factor = scale_factor_ > 1.0 ? 1.0 / scale_factor_ : 1.0;
// The sinc function is an idealized brick-wall filter, but since we're
// windowing it the transition from pass to stop does not happen right away.
// So we should adjust the lowpass filter cutoff slightly downward to avoid
// some aliasing at the very high-end.
// FIXME: this value is empirical and to be more exact should vary depending
// on m_kernelSize.
sinc_scale_factor *= 0.9;
int n = kernel_size_;
int half_size = n / 2;
// Generates a set of windowed sinc() kernels.
// We generate a range of sub-sample offsets from 0.0 to 1.0.
for (unsigned offset_index = 0; offset_index <= number_of_kernel_offsets_;
++offset_index) {
double subsample_offset =
static_cast<double>(offset_index) / number_of_kernel_offsets_;
for (int i = 0; i < n; ++i) {
// Compute the sinc() with offset.
double s =
sinc_scale_factor * kPiDouble * (i - half_size - subsample_offset);
double sinc = !s ? 1.0 : std::sin(s) / s;
sinc *= sinc_scale_factor;
// Compute Blackman window, matching the offset of the sinc().
double x = (i - subsample_offset) / n;
double window = a0 - a1 * std::cos(kTwoPiDouble * x) +
a2 * std::cos(kTwoPiDouble * 2.0 * x);
// Window the sinc() function and store at the correct offset.
kernel_storage_[i + offset_index * kernel_size_] = sinc * window;
}
}
}
void SincResampler::ConsumeSource(float* buffer,
unsigned number_of_source_frames) {
DCHECK(source_provider_);
if (!source_provider_)
return;
// Wrap the provided buffer by an AudioBus for use by the source provider.
scoped_refptr<AudioBus> bus =
AudioBus::Create(1, number_of_source_frames, false);
// FIXME: Find a way to make the following const-correct:
bus->SetChannelMemory(0, buffer, number_of_source_frames);
source_provider_->ProvideInput(bus.get(), number_of_source_frames);
}
namespace {
// BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer.
class BufferSourceProvider final : public AudioSourceProvider {
public:
BufferSourceProvider(const float* source, uint32_t number_of_source_frames)
: source_(source), source_frames_available_(number_of_source_frames) {}
// Consumes samples from the in-memory buffer.
void ProvideInput(AudioBus* bus, uint32_t frames_to_process) override {
DCHECK(source_);
DCHECK(bus);
if (!source_ || !bus)
return;
float* buffer = bus->Channel(0)->MutableData();
// Clamp to number of frames available and zero-pad.
uint32_t frames_to_copy =
std::min(source_frames_available_, frames_to_process);
memcpy(buffer, source_, sizeof(float) * frames_to_copy);
// Zero-pad if necessary.
if (frames_to_copy < frames_to_process)
memset(buffer + frames_to_copy, 0,
sizeof(float) * (frames_to_process - frames_to_copy));
source_frames_available_ -= frames_to_copy;
source_ += frames_to_copy;
}
private:
const float* source_;
uint32_t source_frames_available_;
};
} // namespace
void SincResampler::Process(const float* source,
float* destination,
unsigned number_of_source_frames) {
// Resample an in-memory buffer using an AudioSourceProvider.
BufferSourceProvider source_provider(source, number_of_source_frames);
unsigned number_of_destination_frames =
static_cast<unsigned>(number_of_source_frames / scale_factor_);
unsigned remaining = number_of_destination_frames;
while (remaining) {
unsigned frames_this_time = std::min(remaining, block_size_);
Process(&source_provider, destination, frames_this_time);
destination += frames_this_time;
remaining -= frames_this_time;
}
}
void SincResampler::Process(AudioSourceProvider* source_provider,
float* destination,
uint32_t frames_to_process) {
bool is_good = source_provider && block_size_ > kernel_size_ &&
input_buffer_.size() >= block_size_ + kernel_size_ &&
!(kernel_size_ % 2);
DCHECK(is_good);
if (!is_good)
return;
source_provider_ = source_provider;
unsigned number_of_destination_frames = frames_to_process;
// Setup various region pointers in the buffer (see diagram above).
float* r0 = input_buffer_.Data() + kernel_size_ / 2;
float* r1 = input_buffer_.Data();
float* r2 = r0;
float* r3 = r0 + block_size_ - kernel_size_ / 2;
float* r4 = r0 + block_size_;
float* r5 = r0 + kernel_size_ / 2;
// Step (1)
// Prime the input buffer at the start of the input stream.
if (!is_buffer_primed_) {
ConsumeSource(r0, block_size_ + kernel_size_ / 2);
is_buffer_primed_ = true;
}
// Step (2)
while (number_of_destination_frames) {
while (virtual_source_index_ < block_size_) {
// m_virtualSourceIndex lies in between two kernel offsets so figure out
// what they are.
int source_index_i = static_cast<int>(virtual_source_index_);
double subsample_remainder = virtual_source_index_ - source_index_i;
double virtual_offset_index =
subsample_remainder * number_of_kernel_offsets_;
int offset_index = static_cast<int>(virtual_offset_index);
float* k1 = kernel_storage_.Data() + offset_index * kernel_size_;
float* k2 = k1 + kernel_size_;
// Initialize input pointer based on quantized m_virtualSourceIndex.
float* input_p = r1 + source_index_i;
// We'll compute "convolutions" for the two kernels which straddle
// m_virtualSourceIndex
float sum1 = 0;
float sum2 = 0;
// Figure out how much to weight each kernel's "convolution".
double kernel_interpolation_factor = virtual_offset_index - offset_index;
// Generate a single output sample.
int n = kernel_size_;
#define CONVOLVE_ONE_SAMPLE() \
do { \
input = *input_p++; \
sum1 += input * *k1; \
sum2 += input * *k2; \
++k1; \
++k2; \
} while (0)
{
float input;
#if defined(ARCH_CPU_X86_FAMILY)
// If the sourceP address is not 16-byte aligned, the first several
// frames (at most three) should be processed seperately.
while ((reinterpret_cast<uintptr_t>(input_p) & 0x0F) && n) {
CONVOLVE_ONE_SAMPLE();
n--;
}
// Now the inputP is aligned and start to apply SSE.
float* end_p = input_p + n - n % 4;
__m128 m_input;
__m128 m_k1;
__m128 m_k2;
__m128 mul1;
__m128 mul2;
__m128 sums1 = _mm_setzero_ps();
__m128 sums2 = _mm_setzero_ps();
bool k1_aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F);
bool k2_aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F);
#define LOAD_DATA(l1, l2) \
do { \
m_input = _mm_load_ps(input_p); \
m_k1 = _mm_##l1##_ps(k1); \
m_k2 = _mm_##l2##_ps(k2); \
} while (0)
#define CONVOLVE_4_SAMPLES() \
do { \
mul1 = _mm_mul_ps(m_input, m_k1); \
mul2 = _mm_mul_ps(m_input, m_k2); \
sums1 = _mm_add_ps(sums1, mul1); \
sums2 = _mm_add_ps(sums2, mul2); \
input_p += 4; \
k1 += 4; \
k2 += 4; \
} while (0)
if (k1_aligned && k2_aligned) { // both aligned
while (input_p < end_p) {
LOAD_DATA(load, load);
CONVOLVE_4_SAMPLES();
}
} else if (!k1_aligned && k2_aligned) { // only k2 aligned
while (input_p < end_p) {
LOAD_DATA(loadu, load);
CONVOLVE_4_SAMPLES();
}
} else if (k1_aligned && !k2_aligned) { // only k1 aligned
while (input_p < end_p) {
LOAD_DATA(load, loadu);
CONVOLVE_4_SAMPLES();
}
} else { // both non-aligned
while (input_p < end_p) {
LOAD_DATA(loadu, loadu);
CONVOLVE_4_SAMPLES();
}
}
// Summarize the SSE results to sum1 and sum2.
float* group_sum_p = reinterpret_cast<float*>(&sums1);
sum1 +=
group_sum_p[0] + group_sum_p[1] + group_sum_p[2] + group_sum_p[3];
group_sum_p = reinterpret_cast<float*>(&sums2);
sum2 +=
group_sum_p[0] + group_sum_p[1] + group_sum_p[2] + group_sum_p[3];
n %= 4;
while (n) {
CONVOLVE_ONE_SAMPLE();
n--;
}
#else
// FIXME: add ARM NEON optimizations for the following. The scalar
// code-path can probably also be optimized better.
// Optimize size 32 and size 64 kernels by unrolling the while loop.
// A 20 - 30% speed improvement was measured in some cases by using this
// approach.
if (n == 32) {
CONVOLVE_ONE_SAMPLE(); // 1
CONVOLVE_ONE_SAMPLE(); // 2
CONVOLVE_ONE_SAMPLE(); // 3
CONVOLVE_ONE_SAMPLE(); // 4
CONVOLVE_ONE_SAMPLE(); // 5
CONVOLVE_ONE_SAMPLE(); // 6
CONVOLVE_ONE_SAMPLE(); // 7
CONVOLVE_ONE_SAMPLE(); // 8
CONVOLVE_ONE_SAMPLE(); // 9
CONVOLVE_ONE_SAMPLE(); // 10
CONVOLVE_ONE_SAMPLE(); // 11
CONVOLVE_ONE_SAMPLE(); // 12
CONVOLVE_ONE_SAMPLE(); // 13
CONVOLVE_ONE_SAMPLE(); // 14
CONVOLVE_ONE_SAMPLE(); // 15
CONVOLVE_ONE_SAMPLE(); // 16
CONVOLVE_ONE_SAMPLE(); // 17
CONVOLVE_ONE_SAMPLE(); // 18
CONVOLVE_ONE_SAMPLE(); // 19
CONVOLVE_ONE_SAMPLE(); // 20
CONVOLVE_ONE_SAMPLE(); // 21
CONVOLVE_ONE_SAMPLE(); // 22
CONVOLVE_ONE_SAMPLE(); // 23
CONVOLVE_ONE_SAMPLE(); // 24
CONVOLVE_ONE_SAMPLE(); // 25
CONVOLVE_ONE_SAMPLE(); // 26
CONVOLVE_ONE_SAMPLE(); // 27
CONVOLVE_ONE_SAMPLE(); // 28
CONVOLVE_ONE_SAMPLE(); // 29
CONVOLVE_ONE_SAMPLE(); // 30
CONVOLVE_ONE_SAMPLE(); // 31
CONVOLVE_ONE_SAMPLE(); // 32
} else if (n == 64) {
CONVOLVE_ONE_SAMPLE(); // 1
CONVOLVE_ONE_SAMPLE(); // 2
CONVOLVE_ONE_SAMPLE(); // 3
CONVOLVE_ONE_SAMPLE(); // 4
CONVOLVE_ONE_SAMPLE(); // 5
CONVOLVE_ONE_SAMPLE(); // 6
CONVOLVE_ONE_SAMPLE(); // 7
CONVOLVE_ONE_SAMPLE(); // 8
CONVOLVE_ONE_SAMPLE(); // 9
CONVOLVE_ONE_SAMPLE(); // 10
CONVOLVE_ONE_SAMPLE(); // 11
CONVOLVE_ONE_SAMPLE(); // 12
CONVOLVE_ONE_SAMPLE(); // 13
CONVOLVE_ONE_SAMPLE(); // 14
CONVOLVE_ONE_SAMPLE(); // 15
CONVOLVE_ONE_SAMPLE(); // 16
CONVOLVE_ONE_SAMPLE(); // 17
CONVOLVE_ONE_SAMPLE(); // 18
CONVOLVE_ONE_SAMPLE(); // 19
CONVOLVE_ONE_SAMPLE(); // 20
CONVOLVE_ONE_SAMPLE(); // 21
CONVOLVE_ONE_SAMPLE(); // 22
CONVOLVE_ONE_SAMPLE(); // 23
CONVOLVE_ONE_SAMPLE(); // 24
CONVOLVE_ONE_SAMPLE(); // 25
CONVOLVE_ONE_SAMPLE(); // 26
CONVOLVE_ONE_SAMPLE(); // 27
CONVOLVE_ONE_SAMPLE(); // 28
CONVOLVE_ONE_SAMPLE(); // 29
CONVOLVE_ONE_SAMPLE(); // 30
CONVOLVE_ONE_SAMPLE(); // 31
CONVOLVE_ONE_SAMPLE(); // 32
CONVOLVE_ONE_SAMPLE(); // 33
CONVOLVE_ONE_SAMPLE(); // 34
CONVOLVE_ONE_SAMPLE(); // 35
CONVOLVE_ONE_SAMPLE(); // 36
CONVOLVE_ONE_SAMPLE(); // 37
CONVOLVE_ONE_SAMPLE(); // 38
CONVOLVE_ONE_SAMPLE(); // 39
CONVOLVE_ONE_SAMPLE(); // 40
CONVOLVE_ONE_SAMPLE(); // 41
CONVOLVE_ONE_SAMPLE(); // 42
CONVOLVE_ONE_SAMPLE(); // 43
CONVOLVE_ONE_SAMPLE(); // 44
CONVOLVE_ONE_SAMPLE(); // 45
CONVOLVE_ONE_SAMPLE(); // 46
CONVOLVE_ONE_SAMPLE(); // 47
CONVOLVE_ONE_SAMPLE(); // 48
CONVOLVE_ONE_SAMPLE(); // 49
CONVOLVE_ONE_SAMPLE(); // 50
CONVOLVE_ONE_SAMPLE(); // 51
CONVOLVE_ONE_SAMPLE(); // 52
CONVOLVE_ONE_SAMPLE(); // 53
CONVOLVE_ONE_SAMPLE(); // 54
CONVOLVE_ONE_SAMPLE(); // 55
CONVOLVE_ONE_SAMPLE(); // 56
CONVOLVE_ONE_SAMPLE(); // 57
CONVOLVE_ONE_SAMPLE(); // 58
CONVOLVE_ONE_SAMPLE(); // 59
CONVOLVE_ONE_SAMPLE(); // 60
CONVOLVE_ONE_SAMPLE(); // 61
CONVOLVE_ONE_SAMPLE(); // 62
CONVOLVE_ONE_SAMPLE(); // 63
CONVOLVE_ONE_SAMPLE(); // 64
} else {
while (n--) {
// Non-optimized using actual while loop.
CONVOLVE_ONE_SAMPLE();
}
}
#endif
}
#undef CONVOLVE_ONE_SAMPLE
// Linearly interpolate the two "convolutions".
double result = (1.0 - kernel_interpolation_factor) * sum1 +
kernel_interpolation_factor * sum2;
*destination++ = result;
// Advance the virtual index.
virtual_source_index_ += scale_factor_;
--number_of_destination_frames;
if (!number_of_destination_frames)
return;
}
// Wrap back around to the start.
virtual_source_index_ -= block_size_;
// Step (3) Copy r3 to r1 and r4 to r2.
// This wraps the last input frames back to the start of the buffer.
memcpy(r1, r3, sizeof(float) * (kernel_size_ / 2));
memcpy(r2, r4, sizeof(float) * (kernel_size_ / 2));
// Step (4)
// Refresh the buffer with more input.
ConsumeSource(r5, block_size_);
}
}
} // namespace blink