blob: 3afce0a68fe0d8267fdf075d2274872cf0488f9f [file] [log] [blame]
/*
* Copyright (C) 2012 Intel Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "platform/audio/DirectConvolver.h"
#if OS(MACOSX)
#include <Accelerate/Accelerate.h>
#endif
#include "platform/audio/VectorMath.h"
#include "platform/wtf/CPU.h"
#if (CPU(X86) || CPU(X86_64)) && !OS(MACOSX)
#include <emmintrin.h>
#endif
namespace blink {
using namespace VectorMath;
DirectConvolver::DirectConvolver(size_t input_block_size)
: input_block_size_(input_block_size), buffer_(input_block_size * 2) {}
void DirectConvolver::Process(AudioFloatArray* convolution_kernel,
const float* source_p,
float* dest_p,
size_t frames_to_process) {
DCHECK_EQ(frames_to_process, input_block_size_);
if (frames_to_process != input_block_size_)
return;
// Only support kernelSize <= m_inputBlockSize
size_t kernel_size = convolution_kernel->size();
DCHECK_LE(kernel_size, input_block_size_);
if (kernel_size > input_block_size_)
return;
float* kernel_p = convolution_kernel->Data();
// Sanity check
bool is_copy_good = kernel_p && source_p && dest_p && buffer_.Data();
DCHECK(is_copy_good);
if (!is_copy_good)
return;
float* input_p = buffer_.Data() + input_block_size_;
// Copy samples to 2nd half of input buffer.
memcpy(input_p, source_p, sizeof(float) * frames_to_process);
#if OS(MACOSX)
#if CPU(X86)
conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1,
framesToProcess, kernelSize);
#else
vDSP_conv(input_p - kernel_size + 1, 1, kernel_p + kernel_size - 1, -1,
dest_p, 1, frames_to_process, kernel_size);
#endif // CPU(X86)
#else
size_t i = 0;
#if CPU(X86) || CPU(X86_64)
// Convolution using SSE2. Currently only do this if both |kernelSize| and
// |framesToProcess| are multiples of 4. If not, use the straightforward loop
// below.
if ((kernel_size % 4 == 0) && (frames_to_process % 4 == 0)) {
// AudioFloatArray's are always aligned on at least a 16-byte boundary.
AudioFloatArray kernel_buffer(4 * kernel_size);
__m128* kernel_reversed = reinterpret_cast<__m128*>(kernel_buffer.Data());
// Reverse the kernel and repeat each value across a vector
for (i = 0; i < kernel_size; ++i) {
kernel_reversed[i] = _mm_set1_ps(kernel_p[kernel_size - i - 1]);
}
float* input_start_p = input_p - kernel_size + 1;
// Do convolution with 4 inputs at a time.
for (i = 0; i < frames_to_process; i += 4) {
__m128 convolution_sum;
convolution_sum = _mm_setzero_ps();
// |kernelSize| is a multiple of 4 so we can unroll the loop by 4,
// manually.
for (size_t k = 0; k < kernel_size; k += 4) {
size_t data_offset = i + k;
for (size_t m = 0; m < 4; ++m) {
__m128 source_block;
__m128 product;
source_block = _mm_loadu_ps(input_start_p + data_offset + m);
product = _mm_mul_ps(kernel_reversed[k + m], source_block);
convolution_sum = _mm_add_ps(convolution_sum, product);
}
}
_mm_storeu_ps(dest_p + i, convolution_sum);
}
} else {
#endif
// FIXME: The macro can be further optimized to avoid pipeline stalls. One
// possibility is to maintain 4 separate sums and change the macro to
// CONVOLVE_FOUR_SAMPLES.
#define CONVOLVE_ONE_SAMPLE \
do { \
sum += input_p[i - j] * kernel_p[j]; \
j++; \
} while (0)
while (i < frames_to_process) {
size_t j = 0;
float sum = 0;
// FIXME: SSE optimization may be applied here.
if (kernel_size == 32) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
CONVOLVE_ONE_SAMPLE; // 4
CONVOLVE_ONE_SAMPLE; // 5
CONVOLVE_ONE_SAMPLE; // 6
CONVOLVE_ONE_SAMPLE; // 7
CONVOLVE_ONE_SAMPLE; // 8
CONVOLVE_ONE_SAMPLE; // 9
CONVOLVE_ONE_SAMPLE; // 10
CONVOLVE_ONE_SAMPLE; // 11
CONVOLVE_ONE_SAMPLE; // 12
CONVOLVE_ONE_SAMPLE; // 13
CONVOLVE_ONE_SAMPLE; // 14
CONVOLVE_ONE_SAMPLE; // 15
CONVOLVE_ONE_SAMPLE; // 16
CONVOLVE_ONE_SAMPLE; // 17
CONVOLVE_ONE_SAMPLE; // 18
CONVOLVE_ONE_SAMPLE; // 19
CONVOLVE_ONE_SAMPLE; // 20
CONVOLVE_ONE_SAMPLE; // 21
CONVOLVE_ONE_SAMPLE; // 22
CONVOLVE_ONE_SAMPLE; // 23
CONVOLVE_ONE_SAMPLE; // 24
CONVOLVE_ONE_SAMPLE; // 25
CONVOLVE_ONE_SAMPLE; // 26
CONVOLVE_ONE_SAMPLE; // 27
CONVOLVE_ONE_SAMPLE; // 28
CONVOLVE_ONE_SAMPLE; // 29
CONVOLVE_ONE_SAMPLE; // 30
CONVOLVE_ONE_SAMPLE; // 31
CONVOLVE_ONE_SAMPLE; // 32
} else if (kernel_size == 64) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
CONVOLVE_ONE_SAMPLE; // 4
CONVOLVE_ONE_SAMPLE; // 5
CONVOLVE_ONE_SAMPLE; // 6
CONVOLVE_ONE_SAMPLE; // 7
CONVOLVE_ONE_SAMPLE; // 8
CONVOLVE_ONE_SAMPLE; // 9
CONVOLVE_ONE_SAMPLE; // 10
CONVOLVE_ONE_SAMPLE; // 11
CONVOLVE_ONE_SAMPLE; // 12
CONVOLVE_ONE_SAMPLE; // 13
CONVOLVE_ONE_SAMPLE; // 14
CONVOLVE_ONE_SAMPLE; // 15
CONVOLVE_ONE_SAMPLE; // 16
CONVOLVE_ONE_SAMPLE; // 17
CONVOLVE_ONE_SAMPLE; // 18
CONVOLVE_ONE_SAMPLE; // 19
CONVOLVE_ONE_SAMPLE; // 20
CONVOLVE_ONE_SAMPLE; // 21
CONVOLVE_ONE_SAMPLE; // 22
CONVOLVE_ONE_SAMPLE; // 23
CONVOLVE_ONE_SAMPLE; // 24
CONVOLVE_ONE_SAMPLE; // 25
CONVOLVE_ONE_SAMPLE; // 26
CONVOLVE_ONE_SAMPLE; // 27
CONVOLVE_ONE_SAMPLE; // 28
CONVOLVE_ONE_SAMPLE; // 29
CONVOLVE_ONE_SAMPLE; // 30
CONVOLVE_ONE_SAMPLE; // 31
CONVOLVE_ONE_SAMPLE; // 32
CONVOLVE_ONE_SAMPLE; // 33
CONVOLVE_ONE_SAMPLE; // 34
CONVOLVE_ONE_SAMPLE; // 35
CONVOLVE_ONE_SAMPLE; // 36
CONVOLVE_ONE_SAMPLE; // 37
CONVOLVE_ONE_SAMPLE; // 38
CONVOLVE_ONE_SAMPLE; // 39
CONVOLVE_ONE_SAMPLE; // 40
CONVOLVE_ONE_SAMPLE; // 41
CONVOLVE_ONE_SAMPLE; // 42
CONVOLVE_ONE_SAMPLE; // 43
CONVOLVE_ONE_SAMPLE; // 44
CONVOLVE_ONE_SAMPLE; // 45
CONVOLVE_ONE_SAMPLE; // 46
CONVOLVE_ONE_SAMPLE; // 47
CONVOLVE_ONE_SAMPLE; // 48
CONVOLVE_ONE_SAMPLE; // 49
CONVOLVE_ONE_SAMPLE; // 50
CONVOLVE_ONE_SAMPLE; // 51
CONVOLVE_ONE_SAMPLE; // 52
CONVOLVE_ONE_SAMPLE; // 53
CONVOLVE_ONE_SAMPLE; // 54
CONVOLVE_ONE_SAMPLE; // 55
CONVOLVE_ONE_SAMPLE; // 56
CONVOLVE_ONE_SAMPLE; // 57
CONVOLVE_ONE_SAMPLE; // 58
CONVOLVE_ONE_SAMPLE; // 59
CONVOLVE_ONE_SAMPLE; // 60
CONVOLVE_ONE_SAMPLE; // 61
CONVOLVE_ONE_SAMPLE; // 62
CONVOLVE_ONE_SAMPLE; // 63
CONVOLVE_ONE_SAMPLE; // 64
} else if (kernel_size == 128) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
CONVOLVE_ONE_SAMPLE; // 4
CONVOLVE_ONE_SAMPLE; // 5
CONVOLVE_ONE_SAMPLE; // 6
CONVOLVE_ONE_SAMPLE; // 7
CONVOLVE_ONE_SAMPLE; // 8
CONVOLVE_ONE_SAMPLE; // 9
CONVOLVE_ONE_SAMPLE; // 10
CONVOLVE_ONE_SAMPLE; // 11
CONVOLVE_ONE_SAMPLE; // 12
CONVOLVE_ONE_SAMPLE; // 13
CONVOLVE_ONE_SAMPLE; // 14
CONVOLVE_ONE_SAMPLE; // 15
CONVOLVE_ONE_SAMPLE; // 16
CONVOLVE_ONE_SAMPLE; // 17
CONVOLVE_ONE_SAMPLE; // 18
CONVOLVE_ONE_SAMPLE; // 19
CONVOLVE_ONE_SAMPLE; // 20
CONVOLVE_ONE_SAMPLE; // 21
CONVOLVE_ONE_SAMPLE; // 22
CONVOLVE_ONE_SAMPLE; // 23
CONVOLVE_ONE_SAMPLE; // 24
CONVOLVE_ONE_SAMPLE; // 25
CONVOLVE_ONE_SAMPLE; // 26
CONVOLVE_ONE_SAMPLE; // 27
CONVOLVE_ONE_SAMPLE; // 28
CONVOLVE_ONE_SAMPLE; // 29
CONVOLVE_ONE_SAMPLE; // 30
CONVOLVE_ONE_SAMPLE; // 31
CONVOLVE_ONE_SAMPLE; // 32
CONVOLVE_ONE_SAMPLE; // 33
CONVOLVE_ONE_SAMPLE; // 34
CONVOLVE_ONE_SAMPLE; // 35
CONVOLVE_ONE_SAMPLE; // 36
CONVOLVE_ONE_SAMPLE; // 37
CONVOLVE_ONE_SAMPLE; // 38
CONVOLVE_ONE_SAMPLE; // 39
CONVOLVE_ONE_SAMPLE; // 40
CONVOLVE_ONE_SAMPLE; // 41
CONVOLVE_ONE_SAMPLE; // 42
CONVOLVE_ONE_SAMPLE; // 43
CONVOLVE_ONE_SAMPLE; // 44
CONVOLVE_ONE_SAMPLE; // 45
CONVOLVE_ONE_SAMPLE; // 46
CONVOLVE_ONE_SAMPLE; // 47
CONVOLVE_ONE_SAMPLE; // 48
CONVOLVE_ONE_SAMPLE; // 49
CONVOLVE_ONE_SAMPLE; // 50
CONVOLVE_ONE_SAMPLE; // 51
CONVOLVE_ONE_SAMPLE; // 52
CONVOLVE_ONE_SAMPLE; // 53
CONVOLVE_ONE_SAMPLE; // 54
CONVOLVE_ONE_SAMPLE; // 55
CONVOLVE_ONE_SAMPLE; // 56
CONVOLVE_ONE_SAMPLE; // 57
CONVOLVE_ONE_SAMPLE; // 58
CONVOLVE_ONE_SAMPLE; // 59
CONVOLVE_ONE_SAMPLE; // 60
CONVOLVE_ONE_SAMPLE; // 61
CONVOLVE_ONE_SAMPLE; // 62
CONVOLVE_ONE_SAMPLE; // 63
CONVOLVE_ONE_SAMPLE; // 64
CONVOLVE_ONE_SAMPLE; // 65
CONVOLVE_ONE_SAMPLE; // 66
CONVOLVE_ONE_SAMPLE; // 67
CONVOLVE_ONE_SAMPLE; // 68
CONVOLVE_ONE_SAMPLE; // 69
CONVOLVE_ONE_SAMPLE; // 70
CONVOLVE_ONE_SAMPLE; // 71
CONVOLVE_ONE_SAMPLE; // 72
CONVOLVE_ONE_SAMPLE; // 73
CONVOLVE_ONE_SAMPLE; // 74
CONVOLVE_ONE_SAMPLE; // 75
CONVOLVE_ONE_SAMPLE; // 76
CONVOLVE_ONE_SAMPLE; // 77
CONVOLVE_ONE_SAMPLE; // 78
CONVOLVE_ONE_SAMPLE; // 79
CONVOLVE_ONE_SAMPLE; // 80
CONVOLVE_ONE_SAMPLE; // 81
CONVOLVE_ONE_SAMPLE; // 82
CONVOLVE_ONE_SAMPLE; // 83
CONVOLVE_ONE_SAMPLE; // 84
CONVOLVE_ONE_SAMPLE; // 85
CONVOLVE_ONE_SAMPLE; // 86
CONVOLVE_ONE_SAMPLE; // 87
CONVOLVE_ONE_SAMPLE; // 88
CONVOLVE_ONE_SAMPLE; // 89
CONVOLVE_ONE_SAMPLE; // 90
CONVOLVE_ONE_SAMPLE; // 91
CONVOLVE_ONE_SAMPLE; // 92
CONVOLVE_ONE_SAMPLE; // 93
CONVOLVE_ONE_SAMPLE; // 94
CONVOLVE_ONE_SAMPLE; // 95
CONVOLVE_ONE_SAMPLE; // 96
CONVOLVE_ONE_SAMPLE; // 97
CONVOLVE_ONE_SAMPLE; // 98
CONVOLVE_ONE_SAMPLE; // 99
CONVOLVE_ONE_SAMPLE; // 100
CONVOLVE_ONE_SAMPLE; // 101
CONVOLVE_ONE_SAMPLE; // 102
CONVOLVE_ONE_SAMPLE; // 103
CONVOLVE_ONE_SAMPLE; // 104
CONVOLVE_ONE_SAMPLE; // 105
CONVOLVE_ONE_SAMPLE; // 106
CONVOLVE_ONE_SAMPLE; // 107
CONVOLVE_ONE_SAMPLE; // 108
CONVOLVE_ONE_SAMPLE; // 109
CONVOLVE_ONE_SAMPLE; // 110
CONVOLVE_ONE_SAMPLE; // 111
CONVOLVE_ONE_SAMPLE; // 112
CONVOLVE_ONE_SAMPLE; // 113
CONVOLVE_ONE_SAMPLE; // 114
CONVOLVE_ONE_SAMPLE; // 115
CONVOLVE_ONE_SAMPLE; // 116
CONVOLVE_ONE_SAMPLE; // 117
CONVOLVE_ONE_SAMPLE; // 118
CONVOLVE_ONE_SAMPLE; // 119
CONVOLVE_ONE_SAMPLE; // 120
CONVOLVE_ONE_SAMPLE; // 121
CONVOLVE_ONE_SAMPLE; // 122
CONVOLVE_ONE_SAMPLE; // 123
CONVOLVE_ONE_SAMPLE; // 124
CONVOLVE_ONE_SAMPLE; // 125
CONVOLVE_ONE_SAMPLE; // 126
CONVOLVE_ONE_SAMPLE; // 127
CONVOLVE_ONE_SAMPLE; // 128
} else {
while (j < kernel_size) {
// Non-optimized using actual while loop.
CONVOLVE_ONE_SAMPLE;
}
}
dest_p[i++] = sum;
}
#if CPU(X86) || CPU(X86_64)
}
#endif
#endif // OS(MACOSX)
// Copy 2nd half of input buffer to 1st half.
memcpy(buffer_.Data(), input_p, sizeof(float) * frames_to_process);
}
void DirectConvolver::Reset() {
buffer_.Zero();
}
} // namespace blink