third_party/WebKit/Source/platform/audio/DirectConvolver.cpp - chromium/src.git - Git at Google

 /*
  * Copyright (C) 2012 Intel Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1.  Redistributions of source code must retain the above copyright
  *     notice, this list of conditions and the following disclaimer.
  * 2.  Redistributions in binary form must reproduce the above copyright
  *     notice, this list of conditions and the following disclaimer in the
  *     documentation and/or other materials provided with the distribution.
  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
  *     its contributors may be used to endorse or promote products derived
  *     from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "platform/audio/DirectConvolver.h"

 #if OS(MACOSX)
 #include <Accelerate/Accelerate.h>
 #endif

 #include "platform/audio/VectorMath.h"
 #include "platform/wtf/CPU.h"

 #if (CPU(X86) || CPU(X86_64)) && !OS(MACOSX)
 #include <emmintrin.h>
 #endif

 namespace blink {

 using namespace VectorMath;

 DirectConvolver::DirectConvolver(size_t input_block_size)
     : input_block_size_(input_block_size), buffer_(input_block_size * 2) {}

 void DirectConvolver::Process(AudioFloatArray* convolution_kernel,
                               const float* source_p,
                               float* dest_p,
                               size_t frames_to_process) {
   DCHECK_EQ(frames_to_process, input_block_size_);
   if (frames_to_process != input_block_size_)
     return;

   // Only support kernelSize <= m_inputBlockSize
   size_t kernel_size = convolution_kernel->size();
   DCHECK_LE(kernel_size, input_block_size_);
   if (kernel_size > input_block_size_)
     return;

   float* kernel_p = convolution_kernel->Data();

   // Sanity check
   bool is_copy_good = kernel_p && source_p && dest_p && buffer_.Data();
   DCHECK(is_copy_good);
   if (!is_copy_good)
     return;

   float* input_p = buffer_.Data() + input_block_size_;

   // Copy samples to 2nd half of input buffer.
   memcpy(input_p, source_p, sizeof(float) * frames_to_process);

 #if OS(MACOSX)
 #if CPU(X86)
   conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1,
        framesToProcess, kernelSize);
 #else
   vDSP_conv(input_p - kernel_size + 1, 1, kernel_p + kernel_size - 1, -1,
             dest_p, 1, frames_to_process, kernel_size);
 #endif  // CPU(X86)
 #else
   size_t i = 0;
 #if CPU(X86) || CPU(X86_64)
   // Convolution using SSE2. Currently only do this if both |kernelSize| and
   // |framesToProcess| are multiples of 4. If not, use the straightforward loop
   // below.

   if ((kernel_size % 4 == 0) && (frames_to_process % 4 == 0)) {
     // AudioFloatArray's are always aligned on at least a 16-byte boundary.
     AudioFloatArray kernel_buffer(4 * kernel_size);
     __m128* kernel_reversed = reinterpret_cast<__m128*>(kernel_buffer.Data());

     // Reverse the kernel and repeat each value across a vector
     for (i = 0; i < kernel_size; ++i) {
       kernel_reversed[i] = _mm_set1_ps(kernel_p[kernel_size - i - 1]);
     }

     float* input_start_p = input_p - kernel_size + 1;

     // Do convolution with 4 inputs at a time.
     for (i = 0; i < frames_to_process; i += 4) {
       __m128 convolution_sum;

       convolution_sum = _mm_setzero_ps();

       // |kernelSize| is a multiple of 4 so we can unroll the loop by 4,
       // manually.
       for (size_t k = 0; k < kernel_size; k += 4) {
         size_t data_offset = i + k;

         for (size_t m = 0; m < 4; ++m) {
           __m128 source_block;
           __m128 product;

           source_block = _mm_loadu_ps(input_start_p + data_offset + m);
           product = _mm_mul_ps(kernel_reversed[k + m], source_block);
           convolution_sum = _mm_add_ps(convolution_sum, product);
         }
       }
       _mm_storeu_ps(dest_p + i, convolution_sum);
     }
   } else {
 #endif

 // FIXME: The macro can be further optimized to avoid pipeline stalls. One
 // possibility is to maintain 4 separate sums and change the macro to
 // CONVOLVE_FOUR_SAMPLES.
 #define CONVOLVE_ONE_SAMPLE              \
   do {                                   \
     sum += input_p[i - j] * kernel_p[j]; \
     j++;                                 \
   } while (0)

     while (i < frames_to_process) {
       size_t j = 0;
       float sum = 0;

       // FIXME: SSE optimization may be applied here.
       if (kernel_size == 32) {
         CONVOLVE_ONE_SAMPLE;  // 1
         CONVOLVE_ONE_SAMPLE;  // 2
         CONVOLVE_ONE_SAMPLE;  // 3
         CONVOLVE_ONE_SAMPLE;  // 4
         CONVOLVE_ONE_SAMPLE;  // 5
         CONVOLVE_ONE_SAMPLE;  // 6
         CONVOLVE_ONE_SAMPLE;  // 7
         CONVOLVE_ONE_SAMPLE;  // 8
         CONVOLVE_ONE_SAMPLE;  // 9
         CONVOLVE_ONE_SAMPLE;  // 10

         CONVOLVE_ONE_SAMPLE;  // 11
         CONVOLVE_ONE_SAMPLE;  // 12
         CONVOLVE_ONE_SAMPLE;  // 13
         CONVOLVE_ONE_SAMPLE;  // 14
         CONVOLVE_ONE_SAMPLE;  // 15
         CONVOLVE_ONE_SAMPLE;  // 16
         CONVOLVE_ONE_SAMPLE;  // 17
         CONVOLVE_ONE_SAMPLE;  // 18
         CONVOLVE_ONE_SAMPLE;  // 19
         CONVOLVE_ONE_SAMPLE;  // 20

         CONVOLVE_ONE_SAMPLE;  // 21
         CONVOLVE_ONE_SAMPLE;  // 22
         CONVOLVE_ONE_SAMPLE;  // 23
         CONVOLVE_ONE_SAMPLE;  // 24
         CONVOLVE_ONE_SAMPLE;  // 25
         CONVOLVE_ONE_SAMPLE;  // 26
         CONVOLVE_ONE_SAMPLE;  // 27
         CONVOLVE_ONE_SAMPLE;  // 28
         CONVOLVE_ONE_SAMPLE;  // 29
         CONVOLVE_ONE_SAMPLE;  // 30

         CONVOLVE_ONE_SAMPLE;  // 31
         CONVOLVE_ONE_SAMPLE;  // 32

       } else if (kernel_size == 64) {
         CONVOLVE_ONE_SAMPLE;  // 1
         CONVOLVE_ONE_SAMPLE;  // 2
         CONVOLVE_ONE_SAMPLE;  // 3
         CONVOLVE_ONE_SAMPLE;  // 4
         CONVOLVE_ONE_SAMPLE;  // 5
         CONVOLVE_ONE_SAMPLE;  // 6
         CONVOLVE_ONE_SAMPLE;  // 7
         CONVOLVE_ONE_SAMPLE;  // 8
         CONVOLVE_ONE_SAMPLE;  // 9
         CONVOLVE_ONE_SAMPLE;  // 10

         CONVOLVE_ONE_SAMPLE;  // 11
         CONVOLVE_ONE_SAMPLE;  // 12
         CONVOLVE_ONE_SAMPLE;  // 13
         CONVOLVE_ONE_SAMPLE;  // 14
         CONVOLVE_ONE_SAMPLE;  // 15
         CONVOLVE_ONE_SAMPLE;  // 16
         CONVOLVE_ONE_SAMPLE;  // 17
         CONVOLVE_ONE_SAMPLE;  // 18
         CONVOLVE_ONE_SAMPLE;  // 19
         CONVOLVE_ONE_SAMPLE;  // 20

         CONVOLVE_ONE_SAMPLE;  // 21
         CONVOLVE_ONE_SAMPLE;  // 22
         CONVOLVE_ONE_SAMPLE;  // 23
         CONVOLVE_ONE_SAMPLE;  // 24
         CONVOLVE_ONE_SAMPLE;  // 25
         CONVOLVE_ONE_SAMPLE;  // 26
         CONVOLVE_ONE_SAMPLE;  // 27
         CONVOLVE_ONE_SAMPLE;  // 28
         CONVOLVE_ONE_SAMPLE;  // 29
         CONVOLVE_ONE_SAMPLE;  // 30

         CONVOLVE_ONE_SAMPLE;  // 31
         CONVOLVE_ONE_SAMPLE;  // 32
         CONVOLVE_ONE_SAMPLE;  // 33
         CONVOLVE_ONE_SAMPLE;  // 34
         CONVOLVE_ONE_SAMPLE;  // 35
         CONVOLVE_ONE_SAMPLE;  // 36
         CONVOLVE_ONE_SAMPLE;  // 37
         CONVOLVE_ONE_SAMPLE;  // 38
         CONVOLVE_ONE_SAMPLE;  // 39
         CONVOLVE_ONE_SAMPLE;  // 40

         CONVOLVE_ONE_SAMPLE;  // 41
         CONVOLVE_ONE_SAMPLE;  // 42
         CONVOLVE_ONE_SAMPLE;  // 43
         CONVOLVE_ONE_SAMPLE;  // 44
         CONVOLVE_ONE_SAMPLE;  // 45
         CONVOLVE_ONE_SAMPLE;  // 46
         CONVOLVE_ONE_SAMPLE;  // 47
         CONVOLVE_ONE_SAMPLE;  // 48
         CONVOLVE_ONE_SAMPLE;  // 49
         CONVOLVE_ONE_SAMPLE;  // 50

         CONVOLVE_ONE_SAMPLE;  // 51
         CONVOLVE_ONE_SAMPLE;  // 52
         CONVOLVE_ONE_SAMPLE;  // 53
         CONVOLVE_ONE_SAMPLE;  // 54
         CONVOLVE_ONE_SAMPLE;  // 55
         CONVOLVE_ONE_SAMPLE;  // 56
         CONVOLVE_ONE_SAMPLE;  // 57
         CONVOLVE_ONE_SAMPLE;  // 58
         CONVOLVE_ONE_SAMPLE;  // 59
         CONVOLVE_ONE_SAMPLE;  // 60

         CONVOLVE_ONE_SAMPLE;  // 61
         CONVOLVE_ONE_SAMPLE;  // 62
         CONVOLVE_ONE_SAMPLE;  // 63
         CONVOLVE_ONE_SAMPLE;  // 64

       } else if (kernel_size == 128) {
         CONVOLVE_ONE_SAMPLE;  // 1
         CONVOLVE_ONE_SAMPLE;  // 2
         CONVOLVE_ONE_SAMPLE;  // 3
         CONVOLVE_ONE_SAMPLE;  // 4
         CONVOLVE_ONE_SAMPLE;  // 5
         CONVOLVE_ONE_SAMPLE;  // 6
         CONVOLVE_ONE_SAMPLE;  // 7
         CONVOLVE_ONE_SAMPLE;  // 8
         CONVOLVE_ONE_SAMPLE;  // 9
         CONVOLVE_ONE_SAMPLE;  // 10

         CONVOLVE_ONE_SAMPLE;  // 11
         CONVOLVE_ONE_SAMPLE;  // 12
         CONVOLVE_ONE_SAMPLE;  // 13
         CONVOLVE_ONE_SAMPLE;  // 14
         CONVOLVE_ONE_SAMPLE;  // 15
         CONVOLVE_ONE_SAMPLE;  // 16
         CONVOLVE_ONE_SAMPLE;  // 17
         CONVOLVE_ONE_SAMPLE;  // 18
         CONVOLVE_ONE_SAMPLE;  // 19
         CONVOLVE_ONE_SAMPLE;  // 20

         CONVOLVE_ONE_SAMPLE;  // 21
         CONVOLVE_ONE_SAMPLE;  // 22
         CONVOLVE_ONE_SAMPLE;  // 23
         CONVOLVE_ONE_SAMPLE;  // 24
         CONVOLVE_ONE_SAMPLE;  // 25
         CONVOLVE_ONE_SAMPLE;  // 26
         CONVOLVE_ONE_SAMPLE;  // 27
         CONVOLVE_ONE_SAMPLE;  // 28
         CONVOLVE_ONE_SAMPLE;  // 29
         CONVOLVE_ONE_SAMPLE;  // 30

         CONVOLVE_ONE_SAMPLE;  // 31
         CONVOLVE_ONE_SAMPLE;  // 32
         CONVOLVE_ONE_SAMPLE;  // 33
         CONVOLVE_ONE_SAMPLE;  // 34
         CONVOLVE_ONE_SAMPLE;  // 35
         CONVOLVE_ONE_SAMPLE;  // 36
         CONVOLVE_ONE_SAMPLE;  // 37
         CONVOLVE_ONE_SAMPLE;  // 38
         CONVOLVE_ONE_SAMPLE;  // 39
         CONVOLVE_ONE_SAMPLE;  // 40

         CONVOLVE_ONE_SAMPLE;  // 41
         CONVOLVE_ONE_SAMPLE;  // 42
         CONVOLVE_ONE_SAMPLE;  // 43
         CONVOLVE_ONE_SAMPLE;  // 44
         CONVOLVE_ONE_SAMPLE;  // 45
         CONVOLVE_ONE_SAMPLE;  // 46
         CONVOLVE_ONE_SAMPLE;  // 47
         CONVOLVE_ONE_SAMPLE;  // 48
         CONVOLVE_ONE_SAMPLE;  // 49
         CONVOLVE_ONE_SAMPLE;  // 50

         CONVOLVE_ONE_SAMPLE;  // 51
         CONVOLVE_ONE_SAMPLE;  // 52
         CONVOLVE_ONE_SAMPLE;  // 53
         CONVOLVE_ONE_SAMPLE;  // 54
         CONVOLVE_ONE_SAMPLE;  // 55
         CONVOLVE_ONE_SAMPLE;  // 56
         CONVOLVE_ONE_SAMPLE;  // 57
         CONVOLVE_ONE_SAMPLE;  // 58
         CONVOLVE_ONE_SAMPLE;  // 59
         CONVOLVE_ONE_SAMPLE;  // 60

         CONVOLVE_ONE_SAMPLE;  // 61
         CONVOLVE_ONE_SAMPLE;  // 62
         CONVOLVE_ONE_SAMPLE;  // 63
         CONVOLVE_ONE_SAMPLE;  // 64
         CONVOLVE_ONE_SAMPLE;  // 65
         CONVOLVE_ONE_SAMPLE;  // 66
         CONVOLVE_ONE_SAMPLE;  // 67
         CONVOLVE_ONE_SAMPLE;  // 68
         CONVOLVE_ONE_SAMPLE;  // 69
         CONVOLVE_ONE_SAMPLE;  // 70

         CONVOLVE_ONE_SAMPLE;  // 71
         CONVOLVE_ONE_SAMPLE;  // 72
         CONVOLVE_ONE_SAMPLE;  // 73
         CONVOLVE_ONE_SAMPLE;  // 74
         CONVOLVE_ONE_SAMPLE;  // 75
         CONVOLVE_ONE_SAMPLE;  // 76
         CONVOLVE_ONE_SAMPLE;  // 77
         CONVOLVE_ONE_SAMPLE;  // 78
         CONVOLVE_ONE_SAMPLE;  // 79
         CONVOLVE_ONE_SAMPLE;  // 80

         CONVOLVE_ONE_SAMPLE;  // 81
         CONVOLVE_ONE_SAMPLE;  // 82
         CONVOLVE_ONE_SAMPLE;  // 83
         CONVOLVE_ONE_SAMPLE;  // 84
         CONVOLVE_ONE_SAMPLE;  // 85
         CONVOLVE_ONE_SAMPLE;  // 86
         CONVOLVE_ONE_SAMPLE;  // 87
         CONVOLVE_ONE_SAMPLE;  // 88
         CONVOLVE_ONE_SAMPLE;  // 89
         CONVOLVE_ONE_SAMPLE;  // 90

         CONVOLVE_ONE_SAMPLE;  // 91
         CONVOLVE_ONE_SAMPLE;  // 92
         CONVOLVE_ONE_SAMPLE;  // 93
         CONVOLVE_ONE_SAMPLE;  // 94
         CONVOLVE_ONE_SAMPLE;  // 95
         CONVOLVE_ONE_SAMPLE;  // 96
         CONVOLVE_ONE_SAMPLE;  // 97
         CONVOLVE_ONE_SAMPLE;  // 98
         CONVOLVE_ONE_SAMPLE;  // 99
         CONVOLVE_ONE_SAMPLE;  // 100

         CONVOLVE_ONE_SAMPLE;  // 101
         CONVOLVE_ONE_SAMPLE;  // 102
         CONVOLVE_ONE_SAMPLE;  // 103
         CONVOLVE_ONE_SAMPLE;  // 104
         CONVOLVE_ONE_SAMPLE;  // 105
         CONVOLVE_ONE_SAMPLE;  // 106
         CONVOLVE_ONE_SAMPLE;  // 107
         CONVOLVE_ONE_SAMPLE;  // 108
         CONVOLVE_ONE_SAMPLE;  // 109
         CONVOLVE_ONE_SAMPLE;  // 110

         CONVOLVE_ONE_SAMPLE;  // 111
         CONVOLVE_ONE_SAMPLE;  // 112
         CONVOLVE_ONE_SAMPLE;  // 113
         CONVOLVE_ONE_SAMPLE;  // 114
         CONVOLVE_ONE_SAMPLE;  // 115
         CONVOLVE_ONE_SAMPLE;  // 116
         CONVOLVE_ONE_SAMPLE;  // 117
         CONVOLVE_ONE_SAMPLE;  // 118
         CONVOLVE_ONE_SAMPLE;  // 119
         CONVOLVE_ONE_SAMPLE;  // 120

         CONVOLVE_ONE_SAMPLE;  // 121
         CONVOLVE_ONE_SAMPLE;  // 122
         CONVOLVE_ONE_SAMPLE;  // 123
         CONVOLVE_ONE_SAMPLE;  // 124
         CONVOLVE_ONE_SAMPLE;  // 125
         CONVOLVE_ONE_SAMPLE;  // 126
         CONVOLVE_ONE_SAMPLE;  // 127
         CONVOLVE_ONE_SAMPLE;  // 128
       } else {
         while (j < kernel_size) {
           // Non-optimized using actual while loop.
           CONVOLVE_ONE_SAMPLE;
         }
       }
       dest_p[i++] = sum;
     }
 #if CPU(X86) || CPU(X86_64)
   }
 #endif
 #endif  // OS(MACOSX)

   // Copy 2nd half of input buffer to 1st half.
   memcpy(buffer_.Data(), input_p, sizeof(float) * frames_to_process);
 }

 void DirectConvolver::Reset() {
   buffer_.Zero();
 }

 }  // namespace blink
	/*
	* Copyright (C) 2012 Intel Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
	* its contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "platform/audio/DirectConvolver.h"

	#if OS(MACOSX)
	#include <Accelerate/Accelerate.h>
	#endif

	#include "platform/audio/VectorMath.h"
	#include "platform/wtf/CPU.h"

	#if (CPU(X86) \|\| CPU(X86_64)) && !OS(MACOSX)
	#include <emmintrin.h>
	#endif

	namespace blink {

	using namespace VectorMath;

	DirectConvolver::DirectConvolver(size_t input_block_size)
	: input_block_size_(input_block_size), buffer_(input_block_size * 2) {}

	void DirectConvolver::Process(AudioFloatArray* convolution_kernel,
	const float* source_p,
	float* dest_p,
	size_t frames_to_process) {
	DCHECK_EQ(frames_to_process, input_block_size_);
	if (frames_to_process != input_block_size_)
	return;

	// Only support kernelSize <= m_inputBlockSize
	size_t kernel_size = convolution_kernel->size();
	DCHECK_LE(kernel_size, input_block_size_);
	if (kernel_size > input_block_size_)
	return;

	float* kernel_p = convolution_kernel->Data();

	// Sanity check
	bool is_copy_good = kernel_p && source_p && dest_p && buffer_.Data();
	DCHECK(is_copy_good);
	if (!is_copy_good)
	return;

	float* input_p = buffer_.Data() + input_block_size_;

	// Copy samples to 2nd half of input buffer.
	memcpy(input_p, source_p, sizeof(float) * frames_to_process);

	#if OS(MACOSX)
	#if CPU(X86)
	conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1,
	framesToProcess, kernelSize);
	#else
	vDSP_conv(input_p - kernel_size + 1, 1, kernel_p + kernel_size - 1, -1,
	dest_p, 1, frames_to_process, kernel_size);
	#endif // CPU(X86)
	#else
	size_t i = 0;
	#if CPU(X86) \|\| CPU(X86_64)
	// Convolution using SSE2. Currently only do this if both \|kernelSize\| and
	// \|framesToProcess\| are multiples of 4. If not, use the straightforward loop
	// below.

	if ((kernel_size % 4 == 0) && (frames_to_process % 4 == 0)) {
	// AudioFloatArray's are always aligned on at least a 16-byte boundary.
	AudioFloatArray kernel_buffer(4 * kernel_size);
	__m128* kernel_reversed = reinterpret_cast<__m128*>(kernel_buffer.Data());

	// Reverse the kernel and repeat each value across a vector
	for (i = 0; i < kernel_size; ++i) {
	kernel_reversed[i] = _mm_set1_ps(kernel_p[kernel_size - i - 1]);
	}

	float* input_start_p = input_p - kernel_size + 1;

	// Do convolution with 4 inputs at a time.
	for (i = 0; i < frames_to_process; i += 4) {
	__m128 convolution_sum;

	convolution_sum = _mm_setzero_ps();

	// \|kernelSize\| is a multiple of 4 so we can unroll the loop by 4,
	// manually.
	for (size_t k = 0; k < kernel_size; k += 4) {
	size_t data_offset = i + k;

	for (size_t m = 0; m < 4; ++m) {
	__m128 source_block;
	__m128 product;

	source_block = _mm_loadu_ps(input_start_p + data_offset + m);
	product = _mm_mul_ps(kernel_reversed[k + m], source_block);
	convolution_sum = _mm_add_ps(convolution_sum, product);
	}
	}
	_mm_storeu_ps(dest_p + i, convolution_sum);
	}
	} else {
	#endif

	// FIXME: The macro can be further optimized to avoid pipeline stalls. One
	// possibility is to maintain 4 separate sums and change the macro to
	// CONVOLVE_FOUR_SAMPLES.
	#define CONVOLVE_ONE_SAMPLE \
	do { \
	sum += input_p[i - j] * kernel_p[j]; \
	j++; \
	} while (0)

	while (i < frames_to_process) {
	size_t j = 0;
	float sum = 0;

	// FIXME: SSE optimization may be applied here.
	if (kernel_size == 32) {
	CONVOLVE_ONE_SAMPLE; // 1
	CONVOLVE_ONE_SAMPLE; // 2
	CONVOLVE_ONE_SAMPLE; // 3
	CONVOLVE_ONE_SAMPLE; // 4
	CONVOLVE_ONE_SAMPLE; // 5
	CONVOLVE_ONE_SAMPLE; // 6
	CONVOLVE_ONE_SAMPLE; // 7
	CONVOLVE_ONE_SAMPLE; // 8
	CONVOLVE_ONE_SAMPLE; // 9
	CONVOLVE_ONE_SAMPLE; // 10

	CONVOLVE_ONE_SAMPLE; // 11
	CONVOLVE_ONE_SAMPLE; // 12
	CONVOLVE_ONE_SAMPLE; // 13
	CONVOLVE_ONE_SAMPLE; // 14
	CONVOLVE_ONE_SAMPLE; // 15
	CONVOLVE_ONE_SAMPLE; // 16
	CONVOLVE_ONE_SAMPLE; // 17
	CONVOLVE_ONE_SAMPLE; // 18
	CONVOLVE_ONE_SAMPLE; // 19
	CONVOLVE_ONE_SAMPLE; // 20

	CONVOLVE_ONE_SAMPLE; // 21
	CONVOLVE_ONE_SAMPLE; // 22
	CONVOLVE_ONE_SAMPLE; // 23
	CONVOLVE_ONE_SAMPLE; // 24
	CONVOLVE_ONE_SAMPLE; // 25
	CONVOLVE_ONE_SAMPLE; // 26
	CONVOLVE_ONE_SAMPLE; // 27
	CONVOLVE_ONE_SAMPLE; // 28
	CONVOLVE_ONE_SAMPLE; // 29
	CONVOLVE_ONE_SAMPLE; // 30

	CONVOLVE_ONE_SAMPLE; // 31
	CONVOLVE_ONE_SAMPLE; // 32

	} else if (kernel_size == 64) {
	CONVOLVE_ONE_SAMPLE; // 1
	CONVOLVE_ONE_SAMPLE; // 2
	CONVOLVE_ONE_SAMPLE; // 3
	CONVOLVE_ONE_SAMPLE; // 4
	CONVOLVE_ONE_SAMPLE; // 5
	CONVOLVE_ONE_SAMPLE; // 6
	CONVOLVE_ONE_SAMPLE; // 7
	CONVOLVE_ONE_SAMPLE; // 8
	CONVOLVE_ONE_SAMPLE; // 9
	CONVOLVE_ONE_SAMPLE; // 10

	CONVOLVE_ONE_SAMPLE; // 11
	CONVOLVE_ONE_SAMPLE; // 12
	CONVOLVE_ONE_SAMPLE; // 13
	CONVOLVE_ONE_SAMPLE; // 14
	CONVOLVE_ONE_SAMPLE; // 15
	CONVOLVE_ONE_SAMPLE; // 16
	CONVOLVE_ONE_SAMPLE; // 17
	CONVOLVE_ONE_SAMPLE; // 18
	CONVOLVE_ONE_SAMPLE; // 19
	CONVOLVE_ONE_SAMPLE; // 20

	CONVOLVE_ONE_SAMPLE; // 21
	CONVOLVE_ONE_SAMPLE; // 22
	CONVOLVE_ONE_SAMPLE; // 23
	CONVOLVE_ONE_SAMPLE; // 24
	CONVOLVE_ONE_SAMPLE; // 25
	CONVOLVE_ONE_SAMPLE; // 26
	CONVOLVE_ONE_SAMPLE; // 27
	CONVOLVE_ONE_SAMPLE; // 28
	CONVOLVE_ONE_SAMPLE; // 29
	CONVOLVE_ONE_SAMPLE; // 30

	CONVOLVE_ONE_SAMPLE; // 31
	CONVOLVE_ONE_SAMPLE; // 32
	CONVOLVE_ONE_SAMPLE; // 33
	CONVOLVE_ONE_SAMPLE; // 34
	CONVOLVE_ONE_SAMPLE; // 35
	CONVOLVE_ONE_SAMPLE; // 36
	CONVOLVE_ONE_SAMPLE; // 37
	CONVOLVE_ONE_SAMPLE; // 38
	CONVOLVE_ONE_SAMPLE; // 39
	CONVOLVE_ONE_SAMPLE; // 40

	CONVOLVE_ONE_SAMPLE; // 41
	CONVOLVE_ONE_SAMPLE; // 42
	CONVOLVE_ONE_SAMPLE; // 43
	CONVOLVE_ONE_SAMPLE; // 44
	CONVOLVE_ONE_SAMPLE; // 45
	CONVOLVE_ONE_SAMPLE; // 46
	CONVOLVE_ONE_SAMPLE; // 47
	CONVOLVE_ONE_SAMPLE; // 48
	CONVOLVE_ONE_SAMPLE; // 49
	CONVOLVE_ONE_SAMPLE; // 50

	CONVOLVE_ONE_SAMPLE; // 51
	CONVOLVE_ONE_SAMPLE; // 52
	CONVOLVE_ONE_SAMPLE; // 53
	CONVOLVE_ONE_SAMPLE; // 54
	CONVOLVE_ONE_SAMPLE; // 55
	CONVOLVE_ONE_SAMPLE; // 56
	CONVOLVE_ONE_SAMPLE; // 57
	CONVOLVE_ONE_SAMPLE; // 58
	CONVOLVE_ONE_SAMPLE; // 59
	CONVOLVE_ONE_SAMPLE; // 60

	CONVOLVE_ONE_SAMPLE; // 61
	CONVOLVE_ONE_SAMPLE; // 62
	CONVOLVE_ONE_SAMPLE; // 63
	CONVOLVE_ONE_SAMPLE; // 64

	} else if (kernel_size == 128) {
	CONVOLVE_ONE_SAMPLE; // 1
	CONVOLVE_ONE_SAMPLE; // 2
	CONVOLVE_ONE_SAMPLE; // 3
	CONVOLVE_ONE_SAMPLE; // 4
	CONVOLVE_ONE_SAMPLE; // 5
	CONVOLVE_ONE_SAMPLE; // 6
	CONVOLVE_ONE_SAMPLE; // 7
	CONVOLVE_ONE_SAMPLE; // 8
	CONVOLVE_ONE_SAMPLE; // 9
	CONVOLVE_ONE_SAMPLE; // 10

	CONVOLVE_ONE_SAMPLE; // 11
	CONVOLVE_ONE_SAMPLE; // 12
	CONVOLVE_ONE_SAMPLE; // 13
	CONVOLVE_ONE_SAMPLE; // 14
	CONVOLVE_ONE_SAMPLE; // 15
	CONVOLVE_ONE_SAMPLE; // 16
	CONVOLVE_ONE_SAMPLE; // 17
	CONVOLVE_ONE_SAMPLE; // 18
	CONVOLVE_ONE_SAMPLE; // 19
	CONVOLVE_ONE_SAMPLE; // 20

	CONVOLVE_ONE_SAMPLE; // 21
	CONVOLVE_ONE_SAMPLE; // 22
	CONVOLVE_ONE_SAMPLE; // 23
	CONVOLVE_ONE_SAMPLE; // 24
	CONVOLVE_ONE_SAMPLE; // 25
	CONVOLVE_ONE_SAMPLE; // 26
	CONVOLVE_ONE_SAMPLE; // 27
	CONVOLVE_ONE_SAMPLE; // 28
	CONVOLVE_ONE_SAMPLE; // 29
	CONVOLVE_ONE_SAMPLE; // 30

	CONVOLVE_ONE_SAMPLE; // 31
	CONVOLVE_ONE_SAMPLE; // 32
	CONVOLVE_ONE_SAMPLE; // 33
	CONVOLVE_ONE_SAMPLE; // 34
	CONVOLVE_ONE_SAMPLE; // 35
	CONVOLVE_ONE_SAMPLE; // 36
	CONVOLVE_ONE_SAMPLE; // 37
	CONVOLVE_ONE_SAMPLE; // 38
	CONVOLVE_ONE_SAMPLE; // 39
	CONVOLVE_ONE_SAMPLE; // 40

	CONVOLVE_ONE_SAMPLE; // 41
	CONVOLVE_ONE_SAMPLE; // 42
	CONVOLVE_ONE_SAMPLE; // 43
	CONVOLVE_ONE_SAMPLE; // 44
	CONVOLVE_ONE_SAMPLE; // 45
	CONVOLVE_ONE_SAMPLE; // 46
	CONVOLVE_ONE_SAMPLE; // 47
	CONVOLVE_ONE_SAMPLE; // 48
	CONVOLVE_ONE_SAMPLE; // 49
	CONVOLVE_ONE_SAMPLE; // 50

	CONVOLVE_ONE_SAMPLE; // 51
	CONVOLVE_ONE_SAMPLE; // 52
	CONVOLVE_ONE_SAMPLE; // 53
	CONVOLVE_ONE_SAMPLE; // 54
	CONVOLVE_ONE_SAMPLE; // 55
	CONVOLVE_ONE_SAMPLE; // 56
	CONVOLVE_ONE_SAMPLE; // 57
	CONVOLVE_ONE_SAMPLE; // 58
	CONVOLVE_ONE_SAMPLE; // 59
	CONVOLVE_ONE_SAMPLE; // 60

	CONVOLVE_ONE_SAMPLE; // 61
	CONVOLVE_ONE_SAMPLE; // 62
	CONVOLVE_ONE_SAMPLE; // 63
	CONVOLVE_ONE_SAMPLE; // 64
	CONVOLVE_ONE_SAMPLE; // 65
	CONVOLVE_ONE_SAMPLE; // 66
	CONVOLVE_ONE_SAMPLE; // 67
	CONVOLVE_ONE_SAMPLE; // 68
	CONVOLVE_ONE_SAMPLE; // 69
	CONVOLVE_ONE_SAMPLE; // 70

	CONVOLVE_ONE_SAMPLE; // 71
	CONVOLVE_ONE_SAMPLE; // 72
	CONVOLVE_ONE_SAMPLE; // 73
	CONVOLVE_ONE_SAMPLE; // 74
	CONVOLVE_ONE_SAMPLE; // 75
	CONVOLVE_ONE_SAMPLE; // 76
	CONVOLVE_ONE_SAMPLE; // 77
	CONVOLVE_ONE_SAMPLE; // 78
	CONVOLVE_ONE_SAMPLE; // 79
	CONVOLVE_ONE_SAMPLE; // 80

	CONVOLVE_ONE_SAMPLE; // 81
	CONVOLVE_ONE_SAMPLE; // 82
	CONVOLVE_ONE_SAMPLE; // 83
	CONVOLVE_ONE_SAMPLE; // 84
	CONVOLVE_ONE_SAMPLE; // 85
	CONVOLVE_ONE_SAMPLE; // 86
	CONVOLVE_ONE_SAMPLE; // 87
	CONVOLVE_ONE_SAMPLE; // 88
	CONVOLVE_ONE_SAMPLE; // 89
	CONVOLVE_ONE_SAMPLE; // 90

	CONVOLVE_ONE_SAMPLE; // 91
	CONVOLVE_ONE_SAMPLE; // 92
	CONVOLVE_ONE_SAMPLE; // 93
	CONVOLVE_ONE_SAMPLE; // 94
	CONVOLVE_ONE_SAMPLE; // 95
	CONVOLVE_ONE_SAMPLE; // 96
	CONVOLVE_ONE_SAMPLE; // 97
	CONVOLVE_ONE_SAMPLE; // 98
	CONVOLVE_ONE_SAMPLE; // 99
	CONVOLVE_ONE_SAMPLE; // 100

	CONVOLVE_ONE_SAMPLE; // 101
	CONVOLVE_ONE_SAMPLE; // 102
	CONVOLVE_ONE_SAMPLE; // 103
	CONVOLVE_ONE_SAMPLE; // 104
	CONVOLVE_ONE_SAMPLE; // 105
	CONVOLVE_ONE_SAMPLE; // 106
	CONVOLVE_ONE_SAMPLE; // 107
	CONVOLVE_ONE_SAMPLE; // 108
	CONVOLVE_ONE_SAMPLE; // 109
	CONVOLVE_ONE_SAMPLE; // 110

	CONVOLVE_ONE_SAMPLE; // 111
	CONVOLVE_ONE_SAMPLE; // 112
	CONVOLVE_ONE_SAMPLE; // 113
	CONVOLVE_ONE_SAMPLE; // 114
	CONVOLVE_ONE_SAMPLE; // 115
	CONVOLVE_ONE_SAMPLE; // 116
	CONVOLVE_ONE_SAMPLE; // 117
	CONVOLVE_ONE_SAMPLE; // 118
	CONVOLVE_ONE_SAMPLE; // 119
	CONVOLVE_ONE_SAMPLE; // 120

	CONVOLVE_ONE_SAMPLE; // 121
	CONVOLVE_ONE_SAMPLE; // 122
	CONVOLVE_ONE_SAMPLE; // 123
	CONVOLVE_ONE_SAMPLE; // 124
	CONVOLVE_ONE_SAMPLE; // 125
	CONVOLVE_ONE_SAMPLE; // 126
	CONVOLVE_ONE_SAMPLE; // 127
	CONVOLVE_ONE_SAMPLE; // 128
	} else {
	while (j < kernel_size) {
	// Non-optimized using actual while loop.
	CONVOLVE_ONE_SAMPLE;
	}
	}
	dest_p[i++] = sum;
	}
	#if CPU(X86) \|\| CPU(X86_64)
	}
	#endif
	#endif // OS(MACOSX)

	// Copy 2nd half of input buffer to 1st half.
	memcpy(buffer_.Data(), input_p, sizeof(float) * frames_to_process);
	}

	void DirectConvolver::Reset() {
	buffer_.Zero();
	}

	} // namespace blink