blob: 2950177748d2325e44438f72a6572009ea964f2b [file] [log] [blame]
/*
* Copyright (C) 2010, Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
#include "platform/audio/VectorMath.h"
#include <cmath>
#include "build/build_config.h"
#include "platform/wtf/Assertions.h"
#include "platform/wtf/CPU.h"
#if defined(OS_MACOSX)
#include "platform/audio/mac/VectorMathMac.h"
#elif WTF_CPU_ARM_NEON
#include "platform/audio/cpu/arm/VectorMathNEON.h"
#elif defined(ARCH_CPU_X86_FAMILY)
#include "platform/audio/cpu/x86/VectorMathX86.h"
#else
#include "platform/audio/VectorMathScalar.h"
#endif
#if HAVE_MIPS_MSA_INTRINSICS
#include "platform/cpu/mips/CommonMacrosMSA.h"
#endif
#include <algorithm>
namespace blink {
namespace VectorMath {
namespace {
#if defined(OS_MACOSX)
namespace Impl = Mac;
#elif WTF_CPU_ARM_NEON
namespace Impl = NEON;
#elif defined(ARCH_CPU_X86_FAMILY)
namespace Impl = X86;
#else
namespace Impl = Scalar;
#endif
} // namespace
void Vsma(const float* source_p,
int source_stride,
const float* scale,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
if ((source_stride == 1) && (dest_stride == 1)) {
float* destPCopy = dest_p;
v4f32 vScale;
v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
FloatInt scaleVal;
scaleVal.floatVal = *scale;
vScale = (v4f32)__msa_fill_w(scaleVal.intVal);
for (; n >= 32; n -= 32) {
LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
vSrc7);
LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6,
vDst7);
VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
}
}
frames_to_process = n;
#endif
Impl::Vsma(source_p, source_stride, scale, dest_p, dest_stride,
frames_to_process);
}
void Vsmul(const float* source_p,
int source_stride,
const float* scale,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
if ((source_stride == 1) && (dest_stride == 1)) {
v4f32 vScale;
v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
FloatInt scaleVal;
scaleVal.floatVal = *scale;
vScale = (v4f32)__msa_fill_w(scaleVal.intVal);
for (; n >= 32; n -= 32) {
LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
vSrc7);
VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
}
}
frames_to_process = n;
#endif
Impl::Vsmul(source_p, source_stride, scale, dest_p, dest_stride,
frames_to_process);
}
void Vadd(const float* source1p,
int source_stride1,
const float* source2p,
int source_stride2,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
vSrc1P7;
v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,
vSrc2P7;
v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
for (; n >= 32; n -= 32) {
LD_SP8(source1p, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,
vSrc1P6, vSrc1P7);
LD_SP8(source2p, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,
vSrc2P6, vSrc2P7);
ADD4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,
vSrc2P3, vDst0, vDst1, vDst2, vDst3);
ADD4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,
vSrc2P7, vDst4, vDst5, vDst6, vDst7);
ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
}
}
frames_to_process = n;
#endif
Impl::Vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
dest_stride, frames_to_process);
}
void Vmul(const float* source1p,
int source_stride1,
const float* source2p,
int source_stride2,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
vSrc1P7;
v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,
vSrc2P7;
v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
for (; n >= 32; n -= 32) {
LD_SP8(source1p, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,
vSrc1P6, vSrc1P7);
LD_SP8(source2p, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,
vSrc2P6, vSrc2P7);
MUL4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,
vSrc2P3, vDst0, vDst1, vDst2, vDst3);
MUL4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,
vSrc2P7, vDst4, vDst5, vDst6, vDst7);
ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
}
}
frames_to_process = n;
#endif
Impl::Vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
dest_stride, frames_to_process);
}
void Zvmul(const float* real1p,
const float* imag1p,
const float* real2p,
const float* imag2p,
float* real_dest_p,
float* imag_dest_p,
size_t frames_to_process) {
Impl::Zvmul(real1p, imag1p, real2p, imag2p, real_dest_p, imag_dest_p,
frames_to_process);
}
void Vsvesq(const float* source_p,
int source_stride,
float* sum_p,
size_t frames_to_process) {
float sum = 0;
Impl::Vsvesq(source_p, source_stride, &sum, frames_to_process);
DCHECK(sum_p);
*sum_p = sum;
}
void Vmaxmgv(const float* source_p,
int source_stride,
float* max_p,
size_t frames_to_process) {
float max = 0;
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
if (source_stride == 1) {
v4f32 vMax = {
0,
};
v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
const v16i8 vSignBitMask = (v16i8)__msa_fill_w(0x7FFFFFFF);
for (; n >= 32; n -= 32) {
LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
vSrc7);
AND_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vSignBitMask);
VMAX_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vMax);
AND_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vSignBitMask);
VMAX_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vMax);
}
max = std::max(max, vMax[0]);
max = std::max(max, vMax[1]);
max = std::max(max, vMax[2]);
max = std::max(max, vMax[3]);
}
frames_to_process = n;
#endif
Impl::Vmaxmgv(source_p, source_stride, &max, frames_to_process);
DCHECK(max_p);
*max_p = max;
}
void Vclip(const float* source_p,
int source_stride,
const float* low_threshold_p,
const float* high_threshold_p,
float* dest_p,
int dest_stride,
size_t frames_to_process) {
float low_threshold = *low_threshold_p;
float high_threshold = *high_threshold_p;
#if DCHECK_IS_ON()
// Do the same DCHECKs that |clampTo| would do so that optimization paths do
// not have to do them.
for (size_t i = 0u; i < frames_to_process; ++i)
DCHECK(!std::isnan(source_p[i]));
// This also ensures that thresholds are not NaNs.
DCHECK_LE(low_threshold, high_threshold);
#endif
#if HAVE_MIPS_MSA_INTRINSICS
int n = frames_to_process;
if ((source_stride == 1) && (dest_stride == 1)) {
v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
v4f32 vLowThr, vHighThr;
FloatInt lowThr, highThr;
lowThr.floatVal = low_threshold;
highThr.floatVal = high_threshold;
vLowThr = (v4f32)__msa_fill_w(lowThr.intVal);
vHighThr = (v4f32)__msa_fill_w(highThr.intVal);
for (; n >= 32; n -= 32) {
LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
vSrc7);
VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2,
vDst3);
VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6,
vDst7);
ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
}
}
frames_to_process = n;
#endif
Impl::Vclip(source_p, source_stride, &low_threshold, &high_threshold, dest_p,
dest_stride, frames_to_process);
}
} // namespace VectorMath
} // namespace blink