| /* |
| * Copyright (C) 2010, Google Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE |
| * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
| * DAMAGE. |
| */ |
| |
| #include "platform/audio/VectorMath.h" |
| |
| #include <stdint.h> |
| #include "platform/wtf/Assertions.h" |
| #include "platform/wtf/CPU.h" |
| #include "platform/wtf/MathExtras.h" |
| |
| #if OS(MACOSX) |
| #include <Accelerate/Accelerate.h> |
| #endif |
| |
| #if CPU(X86) || CPU(X86_64) |
| #include <emmintrin.h> |
| #endif |
| |
| #if CPU(ARM_NEON) |
| #include <arm_neon.h> |
| #endif |
| |
| #if HAVE(MIPS_MSA_INTRINSICS) |
| #include "platform/cpu/mips/CommonMacrosMSA.h" |
| #endif |
| |
| #include <math.h> |
| #include <algorithm> |
| |
| namespace blink { |
| |
| namespace VectorMath { |
| |
| #if OS(MACOSX) |
| // On the Mac we use the highly optimized versions in Accelerate.framework |
| // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes |
| // <vecLib/vDSP_translate.h> which defines macros of the same name as |
| // our namespaced function names, so we must handle this case differently. Other |
| // architectures (64bit, ARM, etc.) do not include this header file. |
| |
| void Vsmul(const float* source_p, |
| int source_stride, |
| const float* scale, |
| float* dest_p, |
| int dest_stride, |
| size_t frames_to_process) { |
| #if CPU(X86) |
| ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); |
| #else |
| vDSP_vsmul(source_p, source_stride, scale, dest_p, dest_stride, |
| frames_to_process); |
| #endif |
| } |
| |
| void Vadd(const float* source1p, |
| int source_stride1, |
| const float* source2p, |
| int source_stride2, |
| float* dest_p, |
| int dest_stride, |
| size_t frames_to_process) { |
| #if CPU(X86) |
| ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, |
| framesToProcess); |
| #else |
| vDSP_vadd(source1p, source_stride1, source2p, source_stride2, dest_p, |
| dest_stride, frames_to_process); |
| #endif |
| } |
| |
| void Vmul(const float* source1p, |
| int source_stride1, |
| const float* source2p, |
| int source_stride2, |
| float* dest_p, |
| int dest_stride, |
| size_t frames_to_process) { |
| #if CPU(X86) |
| ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, |
| framesToProcess); |
| #else |
| vDSP_vmul(source1p, source_stride1, source2p, source_stride2, dest_p, |
| dest_stride, frames_to_process); |
| #endif |
| } |
| |
| void Zvmul(const float* real1p, |
| const float* imag1p, |
| const float* real2p, |
| const float* imag2p, |
| float* real_dest_p, |
| float* imag_dest_p, |
| size_t frames_to_process) { |
| DSPSplitComplex sc1; |
| DSPSplitComplex sc2; |
| DSPSplitComplex dest; |
| sc1.realp = const_cast<float*>(real1p); |
| sc1.imagp = const_cast<float*>(imag1p); |
| sc2.realp = const_cast<float*>(real2p); |
| sc2.imagp = const_cast<float*>(imag2p); |
| dest.realp = real_dest_p; |
| dest.imagp = imag_dest_p; |
| #if CPU(X86) |
| ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); |
| #else |
| vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, frames_to_process, 1); |
| #endif |
| } |
| |
| void Vsma(const float* source_p, |
| int source_stride, |
| const float* scale, |
| float* dest_p, |
| int dest_stride, |
| size_t frames_to_process) { |
| vDSP_vsma(source_p, source_stride, scale, dest_p, dest_stride, dest_p, |
| dest_stride, frames_to_process); |
| } |
| |
| void Vmaxmgv(const float* source_p, |
| int source_stride, |
| float* max_p, |
| size_t frames_to_process) { |
| vDSP_maxmgv(source_p, source_stride, max_p, frames_to_process); |
| } |
| |
| void Vsvesq(const float* source_p, |
| int source_stride, |
| float* sum_p, |
| size_t frames_to_process) { |
| vDSP_svesq(const_cast<float*>(source_p), source_stride, sum_p, |
| frames_to_process); |
| } |
| |
| void Vclip(const float* source_p, |
| int source_stride, |
| const float* low_threshold_p, |
| const float* high_threshold_p, |
| float* dest_p, |
| int dest_stride, |
| size_t frames_to_process) { |
| vDSP_vclip(const_cast<float*>(source_p), source_stride, |
| const_cast<float*>(low_threshold_p), |
| const_cast<float*>(high_threshold_p), dest_p, dest_stride, |
| frames_to_process); |
| } |
| #else |
| |
| void Vsma(const float* source_p, |
| int source_stride, |
| const float* scale, |
| float* dest_p, |
| int dest_stride, |
| size_t frames_to_process) { |
| int n = frames_to_process; |
| |
| #if CPU(X86) || CPU(X86_64) |
| if ((source_stride == 1) && (dest_stride == 1)) { |
| float k = *scale; |
| |
| // If the sourceP address is not 16-byte aligned, the first several frames |
| // (at most three) should be processed separately. |
| while ((reinterpret_cast<uintptr_t>(source_p) & 0x0F) && n) { |
| *dest_p += k * *source_p; |
| source_p++; |
| dest_p++; |
| n--; |
| } |
| |
| // Now the sourceP is aligned, use SSE. |
| int tail_frames = n % 4; |
| const float* end_p = dest_p + n - tail_frames; |
| |
| __m128 p_source; |
| __m128 dest; |
| __m128 temp; |
| __m128 m_scale = _mm_set_ps1(k); |
| |
| bool dest_aligned = !(reinterpret_cast<uintptr_t>(dest_p) & 0x0F); |
| |
| #define SSE2_MULT_ADD(loadInstr, storeInstr) \ |
| while (dest_p < end_p) { \ |
| p_source = _mm_load_ps(source_p); \ |
| temp = _mm_mul_ps(p_source, m_scale); \ |
| dest = _mm_##loadInstr##_ps(dest_p); \ |
| dest = _mm_add_ps(dest, temp); \ |
| _mm_##storeInstr##_ps(dest_p, dest); \ |
| source_p += 4; \ |
| dest_p += 4; \ |
| } |
| |
| if (dest_aligned) |
| SSE2_MULT_ADD(load, store) |
| else |
| SSE2_MULT_ADD(loadu, storeu) |
| |
| n = tail_frames; |
| } |
| #elif CPU(ARM_NEON) |
| if ((source_stride == 1) && (dest_stride == 1)) { |
| int tail_frames = n % 4; |
| const float* end_p = dest_p + n - tail_frames; |
| |
| float32x4_t k = vdupq_n_f32(*scale); |
| while (dest_p < end_p) { |
| float32x4_t source = vld1q_f32(source_p); |
| float32x4_t dest = vld1q_f32(dest_p); |
| |
| dest = vmlaq_f32(dest, source, k); |
| vst1q_f32(dest_p, dest); |
| |
| source_p += 4; |
| dest_p += 4; |
| } |
| n = tail_frames; |
| } |
| #elif HAVE(MIPS_MSA_INTRINSICS) |
| if ((sourceStride == 1) && (destStride == 1)) { |
| float* destPCopy = destP; |
| v4f32 vScale; |
| v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; |
| v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| FloatInt scaleVal; |
| |
| scaleVal.floatVal = *scale; |
| vScale = (v4f32)__msa_fill_w(scaleVal.intVal); |
| |
| for (; n >= 32; n -= 32) { |
| LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, |
| vSrc7); |
| LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, |
| vDst7); |
| VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale); |
| ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| } |
| } |
| #endif |
| while (n) { |
| *dest_p += *source_p * *scale; |
| source_p += source_stride; |
| dest_p += dest_stride; |
| n--; |
| } |
| } |
| |
| void Vsmul(const float* source_p, |
| int source_stride, |
| const float* scale, |
| float* dest_p, |
| int dest_stride, |
| size_t frames_to_process) { |
| int n = frames_to_process; |
| |
| #if CPU(X86) || CPU(X86_64) |
| if ((source_stride == 1) && (dest_stride == 1)) { |
| float k = *scale; |
| |
| // If the sourceP address is not 16-byte aligned, the first several frames |
| // (at most three) should be processed separately. |
| while ((reinterpret_cast<size_t>(source_p) & 0x0F) && n) { |
| *dest_p = k * *source_p; |
| source_p++; |
| dest_p++; |
| n--; |
| } |
| |
| // Now the sourceP address is aligned and start to apply SSE. |
| int group = n / 4; |
| __m128 m_scale = _mm_set_ps1(k); |
| __m128* p_source; |
| __m128* p_dest; |
| __m128 dest; |
| |
| if (reinterpret_cast<size_t>(dest_p) & 0x0F) { |
| while (group--) { |
| p_source = reinterpret_cast<__m128*>(const_cast<float*>(source_p)); |
| dest = _mm_mul_ps(*p_source, m_scale); |
| _mm_storeu_ps(dest_p, dest); |
| |
| source_p += 4; |
| dest_p += 4; |
| } |
| } else { |
| while (group--) { |
| p_source = reinterpret_cast<__m128*>(const_cast<float*>(source_p)); |
| p_dest = reinterpret_cast<__m128*>(dest_p); |
| *p_dest = _mm_mul_ps(*p_source, m_scale); |
| |
| source_p += 4; |
| dest_p += 4; |
| } |
| } |
| |
| // Non-SSE handling for remaining frames which is less than 4. |
| n %= 4; |
| while (n) { |
| *dest_p = k * *source_p; |
| source_p++; |
| dest_p++; |
| n--; |
| } |
| } else { // If strides are not 1, rollback to normal algorithm. |
| #elif CPU(ARM_NEON) |
| if ((source_stride == 1) && (dest_stride == 1)) { |
| float k = *scale; |
| int tail_frames = n % 4; |
| const float* end_p = dest_p + n - tail_frames; |
| |
| while (dest_p < end_p) { |
| float32x4_t source = vld1q_f32(source_p); |
| vst1q_f32(dest_p, vmulq_n_f32(source, k)); |
| |
| source_p += 4; |
| dest_p += 4; |
| } |
| n = tail_frames; |
| } |
| #elif HAVE(MIPS_MSA_INTRINSICS) |
| if ((sourceStride == 1) && (destStride == 1)) { |
| v4f32 vScale; |
| v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; |
| v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| FloatInt scaleVal; |
| |
| scaleVal.floatVal = *scale; |
| vScale = (v4f32)__msa_fill_w(scaleVal.intVal); |
| |
| for (; n >= 32; n -= 32) { |
| LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, |
| vSrc7); |
| VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale); |
| ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| } |
| } |
| #endif |
| float k = *scale; |
| while (n--) { |
| *dest_p = k * *source_p; |
| source_p += source_stride; |
| dest_p += dest_stride; |
| } |
| #if CPU(X86) || CPU(X86_64) |
| } |
| #endif |
| } |
| |
| void Vadd(const float* source1p, |
| int source_stride1, |
| const float* source2p, |
| int source_stride2, |
| float* dest_p, |
| int dest_stride, |
| size_t frames_to_process) { |
| int n = frames_to_process; |
| |
| #if CPU(X86) || CPU(X86_64) |
| if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) { |
| // If the sourceP address is not 16-byte aligned, the first several frames |
| // (at most three) should be processed separately. |
| while ((reinterpret_cast<size_t>(source1p) & 0x0F) && n) { |
| *dest_p = *source1p + *source2p; |
| source1p++; |
| source2p++; |
| dest_p++; |
| n--; |
| } |
| |
| // Now the source1P address is aligned and start to apply SSE. |
| int group = n / 4; |
| __m128* p_source1; |
| __m128* p_source2; |
| __m128* p_dest; |
| __m128 source2; |
| __m128 dest; |
| |
| bool source2_aligned = !(reinterpret_cast<size_t>(source2p) & 0x0F); |
| bool dest_aligned = !(reinterpret_cast<size_t>(dest_p) & 0x0F); |
| |
| if (source2_aligned && dest_aligned) { // all aligned |
| while (group--) { |
| p_source1 = reinterpret_cast<__m128*>(const_cast<float*>(source1p)); |
| p_source2 = reinterpret_cast<__m128*>(const_cast<float*>(source2p)); |
| p_dest = reinterpret_cast<__m128*>(dest_p); |
| *p_dest = _mm_add_ps(*p_source1, *p_source2); |
| |
| source1p += 4; |
| source2p += 4; |
| dest_p += 4; |
| } |
| |
| } else if (source2_aligned && |
| !dest_aligned) { // source2 aligned but dest not aligned |
| while (group--) { |
| p_source1 = reinterpret_cast<__m128*>(const_cast<float*>(source1p)); |
| p_source2 = reinterpret_cast<__m128*>(const_cast<float*>(source2p)); |
| dest = _mm_add_ps(*p_source1, *p_source2); |
| _mm_storeu_ps(dest_p, dest); |
| |
| source1p += 4; |
| source2p += 4; |
| dest_p += 4; |
| } |
| |
| } else if (!source2_aligned && |
| dest_aligned) { // source2 not aligned but dest aligned |
| while (group--) { |
| p_source1 = reinterpret_cast<__m128*>(const_cast<float*>(source1p)); |
| source2 = _mm_loadu_ps(source2p); |
| p_dest = reinterpret_cast<__m128*>(dest_p); |
| *p_dest = _mm_add_ps(*p_source1, source2); |
| |
| source1p += 4; |
| source2p += 4; |
| dest_p += 4; |
| } |
| } else if (!source2_aligned && |
| !dest_aligned) { // both source2 and dest not aligned |
| while (group--) { |
| p_source1 = reinterpret_cast<__m128*>(const_cast<float*>(source1p)); |
| source2 = _mm_loadu_ps(source2p); |
| dest = _mm_add_ps(*p_source1, source2); |
| _mm_storeu_ps(dest_p, dest); |
| |
| source1p += 4; |
| source2p += 4; |
| dest_p += 4; |
| } |
| } |
| |
| // Non-SSE handling for remaining frames which is less than 4. |
| n %= 4; |
| while (n) { |
| *dest_p = *source1p + *source2p; |
| source1p++; |
| source2p++; |
| dest_p++; |
| n--; |
| } |
| } else { // if strides are not 1, rollback to normal algorithm |
| #elif CPU(ARM_NEON) |
| if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) { |
| int tail_frames = n % 4; |
| const float* end_p = dest_p + n - tail_frames; |
| |
| while (dest_p < end_p) { |
| float32x4_t source1 = vld1q_f32(source1p); |
| float32x4_t source2 = vld1q_f32(source2p); |
| vst1q_f32(dest_p, vaddq_f32(source1, source2)); |
| |
| source1p += 4; |
| source2p += 4; |
| dest_p += 4; |
| } |
| n = tail_frames; |
| } |
| #elif HAVE(MIPS_MSA_INTRINSICS) |
| if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
| v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6, |
| vSrc1P7; |
| v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6, |
| vSrc2P7; |
| v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| |
| for (; n >= 32; n -= 32) { |
| LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, |
| vSrc1P6, vSrc1P7); |
| LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, |
| vSrc2P6, vSrc2P7); |
| ADD4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3, |
| vSrc2P3, vDst0, vDst1, vDst2, vDst3); |
| ADD4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7, |
| vSrc2P7, vDst4, vDst5, vDst6, vDst7); |
| ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| } |
| } |
| #endif |
| while (n--) { |
| *dest_p = *source1p + *source2p; |
| source1p += source_stride1; |
| source2p += source_stride2; |
| dest_p += dest_stride; |
| } |
| #if CPU(X86) || CPU(X86_64) |
| } |
| #endif |
| } |
| |
| void Vmul(const float* source1p, |
| int source_stride1, |
| const float* source2p, |
| int source_stride2, |
| float* dest_p, |
| int dest_stride, |
| size_t frames_to_process) { |
| int n = frames_to_process; |
| |
| #if CPU(X86) || CPU(X86_64) |
| if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) { |
| // If the source1P address is not 16-byte aligned, the first several frames |
| // (at most three) should be processed separately. |
| while ((reinterpret_cast<uintptr_t>(source1p) & 0x0F) && n) { |
| *dest_p = *source1p * *source2p; |
| source1p++; |
| source2p++; |
| dest_p++; |
| n--; |
| } |
| |
| // Now the source1P address aligned and start to apply SSE. |
| int tail_frames = n % 4; |
| const float* end_p = dest_p + n - tail_frames; |
| __m128 p_source1; |
| __m128 p_source2; |
| __m128 dest; |
| |
| bool source2_aligned = !(reinterpret_cast<uintptr_t>(source2p) & 0x0F); |
| bool dest_aligned = !(reinterpret_cast<uintptr_t>(dest_p) & 0x0F); |
| |
| #define SSE2_MULT(loadInstr, storeInstr) \ |
| while (dest_p < end_p) { \ |
| p_source1 = _mm_load_ps(source1p); \ |
| p_source2 = _mm_##loadInstr##_ps(source2p); \ |
| dest = _mm_mul_ps(p_source1, p_source2); \ |
| _mm_##storeInstr##_ps(dest_p, dest); \ |
| source1p += 4; \ |
| source2p += 4; \ |
| dest_p += 4; \ |
| } |
| |
| if (source2_aligned && dest_aligned) // Both aligned. |
| SSE2_MULT(load, store) |
| else if (source2_aligned && |
| !dest_aligned) // Source2 is aligned but dest not. |
| SSE2_MULT(load, storeu) |
| else if (!source2_aligned && |
| dest_aligned) // Dest is aligned but source2 not. |
| SSE2_MULT(loadu, store) |
| else // Neither aligned. |
| SSE2_MULT(loadu, storeu) |
| |
| n = tail_frames; |
| } |
| #elif CPU(ARM_NEON) |
| if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) { |
| int tail_frames = n % 4; |
| const float* end_p = dest_p + n - tail_frames; |
| |
| while (dest_p < end_p) { |
| float32x4_t source1 = vld1q_f32(source1p); |
| float32x4_t source2 = vld1q_f32(source2p); |
| vst1q_f32(dest_p, vmulq_f32(source1, source2)); |
| |
| source1p += 4; |
| source2p += 4; |
| dest_p += 4; |
| } |
| n = tail_frames; |
| } |
| #elif HAVE(MIPS_MSA_INTRINSICS) |
| if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
| v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6, |
| vSrc1P7; |
| v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6, |
| vSrc2P7; |
| v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| |
| for (; n >= 32; n -= 32) { |
| LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, |
| vSrc1P6, vSrc1P7); |
| LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, |
| vSrc2P6, vSrc2P7); |
| MUL4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3, |
| vSrc2P3, vDst0, vDst1, vDst2, vDst3); |
| MUL4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7, |
| vSrc2P7, vDst4, vDst5, vDst6, vDst7); |
| ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| } |
| } |
| #endif |
| while (n) { |
| *dest_p = *source1p * *source2p; |
| source1p += source_stride1; |
| source2p += source_stride2; |
| dest_p += dest_stride; |
| n--; |
| } |
| } |
| |
| void Zvmul(const float* real1p, |
| const float* imag1p, |
| const float* real2p, |
| const float* imag2p, |
| float* real_dest_p, |
| float* imag_dest_p, |
| size_t frames_to_process) { |
| unsigned i = 0; |
| #if CPU(X86) || CPU(X86_64) |
| // Only use the SSE optimization in the very common case that all addresses |
| // are 16-byte aligned. Otherwise, fall through to the scalar code below. |
| if (!(reinterpret_cast<uintptr_t>(real1p) & 0x0F) && |
| !(reinterpret_cast<uintptr_t>(imag1p) & 0x0F) && |
| !(reinterpret_cast<uintptr_t>(real2p) & 0x0F) && |
| !(reinterpret_cast<uintptr_t>(imag2p) & 0x0F) && |
| !(reinterpret_cast<uintptr_t>(real_dest_p) & 0x0F) && |
| !(reinterpret_cast<uintptr_t>(imag_dest_p) & 0x0F)) { |
| unsigned end_size = frames_to_process - frames_to_process % 4; |
| while (i < end_size) { |
| __m128 real1 = _mm_load_ps(real1p + i); |
| __m128 real2 = _mm_load_ps(real2p + i); |
| __m128 imag1 = _mm_load_ps(imag1p + i); |
| __m128 imag2 = _mm_load_ps(imag2p + i); |
| __m128 real = _mm_mul_ps(real1, real2); |
| real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2)); |
| __m128 imag = _mm_mul_ps(real1, imag2); |
| imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2)); |
| _mm_store_ps(real_dest_p + i, real); |
| _mm_store_ps(imag_dest_p + i, imag); |
| i += 4; |
| } |
| } |
| #elif CPU(ARM_NEON) |
| unsigned end_size = frames_to_process - frames_to_process % 4; |
| while (i < end_size) { |
| float32x4_t real1 = vld1q_f32(real1p + i); |
| float32x4_t real2 = vld1q_f32(real2p + i); |
| float32x4_t imag1 = vld1q_f32(imag1p + i); |
| float32x4_t imag2 = vld1q_f32(imag2p + i); |
| |
| float32x4_t real_result = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2); |
| float32x4_t imag_result = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2); |
| |
| vst1q_f32(real_dest_p + i, real_result); |
| vst1q_f32(imag_dest_p + i, imag_result); |
| |
| i += 4; |
| } |
| #endif |
| for (; i < frames_to_process; ++i) { |
| // Read and compute result before storing them, in case the |
| // destination is the same as one of the sources. |
| float real_result = real1p[i] * real2p[i] - imag1p[i] * imag2p[i]; |
| float imag_result = real1p[i] * imag2p[i] + imag1p[i] * real2p[i]; |
| |
| real_dest_p[i] = real_result; |
| imag_dest_p[i] = imag_result; |
| } |
| } |
| |
| void Vsvesq(const float* source_p, |
| int source_stride, |
| float* sum_p, |
| size_t frames_to_process) { |
| int n = frames_to_process; |
| float sum = 0; |
| |
| #if CPU(X86) || CPU(X86_64) |
| if (source_stride == 1) { |
| // If the sourceP address is not 16-byte aligned, the first several frames |
| // (at most three) should be processed separately. |
| while ((reinterpret_cast<uintptr_t>(source_p) & 0x0F) && n) { |
| float sample = *source_p; |
| sum += sample * sample; |
| source_p++; |
| n--; |
| } |
| |
| // Now the sourceP is aligned, use SSE. |
| int tail_frames = n % 4; |
| const float* end_p = source_p + n - tail_frames; |
| __m128 source; |
| __m128 m_sum = _mm_setzero_ps(); |
| |
| while (source_p < end_p) { |
| source = _mm_load_ps(source_p); |
| source = _mm_mul_ps(source, source); |
| m_sum = _mm_add_ps(m_sum, source); |
| source_p += 4; |
| } |
| |
| // Summarize the SSE results. |
| const float* group_sum_p = reinterpret_cast<float*>(&m_sum); |
| sum += group_sum_p[0] + group_sum_p[1] + group_sum_p[2] + group_sum_p[3]; |
| |
| n = tail_frames; |
| } |
| #elif CPU(ARM_NEON) |
| if (source_stride == 1) { |
| int tail_frames = n % 4; |
| const float* end_p = source_p + n - tail_frames; |
| |
| float32x4_t four_sum = vdupq_n_f32(0); |
| while (source_p < end_p) { |
| float32x4_t source = vld1q_f32(source_p); |
| four_sum = vmlaq_f32(four_sum, source, source); |
| source_p += 4; |
| } |
| float32x2_t two_sum = |
| vadd_f32(vget_low_f32(four_sum), vget_high_f32(four_sum)); |
| |
| float group_sum[2]; |
| vst1_f32(group_sum, two_sum); |
| sum += group_sum[0] + group_sum[1]; |
| |
| n = tail_frames; |
| } |
| #endif |
| |
| while (n--) { |
| float sample = *source_p; |
| sum += sample * sample; |
| source_p += source_stride; |
| } |
| |
| DCHECK(sum_p); |
| *sum_p = sum; |
| } |
| |
| void Vmaxmgv(const float* source_p, |
| int source_stride, |
| float* max_p, |
| size_t frames_to_process) { |
| int n = frames_to_process; |
| float max = 0; |
| |
| #if CPU(X86) || CPU(X86_64) |
| if (source_stride == 1) { |
| // If the sourceP address is not 16-byte aligned, the first several frames |
| // (at most three) should be processed separately. |
| while ((reinterpret_cast<uintptr_t>(source_p) & 0x0F) && n) { |
| max = std::max(max, fabsf(*source_p)); |
| source_p++; |
| n--; |
| } |
| |
| // Now the sourceP is aligned, use SSE. |
| int tail_frames = n % 4; |
| const float* end_p = source_p + n - tail_frames; |
| __m128 source; |
| __m128 m_max = _mm_setzero_ps(); |
| int mask = 0x7FFFFFFF; |
| __m128 m_mask = _mm_set1_ps(*reinterpret_cast<float*>(&mask)); |
| |
| while (source_p < end_p) { |
| source = _mm_load_ps(source_p); |
| // Calculate the absolute value by anding source with mask, the sign bit |
| // is set to 0. |
| source = _mm_and_ps(source, m_mask); |
| m_max = _mm_max_ps(m_max, source); |
| source_p += 4; |
| } |
| |
| // Get max from the SSE results. |
| const float* group_max_p = reinterpret_cast<float*>(&m_max); |
| max = std::max(max, group_max_p[0]); |
| max = std::max(max, group_max_p[1]); |
| max = std::max(max, group_max_p[2]); |
| max = std::max(max, group_max_p[3]); |
| |
| n = tail_frames; |
| } |
| #elif CPU(ARM_NEON) |
| if (source_stride == 1) { |
| int tail_frames = n % 4; |
| const float* end_p = source_p + n - tail_frames; |
| |
| float32x4_t four_max = vdupq_n_f32(0); |
| while (source_p < end_p) { |
| float32x4_t source = vld1q_f32(source_p); |
| four_max = vmaxq_f32(four_max, vabsq_f32(source)); |
| source_p += 4; |
| } |
| float32x2_t two_max = |
| vmax_f32(vget_low_f32(four_max), vget_high_f32(four_max)); |
| |
| float group_max[2]; |
| vst1_f32(group_max, two_max); |
| max = std::max(group_max[0], group_max[1]); |
| |
| n = tail_frames; |
| } |
| #elif HAVE(MIPS_MSA_INTRINSICS) |
| if (sourceStride == 1) { |
| v4f32 vMax = { |
| 0, |
| }; |
| v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; |
| const v16i8 vSignBitMask = (v16i8)__msa_fill_w(0x7FFFFFFF); |
| |
| for (; n >= 32; n -= 32) { |
| LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, |
| vSrc7); |
| AND_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vSignBitMask); |
| VMAX_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vMax); |
| AND_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vSignBitMask); |
| VMAX_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vMax); |
| } |
| |
| max = std::max(max, vMax[0]); |
| max = std::max(max, vMax[1]); |
| max = std::max(max, vMax[2]); |
| max = std::max(max, vMax[3]); |
| } |
| #endif |
| |
| while (n--) { |
| max = std::max(max, fabsf(*source_p)); |
| source_p += source_stride; |
| } |
| |
| DCHECK(max_p); |
| *max_p = max; |
| } |
| |
| void Vclip(const float* source_p, |
| int source_stride, |
| const float* low_threshold_p, |
| const float* high_threshold_p, |
| float* dest_p, |
| int dest_stride, |
| size_t frames_to_process) { |
| int n = frames_to_process; |
| float low_threshold = *low_threshold_p; |
| float high_threshold = *high_threshold_p; |
| |
| // FIXME: Optimize for SSE2. |
| #if CPU(ARM_NEON) |
| if ((source_stride == 1) && (dest_stride == 1)) { |
| int tail_frames = n % 4; |
| const float* end_p = dest_p + n - tail_frames; |
| |
| float32x4_t low = vdupq_n_f32(low_threshold); |
| float32x4_t high = vdupq_n_f32(high_threshold); |
| while (dest_p < end_p) { |
| float32x4_t source = vld1q_f32(source_p); |
| vst1q_f32(dest_p, vmaxq_f32(vminq_f32(source, high), low)); |
| source_p += 4; |
| dest_p += 4; |
| } |
| n = tail_frames; |
| } |
| #elif HAVE(MIPS_MSA_INTRINSICS) |
| if ((sourceStride == 1) && (destStride == 1)) { |
| v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; |
| v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| v4f32 vLowThr, vHighThr; |
| FloatInt lowThr, highThr; |
| |
| lowThr.floatVal = lowThreshold; |
| highThr.floatVal = highThreshold; |
| vLowThr = (v4f32)__msa_fill_w(lowThr.intVal); |
| vHighThr = (v4f32)__msa_fill_w(highThr.intVal); |
| |
| for (; n >= 32; n -= 32) { |
| LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, |
| vSrc7); |
| VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, |
| vDst3); |
| VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6, |
| vDst7); |
| ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| } |
| } |
| #endif |
| while (n--) { |
| *dest_p = clampTo(*source_p, low_threshold, high_threshold); |
| source_p += source_stride; |
| dest_p += dest_stride; |
| } |
| } |
| |
| #endif // OS(MACOSX) |
| |
| } // namespace VectorMath |
| |
| } // namespace blink |