third_party/WebKit/Source/platform/audio/VectorMath.cpp - chromium/src - Git at Google

 /*
  * Copyright (C) 2010, Google Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1.  Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2.  Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */

 #include "platform/audio/VectorMath.h"

 #include <cmath>

 #include "build/build_config.h"
 #include "platform/wtf/Assertions.h"
 #include "platform/wtf/CPU.h"

 #if !defined(OS_MACOSX)
 #if defined(ARCH_CPU_X86_FAMILY)
 #include "platform/audio/cpu/x86/VectorMathX86.h"
 #else
 #include "platform/audio/VectorMathScalar.h"
 #endif
 #endif

 #if defined(OS_MACOSX)
 #include <Accelerate/Accelerate.h>
 #endif

 #if WTF_CPU_ARM_NEON
 #include <arm_neon.h>
 #endif

 #if HAVE_MIPS_MSA_INTRINSICS
 #include "platform/cpu/mips/CommonMacrosMSA.h"
 #endif

 #include <algorithm>

 namespace blink {

 namespace VectorMath {

 #if defined(OS_MACOSX)
 // On the Mac we use the highly optimized versions in Accelerate.framework
 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes
 // <vecLib/vDSP_translate.h> which defines macros of the same name as
 // our namespaced function names, so we must handle this case differently. Other
 // architectures (64bit, ARM, etc.) do not include this header file.

 void Vsmul(const float* source_p,
            int source_stride,
            const float* scale,
            float* dest_p,
            int dest_stride,
            size_t frames_to_process) {
 #if defined(ARCH_CPU_X86)
   ::vsmul(source_p, source_stride, scale, dest_p, dest_stride,
           frames_to_process);
 #else
   vDSP_vsmul(source_p, source_stride, scale, dest_p, dest_stride,
              frames_to_process);
 #endif
 }

 void Vadd(const float* source1p,
           int source_stride1,
           const float* source2p,
           int source_stride2,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
 #if defined(ARCH_CPU_X86)
   ::vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
          dest_stride, frames_to_process);
 #else
   vDSP_vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
             dest_stride, frames_to_process);
 #endif
 }

 void Vmul(const float* source1p,
           int source_stride1,
           const float* source2p,
           int source_stride2,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
 #if defined(ARCH_CPU_X86)
   ::vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
          dest_stride, frames_to_process);
 #else
   vDSP_vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
             dest_stride, frames_to_process);
 #endif
 }

 void Zvmul(const float* real1p,
            const float* imag1p,
            const float* real2p,
            const float* imag2p,
            float* real_dest_p,
            float* imag_dest_p,
            size_t frames_to_process) {
   DSPSplitComplex sc1;
   DSPSplitComplex sc2;
   DSPSplitComplex dest;
   sc1.realp = const_cast<float*>(real1p);
   sc1.imagp = const_cast<float*>(imag1p);
   sc2.realp = const_cast<float*>(real2p);
   sc2.imagp = const_cast<float*>(imag2p);
   dest.realp = real_dest_p;
   dest.imagp = imag_dest_p;
 #if defined(ARCH_CPU_X86)
   ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, frames_to_process, 1);
 #else
   vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, frames_to_process, 1);
 #endif
 }

 void Vsma(const float* source_p,
           int source_stride,
           const float* scale,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
   vDSP_vsma(source_p, source_stride, scale, dest_p, dest_stride, dest_p,
             dest_stride, frames_to_process);
 }

 void Vmaxmgv(const float* source_p,
              int source_stride,
              float* max_p,
              size_t frames_to_process) {
   vDSP_maxmgv(source_p, source_stride, max_p, frames_to_process);
 }

 void Vsvesq(const float* source_p,
             int source_stride,
             float* sum_p,
             size_t frames_to_process) {
   vDSP_svesq(const_cast<float*>(source_p), source_stride, sum_p,
              frames_to_process);
 }

 void Vclip(const float* source_p,
            int source_stride,
            const float* low_threshold_p,
            const float* high_threshold_p,
            float* dest_p,
            int dest_stride,
            size_t frames_to_process) {
   vDSP_vclip(const_cast<float*>(source_p), source_stride,
              const_cast<float*>(low_threshold_p),
              const_cast<float*>(high_threshold_p), dest_p, dest_stride,
              frames_to_process);
 }
 #else

 namespace {
 #if defined(ARCH_CPU_X86_FAMILY)
 namespace Impl = X86;
 #else
 namespace Impl = Scalar;
 #endif
 }  // namespace

 void Vsma(const float* source_p,
           int source_stride,
           const float* scale,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
 #if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
   int n = frames_to_process;

 #if WTF_CPU_ARM_NEON
   if ((source_stride == 1) && (dest_stride == 1)) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     float32x4_t k = vdupq_n_f32(*scale);
     while (dest_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       float32x4_t dest = vld1q_f32(dest_p);

       dest = vmlaq_f32(dest, source, k);
       vst1q_f32(dest_p, dest);

       source_p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }
 #elif HAVE_MIPS_MSA_INTRINSICS
   if ((source_stride == 1) && (dest_stride == 1)) {
     float* destPCopy = dest_p;
     v4f32 vScale;
     v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
     v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
     FloatInt scaleVal;

     scaleVal.floatVal = *scale;
     vScale = (v4f32)__msa_fill_w(scaleVal.intVal);

     for (; n >= 32; n -= 32) {
       LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
              vSrc7);
       LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6,
              vDst7);
       VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
       VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
       ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
     }
   }
 #endif

   frames_to_process = n;
 #endif

   Impl::Vsma(source_p, source_stride, scale, dest_p, dest_stride,
              frames_to_process);
 }

 void Vsmul(const float* source_p,
            int source_stride,
            const float* scale,
            float* dest_p,
            int dest_stride,
            size_t frames_to_process) {
 #if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
   int n = frames_to_process;

 #if WTF_CPU_ARM_NEON
   if ((source_stride == 1) && (dest_stride == 1)) {
     float k = *scale;
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     while (dest_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       vst1q_f32(dest_p, vmulq_n_f32(source, k));

       source_p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }
 #elif HAVE_MIPS_MSA_INTRINSICS
   if ((source_stride == 1) && (dest_stride == 1)) {
     v4f32 vScale;
     v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
     v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
     FloatInt scaleVal;

     scaleVal.floatVal = *scale;
     vScale = (v4f32)__msa_fill_w(scaleVal.intVal);

     for (; n >= 32; n -= 32) {
       LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
              vSrc7);
       VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
       VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
       ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
     }
   }
 #endif

   frames_to_process = n;
 #endif

   Impl::Vsmul(source_p, source_stride, scale, dest_p, dest_stride,
               frames_to_process);
 }

 void Vadd(const float* source1p,
           int source_stride1,
           const float* source2p,
           int source_stride2,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
 #if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
   int n = frames_to_process;

 #if WTF_CPU_ARM_NEON
   if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     while (dest_p < end_p) {
       float32x4_t source1 = vld1q_f32(source1p);
       float32x4_t source2 = vld1q_f32(source2p);
       vst1q_f32(dest_p, vaddq_f32(source1, source2));

       source1p += 4;
       source2p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }
 #elif HAVE_MIPS_MSA_INTRINSICS
   if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
     v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
         vSrc1P7;
     v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,
         vSrc2P7;
     v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

     for (; n >= 32; n -= 32) {
       LD_SP8(source1p, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,
              vSrc1P6, vSrc1P7);
       LD_SP8(source2p, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,
              vSrc2P6, vSrc2P7);
       ADD4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,
            vSrc2P3, vDst0, vDst1, vDst2, vDst3);
       ADD4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,
            vSrc2P7, vDst4, vDst5, vDst6, vDst7);
       ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
     }
   }
 #endif

   frames_to_process = n;
 #endif

   Impl::Vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
              dest_stride, frames_to_process);
 }

 void Vmul(const float* source1p,
           int source_stride1,
           const float* source2p,
           int source_stride2,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
 #if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
   int n = frames_to_process;

 #if WTF_CPU_ARM_NEON
   if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     while (dest_p < end_p) {
       float32x4_t source1 = vld1q_f32(source1p);
       float32x4_t source2 = vld1q_f32(source2p);
       vst1q_f32(dest_p, vmulq_f32(source1, source2));

       source1p += 4;
       source2p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }
 #elif HAVE_MIPS_MSA_INTRINSICS
   if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
     v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
         vSrc1P7;
     v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,
         vSrc2P7;
     v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

     for (; n >= 32; n -= 32) {
       LD_SP8(source1p, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,
              vSrc1P6, vSrc1P7);
       LD_SP8(source2p, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,
              vSrc2P6, vSrc2P7);
       MUL4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,
            vSrc2P3, vDst0, vDst1, vDst2, vDst3);
       MUL4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,
            vSrc2P7, vDst4, vDst5, vDst6, vDst7);
       ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
     }
   }
 #endif

   frames_to_process = n;
 #endif

   Impl::Vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
              dest_stride, frames_to_process);
 }

 void Zvmul(const float* real1p,
            const float* imag1p,
            const float* real2p,
            const float* imag2p,
            float* real_dest_p,
            float* imag_dest_p,
            size_t frames_to_process) {
   unsigned i = 0;

 #if WTF_CPU_ARM_NEON
   unsigned end_size = frames_to_process - frames_to_process % 4;
   while (i < end_size) {
     float32x4_t real1 = vld1q_f32(real1p + i);
     float32x4_t real2 = vld1q_f32(real2p + i);
     float32x4_t imag1 = vld1q_f32(imag1p + i);
     float32x4_t imag2 = vld1q_f32(imag2p + i);

     float32x4_t real_result = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
     float32x4_t imag_result = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);

     vst1q_f32(real_dest_p + i, real_result);
     vst1q_f32(imag_dest_p + i, imag_result);

     i += 4;
   }
 #endif

   Impl::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i,
               imag_dest_p + i, frames_to_process - i);
 }

 void Vsvesq(const float* source_p,
             int source_stride,
             float* sum_p,
             size_t frames_to_process) {
   float sum = 0;

 #if WTF_CPU_ARM_NEON
   int n = frames_to_process;

   if (source_stride == 1) {
     int tail_frames = n % 4;
     const float* end_p = source_p + n - tail_frames;

     float32x4_t four_sum = vdupq_n_f32(0);
     while (source_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       four_sum = vmlaq_f32(four_sum, source, source);
       source_p += 4;
     }
     float32x2_t two_sum =
         vadd_f32(vget_low_f32(four_sum), vget_high_f32(four_sum));

     float group_sum[2];
     vst1_f32(group_sum, two_sum);
     sum += group_sum[0] + group_sum[1];

     n = tail_frames;
   }

   frames_to_process = n;
 #endif

   Impl::Vsvesq(source_p, source_stride, &sum, frames_to_process);

   DCHECK(sum_p);
   *sum_p = sum;
 }

 void Vmaxmgv(const float* source_p,
              int source_stride,
              float* max_p,
              size_t frames_to_process) {
   float max = 0;

 #if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
   int n = frames_to_process;

 #if WTF_CPU_ARM_NEON
   if (source_stride == 1) {
     int tail_frames = n % 4;
     const float* end_p = source_p + n - tail_frames;

     float32x4_t four_max = vdupq_n_f32(0);
     while (source_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       four_max = vmaxq_f32(four_max, vabsq_f32(source));
       source_p += 4;
     }
     float32x2_t two_max =
         vmax_f32(vget_low_f32(four_max), vget_high_f32(four_max));

     float group_max[2];
     vst1_f32(group_max, two_max);
     max = std::max(group_max[0], group_max[1]);

     n = tail_frames;
   }
 #elif HAVE_MIPS_MSA_INTRINSICS
   if (source_stride == 1) {
     v4f32 vMax = {
         0,
     };
     v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
     const v16i8 vSignBitMask = (v16i8)__msa_fill_w(0x7FFFFFFF);

     for (; n >= 32; n -= 32) {
       LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
              vSrc7);
       AND_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vSignBitMask);
       VMAX_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vMax);
       AND_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vSignBitMask);
       VMAX_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vMax);
     }

     max = std::max(max, vMax[0]);
     max = std::max(max, vMax[1]);
     max = std::max(max, vMax[2]);
     max = std::max(max, vMax[3]);
   }
 #endif

   frames_to_process = n;
 #endif

   Impl::Vmaxmgv(source_p, source_stride, &max, frames_to_process);

   DCHECK(max_p);
   *max_p = max;
 }

 void Vclip(const float* source_p,
            int source_stride,
            const float* low_threshold_p,
            const float* high_threshold_p,
            float* dest_p,
            int dest_stride,
            size_t frames_to_process) {
   float low_threshold = *low_threshold_p;
   float high_threshold = *high_threshold_p;

 #if DCHECK_IS_ON()
   // Do the same DCHECKs that |clampTo| would do so that optimization paths do
   // not have to do them.
   for (size_t i = 0u; i < frames_to_process; ++i)
     DCHECK(!std::isnan(source_p[i]));
   // This also ensures that thresholds are not NaNs.
   DCHECK_LE(low_threshold, high_threshold);
 #endif

 #if HAVE_MIPS_MSA_INTRINSICS || WTF_CPU_ARM_NEON
   int n = frames_to_process;

 #if WTF_CPU_ARM_NEON
   if ((source_stride == 1) && (dest_stride == 1)) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     float32x4_t low = vdupq_n_f32(low_threshold);
     float32x4_t high = vdupq_n_f32(high_threshold);
     while (dest_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       vst1q_f32(dest_p, vmaxq_f32(vminq_f32(source, high), low));
       source_p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }
 #elif HAVE_MIPS_MSA_INTRINSICS
   if ((source_stride == 1) && (dest_stride == 1)) {
     v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
     v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
     v4f32 vLowThr, vHighThr;
     FloatInt lowThr, highThr;

     lowThr.floatVal = low_threshold;
     highThr.floatVal = high_threshold;
     vLowThr = (v4f32)__msa_fill_w(lowThr.intVal);
     vHighThr = (v4f32)__msa_fill_w(highThr.intVal);

     for (; n >= 32; n -= 32) {
       LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
              vSrc7);
       VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2,
              vDst3);
       VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6,
              vDst7);
       ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
     }
   }
 #endif

   frames_to_process = n;
 #endif

   Impl::Vclip(source_p, source_stride, &low_threshold, &high_threshold, dest_p,
               dest_stride, frames_to_process);
 }

 #endif  // defined(OS_MACOSX)

 }  // namespace VectorMath

 }  // namespace blink
	/*
	* Copyright (C) 2010, Google Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
	* DAMAGE.
	*/

	#include "platform/audio/VectorMath.h"

	#include <cmath>

	#include "build/build_config.h"
	#include "platform/wtf/Assertions.h"
	#include "platform/wtf/CPU.h"

	#if !defined(OS_MACOSX)
	#if defined(ARCH_CPU_X86_FAMILY)
	#include "platform/audio/cpu/x86/VectorMathX86.h"
	#else
	#include "platform/audio/VectorMathScalar.h"
	#endif
	#endif

	#if defined(OS_MACOSX)
	#include <Accelerate/Accelerate.h>
	#endif

	#if WTF_CPU_ARM_NEON
	#include <arm_neon.h>
	#endif

	#if HAVE_MIPS_MSA_INTRINSICS
	#include "platform/cpu/mips/CommonMacrosMSA.h"
	#endif

	#include <algorithm>

	namespace blink {

	namespace VectorMath {

	#if defined(OS_MACOSX)
	// On the Mac we use the highly optimized versions in Accelerate.framework
	// In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes
	// <vecLib/vDSP_translate.h> which defines macros of the same name as
	// our namespaced function names, so we must handle this case differently. Other
	// architectures (64bit, ARM, etc.) do not include this header file.

	void Vsmul(const float* source_p,
	int source_stride,
	const float* scale,
	float* dest_p,
	int dest_stride,
	size_t frames_to_process) {
	#if defined(ARCH_CPU_X86)
	::vsmul(source_p, source_stride, scale, dest_p, dest_stride,
	frames_to_process);
	#else
	vDSP_vsmul(source_p, source_stride, scale, dest_p, dest_stride,
	frames_to_process);
	#endif
	}

	void Vadd(const float* source1p,
	int source_stride1,
	const float* source2p,
	int source_stride2,
	float* dest_p,
	int dest_stride,
	size_t frames_to_process) {
	#if defined(ARCH_CPU_X86)
	::vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
	dest_stride, frames_to_process);
	#else
	vDSP_vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
	dest_stride, frames_to_process);
	#endif
	}

	void Vmul(const float* source1p,
	int source_stride1,
	const float* source2p,
	int source_stride2,
	float* dest_p,
	int dest_stride,
	size_t frames_to_process) {
	#if defined(ARCH_CPU_X86)
	::vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
	dest_stride, frames_to_process);
	#else
	vDSP_vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
	dest_stride, frames_to_process);
	#endif
	}

	void Zvmul(const float* real1p,
	const float* imag1p,
	const float* real2p,
	const float* imag2p,
	float* real_dest_p,
	float* imag_dest_p,
	size_t frames_to_process) {
	DSPSplitComplex sc1;
	DSPSplitComplex sc2;
	DSPSplitComplex dest;
	sc1.realp = const_cast<float*>(real1p);
	sc1.imagp = const_cast<float*>(imag1p);
	sc2.realp = const_cast<float*>(real2p);
	sc2.imagp = const_cast<float*>(imag2p);
	dest.realp = real_dest_p;
	dest.imagp = imag_dest_p;
	#if defined(ARCH_CPU_X86)
	::zvmul(&sc1, 1, &sc2, 1, &dest, 1, frames_to_process, 1);
	#else
	vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, frames_to_process, 1);
	#endif
	}

	void Vsma(const float* source_p,
	int source_stride,
	const float* scale,
	float* dest_p,
	int dest_stride,
	size_t frames_to_process) {
	vDSP_vsma(source_p, source_stride, scale, dest_p, dest_stride, dest_p,
	dest_stride, frames_to_process);
	}

	void Vmaxmgv(const float* source_p,
	int source_stride,
	float* max_p,
	size_t frames_to_process) {
	vDSP_maxmgv(source_p, source_stride, max_p, frames_to_process);
	}

	void Vsvesq(const float* source_p,
	int source_stride,
	float* sum_p,
	size_t frames_to_process) {
	vDSP_svesq(const_cast<float*>(source_p), source_stride, sum_p,
	frames_to_process);
	}

	void Vclip(const float* source_p,
	int source_stride,
	const float* low_threshold_p,
	const float* high_threshold_p,
	float* dest_p,
	int dest_stride,
	size_t frames_to_process) {
	vDSP_vclip(const_cast<float*>(source_p), source_stride,
	const_cast<float*>(low_threshold_p),
	const_cast<float*>(high_threshold_p), dest_p, dest_stride,
	frames_to_process);
	}
	#else

	namespace {
	#if defined(ARCH_CPU_X86_FAMILY)
	namespace Impl = X86;
	#else
	namespace Impl = Scalar;
	#endif
	} // namespace

	void Vsma(const float* source_p,
	int source_stride,
	const float* scale,
	float* dest_p,
	int dest_stride,
	size_t frames_to_process) {
	#if HAVE_MIPS_MSA_INTRINSICS \|\| WTF_CPU_ARM_NEON
	int n = frames_to_process;

	#if WTF_CPU_ARM_NEON
	if ((source_stride == 1) && (dest_stride == 1)) {
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	float32x4_t k = vdupq_n_f32(*scale);
	while (dest_p < end_p) {
	float32x4_t source = vld1q_f32(source_p);
	float32x4_t dest = vld1q_f32(dest_p);

	dest = vmlaq_f32(dest, source, k);
	vst1q_f32(dest_p, dest);

	source_p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}
	#elif HAVE_MIPS_MSA_INTRINSICS
	if ((source_stride == 1) && (dest_stride == 1)) {
	float* destPCopy = dest_p;
	v4f32 vScale;
	v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
	v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
	FloatInt scaleVal;

	scaleVal.floatVal = *scale;
	vScale = (v4f32)__msa_fill_w(scaleVal.intVal);

	for (; n >= 32; n -= 32) {
	LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
	vSrc7);
	LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6,
	vDst7);
	VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
	VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
	ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
	}
	}
	#endif

	frames_to_process = n;
	#endif

	Impl::Vsma(source_p, source_stride, scale, dest_p, dest_stride,
	frames_to_process);
	}

	void Vsmul(const float* source_p,
	int source_stride,
	const float* scale,
	float* dest_p,
	int dest_stride,
	size_t frames_to_process) {
	#if HAVE_MIPS_MSA_INTRINSICS \|\| WTF_CPU_ARM_NEON
	int n = frames_to_process;

	#if WTF_CPU_ARM_NEON
	if ((source_stride == 1) && (dest_stride == 1)) {
	float k = *scale;
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	while (dest_p < end_p) {
	float32x4_t source = vld1q_f32(source_p);
	vst1q_f32(dest_p, vmulq_n_f32(source, k));

	source_p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}
	#elif HAVE_MIPS_MSA_INTRINSICS
	if ((source_stride == 1) && (dest_stride == 1)) {
	v4f32 vScale;
	v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
	v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
	FloatInt scaleVal;

	scaleVal.floatVal = *scale;
	vScale = (v4f32)__msa_fill_w(scaleVal.intVal);

	for (; n >= 32; n -= 32) {
	LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
	vSrc7);
	VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
	VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
	ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
	}
	}
	#endif

	frames_to_process = n;
	#endif

	Impl::Vsmul(source_p, source_stride, scale, dest_p, dest_stride,
	frames_to_process);
	}

	void Vadd(const float* source1p,
	int source_stride1,
	const float* source2p,
	int source_stride2,
	float* dest_p,
	int dest_stride,
	size_t frames_to_process) {
	#if HAVE_MIPS_MSA_INTRINSICS \|\| WTF_CPU_ARM_NEON
	int n = frames_to_process;

	#if WTF_CPU_ARM_NEON
	if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	while (dest_p < end_p) {
	float32x4_t source1 = vld1q_f32(source1p);
	float32x4_t source2 = vld1q_f32(source2p);
	vst1q_f32(dest_p, vaddq_f32(source1, source2));

	source1p += 4;
	source2p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}
	#elif HAVE_MIPS_MSA_INTRINSICS
	if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
	v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
	vSrc1P7;
	v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,
	vSrc2P7;
	v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	for (; n >= 32; n -= 32) {
	LD_SP8(source1p, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,
	vSrc1P6, vSrc1P7);
	LD_SP8(source2p, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,
	vSrc2P6, vSrc2P7);
	ADD4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,
	vSrc2P3, vDst0, vDst1, vDst2, vDst3);
	ADD4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,
	vSrc2P7, vDst4, vDst5, vDst6, vDst7);
	ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
	}
	}
	#endif

	frames_to_process = n;
	#endif

	Impl::Vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
	dest_stride, frames_to_process);
	}

	void Vmul(const float* source1p,
	int source_stride1,
	const float* source2p,
	int source_stride2,
	float* dest_p,
	int dest_stride,
	size_t frames_to_process) {
	#if HAVE_MIPS_MSA_INTRINSICS \|\| WTF_CPU_ARM_NEON
	int n = frames_to_process;

	#if WTF_CPU_ARM_NEON
	if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	while (dest_p < end_p) {
	float32x4_t source1 = vld1q_f32(source1p);
	float32x4_t source2 = vld1q_f32(source2p);
	vst1q_f32(dest_p, vmulq_f32(source1, source2));

	source1p += 4;
	source2p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}
	#elif HAVE_MIPS_MSA_INTRINSICS
	if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
	v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
	vSrc1P7;
	v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,
	vSrc2P7;
	v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	for (; n >= 32; n -= 32) {
	LD_SP8(source1p, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,
	vSrc1P6, vSrc1P7);
	LD_SP8(source2p, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,
	vSrc2P6, vSrc2P7);
	MUL4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,
	vSrc2P3, vDst0, vDst1, vDst2, vDst3);
	MUL4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,
	vSrc2P7, vDst4, vDst5, vDst6, vDst7);
	ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
	}
	}
	#endif

	frames_to_process = n;
	#endif

	Impl::Vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
	dest_stride, frames_to_process);
	}

	void Zvmul(const float* real1p,
	const float* imag1p,
	const float* real2p,
	const float* imag2p,
	float* real_dest_p,
	float* imag_dest_p,
	size_t frames_to_process) {
	unsigned i = 0;

	#if WTF_CPU_ARM_NEON
	unsigned end_size = frames_to_process - frames_to_process % 4;
	while (i < end_size) {
	float32x4_t real1 = vld1q_f32(real1p + i);
	float32x4_t real2 = vld1q_f32(real2p + i);
	float32x4_t imag1 = vld1q_f32(imag1p + i);
	float32x4_t imag2 = vld1q_f32(imag2p + i);

	float32x4_t real_result = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
	float32x4_t imag_result = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);

	vst1q_f32(real_dest_p + i, real_result);
	vst1q_f32(imag_dest_p + i, imag_result);

	i += 4;
	}
	#endif

	Impl::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i,
	imag_dest_p + i, frames_to_process - i);
	}

	void Vsvesq(const float* source_p,
	int source_stride,
	float* sum_p,
	size_t frames_to_process) {
	float sum = 0;

	#if WTF_CPU_ARM_NEON
	int n = frames_to_process;

	if (source_stride == 1) {
	int tail_frames = n % 4;
	const float* end_p = source_p + n - tail_frames;

	float32x4_t four_sum = vdupq_n_f32(0);
	while (source_p < end_p) {
	float32x4_t source = vld1q_f32(source_p);
	four_sum = vmlaq_f32(four_sum, source, source);
	source_p += 4;
	}
	float32x2_t two_sum =
	vadd_f32(vget_low_f32(four_sum), vget_high_f32(four_sum));

	float group_sum[2];
	vst1_f32(group_sum, two_sum);
	sum += group_sum[0] + group_sum[1];

	n = tail_frames;
	}

	frames_to_process = n;
	#endif

	Impl::Vsvesq(source_p, source_stride, &sum, frames_to_process);

	DCHECK(sum_p);
	*sum_p = sum;
	}

	void Vmaxmgv(const float* source_p,
	int source_stride,
	float* max_p,
	size_t frames_to_process) {
	float max = 0;

	#if HAVE_MIPS_MSA_INTRINSICS \|\| WTF_CPU_ARM_NEON
	int n = frames_to_process;

	#if WTF_CPU_ARM_NEON
	if (source_stride == 1) {
	int tail_frames = n % 4;
	const float* end_p = source_p + n - tail_frames;

	float32x4_t four_max = vdupq_n_f32(0);
	while (source_p < end_p) {
	float32x4_t source = vld1q_f32(source_p);
	four_max = vmaxq_f32(four_max, vabsq_f32(source));
	source_p += 4;
	}
	float32x2_t two_max =
	vmax_f32(vget_low_f32(four_max), vget_high_f32(four_max));

	float group_max[2];
	vst1_f32(group_max, two_max);
	max = std::max(group_max[0], group_max[1]);

	n = tail_frames;
	}
	#elif HAVE_MIPS_MSA_INTRINSICS
	if (source_stride == 1) {
	v4f32 vMax = {
	0,
	};
	v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
	const v16i8 vSignBitMask = (v16i8)__msa_fill_w(0x7FFFFFFF);

	for (; n >= 32; n -= 32) {
	LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
	vSrc7);
	AND_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vSignBitMask);
	VMAX_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vMax);
	AND_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vSignBitMask);
	VMAX_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vMax);
	}

	max = std::max(max, vMax[0]);
	max = std::max(max, vMax[1]);
	max = std::max(max, vMax[2]);
	max = std::max(max, vMax[3]);
	}
	#endif

	frames_to_process = n;
	#endif

	Impl::Vmaxmgv(source_p, source_stride, &max, frames_to_process);

	DCHECK(max_p);
	*max_p = max;
	}

	void Vclip(const float* source_p,
	int source_stride,
	const float* low_threshold_p,
	const float* high_threshold_p,
	float* dest_p,
	int dest_stride,
	size_t frames_to_process) {
	float low_threshold = *low_threshold_p;
	float high_threshold = *high_threshold_p;

	#if DCHECK_IS_ON()
	// Do the same DCHECKs that \|clampTo\| would do so that optimization paths do
	// not have to do them.
	for (size_t i = 0u; i < frames_to_process; ++i)
	DCHECK(!std::isnan(source_p[i]));
	// This also ensures that thresholds are not NaNs.
	DCHECK_LE(low_threshold, high_threshold);
	#endif

	#if HAVE_MIPS_MSA_INTRINSICS \|\| WTF_CPU_ARM_NEON
	int n = frames_to_process;

	#if WTF_CPU_ARM_NEON
	if ((source_stride == 1) && (dest_stride == 1)) {
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	float32x4_t low = vdupq_n_f32(low_threshold);
	float32x4_t high = vdupq_n_f32(high_threshold);
	while (dest_p < end_p) {
	float32x4_t source = vld1q_f32(source_p);
	vst1q_f32(dest_p, vmaxq_f32(vminq_f32(source, high), low));
	source_p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}
	#elif HAVE_MIPS_MSA_INTRINSICS
	if ((source_stride == 1) && (dest_stride == 1)) {
	v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
	v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
	v4f32 vLowThr, vHighThr;
	FloatInt lowThr, highThr;

	lowThr.floatVal = low_threshold;
	highThr.floatVal = high_threshold;
	vLowThr = (v4f32)__msa_fill_w(lowThr.intVal);
	vHighThr = (v4f32)__msa_fill_w(highThr.intVal);

	for (; n >= 32; n -= 32) {
	LD_SP8(source_p, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
	vSrc7);
	VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2,
	vDst3);
	VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6,
	vDst7);
	ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, dest_p, 4);
	}
	}
	#endif

	frames_to_process = n;
	#endif

	Impl::Vclip(source_p, source_stride, &low_threshold, &high_threshold, dest_p,
	dest_stride, frames_to_process);
	}

	#endif // defined(OS_MACOSX)

	} // namespace VectorMath

	} // namespace blink