third_party/WebKit/Source/platform/audio/VectorMath.cpp - chromium/src.git - Git at Google

 /*
  * Copyright (C) 2010, Google Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1.  Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2.  Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */

 #include "platform/audio/VectorMath.h"

 #include <stdint.h>
 #include "platform/wtf/Assertions.h"
 #include "platform/wtf/CPU.h"
 #include "platform/wtf/MathExtras.h"

 #if OS(MACOSX)
 #include <Accelerate/Accelerate.h>
 #endif

 #if CPU(X86) || CPU(X86_64)
 #include <emmintrin.h>
 #endif

 #if CPU(ARM_NEON)
 #include <arm_neon.h>
 #endif

 #if HAVE(MIPS_MSA_INTRINSICS)
 #include "platform/cpu/mips/CommonMacrosMSA.h"
 #endif

 #include <math.h>
 #include <algorithm>

 namespace blink {

 namespace VectorMath {

 #if OS(MACOSX)
 // On the Mac we use the highly optimized versions in Accelerate.framework
 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes
 // <vecLib/vDSP_translate.h> which defines macros of the same name as
 // our namespaced function names, so we must handle this case differently. Other
 // architectures (64bit, ARM, etc.) do not include this header file.

 void Vsmul(const float* source_p,
            int source_stride,
            const float* scale,
            float* dest_p,
            int dest_stride,
            size_t frames_to_process) {
 #if CPU(X86)
   ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
 #else
   vDSP_vsmul(source_p, source_stride, scale, dest_p, dest_stride,
              frames_to_process);
 #endif
 }

 void Vadd(const float* source1p,
           int source_stride1,
           const float* source2p,
           int source_stride2,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
 #if CPU(X86)
   ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride,
          framesToProcess);
 #else
   vDSP_vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
             dest_stride, frames_to_process);
 #endif
 }

 void Vmul(const float* source1p,
           int source_stride1,
           const float* source2p,
           int source_stride2,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
 #if CPU(X86)
   ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride,
          framesToProcess);
 #else
   vDSP_vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
             dest_stride, frames_to_process);
 #endif
 }

 void Zvmul(const float* real1p,
            const float* imag1p,
            const float* real2p,
            const float* imag2p,
            float* real_dest_p,
            float* imag_dest_p,
            size_t frames_to_process) {
   DSPSplitComplex sc1;
   DSPSplitComplex sc2;
   DSPSplitComplex dest;
   sc1.realp = const_cast<float*>(real1p);
   sc1.imagp = const_cast<float*>(imag1p);
   sc2.realp = const_cast<float*>(real2p);
   sc2.imagp = const_cast<float*>(imag2p);
   dest.realp = real_dest_p;
   dest.imagp = imag_dest_p;
 #if CPU(X86)
   ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
 #else
   vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, frames_to_process, 1);
 #endif
 }

 void Vsma(const float* source_p,
           int source_stride,
           const float* scale,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
   vDSP_vsma(source_p, source_stride, scale, dest_p, dest_stride, dest_p,
             dest_stride, frames_to_process);
 }

 void Vmaxmgv(const float* source_p,
              int source_stride,
              float* max_p,
              size_t frames_to_process) {
   vDSP_maxmgv(source_p, source_stride, max_p, frames_to_process);
 }

 void Vsvesq(const float* source_p,
             int source_stride,
             float* sum_p,
             size_t frames_to_process) {
   vDSP_svesq(const_cast<float*>(source_p), source_stride, sum_p,
              frames_to_process);
 }

 void Vclip(const float* source_p,
            int source_stride,
            const float* low_threshold_p,
            const float* high_threshold_p,
            float* dest_p,
            int dest_stride,
            size_t frames_to_process) {
   vDSP_vclip(const_cast<float*>(source_p), source_stride,
              const_cast<float*>(low_threshold_p),
              const_cast<float*>(high_threshold_p), dest_p, dest_stride,
              frames_to_process);
 }
 #else

 void Vsma(const float* source_p,
           int source_stride,
           const float* scale,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
   int n = frames_to_process;

 #if CPU(X86) || CPU(X86_64)
   if ((source_stride == 1) && (dest_stride == 1)) {
     float k = *scale;

     // If the sourceP address is not 16-byte aligned, the first several frames
     // (at most three) should be processed separately.
     while ((reinterpret_cast<uintptr_t>(source_p) & 0x0F) && n) {
       *dest_p += k * *source_p;
       source_p++;
       dest_p++;
       n--;
     }

     // Now the sourceP is aligned, use SSE.
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     __m128 p_source;
     __m128 dest;
     __m128 temp;
     __m128 m_scale = _mm_set_ps1(k);

     bool dest_aligned = !(reinterpret_cast<uintptr_t>(dest_p) & 0x0F);

 #define SSE2_MULT_ADD(loadInstr, storeInstr) \
   while (dest_p < end_p) {                   \
     p_source = _mm_load_ps(source_p);        \
     temp = _mm_mul_ps(p_source, m_scale);    \
     dest = _mm_##loadInstr##_ps(dest_p);     \
     dest = _mm_add_ps(dest, temp);           \
     _mm_##storeInstr##_ps(dest_p, dest);     \
     source_p += 4;                           \
     dest_p += 4;                             \
   }

     if (dest_aligned)
       SSE2_MULT_ADD(load, store)
     else
       SSE2_MULT_ADD(loadu, storeu)

     n = tail_frames;
   }
 #elif CPU(ARM_NEON)
   if ((source_stride == 1) && (dest_stride == 1)) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     float32x4_t k = vdupq_n_f32(*scale);
     while (dest_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       float32x4_t dest = vld1q_f32(dest_p);

       dest = vmlaq_f32(dest, source, k);
       vst1q_f32(dest_p, dest);

       source_p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }
 #elif HAVE(MIPS_MSA_INTRINSICS)
   if ((sourceStride == 1) && (destStride == 1)) {
     float* destPCopy = destP;
     v4f32 vScale;
     v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
     v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
     FloatInt scaleVal;

     scaleVal.floatVal = *scale;
     vScale = (v4f32)__msa_fill_w(scaleVal.intVal);

     for (; n >= 32; n -= 32) {
       LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
              vSrc7);
       LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6,
              vDst7);
       VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
       VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
       ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
     }
   }
 #endif
   while (n) {
     *dest_p += *source_p * *scale;
     source_p += source_stride;
     dest_p += dest_stride;
     n--;
   }
 }

 void Vsmul(const float* source_p,
            int source_stride,
            const float* scale,
            float* dest_p,
            int dest_stride,
            size_t frames_to_process) {
   int n = frames_to_process;

 #if CPU(X86) || CPU(X86_64)
   if ((source_stride == 1) && (dest_stride == 1)) {
     float k = *scale;

     // If the sourceP address is not 16-byte aligned, the first several frames
     // (at most three) should be processed separately.
     while ((reinterpret_cast<size_t>(source_p) & 0x0F) && n) {
       *dest_p = k * *source_p;
       source_p++;
       dest_p++;
       n--;
     }

     // Now the sourceP address is aligned and start to apply SSE.
     int group = n / 4;
     __m128 m_scale = _mm_set_ps1(k);
     __m128* p_source;
     __m128* p_dest;
     __m128 dest;

     if (reinterpret_cast<size_t>(dest_p) & 0x0F) {
       while (group--) {
         p_source = reinterpret_cast<__m128*>(const_cast<float*>(source_p));
         dest = _mm_mul_ps(*p_source, m_scale);
         _mm_storeu_ps(dest_p, dest);

         source_p += 4;
         dest_p += 4;
       }
     } else {
       while (group--) {
         p_source = reinterpret_cast<__m128*>(const_cast<float*>(source_p));
         p_dest = reinterpret_cast<__m128*>(dest_p);
         *p_dest = _mm_mul_ps(*p_source, m_scale);

         source_p += 4;
         dest_p += 4;
       }
     }

     // Non-SSE handling for remaining frames which is less than 4.
     n %= 4;
     while (n) {
       *dest_p = k * *source_p;
       source_p++;
       dest_p++;
       n--;
     }
   } else {  // If strides are not 1, rollback to normal algorithm.
 #elif CPU(ARM_NEON)
   if ((source_stride == 1) && (dest_stride == 1)) {
     float k = *scale;
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     while (dest_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       vst1q_f32(dest_p, vmulq_n_f32(source, k));

       source_p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }
 #elif HAVE(MIPS_MSA_INTRINSICS)
   if ((sourceStride == 1) && (destStride == 1)) {
     v4f32 vScale;
     v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
     v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
     FloatInt scaleVal;

     scaleVal.floatVal = *scale;
     vScale = (v4f32)__msa_fill_w(scaleVal.intVal);

     for (; n >= 32; n -= 32) {
       LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
              vSrc7);
       VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
       VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
       ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
     }
   }
 #endif
     float k = *scale;
     while (n--) {
       *dest_p = k * *source_p;
       source_p += source_stride;
       dest_p += dest_stride;
     }
 #if CPU(X86) || CPU(X86_64)
   }
 #endif
 }

 void Vadd(const float* source1p,
           int source_stride1,
           const float* source2p,
           int source_stride2,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
   int n = frames_to_process;

 #if CPU(X86) || CPU(X86_64)
   if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
     // If the sourceP address is not 16-byte aligned, the first several frames
     // (at most three) should be processed separately.
     while ((reinterpret_cast<size_t>(source1p) & 0x0F) && n) {
       *dest_p = *source1p + *source2p;
       source1p++;
       source2p++;
       dest_p++;
       n--;
     }

     // Now the source1P address is aligned and start to apply SSE.
     int group = n / 4;
     __m128* p_source1;
     __m128* p_source2;
     __m128* p_dest;
     __m128 source2;
     __m128 dest;

     bool source2_aligned = !(reinterpret_cast<size_t>(source2p) & 0x0F);
     bool dest_aligned = !(reinterpret_cast<size_t>(dest_p) & 0x0F);

     if (source2_aligned && dest_aligned) {  // all aligned
       while (group--) {
         p_source1 = reinterpret_cast<__m128*>(const_cast<float*>(source1p));
         p_source2 = reinterpret_cast<__m128*>(const_cast<float*>(source2p));
         p_dest = reinterpret_cast<__m128*>(dest_p);
         *p_dest = _mm_add_ps(*p_source1, *p_source2);

         source1p += 4;
         source2p += 4;
         dest_p += 4;
       }

     } else if (source2_aligned &&
                !dest_aligned) {  // source2 aligned but dest not aligned
       while (group--) {
         p_source1 = reinterpret_cast<__m128*>(const_cast<float*>(source1p));
         p_source2 = reinterpret_cast<__m128*>(const_cast<float*>(source2p));
         dest = _mm_add_ps(*p_source1, *p_source2);
         _mm_storeu_ps(dest_p, dest);

         source1p += 4;
         source2p += 4;
         dest_p += 4;
       }

     } else if (!source2_aligned &&
                dest_aligned) {  // source2 not aligned but dest aligned
       while (group--) {
         p_source1 = reinterpret_cast<__m128*>(const_cast<float*>(source1p));
         source2 = _mm_loadu_ps(source2p);
         p_dest = reinterpret_cast<__m128*>(dest_p);
         *p_dest = _mm_add_ps(*p_source1, source2);

         source1p += 4;
         source2p += 4;
         dest_p += 4;
       }
     } else if (!source2_aligned &&
                !dest_aligned) {  // both source2 and dest not aligned
       while (group--) {
         p_source1 = reinterpret_cast<__m128*>(const_cast<float*>(source1p));
         source2 = _mm_loadu_ps(source2p);
         dest = _mm_add_ps(*p_source1, source2);
         _mm_storeu_ps(dest_p, dest);

         source1p += 4;
         source2p += 4;
         dest_p += 4;
       }
     }

     // Non-SSE handling for remaining frames which is less than 4.
     n %= 4;
     while (n) {
       *dest_p = *source1p + *source2p;
       source1p++;
       source2p++;
       dest_p++;
       n--;
     }
   } else {  // if strides are not 1, rollback to normal algorithm
 #elif CPU(ARM_NEON)
   if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     while (dest_p < end_p) {
       float32x4_t source1 = vld1q_f32(source1p);
       float32x4_t source2 = vld1q_f32(source2p);
       vst1q_f32(dest_p, vaddq_f32(source1, source2));

       source1p += 4;
       source2p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }
 #elif HAVE(MIPS_MSA_INTRINSICS)
   if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
     v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
         vSrc1P7;
     v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,
         vSrc2P7;
     v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

     for (; n >= 32; n -= 32) {
       LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,
              vSrc1P6, vSrc1P7);
       LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,
              vSrc2P6, vSrc2P7);
       ADD4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,
            vSrc2P3, vDst0, vDst1, vDst2, vDst3);
       ADD4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,
            vSrc2P7, vDst4, vDst5, vDst6, vDst7);
       ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
     }
   }
 #endif
     while (n--) {
       *dest_p = *source1p + *source2p;
       source1p += source_stride1;
       source2p += source_stride2;
       dest_p += dest_stride;
     }
 #if CPU(X86) || CPU(X86_64)
   }
 #endif
 }

 void Vmul(const float* source1p,
           int source_stride1,
           const float* source2p,
           int source_stride2,
           float* dest_p,
           int dest_stride,
           size_t frames_to_process) {
   int n = frames_to_process;

 #if CPU(X86) || CPU(X86_64)
   if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
     // If the source1P address is not 16-byte aligned, the first several frames
     // (at most three) should be processed separately.
     while ((reinterpret_cast<uintptr_t>(source1p) & 0x0F) && n) {
       *dest_p = *source1p * *source2p;
       source1p++;
       source2p++;
       dest_p++;
       n--;
     }

     // Now the source1P address aligned and start to apply SSE.
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;
     __m128 p_source1;
     __m128 p_source2;
     __m128 dest;

     bool source2_aligned = !(reinterpret_cast<uintptr_t>(source2p) & 0x0F);
     bool dest_aligned = !(reinterpret_cast<uintptr_t>(dest_p) & 0x0F);

 #define SSE2_MULT(loadInstr, storeInstr)        \
   while (dest_p < end_p) {                      \
     p_source1 = _mm_load_ps(source1p);          \
     p_source2 = _mm_##loadInstr##_ps(source2p); \
     dest = _mm_mul_ps(p_source1, p_source2);    \
     _mm_##storeInstr##_ps(dest_p, dest);        \
     source1p += 4;                              \
     source2p += 4;                              \
     dest_p += 4;                                \
   }

     if (source2_aligned && dest_aligned)  // Both aligned.
       SSE2_MULT(load, store)
     else if (source2_aligned &&
              !dest_aligned)  // Source2 is aligned but dest not.
       SSE2_MULT(load, storeu)
     else if (!source2_aligned &&
              dest_aligned)  // Dest is aligned but source2 not.
       SSE2_MULT(loadu, store)
     else  // Neither aligned.
       SSE2_MULT(loadu, storeu)

     n = tail_frames;
   }
 #elif CPU(ARM_NEON)
   if ((source_stride1 == 1) && (source_stride2 == 1) && (dest_stride == 1)) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     while (dest_p < end_p) {
       float32x4_t source1 = vld1q_f32(source1p);
       float32x4_t source2 = vld1q_f32(source2p);
       vst1q_f32(dest_p, vmulq_f32(source1, source2));

       source1p += 4;
       source2p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }
 #elif HAVE(MIPS_MSA_INTRINSICS)
   if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
     v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
         vSrc1P7;
     v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,
         vSrc2P7;
     v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

     for (; n >= 32; n -= 32) {
       LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,
              vSrc1P6, vSrc1P7);
       LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,
              vSrc2P6, vSrc2P7);
       MUL4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,
            vSrc2P3, vDst0, vDst1, vDst2, vDst3);
       MUL4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,
            vSrc2P7, vDst4, vDst5, vDst6, vDst7);
       ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
     }
   }
 #endif
   while (n) {
     *dest_p = *source1p * *source2p;
     source1p += source_stride1;
     source2p += source_stride2;
     dest_p += dest_stride;
     n--;
   }
 }

 void Zvmul(const float* real1p,
            const float* imag1p,
            const float* real2p,
            const float* imag2p,
            float* real_dest_p,
            float* imag_dest_p,
            size_t frames_to_process) {
   unsigned i = 0;
 #if CPU(X86) || CPU(X86_64)
   // Only use the SSE optimization in the very common case that all addresses
   // are 16-byte aligned.  Otherwise, fall through to the scalar code below.
   if (!(reinterpret_cast<uintptr_t>(real1p) & 0x0F) &&
       !(reinterpret_cast<uintptr_t>(imag1p) & 0x0F) &&
       !(reinterpret_cast<uintptr_t>(real2p) & 0x0F) &&
       !(reinterpret_cast<uintptr_t>(imag2p) & 0x0F) &&
       !(reinterpret_cast<uintptr_t>(real_dest_p) & 0x0F) &&
       !(reinterpret_cast<uintptr_t>(imag_dest_p) & 0x0F)) {
     unsigned end_size = frames_to_process - frames_to_process % 4;
     while (i < end_size) {
       __m128 real1 = _mm_load_ps(real1p + i);
       __m128 real2 = _mm_load_ps(real2p + i);
       __m128 imag1 = _mm_load_ps(imag1p + i);
       __m128 imag2 = _mm_load_ps(imag2p + i);
       __m128 real = _mm_mul_ps(real1, real2);
       real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
       __m128 imag = _mm_mul_ps(real1, imag2);
       imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
       _mm_store_ps(real_dest_p + i, real);
       _mm_store_ps(imag_dest_p + i, imag);
       i += 4;
     }
   }
 #elif CPU(ARM_NEON)
   unsigned end_size = frames_to_process - frames_to_process % 4;
   while (i < end_size) {
     float32x4_t real1 = vld1q_f32(real1p + i);
     float32x4_t real2 = vld1q_f32(real2p + i);
     float32x4_t imag1 = vld1q_f32(imag1p + i);
     float32x4_t imag2 = vld1q_f32(imag2p + i);

     float32x4_t real_result = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
     float32x4_t imag_result = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);

     vst1q_f32(real_dest_p + i, real_result);
     vst1q_f32(imag_dest_p + i, imag_result);

     i += 4;
   }
 #endif
   for (; i < frames_to_process; ++i) {
     // Read and compute result before storing them, in case the
     // destination is the same as one of the sources.
     float real_result = real1p[i] * real2p[i] - imag1p[i] * imag2p[i];
     float imag_result = real1p[i] * imag2p[i] + imag1p[i] * real2p[i];

     real_dest_p[i] = real_result;
     imag_dest_p[i] = imag_result;
   }
 }

 void Vsvesq(const float* source_p,
             int source_stride,
             float* sum_p,
             size_t frames_to_process) {
   int n = frames_to_process;
   float sum = 0;

 #if CPU(X86) || CPU(X86_64)
   if (source_stride == 1) {
     // If the sourceP address is not 16-byte aligned, the first several frames
     // (at most three) should be processed separately.
     while ((reinterpret_cast<uintptr_t>(source_p) & 0x0F) && n) {
       float sample = *source_p;
       sum += sample * sample;
       source_p++;
       n--;
     }

     // Now the sourceP is aligned, use SSE.
     int tail_frames = n % 4;
     const float* end_p = source_p + n - tail_frames;
     __m128 source;
     __m128 m_sum = _mm_setzero_ps();

     while (source_p < end_p) {
       source = _mm_load_ps(source_p);
       source = _mm_mul_ps(source, source);
       m_sum = _mm_add_ps(m_sum, source);
       source_p += 4;
     }

     // Summarize the SSE results.
     const float* group_sum_p = reinterpret_cast<float*>(&m_sum);
     sum += group_sum_p[0] + group_sum_p[1] + group_sum_p[2] + group_sum_p[3];

     n = tail_frames;
   }
 #elif CPU(ARM_NEON)
   if (source_stride == 1) {
     int tail_frames = n % 4;
     const float* end_p = source_p + n - tail_frames;

     float32x4_t four_sum = vdupq_n_f32(0);
     while (source_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       four_sum = vmlaq_f32(four_sum, source, source);
       source_p += 4;
     }
     float32x2_t two_sum =
         vadd_f32(vget_low_f32(four_sum), vget_high_f32(four_sum));

     float group_sum[2];
     vst1_f32(group_sum, two_sum);
     sum += group_sum[0] + group_sum[1];

     n = tail_frames;
   }
 #endif

   while (n--) {
     float sample = *source_p;
     sum += sample * sample;
     source_p += source_stride;
   }

   DCHECK(sum_p);
   *sum_p = sum;
 }

 void Vmaxmgv(const float* source_p,
              int source_stride,
              float* max_p,
              size_t frames_to_process) {
   int n = frames_to_process;
   float max = 0;

 #if CPU(X86) || CPU(X86_64)
   if (source_stride == 1) {
     // If the sourceP address is not 16-byte aligned, the first several frames
     // (at most three) should be processed separately.
     while ((reinterpret_cast<uintptr_t>(source_p) & 0x0F) && n) {
       max = std::max(max, fabsf(*source_p));
       source_p++;
       n--;
     }

     // Now the sourceP is aligned, use SSE.
     int tail_frames = n % 4;
     const float* end_p = source_p + n - tail_frames;
     __m128 source;
     __m128 m_max = _mm_setzero_ps();
     int mask = 0x7FFFFFFF;
     __m128 m_mask = _mm_set1_ps(*reinterpret_cast<float*>(&mask));

     while (source_p < end_p) {
       source = _mm_load_ps(source_p);
       // Calculate the absolute value by anding source with mask, the sign bit
       // is set to 0.
       source = _mm_and_ps(source, m_mask);
       m_max = _mm_max_ps(m_max, source);
       source_p += 4;
     }

     // Get max from the SSE results.
     const float* group_max_p = reinterpret_cast<float*>(&m_max);
     max = std::max(max, group_max_p[0]);
     max = std::max(max, group_max_p[1]);
     max = std::max(max, group_max_p[2]);
     max = std::max(max, group_max_p[3]);

     n = tail_frames;
   }
 #elif CPU(ARM_NEON)
   if (source_stride == 1) {
     int tail_frames = n % 4;
     const float* end_p = source_p + n - tail_frames;

     float32x4_t four_max = vdupq_n_f32(0);
     while (source_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       four_max = vmaxq_f32(four_max, vabsq_f32(source));
       source_p += 4;
     }
     float32x2_t two_max =
         vmax_f32(vget_low_f32(four_max), vget_high_f32(four_max));

     float group_max[2];
     vst1_f32(group_max, two_max);
     max = std::max(group_max[0], group_max[1]);

     n = tail_frames;
   }
 #elif HAVE(MIPS_MSA_INTRINSICS)
   if (sourceStride == 1) {
     v4f32 vMax = {
         0,
     };
     v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
     const v16i8 vSignBitMask = (v16i8)__msa_fill_w(0x7FFFFFFF);

     for (; n >= 32; n -= 32) {
       LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
              vSrc7);
       AND_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vSignBitMask);
       VMAX_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vMax);
       AND_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vSignBitMask);
       VMAX_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vMax);
     }

     max = std::max(max, vMax[0]);
     max = std::max(max, vMax[1]);
     max = std::max(max, vMax[2]);
     max = std::max(max, vMax[3]);
   }
 #endif

   while (n--) {
     max = std::max(max, fabsf(*source_p));
     source_p += source_stride;
   }

   DCHECK(max_p);
   *max_p = max;
 }

 void Vclip(const float* source_p,
            int source_stride,
            const float* low_threshold_p,
            const float* high_threshold_p,
            float* dest_p,
            int dest_stride,
            size_t frames_to_process) {
   int n = frames_to_process;
   float low_threshold = *low_threshold_p;
   float high_threshold = *high_threshold_p;

 // FIXME: Optimize for SSE2.
 #if CPU(ARM_NEON)
   if ((source_stride == 1) && (dest_stride == 1)) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     float32x4_t low = vdupq_n_f32(low_threshold);
     float32x4_t high = vdupq_n_f32(high_threshold);
     while (dest_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       vst1q_f32(dest_p, vmaxq_f32(vminq_f32(source, high), low));
       source_p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }
 #elif HAVE(MIPS_MSA_INTRINSICS)
   if ((sourceStride == 1) && (destStride == 1)) {
     v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
     v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
     v4f32 vLowThr, vHighThr;
     FloatInt lowThr, highThr;

     lowThr.floatVal = lowThreshold;
     highThr.floatVal = highThreshold;
     vLowThr = (v4f32)__msa_fill_w(lowThr.intVal);
     vHighThr = (v4f32)__msa_fill_w(highThr.intVal);

     for (; n >= 32; n -= 32) {
       LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
              vSrc7);
       VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2,
              vDst3);
       VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6,
              vDst7);
       ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
     }
   }
 #endif
   while (n--) {
     *dest_p = clampTo(*source_p, low_threshold, high_threshold);
     source_p += source_stride;
     dest_p += dest_stride;
   }
 }

 #endif  // OS(MACOSX)

 }  // namespace VectorMath

 }  // namespace blink