DSP: Fix vector type mismatches

This commit fixes the vector type mismatches in the MVE function
implementations that are treated as errors in GCC unless the
`-flax-vector-conversions` option is specified.

Note that most of these mismatches were already fixed upstream.

Signed-off-by: Stephanos Ioannidis <root@stephanos.io>
GitOrigin-RevId: 5f86244bad4ad5a590e084f0e72ba7a1416c2edf
Change-Id: Ia0d6d8e93b07b6334a5f5ca90b4017049f145013
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/zephyr/cmsis/+/3627934
Tested-by: CopyBot Service Account <copybot.service@gmail.com>
Reviewed-by: Jack Rosenthal <jrosenth@chromium.org>
Tested-by: Jack Rosenthal <jrosenth@chromium.org>
Commit-Queue: Jack Rosenthal <jrosenth@chromium.org>
diff --git a/CMSIS/DSP/PrivateInclude/arm_vec_fft.h b/CMSIS/DSP/PrivateInclude/arm_vec_fft.h
index 30dcb0e..1006920 100644
--- a/CMSIS/DSP/PrivateInclude/arm_vec_fft.h
+++ b/CMSIS/DSP/PrivateInclude/arm_vec_fft.h
@@ -140,7 +140,7 @@
 {
     uint32_t       *src = (uint32_t *) pSrc;
     int32_t         blkCnt;     /* loop counters */
-    uint32x4_t      bitRevTabOff;
+    uint16x8_t      bitRevTabOff;
     uint16x8_t      one = vdupq_n_u16(1);
     uint32x4_t      bitRevOff1Low, bitRevOff0Low;
     uint32x4_t      bitRevOff1High, bitRevOff0High;
@@ -152,8 +152,8 @@
 
     bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
     bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
-    bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
-    bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+    bitRevOff0Low = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
+    bitRevOff0High = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
 
     blkCnt = (bitRevLen / 16);
     while (blkCnt > 0) {
@@ -162,8 +162,8 @@
 
         bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one);
         bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one);
-        bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3);
-        bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3);
+        bitRevOff1Low = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff1Low, 3);
+        bitRevOff1High = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff1High, 3);
 
         inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
         inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
@@ -177,8 +177,8 @@
 
         bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
         bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
-        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
-        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+        bitRevOff0Low = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
+        bitRevOff0High = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
 
         inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
         inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
@@ -211,8 +211,8 @@
 
         bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
         bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
-        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
-        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+        bitRevOff0Low = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
+        bitRevOff0High = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
 
         inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
         inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
@@ -251,13 +251,13 @@
     while (blkCnt > 0) {
         uint64x2_t      vecIn;
 
-        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0);
+        vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs0);
         idxOffs0 = idxOffs0 + 16;
         vst1q(pDst32, (uint32x4_t) vecIn);
         pDst32 += 4;
         bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
 
-        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1);
+        vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs1);
         idxOffs1 = idxOffs1 + 16;
         vst1q(pDst32, (uint32x4_t) vecIn);
         pDst32 += 4;
@@ -297,13 +297,13 @@
     while (blkCnt > 0) {
         uint32x4_t      vecIn;
 
-        vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs0);
+        vecIn = (uint32x4_t)vldrwq_gather_offset_s32(pSrc, bitRevOffs0);
         idxOffs0 = idxOffs0 + 32;
         vst1q(pDst16, (uint16x8_t) vecIn);
         pDst16 += 8;
         bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
 
-        vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs1);
+        vecIn = (uint32x4_t)vldrwq_gather_offset_s32(pSrc, bitRevOffs1);
         idxOffs1 = idxOffs1 + 32;
         vst1q(pDst16, (uint16x8_t) vecIn);
         pDst16 += 8;
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
index c77a05c..5ab13dc 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
@@ -102,9 +102,9 @@
         bCoeffs1[7] = a2;
         bCoeffs1[6] = a1;
 
-        bCoeffs2 =
+        bCoeffs2 = (q15x8_t)
             vsetq_lane_s32(vgetq_lane_s32((q31x4_t) bCoeffs0, 3), (q31x4_t) bCoeffs2, 3);
-        bCoeffs3 =
+        bCoeffs3 = (q15x8_t)
             vsetq_lane_s32(vgetq_lane_s32((q31x4_t) bCoeffs1, 3), (q31x4_t) bCoeffs3, 3);
 
 
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c
index bcca830..05fe9fc 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c
@@ -178,7 +178,7 @@
 
         int32x4_t vecOffs;
         int w;
-        vecOffs = vidupq_u32((uint32_t)0, 1);
+        vecOffs = (int32x4_t)vidupq_u32((uint32_t)0, 1);
         vecOffs = vmulq_n_s32(vecOffs,n);
 
         for(w=k+1; w<n; w+=4)
@@ -204,7 +204,7 @@
              //pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
 
 
-             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0);
+             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
              vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
 
              
@@ -247,7 +247,7 @@
 
              vecA = vldrwq_z_f32(&pA[w*n+x],p0);
              
-             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0);
+             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
              vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
 
              vecA = vfmsq_m(vecA, vecW, vecX, p0);
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c b/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
index 2dded65..f67ce26 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
@@ -72,9 +72,9 @@
   {
       /* C = (float32_t) A / 32768 */
       /* convert from q15 to float and then store the results in the destination buffer */
-      vecDst = vldrhq_s32(pSrcVec); 
+      vecDst = (q15x8_t)vldrhq_s32(pSrcVec); 
       pSrcVec += 4;
-      vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 15));  
+      vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 15));  
       pDst += 4;
       /*
        * Decrement the blockSize loop counter
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c b/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
index 258309e..16f9342 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
@@ -70,9 +70,9 @@
     {
         /* C = (float32_t) A / 32768 */
         /* convert from q7 to float and then store the results in the destination buffer */
-        vecDst = vldrbq_s32(pSrcVec);    
+        vecDst = (q7x16_t)vldrbq_s32(pSrcVec);    
         pSrcVec += 4;
-        vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 7));   
+        vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 7));   
         pDst += 4;
         /*
          * Decrement the blockSize loop counter
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
index 9d4eb96..121cac1 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
@@ -184,16 +184,16 @@
         vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
 
         vecTmp0 = vhaddq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
 
         vecTmp0 = vhsubq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
 
         vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
 
         vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
 
         blkCnt--;
     }
@@ -419,16 +419,16 @@
         vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
 
         vecTmp0 = vhaddq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
 
         vecTmp0 = vhsubq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
 
         vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
 
         vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
 
         blkCnt--;
     }