Optimize Neon SAD reductions using wider ADDP instruction

Implement AArch64-only paths for each of the Neon SAD reduction
functions, making use of a wider pairwise addition instruction only
available on AArch64.

This change removes the need for shuffling between high and low
halves of Neon vectors - resulting in a faster reduction that requires
fewer instructions.

Bug: b/181236880
Change-Id: I1c48580b4aec27222538eeab44e38ecc1f2009dc
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 06443c6..34c0a7a 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -34,7 +34,9 @@
                             uint32_t *const res) {
   int i;
   uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+#if !defined(__aarch64__)
   uint16x4_t a[2];
+#endif
   uint32x4_t r;
 
   assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
@@ -51,9 +53,14 @@
     abs[1] = vabal_u8(abs[1], s, ref23);
   }
 
+#if defined(__aarch64__)
+  abs[0] = vpaddq_u16(abs[0], abs[1]);
+  r = vpaddlq_u16(abs[0]);
+#else
   a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
   a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
   r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
+#endif
   vst1q_u32(res, r);
 }
 
@@ -74,6 +81,12 @@
 // Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
 static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
                                           uint32_t *const res) {
+#if defined(__aarch64__)
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint16x8_t b0 = vpaddq_u16(a0, a1);
+  const uint32x4_t r = vpaddlq_u16(b0);
+#else
   const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
   const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
   const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
@@ -81,12 +94,21 @@
   const uint16x4_t b0 = vpadd_u16(a0, a1);
   const uint16x4_t b1 = vpadd_u16(a2, a3);
   const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
   vst1q_u32(res, r);
 }
 
 // Can handle 1024 pixels' sad sum (such as 32x32)
 static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
                                            uint32_t *const res) {
+#if defined(__aarch64__)
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint32x4_t b0 = vpaddlq_u16(a0);
+  const uint32x4_t b1 = vpaddlq_u16(a1);
+  const uint32x4_t r = vpaddq_u32(b0, b1);
+  vst1q_u32(res, r);
+#else
   const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
   const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
   const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
@@ -96,11 +118,22 @@
   const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
   const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
   vst1q_u32(res, vcombine_u32(c0, c1));
+#endif
 }
 
 // Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
 static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
                                            uint32_t *const res) {
+#if defined(__aarch64__)
+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+  const uint32x4_t b0 = vpaddq_u32(a0, a1);
+  const uint32x4_t b1 = vpaddq_u32(a2, a3);
+  const uint32x4_t r = vpaddq_u32(b0, b1);
+  vst1q_u32(res, r);
+#else
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
   const uint32x4_t a2 = vpaddlq_u16(sum[2]);
@@ -112,11 +145,30 @@
   const uint32x2_t c0 = vpadd_u32(b0, b1);
   const uint32x2_t c1 = vpadd_u32(b2, b3);
   vst1q_u32(res, vcombine_u32(c0, c1));
+#endif
 }
 
 // Can handle 4096 pixels' sad sum (such as 64x64)
 static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
                                            uint32_t *const res) {
+#if defined(__aarch64__)
+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
+  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
+  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
+  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
+  const uint32x4_t b0 = vaddq_u32(a0, a1);
+  const uint32x4_t b1 = vaddq_u32(a2, a3);
+  const uint32x4_t b2 = vaddq_u32(a4, a5);
+  const uint32x4_t b3 = vaddq_u32(a6, a7);
+  const uint32x4_t c0 = vpaddq_u32(b0, b1);
+  const uint32x4_t c1 = vpaddq_u32(b2, b3);
+  const uint32x4_t r = vpaddq_u32(c0, c1);
+  vst1q_u32(res, r);
+#else
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
   const uint32x4_t a2 = vpaddlq_u16(sum[2]);
@@ -136,6 +188,7 @@
   const uint32x2_t d0 = vpadd_u32(c0, c1);
   const uint32x2_t d1 = vpadd_u32(c2, c3);
   vst1q_u32(res, vcombine_u32(d0, d1));
+#endif
 }
 
 static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,