Optimize Neon implementation of vpx_int_pro_col Use widening pairwise addition instructions to halve the number of additions required. Change-Id: I0307a3b65e50d2b1ae582938bc5df9c2b21df734

commit: c738e87f27ef8e12dd28b9052f446a5f69abf3c9 [log] [tgz]
author: Jonathan Wright <jonathan.wright@arm.com> Tue May 30 14:22:04 2023
committer: Jonathan Wright <jonathan.wright@arm.com> Wed May 31 13:30:02 2023
tree: c24856549456db876b489192cd3b47b7e43c4272
parent: 99522d307ccef8b53d373beab8c5b6bf997ca4ef [diff]
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index 8c61fc2..2fe65d1 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c

@@ -121,17 +121,17 @@
 }
 
 int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+  uint16x8_t sum;
   int i;
-  uint16x8_t vec_sum = vdupq_n_u16(0);
 
-  for (i = 0; i < width; i += 16) {
-    const uint8x16_t vec_row = vld1q_u8(ref);
-    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
-    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
-    ref += 16;
+  assert(width >= 16 && width % 16 == 0);
+
+  sum = vpaddlq_u8(vld1q_u8(ref));
+  for (i = 16; i < width; i += 16) {
+    sum = vpadalq_u8(sum, vld1q_u8(ref + i));
   }
 
-  return (int16_t)horizontal_add_uint16x8(vec_sum);
+  return (int16_t)horizontal_add_uint16x8(sum);
 }
 
 // ref, src = [0, 510] - max diff = 16-bits
commit	c738e87f27ef8e12dd28b9052f446a5f69abf3c9	[log] [tgz]
author	Jonathan Wright <jonathan.wright@arm.com>	Tue May 30 14:22:04 2023
committer	Jonathan Wright <jonathan.wright@arm.com>	Wed May 31 13:30:02 2023
tree	c24856549456db876b489192cd3b47b7e43c4272
parent	99522d307ccef8b53d373beab8c5b6bf997ca4ef [diff]