Implement vpx_convolve8_avg_vert_neon using SDOT instruction Add an alternative AArch64 implementation of vpx_convolve8_avg_vert_neon for targets that implement the Armv8.4-A SDOT (signed dot product) instruction. The existing MLA-based implementation of vpx_convolve8_avg_vert_neon is retained and used on target CPUs that do not implement the SDOT instruction (or CPUs executing in AArch32 mode). The availability of the SDOT instruction is indicated by the feature macro __ARM_FEATURE_DOTPROD. Bug: b/181236880 Change-Id: I971c626116155e1384bff4c76fd3420312c7a15b

commit: 35bce9389ea875b57b352a0f5f532b96aa47bff6 [log] [tgz]
author: Jonathan Wright <jonathan.wright@arm.com> Sun May 23 12:35:15 2021
committer: James Zern <jzern@google.com> Tue May 25 00:12:19 2021
tree: 492d86b8cd8edb31bc18a9a12490af46b8573088
parent: 10823f54681747b9f64deb3002531c95cc67d17f [diff]
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 25a59a2..06b58c4 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c

@@ -303,6 +303,467 @@
   }
 }
 
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    src += 4 * src_stride;
+    t4 = vld1_u8(src);
+    src += src_stride;
+    t5 = vld1_u8(src);
+    src += src_stride;
+    t6 = vld1_u8(src);
+    src += src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      s += 4 * src_stride;
+      t4 = vld1_u8(s);
+      s += src_stride;
+      t5 = vld1_u8(s);
+      s += src_stride;
+      t6 = vld1_u8(s);
+      s += src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                     correction, filters);
+        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                     correction, filters);
+        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                     correction, filters);
+        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                     correction, filters);
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23, dd01, dd23;
+
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    src += 4 * src_stride;
+    t4 = vld1_u8(src);
+    src += src_stride;
+    t5 = vld1_u8(src);
+    src += src_stride;
+    t6 = vld1_u8(src);
+    src += src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      s += 4 * src_stride;
+      t4 = vld1_u8(s);
+      s += src_stride;
+      t5 = vld1_u8(s);
+      s += src_stride;
+      t6 = vld1_u8(s);
+      s += src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                     correction, filters);
+        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                     correction, filters);
+        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                     correction, filters);
+        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                     correction, filters);
+
+        dd0 = vld1_u8(d + 0 * dst_stride);
+        dd1 = vld1_u8(d + 1 * dst_stride);
+        dd2 = vld1_u8(d + 2 * dst_stride);
+        dd3 = vld1_u8(d + 3 * dst_stride);
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
 #else
 
 static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
@@ -870,236 +1331,6 @@
   }
 }
 
-#endif
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
-
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *filter, int x0_q4,
-                             int x_step_q4, int y0_q4, int y_step_q4, int w,
-                             int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x8_t range_limit = vdup_n_u8(128);
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(y_step_q4 == 16);
-
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= 3 * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-    src += 4 * src_stride;
-    t4 = vld1_u8(src);
-    src += src_stride;
-    t5 = vld1_u8(src);
-    src += src_stride;
-    t6 = vld1_u8(src);
-    src += src_stride;
-
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-    s7 = vdup_n_s8(0);
-    s8 = vdup_n_s8(0);
-    s9 = vdup_n_s8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
-
-    do {
-      uint8x8_t t7, t8, t9, t10;
-
-      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
-
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
-
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      s += 4 * src_stride;
-      t4 = vld1_u8(s);
-      s += src_stride;
-      t5 = vld1_u8(s);
-      s += src_stride;
-      t6 = vld1_u8(s);
-      s += src_stride;
-
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-      s7 = vdup_n_s8(0);
-      s8 = vdup_n_s8(0);
-      s9 = vdup_n_s8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        uint8x8_t t7, t8, t9, t10;
-
-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                     correction, filters);
-        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                     correction, filters);
-        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                     correction, filters);
-        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                     correction, filters);
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        vst1_u8(d + 2 * dst_stride, d2);
-        vst1_u8(d + 3 * dst_stride, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height > 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w > 0);
-  }
-}
-
-#else
-
 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const InterpKernel *filter, int x0_q4,
@@ -1272,8 +1503,6 @@
   }
 }
 
-#endif
-
 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *filter, int x0_q4,
@@ -1464,3 +1693,5 @@
     } while (w != 0);
   }
 }
+
+#endif
commit	35bce9389ea875b57b352a0f5f532b96aa47bff6	[log] [tgz]
author	Jonathan Wright <jonathan.wright@arm.com>	Sun May 23 12:35:15 2021
committer	James Zern <jzern@google.com>	Tue May 25 00:12:19 2021
tree	492d86b8cd8edb31bc18a9a12490af46b8573088
parent	10823f54681747b9f64deb3002531c95cc67d17f [diff]