[NEON] Optimize and homogenize Butterfly DCT functions

Provide a set of commonly used Butterfly DCT functions for use in
DCT 4x4, 8x8, 16x16, 32x32 functions. These are provided in various
forms, using vqrdmulh_s16/vqrdmulh_s32 for _fast variants, which
unfortunately are only usable in pass1 of most DCTs, as they do not
provide the necessary precision in pass2.
This gave a performance gain ranging from 5% to 15% in 16x16 case.
Also, for 32x32, the loads were rearranged, along with the butterfly
optimizations, this gave 10% gain in 32x32_rd function.
This refactoring was necessary to allow easier porting of highbd
32x32 functions -follows this patchset.

Change-Id: I6282e640b95a95938faff76c3b2bace3dc298bc3
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index a07a160..b8286a8 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -18,6 +18,8 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
 
 static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
                                    int stride) {
@@ -130,12 +132,14 @@
     case ADST_DCT:
       load_buffer_4x4(input, in, stride);
       fadst4x4_neon(in);
-      vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+      // pass1 variant is not accurate enough
+      vpx_fdct4x4_pass2_neon((int16x4_t *)in);
       write_buffer_4x4(output, in);
       break;
     case DCT_ADST:
       load_buffer_4x4(input, in, stride);
-      vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+      // pass1 variant is not accurate enough
+      vpx_fdct4x4_pass2_neon((int16x4_t *)in);
       fadst4x4_neon(in);
       write_buffer_4x4(output, in);
       break;
@@ -488,13 +492,15 @@
     case ADST_DCT:
       load_buffer_8x8(input, in, stride);
       fadst8x8_neon(in);
-      vpx_fdct8x8_pass1_neon(in);
+      // pass1 variant is not accurate enough
+      vpx_fdct8x8_pass2_neon(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
     case DCT_ADST:
       load_buffer_8x8(input, in, stride);
-      vpx_fdct8x8_pass1_neon(in);
+      // pass1 variant is not accurate enough
+      vpx_fdct8x8_pass2_neon(in);
       fadst8x8_neon(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
@@ -559,7 +565,8 @@
   i[6] = vaddq_s16(in[6], in[9]);
   i[7] = vaddq_s16(in[7], in[8]);
 
-  vpx_fdct8x8_pass1_neon(i);
+  // pass1 variant is not accurate enough
+  vpx_fdct8x8_pass2_neon(i);
   transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]);
 
   // step 2
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index d0c07d4..a458eca 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -37,20 +37,21 @@
   // Left half.
   load_cross(input, stride, temp0);
   scale_input(temp0, temp1);
-  vpx_fdct16x16_body(temp1, temp0);
+  vpx_fdct8x16_body(temp1, temp0);
 
   // Right half.
   load_cross(input + 8, stride, temp1);
   scale_input(temp1, temp2);
-  vpx_fdct16x16_body(temp2, temp1);
+  vpx_fdct8x16_body(temp2, temp1);
 
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
+
   transpose_s16_8x8_new(&temp0[0], &temp2[0]);
   transpose_s16_8x8_new(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
   cross_input(temp2, temp3);
-  vpx_fdct16x16_body(temp3, temp2);
+  vpx_fdct8x16_body(temp3, temp2);
   transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
                     &temp2[5], &temp2[6], &temp2[7]);
   transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -62,11 +63,12 @@
   // Transpose bottom left and bottom right quarters into one contiguous
   // location to process to the bottom half.
   transpose_s16_8x8_new(&temp0[8], &temp1[0]);
+
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
                     &temp1[13], &temp1[14], &temp1[15]);
   partial_round_shift(temp1);
   cross_input(temp1, temp0);
-  vpx_fdct16x16_body(temp0, temp1);
+  vpx_fdct8x16_body(temp0, temp1);
   transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
                     &temp1[5], &temp1[6], &temp1[7]);
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
@@ -86,12 +88,12 @@
   // Left half.
   load_cross(input, stride, temp0);
   highbd_scale_input(temp0, left1, right1);
-  vpx_highbd_fdct16x16_body(left1, right1);
+  vpx_highbd_fdct8x16_body(left1, right1);
 
   // right half.
   load_cross(input + 8, stride, temp0);
   highbd_scale_input(temp0, left2, right2);
-  vpx_highbd_fdct16x16_body(left2, right2);
+  vpx_highbd_fdct8x16_body(left2, right2);
 
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
@@ -103,14 +105,14 @@
 
   highbd_partial_round_shift(left3, right3);
   highbd_cross_input(left3, right3, left1, right1);
-  vpx_highbd_fdct16x16_body(left1, right1);
+  vpx_highbd_fdct8x16_body(left1, right1);
 
   // Transpose bottom left and bottom right quarters into one contiguous
   // location to process to the bottom half.
 
   highbd_partial_round_shift(left4, right4);
   highbd_cross_input(left4, right4, left2, right2);
-  vpx_highbd_fdct16x16_body(left2, right2);
+  vpx_highbd_fdct8x16_body(left2, right2);
 
   transpose_s32_8x8_2(left1, right1, left3, right3);
   transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
index d998709..43d820b 100644
--- a/vpx_dsp/arm/fdct16x16_neon.h
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -160,8 +160,8 @@
 }
 
 // Main body of fdct16x16.
-static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
-                               int16x8_t *out /*[16]*/) {
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+                              int16x8_t *out /*[16]*/) {
   int16x8_t s[8];
   int16x8_t x[4];
   int16x8_t step[8];
@@ -186,16 +186,17 @@
 
   // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
   // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
-  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[8]);
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
   // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
 
   //  Stage 2
   // Re-using source s5/s6
   // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
   // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
 
   //  Stage 3
   x[0] = vaddq_s16(s[4], s[5]);
@@ -204,12 +205,12 @@
   x[3] = vaddq_s16(s[7], s[6]);
 
   // Stage 4
-  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
-  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
-  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
-  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
 
   // step 2
   // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
@@ -221,8 +222,8 @@
   // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
   // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
   // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
-  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+  butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
 
   // step 3
   s[0] = vaddq_s16(in[8], s[3]);
@@ -235,13 +236,15 @@
   s[7] = vaddq_s16(in[15], s[4]);
 
   // step 4
-  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
-  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
-  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
 
   // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
-  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
 
   // step 5
   step[0] = vaddq_s16(s[0], s[1]);
@@ -254,22 +257,23 @@
   step[7] = vaddq_s16(s[7], s[6]);
 
   // step 6
-  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
-  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
-  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
-  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
-  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
-  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
-  // cospi_22_64)
-  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
-  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
-  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
                       &out[7]);
-  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
                       &out[15]);
-  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
                       &out[3]);
-  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
                       &out[11]);
 }
 
@@ -279,36 +283,37 @@
                                       int32x4_t *left /*[16]*/,
                                       int32x4_t *right /* [16] */) {
   left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
-  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
   left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
-  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
   left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
-  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
   left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
-  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
   left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
-  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
   left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
-  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
   left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
-  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
   left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
-  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
   left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
-  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
   left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
-  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
   left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
-  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
   left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
-  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
   left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
-  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
   left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
-  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
   left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
-  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
   left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+
+  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
   right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
 }
 
@@ -357,83 +362,40 @@
                                               int32x4_t *right /* [16] */) {
   const int32x4_t one = vdupq_n_s32(1);
   left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
-  right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
   left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
-  right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
   left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
-  right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
   left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
-  right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
   left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
-  right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
   left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
-  right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
   left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
-  right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
   left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
-  right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
   left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
-  right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
   left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
-  right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
   left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
-  right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
   left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
-  right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
   left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
-  right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
   left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
-  right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
   left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
-  right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
   left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
+
+  right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
+  right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
+  right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
+  right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
+  right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
+  right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
+  right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
+  right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
+  right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
+  right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
+  right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
+  right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
+  right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
+  right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
+  right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
   right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
 }
 
-static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
-                                       int32x4_t *right /*[8]*/,
-                                       int32x4_t *out_left /*[8]*/,
-                                       int32x4_t *out_right /*[8]*/) {
-  int32x4x2_t out[8];
-
-  out[0].val[0] = left[0];
-  out[0].val[1] = right[0];
-  out[1].val[0] = left[1];
-  out[1].val[1] = right[1];
-  out[2].val[0] = left[2];
-  out[2].val[1] = right[2];
-  out[3].val[0] = left[3];
-  out[3].val[1] = right[3];
-  out[4].val[0] = left[4];
-  out[4].val[1] = right[4];
-  out[5].val[0] = left[5];
-  out[5].val[1] = right[5];
-  out[6].val[0] = left[6];
-  out[6].val[1] = right[6];
-  out[7].val[0] = left[7];
-  out[7].val[1] = right[7];
-
-  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
-                    &out[6], &out[7]);
-
-  out_left[0] = out[0].val[0];
-  out_left[1] = out[1].val[0];
-  out_left[2] = out[2].val[0];
-  out_left[3] = out[3].val[0];
-  out_left[4] = out[4].val[0];
-  out_left[5] = out[5].val[0];
-  out_left[6] = out[6].val[0];
-  out_left[7] = out[7].val[0];
-  out_right[0] = out[0].val[1];
-  out_right[1] = out[1].val[1];
-  out_right[2] = out[2].val[1];
-  out_right[3] = out[3].val[1];
-  out_right[4] = out[4].val[1];
-  out_right[5] = out[5].val[1];
-  out_right[6] = out[6].val[1];
-  out_right[7] = out[7].val[1];
-}
-
 // Store 16 32x4 vectors, assuming stride == 16.
 static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
   vst1q_s32(a, b[0]);
@@ -469,9 +431,9 @@
   vst1q_s32(a, b[15]);
 }
 
-// Main body of fdct16x16.
-static void vpx_highbd_fdct16x16_body(int32x4_t *left /*[16]*/,
-                                      int32x4_t *right /* [16] */) {
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+                                     int32x4_t *right /* [16] */) {
   int32x4_t sl[8];
   int32x4_t sr[8];
   int32x4_t xl[4];
@@ -531,22 +493,21 @@
 
   // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
   // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[8]);
-  highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0],
-                                 &right[8]);
-  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[8], &right[8]);
+
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
   // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64,
-                                 &left[4], &left[12]);
-  highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64,
-                                 &right[4], &right[12]);
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[4], &right[4],
+                                     &left[12], &right[12]);
 
   //  Stage 2
   // Re-using source s5/s6
   // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
   // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &sl[6], &sl[5]);
-  highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &sr[6], &sr[5]);
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+                               &sr[6], &sl[5], &sr[5]);
 
   //  Stage 3
   xl[0] = vaddq_s32(sl[4], sl[5]);
@@ -559,18 +520,16 @@
   xr[3] = vaddq_s32(sr[7], sr[6]);
 
   // Stage 4
-  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
-  highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64,
-                                 &left[2], &left[14]);
-  highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64,
-                                 &right[2], &right[14]);
-  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
-  highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64,
-                                 &left[10], &left[6]);
-  highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64,
-                                 &right[10], &right[6]);
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[2], &right[2],
+                                     &left[14], &right[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[10], &right[10],
+                                     &left[6], &right[6]);
 
   // step 2
   // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
@@ -582,10 +541,10 @@
   // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
   // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
   // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  highbd_butterfly_one_coeff_s32(inl[5], inl[2], cospi_16_64, &sl[5], &sl[2]);
-  highbd_butterfly_one_coeff_s32(inr[5], inr[2], cospi_16_64, &sr[5], &sr[2]);
-  highbd_butterfly_one_coeff_s32(inl[4], inl[3], cospi_16_64, &sl[4], &sl[3]);
-  highbd_butterfly_one_coeff_s32(inr[4], inr[3], cospi_16_64, &sr[4], &sr[3]);
+  butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+                               &sl[5], &sr[5], &sl[2], &sr[2]);
+  butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+                               &sl[4], &sr[4], &sl[3], &sr[3]);
 
   // step 3
   sl[0] = vaddq_s32(inl[0], sl[3]);
@@ -606,19 +565,18 @@
   sr[7] = vaddq_s32(inr[7], sr[4]);
 
   // step 4
-  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
-  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
-  highbd_butterfly_two_coeff_s32(sl[6], sl[1], cospi_8_64, cospi_24_64, &sl[6],
-                                 &sl[1]);
-  highbd_butterfly_two_coeff_s32(sr[6], sr[1], cospi_8_64, cospi_24_64, &sr[6],
-                                 &sr[1]);
-
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+                                     cospi_24_64, &sl[6], &sr[6], &sl[1],
+                                     &sr[1]);
   // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
-  highbd_butterfly_two_coeff_s32(xl[0], xl[3], cospi_24_64, cospi_8_64, &sl[2],
-                                 &sl[5]);
-  highbd_butterfly_two_coeff_s32(xr[0], xr[3], cospi_24_64, cospi_8_64, &sr[2],
-                                 &sr[5]);
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+                                     cospi_8_64, &sl[2], &sr[2], &sl[5],
+                                     &sr[5]);
 
   // step 5
   stepl[0] = vaddq_s32(sl[0], sl[1]);
@@ -639,31 +597,26 @@
   stepr[7] = vaddq_s32(sr[7], sr[6]);
 
   // step 6
-  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
-  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
-  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
-  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
-  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
-  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
-  // cospi_22_64) out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] *
-  // cospi_26_64) out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] *
-  // cospi_6_64)
-  highbd_butterfly_two_coeff_s32(stepl[7], stepl[0], cospi_2_64, cospi_30_64,
-                                 &left[1], &left[15]);
-  highbd_butterfly_two_coeff_s32(stepr[7], stepr[0], cospi_2_64, cospi_30_64,
-                                 &right[1], &right[15]);
-  highbd_butterfly_two_coeff_s32(stepl[6], stepl[1], cospi_18_64, cospi_14_64,
-                                 &left[9], &left[7]);
-  highbd_butterfly_two_coeff_s32(stepr[6], stepr[1], cospi_18_64, cospi_14_64,
-                                 &right[9], &right[7]);
-  highbd_butterfly_two_coeff_s32(stepl[5], stepl[2], cospi_10_64, cospi_22_64,
-                                 &left[5], &left[11]);
-  highbd_butterfly_two_coeff_s32(stepr[5], stepr[2], cospi_10_64, cospi_22_64,
-                                 &right[5], &right[11]);
-  highbd_butterfly_two_coeff_s32(stepl[4], stepl[3], cospi_26_64, cospi_6_64,
-                                 &left[13], &left[3]);
-  highbd_butterfly_two_coeff_s32(stepr[4], stepr[3], cospi_26_64, cospi_6_64,
-                                 &right[13], &right[3]);
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+                                     cospi_18_64, cospi_14_64, &left[9],
+                                     &right[9], &left[7], &right[7]);
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+                                     cospi_2_64, cospi_30_64, &left[1],
+                                     &right[1], &left[15], &right[15]);
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+                                     cospi_26_64, cospi_6_64, &left[13],
+                                     &right[13], &left[3], &right[3]);
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+                                     cospi_10_64, cospi_22_64, &left[5],
+                                     &right[5], &left[11], &right[11]);
 }
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c
index 51d81bd..e2bf167 100644
--- a/vpx_dsp/arm/fdct32x32_neon.c
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -16,6 +16,7 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct32x32_neon.h"
 
 // Most gcc 4.9 distributions outside of Android do not generate correct code
 // for this function.
@@ -33,1123 +34,6 @@
 
 #else
 
-#define LOAD_INCREMENT(src, stride, dest, index) \
-  do {                                           \
-    dest[index] = vld1q_s16(src);                \
-    src += stride;                               \
-  } while (0)
-
-#define ADD_S16(src, index0, index1, dest, index3)      \
-  do {                                                  \
-    dest[index3] = vaddq_s16(src[index0], src[index1]); \
-  } while (0)
-
-#define ADD_SHIFT_S16(src, index0, index1)                             \
-  do {                                                                 \
-    src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
-  } while (0)
-
-// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
-// middle
-// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
-  const int16_t *a_end = a + 24 * stride;
-  int16x8_t c[8];
-
-  LOAD_INCREMENT(a, stride, b, 0);
-  LOAD_INCREMENT(a, stride, b, 1);
-  LOAD_INCREMENT(a, stride, b, 2);
-  LOAD_INCREMENT(a, stride, b, 3);
-  LOAD_INCREMENT(a, stride, b, 4);
-  LOAD_INCREMENT(a, stride, b, 5);
-  LOAD_INCREMENT(a, stride, b, 6);
-  LOAD_INCREMENT(a, stride, b, 7);
-
-  LOAD_INCREMENT(a_end, stride, b, 24);
-  LOAD_INCREMENT(a_end, stride, b, 25);
-  LOAD_INCREMENT(a_end, stride, b, 26);
-  LOAD_INCREMENT(a_end, stride, b, 27);
-  LOAD_INCREMENT(a_end, stride, b, 28);
-  LOAD_INCREMENT(a_end, stride, b, 29);
-  LOAD_INCREMENT(a_end, stride, b, 30);
-  LOAD_INCREMENT(a_end, stride, b, 31);
-
-  ADD_S16(b, 0, 31, c, 0);
-  ADD_S16(b, 1, 30, c, 1);
-  ADD_S16(b, 2, 29, c, 2);
-  ADD_S16(b, 3, 28, c, 3);
-  ADD_S16(b, 4, 27, c, 4);
-  ADD_S16(b, 5, 26, c, 5);
-  ADD_S16(b, 6, 25, c, 6);
-  ADD_S16(b, 7, 24, c, 7);
-
-  ADD_SHIFT_S16(b, 7, 24);
-  ADD_SHIFT_S16(b, 6, 25);
-  ADD_SHIFT_S16(b, 5, 26);
-  ADD_SHIFT_S16(b, 4, 27);
-  ADD_SHIFT_S16(b, 3, 28);
-  ADD_SHIFT_S16(b, 2, 29);
-  ADD_SHIFT_S16(b, 1, 30);
-  ADD_SHIFT_S16(b, 0, 31);
-
-  b[0] = vshlq_n_s16(c[0], 2);
-  b[1] = vshlq_n_s16(c[1], 2);
-  b[2] = vshlq_n_s16(c[2], 2);
-  b[3] = vshlq_n_s16(c[3], 2);
-  b[4] = vshlq_n_s16(c[4], 2);
-  b[5] = vshlq_n_s16(c[5], 2);
-  b[6] = vshlq_n_s16(c[6], 2);
-  b[7] = vshlq_n_s16(c[7], 2);
-
-  LOAD_INCREMENT(a, stride, b, 8);
-  LOAD_INCREMENT(a, stride, b, 9);
-  LOAD_INCREMENT(a, stride, b, 10);
-  LOAD_INCREMENT(a, stride, b, 11);
-  LOAD_INCREMENT(a, stride, b, 12);
-  LOAD_INCREMENT(a, stride, b, 13);
-  LOAD_INCREMENT(a, stride, b, 14);
-  LOAD_INCREMENT(a, stride, b, 15);
-  LOAD_INCREMENT(a, stride, b, 16);
-  LOAD_INCREMENT(a, stride, b, 17);
-  LOAD_INCREMENT(a, stride, b, 18);
-  LOAD_INCREMENT(a, stride, b, 19);
-  LOAD_INCREMENT(a, stride, b, 20);
-  LOAD_INCREMENT(a, stride, b, 21);
-  LOAD_INCREMENT(a, stride, b, 22);
-  LOAD_INCREMENT(a, stride, b, 23);
-
-  ADD_S16(b, 8, 23, c, 0);
-  ADD_S16(b, 9, 22, c, 1);
-  ADD_S16(b, 10, 21, c, 2);
-  ADD_S16(b, 11, 20, c, 3);
-  ADD_S16(b, 12, 19, c, 4);
-  ADD_S16(b, 13, 18, c, 5);
-  ADD_S16(b, 14, 17, c, 6);
-  ADD_S16(b, 15, 16, c, 7);
-
-  ADD_SHIFT_S16(b, 15, 16);
-  ADD_SHIFT_S16(b, 14, 17);
-  ADD_SHIFT_S16(b, 13, 18);
-  ADD_SHIFT_S16(b, 12, 19);
-  ADD_SHIFT_S16(b, 11, 20);
-  ADD_SHIFT_S16(b, 10, 21);
-  ADD_SHIFT_S16(b, 9, 22);
-  ADD_SHIFT_S16(b, 8, 23);
-
-  b[8] = vshlq_n_s16(c[0], 2);
-  b[9] = vshlq_n_s16(c[1], 2);
-  b[10] = vshlq_n_s16(c[2], 2);
-  b[11] = vshlq_n_s16(c[3], 2);
-  b[12] = vshlq_n_s16(c[4], 2);
-  b[13] = vshlq_n_s16(c[5], 2);
-  b[14] = vshlq_n_s16(c[6], 2);
-  b[15] = vshlq_n_s16(c[7], 2);
-}
-
-#undef LOAD_INCREMENT
-#undef ADD_S16
-#undef ADD_SHIFT_S16
-
-#define STORE_S16(src, index, dest)           \
-  do {                                        \
-    store_s16q_to_tran_low(dest, src[index]); \
-    dest += 8;                                \
-  } while (0)
-
-// Store 32 16x8 values, assuming stride == 32.
-// Slight twist: store horizontally in blocks of 8.
-static INLINE void store(tran_low_t *a, const int16x8_t *b) {
-  STORE_S16(b, 0, a);
-  STORE_S16(b, 8, a);
-  STORE_S16(b, 16, a);
-  STORE_S16(b, 24, a);
-  STORE_S16(b, 1, a);
-  STORE_S16(b, 9, a);
-  STORE_S16(b, 17, a);
-  STORE_S16(b, 25, a);
-  STORE_S16(b, 2, a);
-  STORE_S16(b, 10, a);
-  STORE_S16(b, 18, a);
-  STORE_S16(b, 26, a);
-  STORE_S16(b, 3, a);
-  STORE_S16(b, 11, a);
-  STORE_S16(b, 19, a);
-  STORE_S16(b, 27, a);
-  STORE_S16(b, 4, a);
-  STORE_S16(b, 12, a);
-  STORE_S16(b, 20, a);
-  STORE_S16(b, 28, a);
-  STORE_S16(b, 5, a);
-  STORE_S16(b, 13, a);
-  STORE_S16(b, 21, a);
-  STORE_S16(b, 29, a);
-  STORE_S16(b, 6, a);
-  STORE_S16(b, 14, a);
-  STORE_S16(b, 22, a);
-  STORE_S16(b, 30, a);
-  STORE_S16(b, 7, a);
-  STORE_S16(b, 15, a);
-  STORE_S16(b, 23, a);
-  STORE_S16(b, 31, a);
-}
-
-#undef STORE_S16
-
-static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-
-  // Stage 1: Done as part of the load.
-
-  // Stage 2.
-  // Mini cross. X the first 16 values and the middle 8 of the second half.
-  a[0] = vaddq_s16(in[0], in[15]);
-  a[1] = vaddq_s16(in[1], in[14]);
-  a[2] = vaddq_s16(in[2], in[13]);
-  a[3] = vaddq_s16(in[3], in[12]);
-  a[4] = vaddq_s16(in[4], in[11]);
-  a[5] = vaddq_s16(in[5], in[10]);
-  a[6] = vaddq_s16(in[6], in[9]);
-  a[7] = vaddq_s16(in[7], in[8]);
-
-  a[8] = vsubq_s16(in[7], in[8]);
-  a[9] = vsubq_s16(in[6], in[9]);
-  a[10] = vsubq_s16(in[5], in[10]);
-  a[11] = vsubq_s16(in[4], in[11]);
-  a[12] = vsubq_s16(in[3], in[12]);
-  a[13] = vsubq_s16(in[2], in[13]);
-  a[14] = vsubq_s16(in[1], in[14]);
-  a[15] = vsubq_s16(in[0], in[15]);
-
-  a[16] = in[16];
-  a[17] = in[17];
-  a[18] = in[18];
-  a[19] = in[19];
-
-  butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
-  butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
-  butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
-  butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
-
-  a[28] = in[28];
-  a[29] = in[29];
-  a[30] = in[30];
-  a[31] = in[31];
-
-  // Stage 3.
-  b[0] = vaddq_s16(a[0], a[7]);
-  b[1] = vaddq_s16(a[1], a[6]);
-  b[2] = vaddq_s16(a[2], a[5]);
-  b[3] = vaddq_s16(a[3], a[4]);
-
-  b[4] = vsubq_s16(a[3], a[4]);
-  b[5] = vsubq_s16(a[2], a[5]);
-  b[6] = vsubq_s16(a[1], a[6]);
-  b[7] = vsubq_s16(a[0], a[7]);
-
-  b[8] = a[8];
-  b[9] = a[9];
-
-  butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
-  butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
-
-  b[14] = a[14];
-  b[15] = a[15];
-
-  b[16] = vaddq_s16(in[16], a[23]);
-  b[17] = vaddq_s16(in[17], a[22]);
-  b[18] = vaddq_s16(in[18], a[21]);
-  b[19] = vaddq_s16(in[19], a[20]);
-
-  b[20] = vsubq_s16(in[19], a[20]);
-  b[21] = vsubq_s16(in[18], a[21]);
-  b[22] = vsubq_s16(in[17], a[22]);
-  b[23] = vsubq_s16(in[16], a[23]);
-
-  b[24] = vsubq_s16(in[31], a[24]);
-  b[25] = vsubq_s16(in[30], a[25]);
-  b[26] = vsubq_s16(in[29], a[26]);
-  b[27] = vsubq_s16(in[28], a[27]);
-
-  b[28] = vaddq_s16(in[28], a[27]);
-  b[29] = vaddq_s16(in[29], a[26]);
-  b[30] = vaddq_s16(in[30], a[25]);
-  b[31] = vaddq_s16(in[31], a[24]);
-
-  // Stage 4.
-  a[0] = vaddq_s16(b[0], b[3]);
-  a[1] = vaddq_s16(b[1], b[2]);
-  a[2] = vsubq_s16(b[1], b[2]);
-  a[3] = vsubq_s16(b[0], b[3]);
-
-  a[4] = b[4];
-
-  butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
-
-  a[7] = b[7];
-
-  a[8] = vaddq_s16(b[8], b[11]);
-  a[9] = vaddq_s16(b[9], b[10]);
-  a[10] = vsubq_s16(b[9], b[10]);
-  a[11] = vsubq_s16(b[8], b[11]);
-  a[12] = vsubq_s16(b[15], b[12]);
-  a[13] = vsubq_s16(b[14], b[13]);
-  a[14] = vaddq_s16(b[14], b[13]);
-  a[15] = vaddq_s16(b[15], b[12]);
-
-  a[16] = b[16];
-  a[17] = b[17];
-
-  butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
-  butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
-  butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
-  butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
-
-  a[22] = b[22];
-  a[23] = b[23];
-  a[24] = b[24];
-  a[25] = b[25];
-
-  a[30] = b[30];
-  a[31] = b[31];
-
-  // Stage 5.
-  butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
-  butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
-
-  b[4] = vaddq_s16(a[4], a[5]);
-  b[5] = vsubq_s16(a[4], a[5]);
-  b[6] = vsubq_s16(a[7], a[6]);
-  b[7] = vaddq_s16(a[7], a[6]);
-
-  b[8] = a[8];
-
-  butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
-  butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
-
-  b[11] = a[11];
-  b[12] = a[12];
-
-  b[15] = a[15];
-
-  b[16] = vaddq_s16(a[19], a[16]);
-  b[17] = vaddq_s16(a[18], a[17]);
-  b[18] = vsubq_s16(a[17], a[18]);
-  b[19] = vsubq_s16(a[16], a[19]);
-  b[20] = vsubq_s16(a[23], a[20]);
-  b[21] = vsubq_s16(a[22], a[21]);
-  b[22] = vaddq_s16(a[21], a[22]);
-  b[23] = vaddq_s16(a[20], a[23]);
-  b[24] = vaddq_s16(a[27], a[24]);
-  b[25] = vaddq_s16(a[26], a[25]);
-  b[26] = vsubq_s16(a[25], a[26]);
-  b[27] = vsubq_s16(a[24], a[27]);
-  b[28] = vsubq_s16(a[31], a[28]);
-  b[29] = vsubq_s16(a[30], a[29]);
-  b[30] = vaddq_s16(a[29], a[30]);
-  b[31] = vaddq_s16(a[28], a[31]);
-
-  // Stage 6.
-  a[0] = b[0];
-  a[1] = b[1];
-  a[2] = b[2];
-  a[3] = b[3];
-
-  butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
-  butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
-
-  a[8] = vaddq_s16(b[8], b[9]);
-  a[9] = vsubq_s16(b[8], b[9]);
-  a[10] = vsubq_s16(b[11], b[10]);
-  a[11] = vaddq_s16(b[11], b[10]);
-  a[12] = vaddq_s16(b[12], b[13]);
-  a[13] = vsubq_s16(b[12], b[13]);
-  a[14] = vsubq_s16(b[15], b[14]);
-  a[15] = vaddq_s16(b[15], b[14]);
-
-  a[16] = b[16];
-  a[19] = b[19];
-  a[20] = b[20];
-  a[23] = b[23];
-  a[24] = b[24];
-  a[27] = b[27];
-  a[28] = b[28];
-  a[31] = b[31];
-
-  butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
-  butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
-
-  butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
-  butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
-
-  // Stage 7.
-  b[0] = a[0];
-  b[1] = a[1];
-  b[2] = a[2];
-  b[3] = a[3];
-  b[4] = a[4];
-  b[5] = a[5];
-  b[6] = a[6];
-  b[7] = a[7];
-
-  butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
-  butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
-  butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
-  butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
-
-  b[16] = vaddq_s16(a[16], a[17]);
-  b[17] = vsubq_s16(a[16], a[17]);
-  b[18] = vsubq_s16(a[19], a[18]);
-  b[19] = vaddq_s16(a[19], a[18]);
-  b[20] = vaddq_s16(a[20], a[21]);
-  b[21] = vsubq_s16(a[20], a[21]);
-  b[22] = vsubq_s16(a[23], a[22]);
-  b[23] = vaddq_s16(a[23], a[22]);
-  b[24] = vaddq_s16(a[24], a[25]);
-  b[25] = vsubq_s16(a[24], a[25]);
-  b[26] = vsubq_s16(a[27], a[26]);
-  b[27] = vaddq_s16(a[27], a[26]);
-  b[28] = vaddq_s16(a[28], a[29]);
-  b[29] = vsubq_s16(a[28], a[29]);
-  b[30] = vsubq_s16(a[31], a[30]);
-  b[31] = vaddq_s16(a[31], a[30]);
-
-  // Final stage.
-  // Also compute partial rounding shift:
-  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  out[0] = sub_round_shift(b[0]);
-  out[16] = sub_round_shift(b[1]);
-  out[8] = sub_round_shift(b[2]);
-  out[24] = sub_round_shift(b[3]);
-  out[4] = sub_round_shift(b[4]);
-  out[20] = sub_round_shift(b[5]);
-  out[12] = sub_round_shift(b[6]);
-  out[28] = sub_round_shift(b[7]);
-  out[2] = sub_round_shift(b[8]);
-  out[18] = sub_round_shift(b[9]);
-  out[10] = sub_round_shift(b[10]);
-  out[26] = sub_round_shift(b[11]);
-  out[6] = sub_round_shift(b[12]);
-  out[22] = sub_round_shift(b[13]);
-  out[14] = sub_round_shift(b[14]);
-  out[30] = sub_round_shift(b[15]);
-
-  butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
-  out[1] = sub_round_shift(a[1]);
-  out[31] = sub_round_shift(a[31]);
-
-  butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
-  out[17] = sub_round_shift(a[17]);
-  out[15] = sub_round_shift(a[15]);
-
-  butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
-  out[9] = sub_round_shift(a[9]);
-  out[23] = sub_round_shift(a[23]);
-
-  butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
-  out[25] = sub_round_shift(a[25]);
-  out[7] = sub_round_shift(a[7]);
-
-  butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
-  out[5] = sub_round_shift(a[5]);
-  out[27] = sub_round_shift(a[27]);
-
-  butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
-  out[21] = sub_round_shift(a[21]);
-  out[11] = sub_round_shift(a[11]);
-
-  butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
-  out[13] = sub_round_shift(a[13]);
-  out[19] = sub_round_shift(a[19]);
-
-  butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
-  out[29] = sub_round_shift(a[29]);
-  out[3] = sub_round_shift(a[3]);
-}
-
-#define PASS_THROUGH(src, dst, element)    \
-  do {                                     \
-    dst##_lo[element] = src##_lo[element]; \
-    dst##_hi[element] = src##_hi[element]; \
-  } while (0)
-
-#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                        \
-    b##_lo[b_index] =                                                         \
-        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
-    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
-                                vget_high_s16(a[right_index]));               \
-  } while (0)
-
-#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                        \
-    b##_lo[b_index] =                                                         \
-        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
-    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
-                                vget_high_s16(a[right_index]));               \
-  } while (0)
-
-#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
-  do {                                                                       \
-    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
-    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
-  } while (0)
-
-#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
-  do {                                                                     \
-    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
-    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
-    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
-    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
-  } while (0)
-
-#define ADD_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                    \
-    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
-    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
-  } while (0)
-
-#define SUB_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                    \
-    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
-    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
-  } while (0)
-
-#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
-                              add_index, sub_index)                      \
-  do {                                                                   \
-    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
-                                &b##_lo[add_index], &b##_hi[add_index],  \
-                                &b##_lo[sub_index], &b##_hi[sub_index]); \
-  } while (0)
-
-#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
-                          sub_index)                                          \
-  do {                                                                        \
-    butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index],           \
-                            a##_lo[right_index], a##_hi[right_index],         \
-                            constant, &b##_lo[add_index], &b##_hi[add_index], \
-                            &b##_lo[sub_index], &b##_hi[sub_index]);          \
-  } while (0)
-
-#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
-                          right_constant, b, add_index, sub_index)             \
-  do {                                                                         \
-    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
-                            a##_lo[right_index], a##_hi[right_index],          \
-                            left_constant, right_constant, &b##_lo[add_index], \
-                            &b##_hi[add_index], &b##_lo[sub_index],            \
-                            &b##_hi[sub_index]);                               \
-  } while (0)
-
-static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-  int32x4_t c_lo[32];
-  int32x4_t c_hi[32];
-  int32x4_t d_lo[32];
-  int32x4_t d_hi[32];
-
-  // Stage 1. Done as part of the load for the first pass.
-  a[0] = vaddq_s16(in[0], in[31]);
-  a[1] = vaddq_s16(in[1], in[30]);
-  a[2] = vaddq_s16(in[2], in[29]);
-  a[3] = vaddq_s16(in[3], in[28]);
-  a[4] = vaddq_s16(in[4], in[27]);
-  a[5] = vaddq_s16(in[5], in[26]);
-  a[6] = vaddq_s16(in[6], in[25]);
-  a[7] = vaddq_s16(in[7], in[24]);
-  a[8] = vaddq_s16(in[8], in[23]);
-  a[9] = vaddq_s16(in[9], in[22]);
-  a[10] = vaddq_s16(in[10], in[21]);
-  a[11] = vaddq_s16(in[11], in[20]);
-  a[12] = vaddq_s16(in[12], in[19]);
-  a[13] = vaddq_s16(in[13], in[18]);
-  a[14] = vaddq_s16(in[14], in[17]);
-  a[15] = vaddq_s16(in[15], in[16]);
-  a[16] = vsubq_s16(in[15], in[16]);
-  a[17] = vsubq_s16(in[14], in[17]);
-  a[18] = vsubq_s16(in[13], in[18]);
-  a[19] = vsubq_s16(in[12], in[19]);
-  a[20] = vsubq_s16(in[11], in[20]);
-  a[21] = vsubq_s16(in[10], in[21]);
-  a[22] = vsubq_s16(in[9], in[22]);
-  a[23] = vsubq_s16(in[8], in[23]);
-  a[24] = vsubq_s16(in[7], in[24]);
-  a[25] = vsubq_s16(in[6], in[25]);
-  a[26] = vsubq_s16(in[5], in[26]);
-  a[27] = vsubq_s16(in[4], in[27]);
-  a[28] = vsubq_s16(in[3], in[28]);
-  a[29] = vsubq_s16(in[2], in[29]);
-  a[30] = vsubq_s16(in[1], in[30]);
-  a[31] = vsubq_s16(in[0], in[31]);
-
-  // Stage 2.
-  b[0] = vaddq_s16(a[0], a[15]);
-  b[1] = vaddq_s16(a[1], a[14]);
-  b[2] = vaddq_s16(a[2], a[13]);
-  b[3] = vaddq_s16(a[3], a[12]);
-  b[4] = vaddq_s16(a[4], a[11]);
-  b[5] = vaddq_s16(a[5], a[10]);
-  b[6] = vaddq_s16(a[6], a[9]);
-  b[7] = vaddq_s16(a[7], a[8]);
-
-  b[8] = vsubq_s16(a[7], a[8]);
-  b[9] = vsubq_s16(a[6], a[9]);
-  b[10] = vsubq_s16(a[5], a[10]);
-  b[11] = vsubq_s16(a[4], a[11]);
-  b[12] = vsubq_s16(a[3], a[12]);
-  b[13] = vsubq_s16(a[2], a[13]);
-  b[14] = vsubq_s16(a[1], a[14]);
-  b[15] = vsubq_s16(a[0], a[15]);
-
-  b[16] = a[16];
-  b[17] = a[17];
-  b[18] = a[18];
-  b[19] = a[19];
-
-  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
-  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
-  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
-  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
-
-  b[28] = a[28];
-  b[29] = a[29];
-  b[30] = a[30];
-  b[31] = a[31];
-
-  // Stage 3. With extreme values for input this calculation rolls over int16_t.
-  // The sources for b[0] get added multiple times and, through testing, have
-  // been shown to overflow starting here.
-  ADD_S16_S32(b, 0, 7, c, 0);
-  ADD_S16_S32(b, 1, 6, c, 1);
-  ADD_S16_S32(b, 2, 5, c, 2);
-  ADD_S16_S32(b, 3, 4, c, 3);
-  SUB_S16_S32(b, 3, 4, c, 4);
-  SUB_S16_S32(b, 2, 5, c, 5);
-  SUB_S16_S32(b, 1, 6, c, 6);
-  SUB_S16_S32(b, 0, 7, c, 7);
-
-  a[8] = b[8];
-  a[9] = b[9];
-
-  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
-  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
-
-  a[14] = b[14];
-  a[15] = b[15];
-
-  ADD_S16_S32(b, 16, 23, c, 16);
-  ADD_S16_S32(b, 17, 22, c, 17);
-  ADD_S16_S32(b, 18, 21, c, 18);
-  ADD_S16_S32(b, 19, 20, c, 19);
-  SUB_S16_S32(b, 19, 20, c, 20);
-  SUB_S16_S32(b, 18, 21, c, 21);
-  SUB_S16_S32(b, 17, 22, c, 22);
-  SUB_S16_S32(b, 16, 23, c, 23);
-  SUB_S16_S32(b, 31, 24, c, 24);
-  SUB_S16_S32(b, 30, 25, c, 25);
-  SUB_S16_S32(b, 29, 26, c, 26);
-  SUB_S16_S32(b, 28, 27, c, 27);
-  ADD_S16_S32(b, 28, 27, c, 28);
-  ADD_S16_S32(b, 29, 26, c, 29);
-  ADD_S16_S32(b, 30, 25, c, 30);
-  ADD_S16_S32(b, 31, 24, c, 31);
-
-  // Stage 4.
-  ADD_S32(c, 0, 3, d, 0);
-  ADD_S32(c, 1, 2, d, 1);
-  SUB_S32(c, 1, 2, d, 2);
-  SUB_S32(c, 0, 3, d, 3);
-
-  PASS_THROUGH(c, d, 4);
-
-  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
-
-  PASS_THROUGH(c, d, 7);
-
-  ADDW_S16_S32(c, 11, a, 8, d, 8);
-  ADDW_S16_S32(c, 10, a, 9, d, 9);
-  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
-  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
-  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
-  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
-  ADDW_S16_S32(c, 13, b, 14, d, 14);
-  ADDW_S16_S32(c, 12, b, 15, d, 15);
-
-  PASS_THROUGH(c, d, 16);
-  PASS_THROUGH(c, d, 17);
-
-  BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
-  BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
-  BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
-  BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
-
-  PASS_THROUGH(c, d, 22);
-  PASS_THROUGH(c, d, 23);
-  PASS_THROUGH(c, d, 24);
-  PASS_THROUGH(c, d, 25);
-
-  PASS_THROUGH(c, d, 30);
-  PASS_THROUGH(c, d, 31);
-
-  // Stage 5.
-  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
-  BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
-
-  ADD_S32(d, 4, 5, c, 4);
-  SUB_S32(d, 4, 5, c, 5);
-  SUB_S32(d, 7, 6, c, 6);
-  ADD_S32(d, 7, 6, c, 7);
-
-  PASS_THROUGH(d, c, 8);
-
-  BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
-  BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
-
-  PASS_THROUGH(d, c, 11);
-  PASS_THROUGH(d, c, 12);
-  PASS_THROUGH(d, c, 15);
-
-  ADD_S32(d, 16, 19, c, 16);
-  ADD_S32(d, 17, 18, c, 17);
-  SUB_S32(d, 17, 18, c, 18);
-  SUB_S32(d, 16, 19, c, 19);
-  SUB_S32(d, 23, 20, c, 20);
-  SUB_S32(d, 22, 21, c, 21);
-  ADD_S32(d, 22, 21, c, 22);
-  ADD_S32(d, 23, 20, c, 23);
-  ADD_S32(d, 24, 27, c, 24);
-  ADD_S32(d, 25, 26, c, 25);
-  SUB_S32(d, 25, 26, c, 26);
-  SUB_S32(d, 24, 27, c, 27);
-  SUB_S32(d, 31, 28, c, 28);
-  SUB_S32(d, 30, 29, c, 29);
-  ADD_S32(d, 30, 29, c, 30);
-  ADD_S32(d, 31, 28, c, 31);
-
-  // Stage 6.
-  PASS_THROUGH(c, d, 0);
-  PASS_THROUGH(c, d, 1);
-  PASS_THROUGH(c, d, 2);
-  PASS_THROUGH(c, d, 3);
-
-  BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
-  BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
-
-  ADD_S32(c, 8, 9, d, 8);
-  SUB_S32(c, 8, 9, d, 9);
-  SUB_S32(c, 11, 10, d, 10);
-  ADD_S32(c, 11, 10, d, 11);
-  ADD_S32(c, 12, 13, d, 12);
-  SUB_S32(c, 12, 13, d, 13);
-  SUB_S32(c, 15, 14, d, 14);
-  ADD_S32(c, 15, 14, d, 15);
-
-  PASS_THROUGH(c, d, 16);
-  PASS_THROUGH(c, d, 19);
-  PASS_THROUGH(c, d, 20);
-  PASS_THROUGH(c, d, 23);
-  PASS_THROUGH(c, d, 24);
-  PASS_THROUGH(c, d, 27);
-  PASS_THROUGH(c, d, 28);
-  PASS_THROUGH(c, d, 31);
-
-  BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
-  BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
-  BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
-  BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
-
-  // Stage 7.
-  PASS_THROUGH(d, c, 0);
-  PASS_THROUGH(d, c, 1);
-  PASS_THROUGH(d, c, 2);
-  PASS_THROUGH(d, c, 3);
-  PASS_THROUGH(d, c, 4);
-  PASS_THROUGH(d, c, 5);
-  PASS_THROUGH(d, c, 6);
-  PASS_THROUGH(d, c, 7);
-
-  BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
-  BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
-  BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
-  BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
-
-  ADD_S32(d, 16, 17, c, 16);
-  SUB_S32(d, 16, 17, c, 17);
-  SUB_S32(d, 19, 18, c, 18);
-  ADD_S32(d, 19, 18, c, 19);
-  ADD_S32(d, 20, 21, c, 20);
-  SUB_S32(d, 20, 21, c, 21);
-  SUB_S32(d, 23, 22, c, 22);
-  ADD_S32(d, 23, 22, c, 23);
-  ADD_S32(d, 24, 25, c, 24);
-  SUB_S32(d, 24, 25, c, 25);
-  SUB_S32(d, 27, 26, c, 26);
-  ADD_S32(d, 27, 26, c, 27);
-  ADD_S32(d, 28, 29, c, 28);
-  SUB_S32(d, 28, 29, c, 29);
-  SUB_S32(d, 31, 30, c, 30);
-  ADD_S32(d, 31, 30, c, 31);
-
-  // Final stage.
-  // Roll rounding into this function so we can pass back int16x8.
-
-  out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
-  out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
-
-  out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
-  out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
-  out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
-  out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
-  out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
-
-  out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
-  out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
-  out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
-  out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
-
-  out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
-  out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
-  out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
-  out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
-  out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
-
-  BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
-  out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
-  out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
-
-  BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
-  out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
-  out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
-
-  BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
-  out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
-  out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
-
-  BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
-  out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
-  out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
-
-  BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
-  out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
-  out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
-
-  BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
-  out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
-  out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
-
-  BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
-  out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
-  out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
-
-  BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
-  out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
-  out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
-}
-
-static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-
-  // Stage 1. Done as part of the load for the first pass.
-  a[0] = vaddq_s16(in[0], in[31]);
-  a[1] = vaddq_s16(in[1], in[30]);
-  a[2] = vaddq_s16(in[2], in[29]);
-  a[3] = vaddq_s16(in[3], in[28]);
-  a[4] = vaddq_s16(in[4], in[27]);
-  a[5] = vaddq_s16(in[5], in[26]);
-  a[6] = vaddq_s16(in[6], in[25]);
-  a[7] = vaddq_s16(in[7], in[24]);
-  a[8] = vaddq_s16(in[8], in[23]);
-  a[9] = vaddq_s16(in[9], in[22]);
-  a[10] = vaddq_s16(in[10], in[21]);
-  a[11] = vaddq_s16(in[11], in[20]);
-  a[12] = vaddq_s16(in[12], in[19]);
-  a[13] = vaddq_s16(in[13], in[18]);
-  a[14] = vaddq_s16(in[14], in[17]);
-  a[15] = vaddq_s16(in[15], in[16]);
-  a[16] = vsubq_s16(in[15], in[16]);
-  a[17] = vsubq_s16(in[14], in[17]);
-  a[18] = vsubq_s16(in[13], in[18]);
-  a[19] = vsubq_s16(in[12], in[19]);
-  a[20] = vsubq_s16(in[11], in[20]);
-  a[21] = vsubq_s16(in[10], in[21]);
-  a[22] = vsubq_s16(in[9], in[22]);
-  a[23] = vsubq_s16(in[8], in[23]);
-  a[24] = vsubq_s16(in[7], in[24]);
-  a[25] = vsubq_s16(in[6], in[25]);
-  a[26] = vsubq_s16(in[5], in[26]);
-  a[27] = vsubq_s16(in[4], in[27]);
-  a[28] = vsubq_s16(in[3], in[28]);
-  a[29] = vsubq_s16(in[2], in[29]);
-  a[30] = vsubq_s16(in[1], in[30]);
-  a[31] = vsubq_s16(in[0], in[31]);
-
-  // Stage 2.
-  // For the "rd" version, all the values are rounded down after stage 2 to keep
-  // the values in 16 bits.
-  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
-  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
-  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
-  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
-  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
-  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
-  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
-  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
-
-  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
-  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
-  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
-  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
-  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
-  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
-  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
-  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
-
-  b[16] = add_round_shift_s16(a[16]);
-  b[17] = add_round_shift_s16(a[17]);
-  b[18] = add_round_shift_s16(a[18]);
-  b[19] = add_round_shift_s16(a[19]);
-
-  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
-  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
-  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
-  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
-  b[20] = add_round_shift_s16(b[20]);
-  b[21] = add_round_shift_s16(b[21]);
-  b[22] = add_round_shift_s16(b[22]);
-  b[23] = add_round_shift_s16(b[23]);
-  b[24] = add_round_shift_s16(b[24]);
-  b[25] = add_round_shift_s16(b[25]);
-  b[26] = add_round_shift_s16(b[26]);
-  b[27] = add_round_shift_s16(b[27]);
-
-  b[28] = add_round_shift_s16(a[28]);
-  b[29] = add_round_shift_s16(a[29]);
-  b[30] = add_round_shift_s16(a[30]);
-  b[31] = add_round_shift_s16(a[31]);
-
-  // Stage 3.
-  a[0] = vaddq_s16(b[0], b[7]);
-  a[1] = vaddq_s16(b[1], b[6]);
-  a[2] = vaddq_s16(b[2], b[5]);
-  a[3] = vaddq_s16(b[3], b[4]);
-
-  a[4] = vsubq_s16(b[3], b[4]);
-  a[5] = vsubq_s16(b[2], b[5]);
-  a[6] = vsubq_s16(b[1], b[6]);
-  a[7] = vsubq_s16(b[0], b[7]);
-
-  a[8] = b[8];
-  a[9] = b[9];
-
-  butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
-  butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
-
-  a[14] = b[14];
-  a[15] = b[15];
-
-  a[16] = vaddq_s16(b[16], b[23]);
-  a[17] = vaddq_s16(b[17], b[22]);
-  a[18] = vaddq_s16(b[18], b[21]);
-  a[19] = vaddq_s16(b[19], b[20]);
-
-  a[20] = vsubq_s16(b[19], b[20]);
-  a[21] = vsubq_s16(b[18], b[21]);
-  a[22] = vsubq_s16(b[17], b[22]);
-  a[23] = vsubq_s16(b[16], b[23]);
-
-  a[24] = vsubq_s16(b[31], b[24]);
-  a[25] = vsubq_s16(b[30], b[25]);
-  a[26] = vsubq_s16(b[29], b[26]);
-  a[27] = vsubq_s16(b[28], b[27]);
-
-  a[28] = vaddq_s16(b[28], b[27]);
-  a[29] = vaddq_s16(b[29], b[26]);
-  a[30] = vaddq_s16(b[30], b[25]);
-  a[31] = vaddq_s16(b[31], b[24]);
-
-  // Stage 4.
-  b[0] = vaddq_s16(a[0], a[3]);
-  b[1] = vaddq_s16(a[1], a[2]);
-  b[2] = vsubq_s16(a[1], a[2]);
-  b[3] = vsubq_s16(a[0], a[3]);
-
-  b[4] = a[4];
-
-  butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
-
-  b[7] = a[7];
-
-  b[8] = vaddq_s16(a[8], a[11]);
-  b[9] = vaddq_s16(a[9], a[10]);
-  b[10] = vsubq_s16(a[9], a[10]);
-  b[11] = vsubq_s16(a[8], a[11]);
-  b[12] = vsubq_s16(a[15], a[12]);
-  b[13] = vsubq_s16(a[14], a[13]);
-  b[14] = vaddq_s16(a[14], a[13]);
-  b[15] = vaddq_s16(a[15], a[12]);
-
-  b[16] = a[16];
-  b[17] = a[17];
-
-  butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
-  butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
-  butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
-  butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
-
-  b[22] = a[22];
-  b[23] = a[23];
-  b[24] = a[24];
-  b[25] = a[25];
-
-  b[30] = a[30];
-  b[31] = a[31];
-
-  // Stage 5.
-  butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
-  butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
-
-  a[4] = vaddq_s16(b[4], b[5]);
-  a[5] = vsubq_s16(b[4], b[5]);
-  a[6] = vsubq_s16(b[7], b[6]);
-  a[7] = vaddq_s16(b[7], b[6]);
-
-  a[8] = b[8];
-
-  butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
-  butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
-
-  a[11] = b[11];
-  a[12] = b[12];
-
-  a[15] = b[15];
-
-  a[16] = vaddq_s16(b[19], b[16]);
-  a[17] = vaddq_s16(b[18], b[17]);
-  a[18] = vsubq_s16(b[17], b[18]);
-  a[19] = vsubq_s16(b[16], b[19]);
-  a[20] = vsubq_s16(b[23], b[20]);
-  a[21] = vsubq_s16(b[22], b[21]);
-  a[22] = vaddq_s16(b[21], b[22]);
-  a[23] = vaddq_s16(b[20], b[23]);
-  a[24] = vaddq_s16(b[27], b[24]);
-  a[25] = vaddq_s16(b[26], b[25]);
-  a[26] = vsubq_s16(b[25], b[26]);
-  a[27] = vsubq_s16(b[24], b[27]);
-  a[28] = vsubq_s16(b[31], b[28]);
-  a[29] = vsubq_s16(b[30], b[29]);
-  a[30] = vaddq_s16(b[29], b[30]);
-  a[31] = vaddq_s16(b[28], b[31]);
-
-  // Stage 6.
-  b[0] = a[0];
-  b[1] = a[1];
-  b[2] = a[2];
-  b[3] = a[3];
-
-  butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
-  butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
-
-  b[8] = vaddq_s16(a[8], a[9]);
-  b[9] = vsubq_s16(a[8], a[9]);
-  b[10] = vsubq_s16(a[11], a[10]);
-  b[11] = vaddq_s16(a[11], a[10]);
-  b[12] = vaddq_s16(a[12], a[13]);
-  b[13] = vsubq_s16(a[12], a[13]);
-  b[14] = vsubq_s16(a[15], a[14]);
-  b[15] = vaddq_s16(a[15], a[14]);
-
-  b[16] = a[16];
-  b[19] = a[19];
-  b[20] = a[20];
-  b[23] = a[23];
-  b[24] = a[24];
-  b[27] = a[27];
-  b[28] = a[28];
-  b[31] = a[31];
-
-  butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
-  butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
-
-  butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
-  butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
-
-  // Stage 7.
-  a[0] = b[0];
-  a[1] = b[1];
-  a[2] = b[2];
-  a[3] = b[3];
-  a[4] = b[4];
-  a[5] = b[5];
-  a[6] = b[6];
-  a[7] = b[7];
-
-  butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
-  butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
-  butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
-  butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
-
-  a[16] = vaddq_s16(b[16], b[17]);
-  a[17] = vsubq_s16(b[16], b[17]);
-  a[18] = vsubq_s16(b[19], b[18]);
-  a[19] = vaddq_s16(b[19], b[18]);
-  a[20] = vaddq_s16(b[20], b[21]);
-  a[21] = vsubq_s16(b[20], b[21]);
-  a[22] = vsubq_s16(b[23], b[22]);
-  a[23] = vaddq_s16(b[23], b[22]);
-  a[24] = vaddq_s16(b[24], b[25]);
-  a[25] = vsubq_s16(b[24], b[25]);
-  a[26] = vsubq_s16(b[27], b[26]);
-  a[27] = vaddq_s16(b[27], b[26]);
-  a[28] = vaddq_s16(b[28], b[29]);
-  a[29] = vsubq_s16(b[28], b[29]);
-  a[30] = vsubq_s16(b[31], b[30]);
-  a[31] = vaddq_s16(b[31], b[30]);
-
-  // Final stage.
-  out[0] = a[0];
-  out[16] = a[1];
-  out[8] = a[2];
-  out[24] = a[3];
-  out[4] = a[4];
-  out[20] = a[5];
-  out[12] = a[6];
-  out[28] = a[7];
-  out[2] = a[8];
-  out[18] = a[9];
-  out[10] = a[10];
-  out[26] = a[11];
-  out[6] = a[12];
-  out[22] = a[13];
-  out[14] = a[14];
-  out[30] = a[15];
-
-  butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
-  butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
-                      &out[15]);
-  butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
-  butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
-  butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
-  butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
-                      &out[11]);
-  butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
-                      &out[19]);
-  butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
-}
-
-#undef PASS_THROUGH
-#undef ADD_S16_S32
-#undef SUB_S16_S32
-#undef ADDW_S16_S32
-#undef SUBW_S16_S32
-#undef ADD_S32
-#undef SUB_S32
-#undef BUTTERFLY_ONE_S16_S32
-#undef BUTTERFLY_ONE_S32
-#undef BUTTERFLY_TWO_S32
-
 void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp0[32];
   int16x8_t temp1[32];
@@ -1159,17 +43,21 @@
   int16x8_t temp5[32];
 
   // Process in 8x32 columns.
-  load(input, stride, temp0);
-  dct_body_first_pass(temp0, temp1);
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp1);
 
-  load(input + 8, stride, temp0);
-  dct_body_first_pass(temp0, temp2);
+  load_cross(input + 8, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp2);
 
-  load(input + 16, stride, temp0);
-  dct_body_first_pass(temp0, temp3);
+  load_cross(input + 16, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp3);
 
-  load(input + 24, stride, temp0);
-  dct_body_first_pass(temp0, temp4);
+  load_cross(input + 24, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
   transpose_s16_8x8_new(&temp1[0], &temp0[0]);
@@ -1254,17 +142,21 @@
   int16x8_t temp5[32];
 
   // Process in 8x32 columns.
-  load(input, stride, temp0);
-  dct_body_first_pass(temp0, temp1);
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp1);
 
-  load(input + 8, stride, temp0);
-  dct_body_first_pass(temp0, temp2);
+  load_cross(input + 8, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp2);
 
-  load(input + 16, stride, temp0);
-  dct_body_first_pass(temp0, temp3);
+  load_cross(input + 16, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp3);
 
-  load(input + 24, stride, temp0);
-  dct_body_first_pass(temp0, temp4);
+  load_cross(input + 24, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
   transpose_s16_8x8_new(&temp1[0], &temp0[0]);
diff --git a/vpx_dsp/arm/fdct32x32_neon.h b/vpx_dsp/arm/fdct32x32_neon.h
new file mode 100644
index 0000000..dd64791
--- /dev/null
+++ b/vpx_dsp/arm/fdct32x32_neon.h
@@ -0,0 +1,1105 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+// Load & cross the first 8 and last 8, then the middle
+static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
+  b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+  b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+  b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+  b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+  b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+  b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+  b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+  b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+
+  b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+  b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+  b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+  b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+  b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+  b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+  b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+  b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+
+  b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+  b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+  b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+  b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+  b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+  b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+  b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+  b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+
+  b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+  b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+  b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+  b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+  b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+  b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+  b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+  b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+}
+
+#define STORE_S16(src, index, dest)           \
+  do {                                        \
+    store_s16q_to_tran_low(dest, src[index]); \
+    dest += 8;                                \
+  } while (0)
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+  STORE_S16(b, 0, a);
+  STORE_S16(b, 8, a);
+  STORE_S16(b, 16, a);
+  STORE_S16(b, 24, a);
+  STORE_S16(b, 1, a);
+  STORE_S16(b, 9, a);
+  STORE_S16(b, 17, a);
+  STORE_S16(b, 25, a);
+  STORE_S16(b, 2, a);
+  STORE_S16(b, 10, a);
+  STORE_S16(b, 18, a);
+  STORE_S16(b, 26, a);
+  STORE_S16(b, 3, a);
+  STORE_S16(b, 11, a);
+  STORE_S16(b, 19, a);
+  STORE_S16(b, 27, a);
+  STORE_S16(b, 4, a);
+  STORE_S16(b, 12, a);
+  STORE_S16(b, 20, a);
+  STORE_S16(b, 28, a);
+  STORE_S16(b, 5, a);
+  STORE_S16(b, 13, a);
+  STORE_S16(b, 21, a);
+  STORE_S16(b, 29, a);
+  STORE_S16(b, 6, a);
+  STORE_S16(b, 14, a);
+  STORE_S16(b, 22, a);
+  STORE_S16(b, 30, a);
+  STORE_S16(b, 7, a);
+  STORE_S16(b, 15, a);
+  STORE_S16(b, 23, a);
+  STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+static INLINE void scale_input(const int16x8_t *in /*32*/,
+                               int16x8_t *out /*32*/) {
+  out[0] = vshlq_n_s16(in[0], 2);
+  out[1] = vshlq_n_s16(in[1], 2);
+  out[2] = vshlq_n_s16(in[2], 2);
+  out[3] = vshlq_n_s16(in[3], 2);
+  out[4] = vshlq_n_s16(in[4], 2);
+  out[5] = vshlq_n_s16(in[5], 2);
+  out[6] = vshlq_n_s16(in[6], 2);
+  out[7] = vshlq_n_s16(in[7], 2);
+
+  out[8] = vshlq_n_s16(in[8], 2);
+  out[9] = vshlq_n_s16(in[9], 2);
+  out[10] = vshlq_n_s16(in[10], 2);
+  out[11] = vshlq_n_s16(in[11], 2);
+  out[12] = vshlq_n_s16(in[12], 2);
+  out[13] = vshlq_n_s16(in[13], 2);
+  out[14] = vshlq_n_s16(in[14], 2);
+  out[15] = vshlq_n_s16(in[15], 2);
+
+  out[16] = vshlq_n_s16(in[16], 2);
+  out[17] = vshlq_n_s16(in[17], 2);
+  out[18] = vshlq_n_s16(in[18], 2);
+  out[19] = vshlq_n_s16(in[19], 2);
+  out[20] = vshlq_n_s16(in[20], 2);
+  out[21] = vshlq_n_s16(in[21], 2);
+  out[22] = vshlq_n_s16(in[22], 2);
+  out[23] = vshlq_n_s16(in[23], 2);
+
+  out[24] = vshlq_n_s16(in[24], 2);
+  out[25] = vshlq_n_s16(in[25], 2);
+  out[26] = vshlq_n_s16(in[26], 2);
+  out[27] = vshlq_n_s16(in[27], 2);
+  out[28] = vshlq_n_s16(in[28], 2);
+  out[29] = vshlq_n_s16(in[29], 2);
+  out[30] = vshlq_n_s16(in[30], 2);
+  out[31] = vshlq_n_s16(in[31], 2);
+}
+
+static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  a[0] = vaddq_s16(in[0], in[15]);
+  a[1] = vaddq_s16(in[1], in[14]);
+  a[2] = vaddq_s16(in[2], in[13]);
+  a[3] = vaddq_s16(in[3], in[12]);
+  a[4] = vaddq_s16(in[4], in[11]);
+  a[5] = vaddq_s16(in[5], in[10]);
+  a[6] = vaddq_s16(in[6], in[9]);
+  a[7] = vaddq_s16(in[7], in[8]);
+
+  a[8] = vsubq_s16(in[7], in[8]);
+  a[9] = vsubq_s16(in[6], in[9]);
+  a[10] = vsubq_s16(in[5], in[10]);
+  a[11] = vsubq_s16(in[4], in[11]);
+  a[12] = vsubq_s16(in[3], in[12]);
+  a[13] = vsubq_s16(in[2], in[13]);
+  a[14] = vsubq_s16(in[1], in[14]);
+  a[15] = vsubq_s16(in[0], in[15]);
+
+  a[16] = in[16];
+  a[17] = in[17];
+  a[18] = in[18];
+  a[19] = in[19];
+
+  butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
+                                     &a[20]);
+  butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
+                                     &a[21]);
+  butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
+                                     &a[22]);
+  butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
+                                     &a[23]);
+
+  a[28] = in[28];
+  a[29] = in[29];
+  a[30] = in[30];
+  a[31] = in[31];
+
+  // Stage 3.
+  b[0] = vaddq_s16(a[0], a[7]);
+  b[1] = vaddq_s16(a[1], a[6]);
+  b[2] = vaddq_s16(a[2], a[5]);
+  b[3] = vaddq_s16(a[3], a[4]);
+
+  b[4] = vsubq_s16(a[3], a[4]);
+  b[5] = vsubq_s16(a[2], a[5]);
+  b[6] = vsubq_s16(a[1], a[6]);
+  b[7] = vsubq_s16(a[0], a[7]);
+
+  b[8] = a[8];
+  b[9] = a[9];
+
+  butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+  butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+  b[14] = a[14];
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(in[16], a[23]);
+  b[17] = vaddq_s16(in[17], a[22]);
+  b[18] = vaddq_s16(in[18], a[21]);
+  b[19] = vaddq_s16(in[19], a[20]);
+
+  b[20] = vsubq_s16(in[19], a[20]);
+  b[21] = vsubq_s16(in[18], a[21]);
+  b[22] = vsubq_s16(in[17], a[22]);
+  b[23] = vsubq_s16(in[16], a[23]);
+
+  b[24] = vsubq_s16(in[31], a[24]);
+  b[25] = vsubq_s16(in[30], a[25]);
+  b[26] = vsubq_s16(in[29], a[26]);
+  b[27] = vsubq_s16(in[28], a[27]);
+
+  b[28] = vaddq_s16(in[28], a[27]);
+  b[29] = vaddq_s16(in[29], a[26]);
+  b[30] = vaddq_s16(in[30], a[25]);
+  b[31] = vaddq_s16(in[31], a[24]);
+
+  // Stage 4.
+  a[0] = vaddq_s16(b[0], b[3]);
+  a[1] = vaddq_s16(b[1], b[2]);
+  a[2] = vsubq_s16(b[1], b[2]);
+  a[3] = vsubq_s16(b[0], b[3]);
+
+  a[4] = b[4];
+
+  butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+  a[7] = b[7];
+
+  a[8] = vaddq_s16(b[8], b[11]);
+  a[9] = vaddq_s16(b[9], b[10]);
+  a[10] = vsubq_s16(b[9], b[10]);
+  a[11] = vsubq_s16(b[8], b[11]);
+  a[12] = vsubq_s16(b[15], b[12]);
+  a[13] = vsubq_s16(b[14], b[13]);
+  a[14] = vaddq_s16(b[14], b[13]);
+  a[15] = vaddq_s16(b[15], b[12]);
+
+  a[16] = b[16];
+  a[17] = b[17];
+
+  butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
+  butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
+  butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
+  butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
+
+  a[22] = b[22];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[25] = b[25];
+
+  a[30] = b[30];
+  a[31] = b[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+  butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
+
+  b[4] = vaddq_s16(a[4], a[5]);
+  b[5] = vsubq_s16(a[4], a[5]);
+  b[6] = vsubq_s16(a[7], a[6]);
+  b[7] = vaddq_s16(a[7], a[6]);
+
+  b[8] = a[8];
+
+  butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
+  butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
+
+  b[11] = a[11];
+  b[12] = a[12];
+
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(a[19], a[16]);
+  b[17] = vaddq_s16(a[18], a[17]);
+  b[18] = vsubq_s16(a[17], a[18]);
+  b[19] = vsubq_s16(a[16], a[19]);
+  b[20] = vsubq_s16(a[23], a[20]);
+  b[21] = vsubq_s16(a[22], a[21]);
+  b[22] = vaddq_s16(a[21], a[22]);
+  b[23] = vaddq_s16(a[20], a[23]);
+  b[24] = vaddq_s16(a[27], a[24]);
+  b[25] = vaddq_s16(a[26], a[25]);
+  b[26] = vsubq_s16(a[25], a[26]);
+  b[27] = vsubq_s16(a[24], a[27]);
+  b[28] = vsubq_s16(a[31], a[28]);
+  b[29] = vsubq_s16(a[30], a[29]);
+  b[30] = vaddq_s16(a[29], a[30]);
+  b[31] = vaddq_s16(a[28], a[31]);
+
+  // Stage 6.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+
+  butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
+  butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
+
+  a[8] = vaddq_s16(b[8], b[9]);
+  a[9] = vsubq_s16(b[8], b[9]);
+  a[10] = vsubq_s16(b[11], b[10]);
+  a[11] = vaddq_s16(b[11], b[10]);
+  a[12] = vaddq_s16(b[12], b[13]);
+  a[13] = vsubq_s16(b[12], b[13]);
+  a[14] = vsubq_s16(b[15], b[14]);
+  a[15] = vaddq_s16(b[15], b[14]);
+
+  a[16] = b[16];
+  a[19] = b[19];
+  a[20] = b[20];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[27] = b[27];
+  a[28] = b[28];
+  a[31] = b[31];
+
+  butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
+  butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
+  butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
+
+  // Stage 7.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+  b[4] = a[4];
+  b[5] = a[5];
+  b[6] = a[6];
+  b[7] = a[7];
+
+  butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
+  butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
+  butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
+  butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
+
+  b[16] = vaddq_s16(a[16], a[17]);
+  b[17] = vsubq_s16(a[16], a[17]);
+  b[18] = vsubq_s16(a[19], a[18]);
+  b[19] = vaddq_s16(a[19], a[18]);
+  b[20] = vaddq_s16(a[20], a[21]);
+  b[21] = vsubq_s16(a[20], a[21]);
+  b[22] = vsubq_s16(a[23], a[22]);
+  b[23] = vaddq_s16(a[23], a[22]);
+  b[24] = vaddq_s16(a[24], a[25]);
+  b[25] = vsubq_s16(a[24], a[25]);
+  b[26] = vsubq_s16(a[27], a[26]);
+  b[27] = vaddq_s16(a[27], a[26]);
+  b[28] = vaddq_s16(a[28], a[29]);
+  b[29] = vsubq_s16(a[28], a[29]);
+  b[30] = vsubq_s16(a[31], a[30]);
+  b[31] = vaddq_s16(a[31], a[30]);
+
+  // Final stage.
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  out[0] = sub_round_shift_s16(b[0]);
+  out[16] = sub_round_shift_s16(b[1]);
+  out[8] = sub_round_shift_s16(b[2]);
+  out[24] = sub_round_shift_s16(b[3]);
+  out[4] = sub_round_shift_s16(b[4]);
+  out[20] = sub_round_shift_s16(b[5]);
+  out[12] = sub_round_shift_s16(b[6]);
+  out[28] = sub_round_shift_s16(b[7]);
+  out[2] = sub_round_shift_s16(b[8]);
+  out[18] = sub_round_shift_s16(b[9]);
+  out[10] = sub_round_shift_s16(b[10]);
+  out[26] = sub_round_shift_s16(b[11]);
+  out[6] = sub_round_shift_s16(b[12]);
+  out[22] = sub_round_shift_s16(b[13]);
+  out[14] = sub_round_shift_s16(b[14]);
+  out[30] = sub_round_shift_s16(b[15]);
+
+  butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
+  out[1] = sub_round_shift_s16(a[1]);
+  out[31] = sub_round_shift_s16(a[31]);
+
+  butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
+  out[17] = sub_round_shift_s16(a[17]);
+  out[15] = sub_round_shift_s16(a[15]);
+
+  butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
+  out[9] = sub_round_shift_s16(a[9]);
+  out[23] = sub_round_shift_s16(a[23]);
+
+  butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
+  out[25] = sub_round_shift_s16(a[25]);
+  out[7] = sub_round_shift_s16(a[7]);
+
+  butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
+  out[5] = sub_round_shift_s16(a[5]);
+  out[27] = sub_round_shift_s16(a[27]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
+  out[21] = sub_round_shift_s16(a[21]);
+  out[11] = sub_round_shift_s16(a[11]);
+
+  butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
+  out[13] = sub_round_shift_s16(a[13]);
+  out[19] = sub_round_shift_s16(a[19]);
+
+  butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
+  out[29] = sub_round_shift_s16(a[29]);
+  out[3] = sub_round_shift_s16(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element)    \
+  do {                                     \
+    dst##_lo[element] = src##_lo[element]; \
+    dst##_hi[element] = src##_hi[element]; \
+  } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
+  do {                                                                       \
+    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
+    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+  } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+  do {                                                                     \
+    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
+    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
+    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
+    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
+  } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
+                              add_index, sub_index)                      \
+  do {                                                                   \
+    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+                                &b##_lo[add_index], &b##_hi[add_index],  \
+                                &b##_lo[sub_index], &b##_hi[sub_index]); \
+  } while (0)
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index,  \
+                          sub_index)                                           \
+  do {                                                                         \
+    butterfly_one_coeff_s32_fast(                                              \
+        a##_lo[left_index], a##_hi[left_index], a##_lo[right_index],           \
+        a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
+        &b##_lo[sub_index], &b##_hi[sub_index]);                               \
+  } while (0)
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
+                          right_constant, b, add_index, sub_index)             \
+  do {                                                                         \
+    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
+                            a##_lo[right_index], a##_hi[right_index],          \
+                            left_constant, right_constant, &b##_lo[add_index], \
+                            &b##_hi[add_index], &b##_lo[sub_index],            \
+                            &b##_hi[sub_index]);                               \
+  } while (0)
+
+static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+  int32x4_t c_lo[32];
+  int32x4_t c_hi[32];
+  int32x4_t d_lo[32];
+  int32x4_t d_hi[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+  b[18] = a[18];
+  b[19] = a[19];
+
+  butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+  b[28] = a[28];
+  b[29] = a[29];
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 3. With extreme values for input this calculation rolls over int16_t.
+  // The sources for b[0] get added multiple times and, through testing, have
+  // been shown to overflow starting here.
+  ADD_S16_S32(b, 0, 7, c, 0);
+  ADD_S16_S32(b, 1, 6, c, 1);
+  ADD_S16_S32(b, 2, 5, c, 2);
+  ADD_S16_S32(b, 3, 4, c, 3);
+  SUB_S16_S32(b, 3, 4, c, 4);
+  SUB_S16_S32(b, 2, 5, c, 5);
+  SUB_S16_S32(b, 1, 6, c, 6);
+  SUB_S16_S32(b, 0, 7, c, 7);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  ADD_S16_S32(b, 16, 23, c, 16);
+  ADD_S16_S32(b, 17, 22, c, 17);
+  ADD_S16_S32(b, 18, 21, c, 18);
+  ADD_S16_S32(b, 19, 20, c, 19);
+  SUB_S16_S32(b, 19, 20, c, 20);
+  SUB_S16_S32(b, 18, 21, c, 21);
+  SUB_S16_S32(b, 17, 22, c, 22);
+  SUB_S16_S32(b, 16, 23, c, 23);
+  SUB_S16_S32(b, 31, 24, c, 24);
+  SUB_S16_S32(b, 30, 25, c, 25);
+  SUB_S16_S32(b, 29, 26, c, 26);
+  SUB_S16_S32(b, 28, 27, c, 27);
+  ADD_S16_S32(b, 28, 27, c, 28);
+  ADD_S16_S32(b, 29, 26, c, 29);
+  ADD_S16_S32(b, 30, 25, c, 30);
+  ADD_S16_S32(b, 31, 24, c, 31);
+
+  // Stage 4.
+  ADD_S32(c, 0, 3, d, 0);
+  ADD_S32(c, 1, 2, d, 1);
+  SUB_S32(c, 1, 2, d, 2);
+  SUB_S32(c, 0, 3, d, 3);
+
+  PASS_THROUGH(c, d, 4);
+
+  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+  PASS_THROUGH(c, d, 7);
+
+  ADDW_S16_S32(c, 11, a, 8, d, 8);
+  ADDW_S16_S32(c, 10, a, 9, d, 9);
+  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+  ADDW_S16_S32(c, 13, b, 14, d, 14);
+  ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 17);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
+
+  PASS_THROUGH(c, d, 22);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 25);
+
+  PASS_THROUGH(c, d, 30);
+  PASS_THROUGH(c, d, 31);
+
+  // Stage 5.
+  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+  BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
+
+  ADD_S32(d, 4, 5, c, 4);
+  SUB_S32(d, 4, 5, c, 5);
+  SUB_S32(d, 7, 6, c, 6);
+  ADD_S32(d, 7, 6, c, 7);
+
+  PASS_THROUGH(d, c, 8);
+
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
+
+  PASS_THROUGH(d, c, 11);
+  PASS_THROUGH(d, c, 12);
+  PASS_THROUGH(d, c, 15);
+
+  ADD_S32(d, 16, 19, c, 16);
+  ADD_S32(d, 17, 18, c, 17);
+  SUB_S32(d, 17, 18, c, 18);
+  SUB_S32(d, 16, 19, c, 19);
+  SUB_S32(d, 23, 20, c, 20);
+  SUB_S32(d, 22, 21, c, 21);
+  ADD_S32(d, 22, 21, c, 22);
+  ADD_S32(d, 23, 20, c, 23);
+  ADD_S32(d, 24, 27, c, 24);
+  ADD_S32(d, 25, 26, c, 25);
+  SUB_S32(d, 25, 26, c, 26);
+  SUB_S32(d, 24, 27, c, 27);
+  SUB_S32(d, 31, 28, c, 28);
+  SUB_S32(d, 30, 29, c, 29);
+  ADD_S32(d, 30, 29, c, 30);
+  ADD_S32(d, 31, 28, c, 31);
+
+  // Stage 6.
+  PASS_THROUGH(c, d, 0);
+  PASS_THROUGH(c, d, 1);
+  PASS_THROUGH(c, d, 2);
+  PASS_THROUGH(c, d, 3);
+
+  BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
+  BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
+
+  ADD_S32(c, 8, 9, d, 8);
+  SUB_S32(c, 8, 9, d, 9);
+  SUB_S32(c, 11, 10, d, 10);
+  ADD_S32(c, 11, 10, d, 11);
+  ADD_S32(c, 12, 13, d, 12);
+  SUB_S32(c, 12, 13, d, 13);
+  SUB_S32(c, 15, 14, d, 14);
+  ADD_S32(c, 15, 14, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 19);
+  PASS_THROUGH(c, d, 20);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 27);
+  PASS_THROUGH(c, d, 28);
+  PASS_THROUGH(c, d, 31);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
+
+  // Stage 7.
+  PASS_THROUGH(d, c, 0);
+  PASS_THROUGH(d, c, 1);
+  PASS_THROUGH(d, c, 2);
+  PASS_THROUGH(d, c, 3);
+  PASS_THROUGH(d, c, 4);
+  PASS_THROUGH(d, c, 5);
+  PASS_THROUGH(d, c, 6);
+  PASS_THROUGH(d, c, 7);
+
+  BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
+  BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
+
+  ADD_S32(d, 16, 17, c, 16);
+  SUB_S32(d, 16, 17, c, 17);
+  SUB_S32(d, 19, 18, c, 18);
+  ADD_S32(d, 19, 18, c, 19);
+  ADD_S32(d, 20, 21, c, 20);
+  SUB_S32(d, 20, 21, c, 21);
+  SUB_S32(d, 23, 22, c, 22);
+  ADD_S32(d, 23, 22, c, 23);
+  ADD_S32(d, 24, 25, c, 24);
+  SUB_S32(d, 24, 25, c, 25);
+  SUB_S32(d, 27, 26, c, 26);
+  ADD_S32(d, 27, 26, c, 27);
+  ADD_S32(d, 28, 29, c, 28);
+  SUB_S32(d, 28, 29, c, 29);
+  SUB_S32(d, 31, 30, c, 30);
+  ADD_S32(d, 31, 30, c, 31);
+
+  // Final stage.
+  // Roll rounding into this function so we can pass back int16x8.
+
+  out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
+  out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
+
+  out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
+  out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
+  out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
+  out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
+  out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
+
+  out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
+  out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
+  out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
+  out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
+
+  out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
+  out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
+  out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
+  out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
+  out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
+  out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
+  out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
+  out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
+  out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
+  out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
+  out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
+
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
+  out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
+  out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
+
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
+  out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
+  out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
+
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
+  out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
+  out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
+
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
+  out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
+  out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
+
+  BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
+  out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
+  out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
+}
+
+static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  // For the "rd" version, all the values are rounded down after stage 2 to keep
+  // the values in 16 bits.
+  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+  b[16] = add_round_shift_s16(a[16]);
+  b[17] = add_round_shift_s16(a[17]);
+  b[18] = add_round_shift_s16(a[18]);
+  b[19] = add_round_shift_s16(a[19]);
+
+  butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+  b[20] = add_round_shift_s16(b[20]);
+  b[21] = add_round_shift_s16(b[21]);
+  b[22] = add_round_shift_s16(b[22]);
+  b[23] = add_round_shift_s16(b[23]);
+  b[24] = add_round_shift_s16(b[24]);
+  b[25] = add_round_shift_s16(b[25]);
+  b[26] = add_round_shift_s16(b[26]);
+  b[27] = add_round_shift_s16(b[27]);
+
+  b[28] = add_round_shift_s16(a[28]);
+  b[29] = add_round_shift_s16(a[29]);
+  b[30] = add_round_shift_s16(a[30]);
+  b[31] = add_round_shift_s16(a[31]);
+
+  // Stage 3.
+  a[0] = vaddq_s16(b[0], b[7]);
+  a[1] = vaddq_s16(b[1], b[6]);
+  a[2] = vaddq_s16(b[2], b[5]);
+  a[3] = vaddq_s16(b[3], b[4]);
+
+  a[4] = vsubq_s16(b[3], b[4]);
+  a[5] = vsubq_s16(b[2], b[5]);
+  a[6] = vsubq_s16(b[1], b[6]);
+  a[7] = vsubq_s16(b[0], b[7]);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+  butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[16], b[23]);
+  a[17] = vaddq_s16(b[17], b[22]);
+  a[18] = vaddq_s16(b[18], b[21]);
+  a[19] = vaddq_s16(b[19], b[20]);
+
+  a[20] = vsubq_s16(b[19], b[20]);
+  a[21] = vsubq_s16(b[18], b[21]);
+  a[22] = vsubq_s16(b[17], b[22]);
+  a[23] = vsubq_s16(b[16], b[23]);
+
+  a[24] = vsubq_s16(b[31], b[24]);
+  a[25] = vsubq_s16(b[30], b[25]);
+  a[26] = vsubq_s16(b[29], b[26]);
+  a[27] = vsubq_s16(b[28], b[27]);
+
+  a[28] = vaddq_s16(b[28], b[27]);
+  a[29] = vaddq_s16(b[29], b[26]);
+  a[30] = vaddq_s16(b[30], b[25]);
+  a[31] = vaddq_s16(b[31], b[24]);
+
+  // Stage 4.
+  b[0] = vaddq_s16(a[0], a[3]);
+  b[1] = vaddq_s16(a[1], a[2]);
+  b[2] = vsubq_s16(a[1], a[2]);
+  b[3] = vsubq_s16(a[0], a[3]);
+
+  b[4] = a[4];
+
+  butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+  b[7] = a[7];
+
+  b[8] = vaddq_s16(a[8], a[11]);
+  b[9] = vaddq_s16(a[9], a[10]);
+  b[10] = vsubq_s16(a[9], a[10]);
+  b[11] = vsubq_s16(a[8], a[11]);
+  b[12] = vsubq_s16(a[15], a[12]);
+  b[13] = vsubq_s16(a[14], a[13]);
+  b[14] = vaddq_s16(a[14], a[13]);
+  b[15] = vaddq_s16(a[15], a[12]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+
+  butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
+  butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
+  butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
+  butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
+
+  b[22] = a[22];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[25] = a[25];
+
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+  butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
+
+  a[4] = vaddq_s16(b[4], b[5]);
+  a[5] = vsubq_s16(b[4], b[5]);
+  a[6] = vsubq_s16(b[7], b[6]);
+  a[7] = vaddq_s16(b[7], b[6]);
+
+  a[8] = b[8];
+
+  butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
+  butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
+
+  a[11] = b[11];
+  a[12] = b[12];
+
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[19], b[16]);
+  a[17] = vaddq_s16(b[18], b[17]);
+  a[18] = vsubq_s16(b[17], b[18]);
+  a[19] = vsubq_s16(b[16], b[19]);
+  a[20] = vsubq_s16(b[23], b[20]);
+  a[21] = vsubq_s16(b[22], b[21]);
+  a[22] = vaddq_s16(b[21], b[22]);
+  a[23] = vaddq_s16(b[20], b[23]);
+  a[24] = vaddq_s16(b[27], b[24]);
+  a[25] = vaddq_s16(b[26], b[25]);
+  a[26] = vsubq_s16(b[25], b[26]);
+  a[27] = vsubq_s16(b[24], b[27]);
+  a[28] = vsubq_s16(b[31], b[28]);
+  a[29] = vsubq_s16(b[30], b[29]);
+  a[30] = vaddq_s16(b[29], b[30]);
+  a[31] = vaddq_s16(b[28], b[31]);
+
+  // Stage 6.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+
+  butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
+  butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
+
+  b[8] = vaddq_s16(a[8], a[9]);
+  b[9] = vsubq_s16(a[8], a[9]);
+  b[10] = vsubq_s16(a[11], a[10]);
+  b[11] = vaddq_s16(a[11], a[10]);
+  b[12] = vaddq_s16(a[12], a[13]);
+  b[13] = vsubq_s16(a[12], a[13]);
+  b[14] = vsubq_s16(a[15], a[14]);
+  b[15] = vaddq_s16(a[15], a[14]);
+
+  b[16] = a[16];
+  b[19] = a[19];
+  b[20] = a[20];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[27] = a[27];
+  b[28] = a[28];
+  b[31] = a[31];
+
+  butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
+  butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
+
+  butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
+  butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
+
+  // Stage 7.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+
+  butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
+  butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
+  butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
+  butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
+
+  a[16] = vaddq_s16(b[16], b[17]);
+  a[17] = vsubq_s16(b[16], b[17]);
+  a[18] = vsubq_s16(b[19], b[18]);
+  a[19] = vaddq_s16(b[19], b[18]);
+  a[20] = vaddq_s16(b[20], b[21]);
+  a[21] = vsubq_s16(b[20], b[21]);
+  a[22] = vsubq_s16(b[23], b[22]);
+  a[23] = vaddq_s16(b[23], b[22]);
+  a[24] = vaddq_s16(b[24], b[25]);
+  a[25] = vsubq_s16(b[24], b[25]);
+  a[26] = vsubq_s16(b[27], b[26]);
+  a[27] = vaddq_s16(b[27], b[26]);
+  a[28] = vaddq_s16(b[28], b[29]);
+  a[29] = vsubq_s16(b[28], b[29]);
+  a[30] = vsubq_s16(b[31], b[30]);
+  a[31] = vaddq_s16(b[31], b[30]);
+
+  // Final stage.
+  out[0] = a[0];
+  out[16] = a[1];
+  out[8] = a[2];
+  out[24] = a[3];
+  out[4] = a[4];
+  out[20] = a[5];
+  out[12] = a[6];
+  out[28] = a[7];
+  out[2] = a[8];
+  out[18] = a[9];
+  out[10] = a[10];
+  out[26] = a[11];
+  out[6] = a[12];
+  out[22] = a[13];
+  out[14] = a[14];
+  out[30] = a[15];
+
+  butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
+  butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
+                      &out[15]);
+  butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
+  butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
+  butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
+  butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
+                      &out[11]);
+  butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
+                      &out[19]);
+  butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+#endif  // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c
index 11df729..3b9196f 100644
--- a/vpx_dsp/arm/fdct4x4_neon.c
+++ b/vpx_dsp/arm/fdct4x4_neon.c
@@ -18,10 +18,10 @@
 #include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
 
 void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
                       int stride) {
-  int i;
   // input[M * stride] * 16
   int16x4_t in[4];
   in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
@@ -34,9 +34,8 @@
     const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
     in[0] = vadd_s16(in[0], one);
   }
-  for (i = 0; i < 2; ++i) {
-    vpx_fdct4x4_pass1_neon(in);
-  }
+  vpx_fdct4x4_pass1_neon(in);
+  vpx_fdct4x4_pass2_neon(in);
   {
     // Not quite a rounding shift. Only add 1 despite shifting by 2.
     const int16x8_t one = vdupq_n_s16(1);
@@ -53,7 +52,6 @@
 
 void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
                              int stride) {
-  int i;
   static const int32x4_t const_1000 = { 1, 0, 0, 0 };
   const int32x4_t const_one = vdupq_n_s32(1);
 
@@ -69,9 +67,8 @@
     in[0] = vaddq_s32(in[0], const_1000);
   }
 
-  for (i = 0; i < 2; ++i) {
-    vpx_highbd_fdct4x4_pass1_neon(in);
-  }
+  vpx_highbd_fdct4x4_pass1_neon(in);
+  vpx_highbd_fdct4x4_pass1_neon(in);
   {
     // Not quite a rounding shift. Only add 1 despite shifting by 2.
     in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2);
diff --git a/vpx_dsp/arm/fdct4x4_neon.h b/vpx_dsp/arm/fdct4x4_neon.h
new file mode 100644
index 0000000..de3db97
--- /dev/null
+++ b/vpx_dsp/arm/fdct4x4_neon.h
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+  int16x4_t out[4];
+
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+  butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
+  int16x4_t out[4];
+
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+  butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
+                                               &out[2]);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
+  int32x4_t out[4];
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
+  const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
+  const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
+  const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
+
+  butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+  // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
+                                          &out[1], &out[3]);
+
+  transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c
index 3fb15cc..75ee6f2 100644
--- a/vpx_dsp/arm/fdct8x8_neon.c
+++ b/vpx_dsp/arm/fdct8x8_neon.c
@@ -17,10 +17,10 @@
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
 
 void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
                       int stride) {
-  int i;
   // stage 1
   int16x8_t in[8];
   in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
@@ -31,9 +31,9 @@
   in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
   in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
   in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
-  for (i = 0; i < 2; ++i) {
-    vpx_fdct8x8_pass1_neon(in);
-  }  // for
+
+  vpx_fdct8x8_pass1_neon(in);
+  vpx_fdct8x8_pass2_neon(in);
   {
     // from vpx_dct_sse2.c
     // Post-condition (division by two)
@@ -71,8 +71,6 @@
 
 void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
                              int stride) {
-  int i;
-
   // input[M * stride] * 16
   int32x4_t left[8], right[8];
   int16x8_t in[8];
@@ -102,26 +100,25 @@
   right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
   right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
 
-  for (i = 0; i < 2; ++i) {
-    vpx_highbd_fdct8x8_pass1_neon(left, right);
-  }
+  vpx_highbd_fdct8x8_pass1_neon(left, right);
+  vpx_highbd_fdct8x8_pass2_neon(left, right);
   {
-    left[0] = highbd_add_round_shift_s32(left[0]);
-    left[1] = highbd_add_round_shift_s32(left[1]);
-    left[2] = highbd_add_round_shift_s32(left[2]);
-    left[3] = highbd_add_round_shift_s32(left[3]);
-    left[4] = highbd_add_round_shift_s32(left[4]);
-    left[5] = highbd_add_round_shift_s32(left[5]);
-    left[6] = highbd_add_round_shift_s32(left[6]);
-    left[7] = highbd_add_round_shift_s32(left[7]);
-    right[0] = highbd_add_round_shift_s32(right[0]);
-    right[1] = highbd_add_round_shift_s32(right[1]);
-    right[2] = highbd_add_round_shift_s32(right[2]);
-    right[3] = highbd_add_round_shift_s32(right[3]);
-    right[4] = highbd_add_round_shift_s32(right[4]);
-    right[5] = highbd_add_round_shift_s32(right[5]);
-    right[6] = highbd_add_round_shift_s32(right[6]);
-    right[7] = highbd_add_round_shift_s32(right[7]);
+    left[0] = add_round_shift_half_s32(left[0]);
+    left[1] = add_round_shift_half_s32(left[1]);
+    left[2] = add_round_shift_half_s32(left[2]);
+    left[3] = add_round_shift_half_s32(left[3]);
+    left[4] = add_round_shift_half_s32(left[4]);
+    left[5] = add_round_shift_half_s32(left[5]);
+    left[6] = add_round_shift_half_s32(left[6]);
+    left[7] = add_round_shift_half_s32(left[7]);
+    right[0] = add_round_shift_half_s32(right[0]);
+    right[1] = add_round_shift_half_s32(right[1]);
+    right[2] = add_round_shift_half_s32(right[2]);
+    right[3] = add_round_shift_half_s32(right[3]);
+    right[4] = add_round_shift_half_s32(right[4]);
+    right[5] = add_round_shift_half_s32(right[5]);
+    right[6] = add_round_shift_half_s32(right[6]);
+    right[7] = add_round_shift_half_s32(right[7]);
 
     // store results
     vst1q_s32(final_output, left[0]);
diff --git a/vpx_dsp/arm/fdct8x8_neon.h b/vpx_dsp/arm/fdct8x8_neon.h
new file mode 100644
index 0000000..d8fa600
--- /dev/null
+++ b/vpx_dsp/arm/fdct8x8_neon.h
@@ -0,0 +1,381 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  int16x8_t s[8], x[4], t[2];
+
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
+
+  // Stage 3
+  x[0] = vaddq_s16(s[4], t[0]);
+  x[1] = vsubq_s16(s[4], t[0]);
+  x[2] = vsubq_s16(s[7], t[1]);
+  x[3] = vaddq_s16(s[7], t[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  int16x8_t s[8], x[4], t[2];
+
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
+                                          &t[0]);
+
+  // Stage 3
+  x[0] = vaddq_s16(s[4], t[0]);
+  x[1] = vsubq_s16(s[4], t[0]);
+  x[2] = vsubq_s16(s[7], t[1]);
+  x[3] = vaddq_s16(s[7], t[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass1_notranspose_neon(in, out);
+  // transpose 8x8
+  transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+  in[4] = out[4];
+  in[5] = out[5];
+  in[6] = out[6];
+  in[7] = out[7];
+}
+
+static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass2_notranspose_neon(in, out);
+  // transpose 8x8
+  transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+  in[4] = out[4];
+  in[5] = out[5];
+  in[6] = out[6];
+  in[7] = out[7];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[4], &right[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
+                          &left[2], &right[2], &left[6], &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+                               &tr[1], &tl[0], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
+                          &left[1], &right[1], &left[7], &right[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
+                          &left[5], &right[5], &left[3], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[4], &right[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[2], &right[2], &left[6],
+                                     &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+                               &tr[1], &tl[0], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[1], &right[1], &left[7],
+                                     &right[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[5], &right[5], &left[3],
+                                     &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  int32x4x2_t out[8];
+  vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  left[0] = out[0].val[0];
+  right[0] = out[0].val[1];
+  left[1] = out[1].val[0];
+  right[1] = out[1].val[1];
+  left[2] = out[2].val[0];
+  right[2] = out[2].val[1];
+  left[3] = out[3].val[0];
+  right[3] = out[3].val[1];
+  left[4] = out[4].val[0];
+  right[4] = out[4].val[1];
+  left[5] = out[5].val[0];
+  right[5] = out[5].val[1];
+  left[6] = out[6].val[0];
+  right[6] = out[6].val[1];
+  left[7] = out[7].val[0];
+  right[7] = out[7].val[1];
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  int32x4x2_t out[8];
+  vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  left[0] = out[0].val[0];
+  right[0] = out[0].val[1];
+  left[1] = out[1].val[0];
+  right[1] = out[1].val[1];
+  left[2] = out[2].val[0];
+  right[2] = out[2].val[1];
+  left[3] = out[3].val[0];
+  right[3] = out[3].val[1];
+  left[4] = out[4].val[0];
+  right[4] = out[4].val[1];
+  left[5] = out[5].val[0];
+  right[5] = out[5].val[1];
+  left[6] = out[6].val[0];
+  right[6] = out[6].val[1];
+  left[7] = out[7].val[0];
+  right[7] = out[7].val[1];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index ce66906..1ea948b 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -14,56 +14,94 @@
 #include <arm_neon.h>
 
 // fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_high_t constant,
-                                       int16x8_t *add, int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
+// Variant that performs fast vqrdmulh_s16 operation on half vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a,
+                                                     const int16x4_t b,
+                                                     const tran_coef_t constant,
+                                                     int16x4_t *add,
+                                                     int16x4_t *sub) {
+  int16x4_t c = vdup_n_s16(2 * constant);
+  *add = vqrdmulh_s16(vadd_s16(a, b), c);
+  *sub = vqrdmulh_s16(vsub_s16(a, b), c);
 }
 
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_coef_t constant0,
-                                       const tran_coef_t constant1,
-                                       int16x8_t *add, int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
-  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
-  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
-  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
-  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on full vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a,
+                                                const int16x8_t b,
+                                                const tran_coef_t constant,
+                                                int16x8_t *add,
+                                                int16x8_t *sub) {
+  int16x8_t c = vdupq_n_s16(2 * constant);
+  *add = vqrdmulhq_s16(vaddq_s16(a, b), c);
+  *sub = vqrdmulhq_s16(vsubq_s16(a, b), c);
 }
 
-// Add 2 if positive, 1 if negative, and shift by 2.
-// In practice, subtract the sign bit, then shift with rounding.
-static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
-  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
-  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
-  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
-  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  int32x4_t c = vdupq_n_s32(constant << 17);
+  const int16x4_t a_lo = vget_low_s16(a);
+  const int16x4_t a_hi = vget_high_s16(a);
+  const int16x4_t b_lo = vget_low_s16(b);
+  const int16x4_t b_hi = vget_high_s16(b);
+  *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c);
+  *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c);
+  *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c);
+  *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c);
 }
 
-// Like butterfly_one_coeff, but don't narrow results.
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int16x8_t *add, int16x8_t *sub) {
+  int32x4_t add_lo, add_hi, sub_lo, sub_hi;
+  butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo,
+                                   &sub_hi);
+  *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi));
+  *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_half(
+    const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+    int32x4_t *add, int32x4_t *sub) {
+  int32x4_t c = vdupq_n_s32(constant << 17);
+  *add = vqrdmulhq_s32(vaddl_s16(a, b), c);
+  *sub = vqrdmulhq_s32(vsubl_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on half vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half(
+    const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+    int16x4_t *add, int16x4_t *sub) {
+  int32x4_t add32, sub32;
+  butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32);
+  *add = vmovn_s32(add32);
+  *sub = vmovn_s32(sub32);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
 static INLINE void butterfly_one_coeff_s16_s32(
-    const int16x8_t a, const int16x8_t b, const tran_high_t constant,
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
     int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
     int32x4_t *sub_hi) {
   const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
@@ -78,37 +116,182 @@
   *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
 }
 
-// Like butterfly_one_coeff, but with s32.
-static INLINE void butterfly_one_coeff_s32(
-    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
-    const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
-    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
-  const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
-  const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
-  const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
-  const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
-  const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
-  const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
-  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
-  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
-  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
-  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_narrow(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int16x8_t *add, int16x8_t *sub) {
+  int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi;
+  butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo,
+                              &sub32_hi);
+  *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi));
+  *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi));
 }
 
-// Like butterfly_two_coeff, but with s32.
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a,
+                                                     const int32x4_t b,
+                                                     const tran_coef_t constant,
+                                                     int32x4_t *add,
+                                                     int32x4_t *sub) {
+  const int32x4_t c = vdupq_n_s32(constant << 17);
+  *add = vqrdmulhq_s32(vaddq_s32(a, b), c);
+  *sub = vqrdmulhq_s32(vsubq_s32(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t c = vdupq_n_s32(constant << 17);
+  *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c);
+  *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c);
+  *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c);
+  *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
+    const int32x4_t a, const int32x4_t b, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) {
+  const int32x2_t a_lo = vget_low_s32(a);
+  const int32x2_t a_hi = vget_high_s32(a);
+  const int32x2_t b_lo = vget_low_s32(b);
+  const int32x2_t b_hi = vget_high_s32(b);
+
+  const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1);
+  const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1);
+  const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2);
+  const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2);
+
+  const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2);
+  const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2);
+  const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1);
+  const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1);
+
+  *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+  *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  // ac1/ac2 hold the following values:
+  // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+  //      vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+  // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+  //      vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+  int64x2_t ac1[4];
+  int64x2_t ac2[4];
+  int64x2_t sum[4];
+  int64x2_t diff[4];
+
+  ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+  ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+  ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+  ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+  ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+  ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+  ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+  ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+  sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+  sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+  sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+  sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+  *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+  *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+  diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+  diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+  diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+  diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+  *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+  *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_half(const int16x4_t a,
+                                            const int16x4_t b,
+                                            const tran_coef_t constant1,
+                                            const tran_coef_t constant2,
+                                            int16x4_t *add, int16x4_t *sub) {
+  const int32x4_t a1 = vmull_n_s16(a, constant1);
+  const int32x4_t a2 = vmull_n_s16(a, constant2);
+  const int32x4_t sum = vmlal_n_s16(a1, b, constant2);
+  const int32x4_t diff = vmlsl_n_s16(a2, b, constant1);
+  *add = vqrshrn_n_s32(sum, DCT_CONST_BITS);
+  *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t constant1,
+                                       const tran_coef_t constant2,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1);
+  const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1);
+  const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2);
+  const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2);
+  const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2);
+  const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2);
+  const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1);
+  const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
 static INLINE void butterfly_two_coeff_s32(
     const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
-    const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
-    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
-    int32x4_t *sub_hi) {
-  const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
-  const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
-  const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
-  const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
-  const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
-  const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
-  const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
-  const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+  const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2);
+  const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2);
+  const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1);
+  const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1);
   *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
   *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
   *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
@@ -126,9 +309,10 @@
 }
 
 // Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
-                                            const int32x4_t a_hi) {
+// In practice, add 1, then add the sign bit, then shift and round,
+// return narrowed results
+static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo,
+                                                   const int32x4_t a_hi) {
   const int32x4_t one = vdupq_n_s32(1);
   const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
   const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
@@ -143,419 +327,32 @@
   return vcombine_s16(b_lo, b_hi);
 }
 
-static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
-  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
-  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
-
-  // in_0 +/- in_3, in_1 +/- in_2
-  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
-  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
-
-  // step_0 +/- step_1, step_2 +/- step_3
-  const int16x4_t s_0 = vget_low_s16(s_01);
-  const int16x4_t s_1 = vget_high_s16(s_01);
-  const int16x4_t s_2 = vget_high_s16(s_32);
-  const int16x4_t s_3 = vget_low_s16(s_32);
-
-  // (s_0 +/- s_1) * cospi_16_64
-  // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
-  const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
-  const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
-  const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
-  const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
-
-  // fdct_round_shift
-  int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
-  int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
-
-  // s_3 * cospi_8_64 + s_2 * cospi_24_64
-  // s_3 * cospi_24_64 - s_2 * cospi_8_64
-  const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
-  const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
-
-  const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
-  const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
-
-  // fdct_round_shift
-  int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
-  int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
-
-  transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
-
-  in[0] = out_0;
-  in[1] = out_1;
-  in[2] = out_2;
-  in[3] = out_3;
+// Add 1 if negative, and shift by 1.
+// In practice, add the sign bit, then shift and round
+static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1);
 }
 
-static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
-                                                      int16x8_t *out) {
-  const int16x8_t v_s0 = vaddq_s16(in[0], in[7]);
-  const int16x8_t v_s1 = vaddq_s16(in[1], in[6]);
-  const int16x8_t v_s2 = vaddq_s16(in[2], in[5]);
-  const int16x8_t v_s3 = vaddq_s16(in[3], in[4]);
-  const int16x8_t v_s4 = vsubq_s16(in[3], in[4]);
-  const int16x8_t v_s5 = vsubq_s16(in[2], in[5]);
-  const int16x8_t v_s6 = vsubq_s16(in[1], in[6]);
-  const int16x8_t v_s7 = vsubq_s16(in[0], in[7]);
-  // fdct4(step, step);
-  int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
-  int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
-  int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
-  int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
-  // fdct4(step, step);
-  int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-  int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-  int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-  int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-  int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
-  int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
-  int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
-  int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
-  v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
-  v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
-  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
-  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
-  v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-  v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-  v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-  v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
-  {
-    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-    out[0] = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
-    out[2] = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
-    out[4] = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
-    out[6] = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
-  }
-  // Stage 2
-  v_x0 = vsubq_s16(v_s6, v_s5);
-  v_x1 = vaddq_s16(v_s6, v_s5);
-  v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
-  v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
-  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
-  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
-  {
-    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-    const int16x8_t ab = vcombine_s16(a, b);
-    const int16x8_t cd = vcombine_s16(c, d);
-    // Stage 3
-    v_x0 = vaddq_s16(v_s4, ab);
-    v_x1 = vsubq_s16(v_s4, ab);
-    v_x2 = vsubq_s16(v_s7, cd);
-    v_x3 = vaddq_s16(v_s7, cd);
-  }
-  // Stage 4
-  v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
-  v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
-  v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
-  v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
-  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
-  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
-  v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
-  v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
-  v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
-  v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
-  v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
-  v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
-  v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
-  v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
-  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
-  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
-  {
-    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-    out[1] = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
-    out[3] = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
-    out[5] = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
-    out[7] = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
-  }
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2);
 }
 
-static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
-  int16x8_t out[8];
-  vpx_fdct8x8_pass1_notranspose_neon(in, out);
-  // transpose 8x8
-  // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
-  // columns.
-  {
-    // 00 01 02 03 40 41 42 43
-    // 10 11 12 13 50 51 52 53
-    // 20 21 22 23 60 61 62 63
-    // 30 31 32 33 70 71 72 73
-    // 04 05 06 07 44 45 46 47
-    // 14 15 16 17 54 55 56 57
-    // 24 25 26 27 64 65 66 67
-    // 34 35 36 37 74 75 76 77
-    const int32x4x2_t r02_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2]));
-    const int32x4x2_t r13_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3]));
-    const int32x4x2_t r46_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6]));
-    const int32x4x2_t r57_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7]));
-    const int16x8x2_t r01_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
-                  vreinterpretq_s16_s32(r13_s32.val[0]));
-    const int16x8x2_t r23_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
-                  vreinterpretq_s16_s32(r13_s32.val[1]));
-    const int16x8x2_t r45_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
-                  vreinterpretq_s16_s32(r57_s32.val[0]));
-    const int16x8x2_t r67_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
-                  vreinterpretq_s16_s32(r57_s32.val[1]));
-    in[0] = r01_s16.val[0];
-    in[1] = r01_s16.val[1];
-    in[2] = r23_s16.val[0];
-    in[3] = r23_s16.val[1];
-    in[4] = r45_s16.val[0];
-    in[5] = r45_s16.val[1];
-    in[6] = r67_s16.val[0];
-    in[7] = r67_s16.val[1];
-    // 00 10 20 30 40 50 60 70
-    // 01 11 21 31 41 51 61 71
-    // 02 12 22 32 42 52 62 72
-    // 03 13 23 33 43 53 63 73
-    // 04 14 24 34 44 54 64 74
-    // 05 15 25 35 45 55 65 75
-    // 06 16 26 36 46 56 66 76
-    // 07 17 27 37 47 57 67 77
-  }
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) {
-  const int32x2_t x_lo = vget_low_s32(x);
-  const int32x2_t x_hi = vget_high_s32(x);
-  const int64x2_t x64_lo = vmovl_s32(x_lo);
-  const int64x2_t x64_hi = vmovl_s32(x_hi);
-
-  const int64x2_t sign_lo = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_lo, 63);
-  const int64x2_t sign_hi = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_hi, 63);
-
-  const int64x2_t sum_lo = vaddq_s64(x64_lo, sign_lo);
-  const int64x2_t sum_hi = vaddq_s64(x64_hi, sign_hi);
-  return vcombine_s32(vshrn_n_s64(sum_lo, 1), vshrn_n_s64(sum_hi, 1));
-}
-
-static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a,
-                                                  const int32x4_t b,
-                                                  const tran_coef_t c,
-                                                  int32x4_t *add,
-                                                  int32x4_t *sub) {
-  const int32x2_t a_lo = vget_low_s32(a);
-  const int32x2_t a_hi = vget_high_s32(a);
-  const int32x2_t b_lo = vget_low_s32(b);
-  const int32x2_t b_hi = vget_high_s32(b);
-
-  const int64x2_t a64_lo = vmull_n_s32(a_lo, c);
-  const int64x2_t a64_hi = vmull_n_s32(a_hi, c);
-
-  const int64x2_t sum_lo = vmlal_n_s32(a64_lo, b_lo, c);
-  const int64x2_t sum_hi = vmlal_n_s32(a64_hi, b_hi, c);
-  const int64x2_t diff_lo = vmlsl_n_s32(a64_lo, b_lo, c);
-  const int64x2_t diff_hi = vmlsl_n_s32(a64_hi, b_hi, c);
-
-  *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
-                      vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
-  *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
-                      vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
-}
-
-static INLINE void highbd_butterfly_two_coeff_s32(
-    const int32x4_t a, const int32x4_t b, const tran_coef_t c0,
-    const tran_coef_t c1, int32x4_t *add, int32x4_t *sub) {
-  const int32x2_t a_lo = vget_low_s32(a);
-  const int32x2_t a_hi = vget_high_s32(a);
-  const int32x2_t b_lo = vget_low_s32(b);
-  const int32x2_t b_hi = vget_high_s32(b);
-
-  const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, c0);
-  const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, c0);
-  const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, c1);
-  const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, c1);
-
-  const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, c1);
-  const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, c1);
-  const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, c0);
-  const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, c0);
-
-  *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
-                      vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
-  *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
-                      vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
-}
-
-static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
-  int32x4_t out[4];
-  // in_0 +/- in_3, in_1 +/- in_2
-  const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
-  const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
-  const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
-  const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
-
-  highbd_butterfly_one_coeff_s32(s_0, s_1, cospi_16_64, &out[0], &out[2]);
-
-  // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
-  // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
-  highbd_butterfly_two_coeff_s32(s_3, s_2, cospi_8_64, cospi_24_64, &out[1],
-                                 &out[3]);
-
-  transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
-
-  in[0] = out[0];
-  in[1] = out[1];
-  in[2] = out[2];
-  in[3] = out[3];
-}
-
-static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
-                                                             int32x4_t *right) {
-  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
-
-  sl[0] = vaddq_s32(left[0], left[7]);
-  sl[1] = vaddq_s32(left[1], left[6]);
-  sl[2] = vaddq_s32(left[2], left[5]);
-  sl[3] = vaddq_s32(left[3], left[4]);
-  sl[4] = vsubq_s32(left[3], left[4]);
-  sl[5] = vsubq_s32(left[2], left[5]);
-  sl[6] = vsubq_s32(left[1], left[6]);
-  sl[7] = vsubq_s32(left[0], left[7]);
-  sr[0] = vaddq_s32(right[0], right[7]);
-  sr[1] = vaddq_s32(right[1], right[6]);
-  sr[2] = vaddq_s32(right[2], right[5]);
-  sr[3] = vaddq_s32(right[3], right[4]);
-  sr[4] = vsubq_s32(right[3], right[4]);
-  sr[5] = vsubq_s32(right[2], right[5]);
-  sr[6] = vsubq_s32(right[1], right[6]);
-  sr[7] = vsubq_s32(right[0], right[7]);
-
-  // fdct4(step, step);
-  // x0 = s0 + s3;
-  xl[0] = vaddq_s32(sl[0], sl[3]);
-  xr[0] = vaddq_s32(sr[0], sr[3]);
-  // x1 = s1 + s2;
-  xl[1] = vaddq_s32(sl[1], sl[2]);
-  xr[1] = vaddq_s32(sr[1], sr[2]);
-  // x2 = s1 - s2;
-  xl[2] = vsubq_s32(sl[1], sl[2]);
-  xr[2] = vsubq_s32(sr[1], sr[2]);
-  // x3 = s0 - s3;
-  xl[3] = vsubq_s32(sl[0], sl[3]);
-  xr[3] = vsubq_s32(sr[0], sr[3]);
-
-  // fdct4(step, step);
-  // t0 = (x0 + x1) * cospi_16_64;
-  // t1 = (x0 - x1) * cospi_16_64;
-  // out[0] = (tran_low_t)fdct_round_shift(t0);
-  // out[4] = (tran_low_t)fdct_round_shift(t1);
-  highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[4]);
-  highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0],
-                                 &right[4]);
-  // t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
-  // t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-  // out[2] = (tran_low_t)fdct_round_shift(t2);
-  // out[6] = (tran_low_t)fdct_round_shift(t3);
-  highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64,
-                                 &left[2], &left[6]);
-  highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64,
-                                 &right[2], &right[6]);
-
-  // Stage 2
-  // t0 = (s6 - s5) * cospi_16_64;
-  highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &tl[1], &tl[0]);
-  highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &tr[1], &tr[0]);
-
-  // Stage 3
-  xl[0] = vaddq_s32(sl[4], tl[0]);
-  xr[0] = vaddq_s32(sr[4], tr[0]);
-  xl[1] = vsubq_s32(sl[4], tl[0]);
-  xr[1] = vsubq_s32(sr[4], tr[0]);
-  xl[2] = vsubq_s32(sl[7], tl[1]);
-  xr[2] = vsubq_s32(sr[7], tr[1]);
-  xl[3] = vaddq_s32(sl[7], tl[1]);
-  xr[3] = vaddq_s32(sr[7], tr[1]);
-
-  // Stage 4
-  // t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-  // out[1] = (tran_low_t)fdct_round_shift(t0);
-  // t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-  // out[7] = (tran_low_t)fdct_round_shift(t3);
-  highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64,
-                                 &left[1], &left[7]);
-  highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64,
-                                 &right[1], &right[7]);
-
-  // t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-  // out[5] = (tran_low_t)fdct_round_shift(t1);
-  // t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-  // out[3] = (tran_low_t)fdct_round_shift(t2);
-  highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64,
-                                 &left[5], &left[3]);
-  highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64,
-                                 &right[5], &right[3]);
-}
-
-static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
-                                                 int32x4_t *right) {
-  int32x4x2_t out[8];
-  vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
-
-  out[0].val[0] = left[0];
-  out[0].val[1] = right[0];
-  out[1].val[0] = left[1];
-  out[1].val[1] = right[1];
-  out[2].val[0] = left[2];
-  out[2].val[1] = right[2];
-  out[3].val[0] = left[3];
-  out[3].val[1] = right[3];
-  out[4].val[0] = left[4];
-  out[4].val[1] = right[4];
-  out[5].val[0] = left[5];
-  out[5].val[1] = right[5];
-  out[6].val[0] = left[6];
-  out[6].val[1] = right[6];
-  out[7].val[0] = left[7];
-  out[7].val[1] = right[7];
-
-  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
-                    &out[6], &out[7]);
-
-  left[0] = out[0].val[0];
-  right[0] = out[0].val[1];
-  left[1] = out[1].val[0];
-  right[1] = out[1].val[1];
-  left[2] = out[2].val[0];
-  right[2] = out[2].val[1];
-  left[3] = out[3].val[0];
-  right[3] = out[3].val[1];
-  left[4] = out[4].val[0];
-  right[4] = out[4].val[1];
-  left[5] = out[5].val[0];
-  right[5] = out[5].val[1];
-  left[6] = out[6].val[0];
-  right[6] = out[6].val[1];
-  left[7] = out[7].val[0];
-  right[7] = out[7].val[1];
-}
-
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index bf06d6a..41d44f2 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -821,6 +821,51 @@
   a7->val[1] = c7.val[1];
 }
 
+// Helper transpose function for highbd FDCT variants
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+                                       int32x4_t *right /*[8]*/,
+                                       int32x4_t *out_left /*[8]*/,
+                                       int32x4_t *out_right /*[8]*/) {
+  int32x4x2_t out[8];
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  out_left[0] = out[0].val[0];
+  out_left[1] = out[1].val[0];
+  out_left[2] = out[2].val[0];
+  out_left[3] = out[3].val[0];
+  out_left[4] = out[4].val[0];
+  out_left[5] = out[5].val[0];
+  out_left[6] = out[6].val[0];
+  out_left[7] = out[7].val[0];
+  out_right[0] = out[0].val[1];
+  out_right[1] = out[1].val[1];
+  out_right[2] = out[2].val[1];
+  out_right[3] = out[3].val[1];
+  out_right[4] = out[4].val[1];
+  out_right[5] = out[5].val[1];
+  out_right[6] = out[6].val[1];
+  out_right[7] = out[7].val[1];
+}
+
 static INLINE void transpose_u8_16x8(
     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,