diff --git a/BUILD.gn b/BUILD.gn
index 901ed1e..f9c483e 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -162,6 +162,7 @@
       "simd/arm/common/jdmerge-neon.c",
       "simd/arm/common/jdsample-neon.c",
       "simd/arm/common/jfdctfst-neon.c",
+      "simd/arm/common/jfdctint-neon.c",
       "simd/arm/common/jidctfst-neon.c",
       "simd/arm/common/jidctint-neon.c",
       "simd/arm/common/jidctred-neon.c",
@@ -180,6 +181,7 @@
       "simd/arm/common/jdmerge-neon.c",
       "simd/arm/common/jdsample-neon.c",
       "simd/arm/common/jfdctfst-neon.c",
+      "simd/arm/common/jfdctint-neon.c",
       "simd/arm/common/jidctfst-neon.c",
       "simd/arm/common/jidctint-neon.c",
       "simd/arm/common/jidctred-neon.c",
diff --git a/README.chromium b/README.chromium
index 19db222..fca4ea6 100644
--- a/README.chromium
+++ b/README.chromium
@@ -79,6 +79,7 @@
   - Implement sample conversion using Arm NEON intrinsics
   - Implement quantization using Arm NEON intrinsics
   - Implement fast DCT using Arm NEON intrinsics
+  - Implement accurate DCT using Arm NEON intrinsics
 * Patches to enable running the upstream unit tests through gtest.
   The upstream unit tests are defined here under the section 'TESTS':
   https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/CMakeLists.txt
diff --git a/simd/arm/arm/jsimd.c b/simd/arm/arm/jsimd.c
index 98907a5..c0d5d90 100644
--- a/simd/arm/arm/jsimd.c
+++ b/simd/arm/arm/jsimd.c
@@ -674,6 +674,17 @@
 GLOBAL(int)
 jsimd_can_fdct_islow(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -703,6 +714,7 @@
 GLOBAL(void)
 jsimd_fdct_islow(DCTELEM *data)
 {
+  jsimd_fdct_islow_neon(data);
 }
 
 GLOBAL(void)
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index d94cfdd..898cf2c 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -39,53 +39,6 @@
 .section .rodata, "a", %progbits
 #endif
 
-/* Constants for jsimd_fdct_islow_neon() */
-
-#define F_0_298   2446  /* FIX(0.298631336) */
-#define F_0_390   3196  /* FIX(0.390180644) */
-#define F_0_541   4433  /* FIX(0.541196100) */
-#define F_0_765   6270  /* FIX(0.765366865) */
-#define F_0_899   7373  /* FIX(0.899976223) */
-#define F_1_175   9633  /* FIX(1.175875602) */
-#define F_1_501  12299  /* FIX(1.501321110) */
-#define F_1_847  15137  /* FIX(1.847759065) */
-#define F_1_961  16069  /* FIX(1.961570560) */
-#define F_2_053  16819  /* FIX(2.053119869) */
-#define F_2_562  20995  /* FIX(2.562915447) */
-#define F_3_072  25172  /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_fdct_islow_neon_consts:
-  .short F_0_298
-  .short -F_0_390
-  .short F_0_541
-  .short F_0_765
-  .short - F_0_899
-  .short F_1_175
-  .short F_1_501
-  .short - F_1_847
-  .short - F_1_961
-  .short F_2_053
-  .short - F_2_562
-  .short F_3_072
-  .short 0          /* padding */
-  .short 0
-  .short 0
-  .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-
 /* Constants for jsimd_huff_encode_one_block_neon() */
 
 .balign 16
@@ -152,340 +105,12 @@
 #endif
 .endm
 
-.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
-    trn1            \t0\().8h, \l0\().8h, \l1\().8h
-    trn1            \t1\().8h, \l2\().8h, \l3\().8h
-    trn1            \t2\().8h, \l4\().8h, \l5\().8h
-    trn1            \t3\().8h, \l6\().8h, \l7\().8h
-    trn2            \l1\().8h, \l0\().8h, \l1\().8h
-    trn2            \l3\().8h, \l2\().8h, \l3\().8h
-    trn2            \l5\().8h, \l4\().8h, \l5\().8h
-    trn2            \l7\().8h, \l6\().8h, \l7\().8h
-
-    trn1            \l4\().4s, \t2\().4s, \t3\().4s
-    trn2            \t3\().4s, \t2\().4s, \t3\().4s
-    trn1            \t2\().4s, \t0\().4s, \t1\().4s
-    trn2            \l2\().4s, \t0\().4s, \t1\().4s
-    trn1            \t0\().4s, \l1\().4s, \l3\().4s
-    trn2            \l3\().4s, \l1\().4s, \l3\().4s
-    trn2            \t1\().4s, \l5\().4s, \l7\().4s
-    trn1            \l5\().4s, \l5\().4s, \l7\().4s
-
-    trn2            \l6\().2d, \l2\().2d, \t3\().2d
-    trn1            \l0\().2d, \t2\().2d, \l4\().2d
-    trn1            \l1\().2d, \t0\().2d, \l5\().2d
-    trn2            \l7\().2d, \l3\().2d, \t1\().2d
-    trn1            \l2\().2d, \l2\().2d, \t3\().2d
-    trn2            \l4\().2d, \t2\().2d, \l4\().2d
-    trn1            \l3\().2d, \l3\().2d, \t1\().2d
-    trn2            \l5\().2d, \t0\().2d, \l5\().2d
-.endm
-
 
 #define CENTERJSAMPLE  128
 
 /*****************************************************************************/
 
 /*
- * jsimd_fdct_islow_neon
- *
- * This file contains a slow-but-accurate integer implementation of the
- * forward DCT (Discrete Cosine Transform). The following code is based
- * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
- * more details.
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- *       rid of a bunch of VLD1.16 instructions
- */
-
-#define CONST_BITS  13
-#define PASS1_BITS  2
-
-#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
-
-#define XFIX_P_0_298  v0.h[0]
-#define XFIX_N_0_390  v0.h[1]
-#define XFIX_P_0_541  v0.h[2]
-#define XFIX_P_0_765  v0.h[3]
-#define XFIX_N_0_899  v0.h[4]
-#define XFIX_P_1_175  v0.h[5]
-#define XFIX_P_1_501  v0.h[6]
-#define XFIX_N_1_847  v0.h[7]
-#define XFIX_N_1_961  v1.h[0]
-#define XFIX_P_2_053  v1.h[1]
-#define XFIX_N_2_562  v1.h[2]
-#define XFIX_P_3_072  v1.h[3]
-
-.balign 16
-asm_function jsimd_fdct_islow_neon
-
-    DATA            .req x0
-    TMP             .req x9
-
-    /* Load constants */
-    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
-    ld1             {v0.8h, v1.8h}, [TMP]
-
-    /* Save NEON registers */
-    sub             sp, sp, #64
-    mov             x10, sp
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
-
-    /* Load all DATA into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17    | v16.8h
-     *   1 | d18     | d19    | v17.8h
-     *   2 | d20     | d21    | v18.8h
-     *   3 | d22     | d23    | v19.8h
-     *   4 | d24     | d25    | v20.8h
-     *   5 | d26     | d27    | v21.8h
-     *   6 | d28     | d29    | v22.8h
-     *   7 | d30     | d31    | v23.8h
-     */
-
-    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-    sub             DATA, DATA, #64
-
-    /* Transpose */
-    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
-    /* 1-D FDCT */
-    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
-    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
-    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
-    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
-    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
-    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
-    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
-    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
-
-    /* even part */
-
-    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
-    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
-    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
-    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
-
-    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
-    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
-
-    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
-
-    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
-    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
-
-    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
-    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
-    mov             v22.16b, v18.16b
-    mov             v25.16b, v24.16b
-
-    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
-    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
-    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
-    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
-
-    rshrn           v18.4h, v18.4s, #DESCALE_P1
-    rshrn           v22.4h, v22.4s, #DESCALE_P1
-    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
-    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
-
-    /* Odd part */
-
-    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
-    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
-    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
-    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
-    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
-    smull2          v5.4s, v10.8h, XFIX_P_1_175
-    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
-    smlal2          v5.4s, v11.8h, XFIX_P_1_175
-
-    smull2          v24.4s, v28.8h, XFIX_P_0_298
-    smull2          v25.4s, v29.8h, XFIX_P_2_053
-    smull2          v26.4s, v30.8h, XFIX_P_3_072
-    smull2          v27.4s, v31.8h, XFIX_P_1_501
-    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
-    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
-    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
-    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
-
-    smull2          v12.4s, v8.8h, XFIX_N_0_899
-    smull2          v13.4s, v9.8h, XFIX_N_2_562
-    smull2          v14.4s, v10.8h, XFIX_N_1_961
-    smull2          v15.4s, v11.8h, XFIX_N_0_390
-    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
-    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
-    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
-    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
-
-    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
-    add             v14.4s, v14.4s, v5.4s
-    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
-    add             v15.4s, v15.4s, v5.4s
-
-    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
-    add             v24.4s, v24.4s, v12.4s
-    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
-    add             v25.4s, v25.4s, v13.4s
-    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
-    add             v26.4s, v26.4s, v14.4s
-    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
-    add             v27.4s, v27.4s, v15.4s
-
-    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
-    add             v24.4s, v24.4s, v14.4s
-    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
-    add             v25.4s, v25.4s, v15.4s
-    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
-    add             v26.4s, v26.4s, v13.4s
-    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
-    add             v27.4s, v27.4s, v12.4s
-
-    rshrn           v23.4h, v28.4s, #DESCALE_P1
-    rshrn           v21.4h, v29.4s, #DESCALE_P1
-    rshrn           v19.4h, v30.4s, #DESCALE_P1
-    rshrn           v17.4h, v31.4s, #DESCALE_P1
-    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
-    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
-
-    /* Transpose */
-    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
-
-    /* 1-D FDCT */
-    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
-    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
-    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
-    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
-    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
-    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
-    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
-    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
-
-    /* even part */
-    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
-    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
-    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
-    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
-
-    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
-    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
-
-    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
-
-    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
-    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
-
-    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
-    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
-    mov             v22.16b, v18.16b
-    mov             v25.16b, v24.16b
-
-    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
-    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
-    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
-    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
-
-    rshrn           v18.4h, v18.4s, #DESCALE_P2
-    rshrn           v22.4h, v22.4s, #DESCALE_P2
-    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
-    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
-
-    /* Odd part */
-    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
-    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
-    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
-    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
-
-    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
-    smull2          v5.4s, v10.8h, XFIX_P_1_175
-    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
-    smlal2          v5.4s, v11.8h, XFIX_P_1_175
-
-    smull2          v24.4s, v28.8h, XFIX_P_0_298
-    smull2          v25.4s, v29.8h, XFIX_P_2_053
-    smull2          v26.4s, v30.8h, XFIX_P_3_072
-    smull2          v27.4s, v31.8h, XFIX_P_1_501
-    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
-    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
-    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
-    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
-
-    smull2          v12.4s, v8.8h, XFIX_N_0_899
-    smull2          v13.4s, v9.8h, XFIX_N_2_562
-    smull2          v14.4s, v10.8h, XFIX_N_1_961
-    smull2          v15.4s, v11.8h, XFIX_N_0_390
-    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
-    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
-    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
-    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
-
-    add             v10.4s, v10.4s, v4.4s
-    add             v14.4s, v14.4s, v5.4s
-    add             v11.4s, v11.4s, v4.4s
-    add             v15.4s, v15.4s, v5.4s
-
-    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
-    add             v24.4s, v24.4s, v12.4s
-    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
-    add             v25.4s, v25.4s, v13.4s
-    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
-    add             v26.4s, v26.4s, v14.4s
-    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
-    add             v27.4s, v27.4s, v15.4s
-
-    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
-    add             v24.4s, v24.4s, v14.4s
-    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
-    add             v25.4s, v25.4s, v15.4s
-    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
-    add             v26.4s, v26.4s, v13.4s
-    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
-    add             v27.4s, v27.4s, v12.4s
-
-    rshrn           v23.4h, v28.4s, #DESCALE_P2
-    rshrn           v21.4h, v29.4s, #DESCALE_P2
-    rshrn           v19.4h, v30.4s, #DESCALE_P2
-    rshrn           v17.4h, v31.4s, #DESCALE_P2
-    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
-    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
-
-    /* store results */
-    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-
-    /* Restore NEON registers */
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-
-    br              x30
-
-    .unreq          DATA
-    .unreq          TMP
-
-#undef XFIX_P_0_298
-#undef XFIX_N_0_390
-#undef XFIX_P_0_541
-#undef XFIX_P_0_765
-#undef XFIX_N_0_899
-#undef XFIX_P_1_175
-#undef XFIX_P_1_501
-#undef XFIX_N_1_847
-#undef XFIX_N_1_961
-#undef XFIX_P_2_053
-#undef XFIX_N_2_562
-#undef XFIX_P_3_072
-
-
-/*****************************************************************************/
-
-/*
  * GLOBAL(JOCTET *)
  * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
  *                             JCOEFPTR block, int last_dc_val,
diff --git a/simd/arm/common/jfdctint-neon.c b/simd/arm/common/jfdctint-neon.c
new file mode 100644
index 0000000..55abb1b
--- /dev/null
+++ b/simd/arm/common/jfdctint-neon.c
@@ -0,0 +1,371 @@
+/*
+ * jfdctint-neon.c - accurate DCT (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Aruthors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jconfigint.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/*
+ * 'jsimd_fdct_islow_neon' performs a slow-but-accurate forward DCT (Discrete
+ * Cosine Transform) on one block of samples. It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_fdct_islow'
+ * function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm. Where possible,
+ * the variable names and comments here in 'jsimd_fdct_islow_neon' match up
+ * with those in 'jpeg_fdct_islow'.
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+  F_0_298, -F_0_390,  F_0_541,  F_0_765,
+ -F_0_899,  F_1_175,  F_1_501, -F_1_847,
+ -F_1_961,  F_2_053, -F_2_562,  F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+  /* Load DCT constants. */
+#if defined(__clang__) || defined(_MSC_VER)
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { consts1, consts2, consts3 };
+#endif
+
+  /* Load an 8x8 block of samples into Neon registers. De-interleaving loads */
+  /* are used followed by vuzp to transpose the block such that we have a */
+  /* column of samples per vector - allowing all rows to be processed at */
+  /* once. */
+  int16x8x4_t s_rows_0123 = vld4q_s16(data);
+  int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part. */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  int32x4_t z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13),
+                                  consts.val[0], 2);
+  int32x4_t z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13),
+                                  consts.val[0], 2);
+
+  int32x4_t col2_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp13),
+                                           consts.val[0], 3);
+  int32x4_t col2_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp13),
+                                           consts.val[0], 3);
+  col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+  int32x4_t col6_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp12),
+                                           consts.val[1], 3);
+  int32x4_t col6_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp12),
+                                           consts.val[1], 3);
+  col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+  /* Odd part. */
+  int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+  int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+  int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+  int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2. */
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part. */
+  tmp10 = vaddq_s16(tmp0, tmp3);
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t row2_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp13),
+                                           consts.val[0], 3);
+  int32x4_t row2_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp13),
+                                           consts.val[0], 3);
+  row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+  int32x4_t row6_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp12),
+                                           consts.val[1], 3);
+  int32x4_t row6_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp12),
+                                           consts.val[1], 3);
+  row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+  /* Odd part. */
+  z1 = vaddq_s16(tmp4, tmp7);
+  z2 = vaddq_s16(tmp5, tmp6);
+  z3 = vaddq_s16(tmp4, tmp6);
+  z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
