blob: 3b9e64c6df989720b6c4c43fde1faee636fb1e6a [file] [log] [blame]
/*
* Copyright (c) 2022 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
#include <arm_neon.h>
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/arm/fdct_neon.h"
// Load & cross the first 8 and last 8, then the middle
static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
}
#define STORE_S16(src, index, dest) \
do { \
store_s16q_to_tran_low(dest, src[index]); \
dest += 8; \
} while (0)
// Store 32 16x8 values, assuming stride == 32.
// Slight twist: store horizontally in blocks of 8.
static INLINE void store(tran_low_t *a, const int16x8_t *b) {
STORE_S16(b, 0, a);
STORE_S16(b, 8, a);
STORE_S16(b, 16, a);
STORE_S16(b, 24, a);
STORE_S16(b, 1, a);
STORE_S16(b, 9, a);
STORE_S16(b, 17, a);
STORE_S16(b, 25, a);
STORE_S16(b, 2, a);
STORE_S16(b, 10, a);
STORE_S16(b, 18, a);
STORE_S16(b, 26, a);
STORE_S16(b, 3, a);
STORE_S16(b, 11, a);
STORE_S16(b, 19, a);
STORE_S16(b, 27, a);
STORE_S16(b, 4, a);
STORE_S16(b, 12, a);
STORE_S16(b, 20, a);
STORE_S16(b, 28, a);
STORE_S16(b, 5, a);
STORE_S16(b, 13, a);
STORE_S16(b, 21, a);
STORE_S16(b, 29, a);
STORE_S16(b, 6, a);
STORE_S16(b, 14, a);
STORE_S16(b, 22, a);
STORE_S16(b, 30, a);
STORE_S16(b, 7, a);
STORE_S16(b, 15, a);
STORE_S16(b, 23, a);
STORE_S16(b, 31, a);
}
#undef STORE_S16
static INLINE void scale_input(const int16x8_t *in /*32*/,
int16x8_t *out /*32*/) {
out[0] = vshlq_n_s16(in[0], 2);
out[1] = vshlq_n_s16(in[1], 2);
out[2] = vshlq_n_s16(in[2], 2);
out[3] = vshlq_n_s16(in[3], 2);
out[4] = vshlq_n_s16(in[4], 2);
out[5] = vshlq_n_s16(in[5], 2);
out[6] = vshlq_n_s16(in[6], 2);
out[7] = vshlq_n_s16(in[7], 2);
out[8] = vshlq_n_s16(in[8], 2);
out[9] = vshlq_n_s16(in[9], 2);
out[10] = vshlq_n_s16(in[10], 2);
out[11] = vshlq_n_s16(in[11], 2);
out[12] = vshlq_n_s16(in[12], 2);
out[13] = vshlq_n_s16(in[13], 2);
out[14] = vshlq_n_s16(in[14], 2);
out[15] = vshlq_n_s16(in[15], 2);
out[16] = vshlq_n_s16(in[16], 2);
out[17] = vshlq_n_s16(in[17], 2);
out[18] = vshlq_n_s16(in[18], 2);
out[19] = vshlq_n_s16(in[19], 2);
out[20] = vshlq_n_s16(in[20], 2);
out[21] = vshlq_n_s16(in[21], 2);
out[22] = vshlq_n_s16(in[22], 2);
out[23] = vshlq_n_s16(in[23], 2);
out[24] = vshlq_n_s16(in[24], 2);
out[25] = vshlq_n_s16(in[25], 2);
out[26] = vshlq_n_s16(in[26], 2);
out[27] = vshlq_n_s16(in[27], 2);
out[28] = vshlq_n_s16(in[28], 2);
out[29] = vshlq_n_s16(in[29], 2);
out[30] = vshlq_n_s16(in[30], 2);
out[31] = vshlq_n_s16(in[31], 2);
}
static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
int16x8_t a[32];
int16x8_t b[32];
// Stage 1: Done as part of the load.
// Stage 2.
// Mini cross. X the first 16 values and the middle 8 of the second half.
a[0] = vaddq_s16(in[0], in[15]);
a[1] = vaddq_s16(in[1], in[14]);
a[2] = vaddq_s16(in[2], in[13]);
a[3] = vaddq_s16(in[3], in[12]);
a[4] = vaddq_s16(in[4], in[11]);
a[5] = vaddq_s16(in[5], in[10]);
a[6] = vaddq_s16(in[6], in[9]);
a[7] = vaddq_s16(in[7], in[8]);
a[8] = vsubq_s16(in[7], in[8]);
a[9] = vsubq_s16(in[6], in[9]);
a[10] = vsubq_s16(in[5], in[10]);
a[11] = vsubq_s16(in[4], in[11]);
a[12] = vsubq_s16(in[3], in[12]);
a[13] = vsubq_s16(in[2], in[13]);
a[14] = vsubq_s16(in[1], in[14]);
a[15] = vsubq_s16(in[0], in[15]);
a[16] = in[16];
a[17] = in[17];
a[18] = in[18];
a[19] = in[19];
butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
&a[20]);
butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
&a[21]);
butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
&a[22]);
butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
&a[23]);
a[28] = in[28];
a[29] = in[29];
a[30] = in[30];
a[31] = in[31];
// Stage 3.
b[0] = vaddq_s16(a[0], a[7]);
b[1] = vaddq_s16(a[1], a[6]);
b[2] = vaddq_s16(a[2], a[5]);
b[3] = vaddq_s16(a[3], a[4]);
b[4] = vsubq_s16(a[3], a[4]);
b[5] = vsubq_s16(a[2], a[5]);
b[6] = vsubq_s16(a[1], a[6]);
b[7] = vsubq_s16(a[0], a[7]);
b[8] = a[8];
b[9] = a[9];
butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
b[14] = a[14];
b[15] = a[15];
b[16] = vaddq_s16(in[16], a[23]);
b[17] = vaddq_s16(in[17], a[22]);
b[18] = vaddq_s16(in[18], a[21]);
b[19] = vaddq_s16(in[19], a[20]);
b[20] = vsubq_s16(in[19], a[20]);
b[21] = vsubq_s16(in[18], a[21]);
b[22] = vsubq_s16(in[17], a[22]);
b[23] = vsubq_s16(in[16], a[23]);
b[24] = vsubq_s16(in[31], a[24]);
b[25] = vsubq_s16(in[30], a[25]);
b[26] = vsubq_s16(in[29], a[26]);
b[27] = vsubq_s16(in[28], a[27]);
b[28] = vaddq_s16(in[28], a[27]);
b[29] = vaddq_s16(in[29], a[26]);
b[30] = vaddq_s16(in[30], a[25]);
b[31] = vaddq_s16(in[31], a[24]);
// Stage 4.
a[0] = vaddq_s16(b[0], b[3]);
a[1] = vaddq_s16(b[1], b[2]);
a[2] = vsubq_s16(b[1], b[2]);
a[3] = vsubq_s16(b[0], b[3]);
a[4] = b[4];
butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
a[7] = b[7];
a[8] = vaddq_s16(b[8], b[11]);
a[9] = vaddq_s16(b[9], b[10]);
a[10] = vsubq_s16(b[9], b[10]);
a[11] = vsubq_s16(b[8], b[11]);
a[12] = vsubq_s16(b[15], b[12]);
a[13] = vsubq_s16(b[14], b[13]);
a[14] = vaddq_s16(b[14], b[13]);
a[15] = vaddq_s16(b[15], b[12]);
a[16] = b[16];
a[17] = b[17];
butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
a[22] = b[22];
a[23] = b[23];
a[24] = b[24];
a[25] = b[25];
a[30] = b[30];
a[31] = b[31];
// Stage 5.
butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
b[4] = vaddq_s16(a[4], a[5]);
b[5] = vsubq_s16(a[4], a[5]);
b[6] = vsubq_s16(a[7], a[6]);
b[7] = vaddq_s16(a[7], a[6]);
b[8] = a[8];
butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
b[11] = a[11];
b[12] = a[12];
b[15] = a[15];
b[16] = vaddq_s16(a[19], a[16]);
b[17] = vaddq_s16(a[18], a[17]);
b[18] = vsubq_s16(a[17], a[18]);
b[19] = vsubq_s16(a[16], a[19]);
b[20] = vsubq_s16(a[23], a[20]);
b[21] = vsubq_s16(a[22], a[21]);
b[22] = vaddq_s16(a[21], a[22]);
b[23] = vaddq_s16(a[20], a[23]);
b[24] = vaddq_s16(a[27], a[24]);
b[25] = vaddq_s16(a[26], a[25]);
b[26] = vsubq_s16(a[25], a[26]);
b[27] = vsubq_s16(a[24], a[27]);
b[28] = vsubq_s16(a[31], a[28]);
b[29] = vsubq_s16(a[30], a[29]);
b[30] = vaddq_s16(a[29], a[30]);
b[31] = vaddq_s16(a[28], a[31]);
// Stage 6.
a[0] = b[0];
a[1] = b[1];
a[2] = b[2];
a[3] = b[3];
butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
a[8] = vaddq_s16(b[8], b[9]);
a[9] = vsubq_s16(b[8], b[9]);
a[10] = vsubq_s16(b[11], b[10]);
a[11] = vaddq_s16(b[11], b[10]);
a[12] = vaddq_s16(b[12], b[13]);
a[13] = vsubq_s16(b[12], b[13]);
a[14] = vsubq_s16(b[15], b[14]);
a[15] = vaddq_s16(b[15], b[14]);
a[16] = b[16];
a[19] = b[19];
a[20] = b[20];
a[23] = b[23];
a[24] = b[24];
a[27] = b[27];
a[28] = b[28];
a[31] = b[31];
butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
// Stage 7.
b[0] = a[0];
b[1] = a[1];
b[2] = a[2];
b[3] = a[3];
b[4] = a[4];
b[5] = a[5];
b[6] = a[6];
b[7] = a[7];
butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
b[16] = vaddq_s16(a[16], a[17]);
b[17] = vsubq_s16(a[16], a[17]);
b[18] = vsubq_s16(a[19], a[18]);
b[19] = vaddq_s16(a[19], a[18]);
b[20] = vaddq_s16(a[20], a[21]);
b[21] = vsubq_s16(a[20], a[21]);
b[22] = vsubq_s16(a[23], a[22]);
b[23] = vaddq_s16(a[23], a[22]);
b[24] = vaddq_s16(a[24], a[25]);
b[25] = vsubq_s16(a[24], a[25]);
b[26] = vsubq_s16(a[27], a[26]);
b[27] = vaddq_s16(a[27], a[26]);
b[28] = vaddq_s16(a[28], a[29]);
b[29] = vsubq_s16(a[28], a[29]);
b[30] = vsubq_s16(a[31], a[30]);
b[31] = vaddq_s16(a[31], a[30]);
// Final stage.
// Also compute partial rounding shift:
// output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
out[0] = sub_round_shift_s16(b[0]);
out[16] = sub_round_shift_s16(b[1]);
out[8] = sub_round_shift_s16(b[2]);
out[24] = sub_round_shift_s16(b[3]);
out[4] = sub_round_shift_s16(b[4]);
out[20] = sub_round_shift_s16(b[5]);
out[12] = sub_round_shift_s16(b[6]);
out[28] = sub_round_shift_s16(b[7]);
out[2] = sub_round_shift_s16(b[8]);
out[18] = sub_round_shift_s16(b[9]);
out[10] = sub_round_shift_s16(b[10]);
out[26] = sub_round_shift_s16(b[11]);
out[6] = sub_round_shift_s16(b[12]);
out[22] = sub_round_shift_s16(b[13]);
out[14] = sub_round_shift_s16(b[14]);
out[30] = sub_round_shift_s16(b[15]);
butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
out[1] = sub_round_shift_s16(a[1]);
out[31] = sub_round_shift_s16(a[31]);
butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
out[17] = sub_round_shift_s16(a[17]);
out[15] = sub_round_shift_s16(a[15]);
butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
out[9] = sub_round_shift_s16(a[9]);
out[23] = sub_round_shift_s16(a[23]);
butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
out[25] = sub_round_shift_s16(a[25]);
out[7] = sub_round_shift_s16(a[7]);
butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
out[5] = sub_round_shift_s16(a[5]);
out[27] = sub_round_shift_s16(a[27]);
butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
out[21] = sub_round_shift_s16(a[21]);
out[11] = sub_round_shift_s16(a[11]);
butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
out[13] = sub_round_shift_s16(a[13]);
out[19] = sub_round_shift_s16(a[19]);
butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
out[29] = sub_round_shift_s16(a[29]);
out[3] = sub_round_shift_s16(a[3]);
}
#define PASS_THROUGH(src, dst, element) \
do { \
dst##_lo[element] = src##_lo[element]; \
dst##_hi[element] = src##_hi[element]; \
} while (0)
#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
do { \
b##_lo[b_index] = \
vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
vget_high_s16(a[right_index])); \
} while (0)
#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
do { \
b##_lo[b_index] = \
vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
vget_high_s16(a[right_index])); \
} while (0)
#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
do { \
c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
} while (0)
#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
do { \
temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
} while (0)
#define ADD_S32(a, left_index, right_index, b, b_index) \
do { \
b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
} while (0)
#define SUB_S32(a, left_index, right_index, b, b_index) \
do { \
b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
} while (0)
#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
add_index, sub_index) \
do { \
butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
&b##_lo[add_index], &b##_hi[add_index], \
&b##_lo[sub_index], &b##_hi[sub_index]); \
} while (0)
#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
sub_index) \
do { \
butterfly_one_coeff_s32_fast( \
a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \
a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
&b##_lo[sub_index], &b##_hi[sub_index]); \
} while (0)
#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
right_constant, b, add_index, sub_index) \
do { \
butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
a##_lo[right_index], a##_hi[right_index], \
left_constant, right_constant, &b##_lo[add_index], \
&b##_hi[add_index], &b##_lo[sub_index], \
&b##_hi[sub_index]); \
} while (0)
static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
int16x8_t a[32];
int16x8_t b[32];
int32x4_t c_lo[32];
int32x4_t c_hi[32];
int32x4_t d_lo[32];
int32x4_t d_hi[32];
// Stage 1. Done as part of the load for the first pass.
a[0] = vaddq_s16(in[0], in[31]);
a[1] = vaddq_s16(in[1], in[30]);
a[2] = vaddq_s16(in[2], in[29]);
a[3] = vaddq_s16(in[3], in[28]);
a[4] = vaddq_s16(in[4], in[27]);
a[5] = vaddq_s16(in[5], in[26]);
a[6] = vaddq_s16(in[6], in[25]);
a[7] = vaddq_s16(in[7], in[24]);
a[8] = vaddq_s16(in[8], in[23]);
a[9] = vaddq_s16(in[9], in[22]);
a[10] = vaddq_s16(in[10], in[21]);
a[11] = vaddq_s16(in[11], in[20]);
a[12] = vaddq_s16(in[12], in[19]);
a[13] = vaddq_s16(in[13], in[18]);
a[14] = vaddq_s16(in[14], in[17]);
a[15] = vaddq_s16(in[15], in[16]);
a[16] = vsubq_s16(in[15], in[16]);
a[17] = vsubq_s16(in[14], in[17]);
a[18] = vsubq_s16(in[13], in[18]);
a[19] = vsubq_s16(in[12], in[19]);
a[20] = vsubq_s16(in[11], in[20]);
a[21] = vsubq_s16(in[10], in[21]);
a[22] = vsubq_s16(in[9], in[22]);
a[23] = vsubq_s16(in[8], in[23]);
a[24] = vsubq_s16(in[7], in[24]);
a[25] = vsubq_s16(in[6], in[25]);
a[26] = vsubq_s16(in[5], in[26]);
a[27] = vsubq_s16(in[4], in[27]);
a[28] = vsubq_s16(in[3], in[28]);
a[29] = vsubq_s16(in[2], in[29]);
a[30] = vsubq_s16(in[1], in[30]);
a[31] = vsubq_s16(in[0], in[31]);
// Stage 2.
b[0] = vaddq_s16(a[0], a[15]);
b[1] = vaddq_s16(a[1], a[14]);
b[2] = vaddq_s16(a[2], a[13]);
b[3] = vaddq_s16(a[3], a[12]);
b[4] = vaddq_s16(a[4], a[11]);
b[5] = vaddq_s16(a[5], a[10]);
b[6] = vaddq_s16(a[6], a[9]);
b[7] = vaddq_s16(a[7], a[8]);
b[8] = vsubq_s16(a[7], a[8]);
b[9] = vsubq_s16(a[6], a[9]);
b[10] = vsubq_s16(a[5], a[10]);
b[11] = vsubq_s16(a[4], a[11]);
b[12] = vsubq_s16(a[3], a[12]);
b[13] = vsubq_s16(a[2], a[13]);
b[14] = vsubq_s16(a[1], a[14]);
b[15] = vsubq_s16(a[0], a[15]);
b[16] = a[16];
b[17] = a[17];
b[18] = a[18];
b[19] = a[19];
butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
b[28] = a[28];
b[29] = a[29];
b[30] = a[30];
b[31] = a[31];
// Stage 3. With extreme values for input this calculation rolls over int16_t.
// The sources for b[0] get added multiple times and, through testing, have
// been shown to overflow starting here.
ADD_S16_S32(b, 0, 7, c, 0);
ADD_S16_S32(b, 1, 6, c, 1);
ADD_S16_S32(b, 2, 5, c, 2);
ADD_S16_S32(b, 3, 4, c, 3);
SUB_S16_S32(b, 3, 4, c, 4);
SUB_S16_S32(b, 2, 5, c, 5);
SUB_S16_S32(b, 1, 6, c, 6);
SUB_S16_S32(b, 0, 7, c, 7);
a[8] = b[8];
a[9] = b[9];
BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
a[14] = b[14];
a[15] = b[15];
ADD_S16_S32(b, 16, 23, c, 16);
ADD_S16_S32(b, 17, 22, c, 17);
ADD_S16_S32(b, 18, 21, c, 18);
ADD_S16_S32(b, 19, 20, c, 19);
SUB_S16_S32(b, 19, 20, c, 20);
SUB_S16_S32(b, 18, 21, c, 21);
SUB_S16_S32(b, 17, 22, c, 22);
SUB_S16_S32(b, 16, 23, c, 23);
SUB_S16_S32(b, 31, 24, c, 24);
SUB_S16_S32(b, 30, 25, c, 25);
SUB_S16_S32(b, 29, 26, c, 26);
SUB_S16_S32(b, 28, 27, c, 27);
ADD_S16_S32(b, 28, 27, c, 28);
ADD_S16_S32(b, 29, 26, c, 29);
ADD_S16_S32(b, 30, 25, c, 30);
ADD_S16_S32(b, 31, 24, c, 31);
// Stage 4.
ADD_S32(c, 0, 3, d, 0);
ADD_S32(c, 1, 2, d, 1);
SUB_S32(c, 1, 2, d, 2);
SUB_S32(c, 0, 3, d, 3);
PASS_THROUGH(c, d, 4);
BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
PASS_THROUGH(c, d, 7);
ADDW_S16_S32(c, 11, a, 8, d, 8);
ADDW_S16_S32(c, 10, a, 9, d, 9);
SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
ADDW_S16_S32(c, 13, b, 14, d, 14);
ADDW_S16_S32(c, 12, b, 15, d, 15);
PASS_THROUGH(c, d, 16);
PASS_THROUGH(c, d, 17);
BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
PASS_THROUGH(c, d, 22);
PASS_THROUGH(c, d, 23);
PASS_THROUGH(c, d, 24);
PASS_THROUGH(c, d, 25);
PASS_THROUGH(c, d, 30);
PASS_THROUGH(c, d, 31);
// Stage 5.
BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
ADD_S32(d, 4, 5, c, 4);
SUB_S32(d, 4, 5, c, 5);
SUB_S32(d, 7, 6, c, 6);
ADD_S32(d, 7, 6, c, 7);
PASS_THROUGH(d, c, 8);
BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
PASS_THROUGH(d, c, 11);
PASS_THROUGH(d, c, 12);
PASS_THROUGH(d, c, 15);
ADD_S32(d, 16, 19, c, 16);
ADD_S32(d, 17, 18, c, 17);
SUB_S32(d, 17, 18, c, 18);
SUB_S32(d, 16, 19, c, 19);
SUB_S32(d, 23, 20, c, 20);
SUB_S32(d, 22, 21, c, 21);
ADD_S32(d, 22, 21, c, 22);
ADD_S32(d, 23, 20, c, 23);
ADD_S32(d, 24, 27, c, 24);
ADD_S32(d, 25, 26, c, 25);
SUB_S32(d, 25, 26, c, 26);
SUB_S32(d, 24, 27, c, 27);
SUB_S32(d, 31, 28, c, 28);
SUB_S32(d, 30, 29, c, 29);
ADD_S32(d, 30, 29, c, 30);
ADD_S32(d, 31, 28, c, 31);
// Stage 6.
PASS_THROUGH(c, d, 0);
PASS_THROUGH(c, d, 1);
PASS_THROUGH(c, d, 2);
PASS_THROUGH(c, d, 3);
BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
ADD_S32(c, 8, 9, d, 8);
SUB_S32(c, 8, 9, d, 9);
SUB_S32(c, 11, 10, d, 10);
ADD_S32(c, 11, 10, d, 11);
ADD_S32(c, 12, 13, d, 12);
SUB_S32(c, 12, 13, d, 13);
SUB_S32(c, 15, 14, d, 14);
ADD_S32(c, 15, 14, d, 15);
PASS_THROUGH(c, d, 16);
PASS_THROUGH(c, d, 19);
PASS_THROUGH(c, d, 20);
PASS_THROUGH(c, d, 23);
PASS_THROUGH(c, d, 24);
PASS_THROUGH(c, d, 27);
PASS_THROUGH(c, d, 28);
PASS_THROUGH(c, d, 31);
BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
// Stage 7.
PASS_THROUGH(d, c, 0);
PASS_THROUGH(d, c, 1);
PASS_THROUGH(d, c, 2);
PASS_THROUGH(d, c, 3);
PASS_THROUGH(d, c, 4);
PASS_THROUGH(d, c, 5);
PASS_THROUGH(d, c, 6);
PASS_THROUGH(d, c, 7);
BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
ADD_S32(d, 16, 17, c, 16);
SUB_S32(d, 16, 17, c, 17);
SUB_S32(d, 19, 18, c, 18);
ADD_S32(d, 19, 18, c, 19);
ADD_S32(d, 20, 21, c, 20);
SUB_S32(d, 20, 21, c, 21);
SUB_S32(d, 23, 22, c, 22);
ADD_S32(d, 23, 22, c, 23);
ADD_S32(d, 24, 25, c, 24);
SUB_S32(d, 24, 25, c, 25);
SUB_S32(d, 27, 26, c, 26);
ADD_S32(d, 27, 26, c, 27);
ADD_S32(d, 28, 29, c, 28);
SUB_S32(d, 28, 29, c, 29);
SUB_S32(d, 31, 30, c, 30);
ADD_S32(d, 31, 30, c, 31);
// Final stage.
// Roll rounding into this function so we can pass back int16x8.
out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
}
static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
int16x8_t *out) {
int16x8_t a[32];
int16x8_t b[32];
// Stage 1. Done as part of the load for the first pass.
a[0] = vaddq_s16(in[0], in[31]);
a[1] = vaddq_s16(in[1], in[30]);
a[2] = vaddq_s16(in[2], in[29]);
a[3] = vaddq_s16(in[3], in[28]);
a[4] = vaddq_s16(in[4], in[27]);
a[5] = vaddq_s16(in[5], in[26]);
a[6] = vaddq_s16(in[6], in[25]);
a[7] = vaddq_s16(in[7], in[24]);
a[8] = vaddq_s16(in[8], in[23]);
a[9] = vaddq_s16(in[9], in[22]);
a[10] = vaddq_s16(in[10], in[21]);
a[11] = vaddq_s16(in[11], in[20]);
a[12] = vaddq_s16(in[12], in[19]);
a[13] = vaddq_s16(in[13], in[18]);
a[14] = vaddq_s16(in[14], in[17]);
a[15] = vaddq_s16(in[15], in[16]);
a[16] = vsubq_s16(in[15], in[16]);
a[17] = vsubq_s16(in[14], in[17]);
a[18] = vsubq_s16(in[13], in[18]);
a[19] = vsubq_s16(in[12], in[19]);
a[20] = vsubq_s16(in[11], in[20]);
a[21] = vsubq_s16(in[10], in[21]);
a[22] = vsubq_s16(in[9], in[22]);
a[23] = vsubq_s16(in[8], in[23]);
a[24] = vsubq_s16(in[7], in[24]);
a[25] = vsubq_s16(in[6], in[25]);
a[26] = vsubq_s16(in[5], in[26]);
a[27] = vsubq_s16(in[4], in[27]);
a[28] = vsubq_s16(in[3], in[28]);
a[29] = vsubq_s16(in[2], in[29]);
a[30] = vsubq_s16(in[1], in[30]);
a[31] = vsubq_s16(in[0], in[31]);
// Stage 2.
// For the "rd" version, all the values are rounded down after stage 2 to keep
// the values in 16 bits.
b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
b[16] = add_round_shift_s16(a[16]);
b[17] = add_round_shift_s16(a[17]);
b[18] = add_round_shift_s16(a[18]);
b[19] = add_round_shift_s16(a[19]);
butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
b[20] = add_round_shift_s16(b[20]);
b[21] = add_round_shift_s16(b[21]);
b[22] = add_round_shift_s16(b[22]);
b[23] = add_round_shift_s16(b[23]);
b[24] = add_round_shift_s16(b[24]);
b[25] = add_round_shift_s16(b[25]);
b[26] = add_round_shift_s16(b[26]);
b[27] = add_round_shift_s16(b[27]);
b[28] = add_round_shift_s16(a[28]);
b[29] = add_round_shift_s16(a[29]);
b[30] = add_round_shift_s16(a[30]);
b[31] = add_round_shift_s16(a[31]);
// Stage 3.
a[0] = vaddq_s16(b[0], b[7]);
a[1] = vaddq_s16(b[1], b[6]);
a[2] = vaddq_s16(b[2], b[5]);
a[3] = vaddq_s16(b[3], b[4]);
a[4] = vsubq_s16(b[3], b[4]);
a[5] = vsubq_s16(b[2], b[5]);
a[6] = vsubq_s16(b[1], b[6]);
a[7] = vsubq_s16(b[0], b[7]);
a[8] = b[8];
a[9] = b[9];
butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
a[14] = b[14];
a[15] = b[15];
a[16] = vaddq_s16(b[16], b[23]);
a[17] = vaddq_s16(b[17], b[22]);
a[18] = vaddq_s16(b[18], b[21]);
a[19] = vaddq_s16(b[19], b[20]);
a[20] = vsubq_s16(b[19], b[20]);
a[21] = vsubq_s16(b[18], b[21]);
a[22] = vsubq_s16(b[17], b[22]);
a[23] = vsubq_s16(b[16], b[23]);
a[24] = vsubq_s16(b[31], b[24]);
a[25] = vsubq_s16(b[30], b[25]);
a[26] = vsubq_s16(b[29], b[26]);
a[27] = vsubq_s16(b[28], b[27]);
a[28] = vaddq_s16(b[28], b[27]);
a[29] = vaddq_s16(b[29], b[26]);
a[30] = vaddq_s16(b[30], b[25]);
a[31] = vaddq_s16(b[31], b[24]);
// Stage 4.
b[0] = vaddq_s16(a[0], a[3]);
b[1] = vaddq_s16(a[1], a[2]);
b[2] = vsubq_s16(a[1], a[2]);
b[3] = vsubq_s16(a[0], a[3]);
b[4] = a[4];
butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
b[7] = a[7];
b[8] = vaddq_s16(a[8], a[11]);
b[9] = vaddq_s16(a[9], a[10]);
b[10] = vsubq_s16(a[9], a[10]);
b[11] = vsubq_s16(a[8], a[11]);
b[12] = vsubq_s16(a[15], a[12]);
b[13] = vsubq_s16(a[14], a[13]);
b[14] = vaddq_s16(a[14], a[13]);
b[15] = vaddq_s16(a[15], a[12]);
b[16] = a[16];
b[17] = a[17];
butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
b[22] = a[22];
b[23] = a[23];
b[24] = a[24];
b[25] = a[25];
b[30] = a[30];
b[31] = a[31];
// Stage 5.
butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
a[4] = vaddq_s16(b[4], b[5]);
a[5] = vsubq_s16(b[4], b[5]);
a[6] = vsubq_s16(b[7], b[6]);
a[7] = vaddq_s16(b[7], b[6]);
a[8] = b[8];
butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
a[11] = b[11];
a[12] = b[12];
a[15] = b[15];
a[16] = vaddq_s16(b[19], b[16]);
a[17] = vaddq_s16(b[18], b[17]);
a[18] = vsubq_s16(b[17], b[18]);
a[19] = vsubq_s16(b[16], b[19]);
a[20] = vsubq_s16(b[23], b[20]);
a[21] = vsubq_s16(b[22], b[21]);
a[22] = vaddq_s16(b[21], b[22]);
a[23] = vaddq_s16(b[20], b[23]);
a[24] = vaddq_s16(b[27], b[24]);
a[25] = vaddq_s16(b[26], b[25]);
a[26] = vsubq_s16(b[25], b[26]);
a[27] = vsubq_s16(b[24], b[27]);
a[28] = vsubq_s16(b[31], b[28]);
a[29] = vsubq_s16(b[30], b[29]);
a[30] = vaddq_s16(b[29], b[30]);
a[31] = vaddq_s16(b[28], b[31]);
// Stage 6.
b[0] = a[0];
b[1] = a[1];
b[2] = a[2];
b[3] = a[3];
butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
b[8] = vaddq_s16(a[8], a[9]);
b[9] = vsubq_s16(a[8], a[9]);
b[10] = vsubq_s16(a[11], a[10]);
b[11] = vaddq_s16(a[11], a[10]);
b[12] = vaddq_s16(a[12], a[13]);
b[13] = vsubq_s16(a[12], a[13]);
b[14] = vsubq_s16(a[15], a[14]);
b[15] = vaddq_s16(a[15], a[14]);
b[16] = a[16];
b[19] = a[19];
b[20] = a[20];
b[23] = a[23];
b[24] = a[24];
b[27] = a[27];
b[28] = a[28];
b[31] = a[31];
butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
// Stage 7.
a[0] = b[0];
a[1] = b[1];
a[2] = b[2];
a[3] = b[3];
a[4] = b[4];
a[5] = b[5];
a[6] = b[6];
a[7] = b[7];
butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
a[16] = vaddq_s16(b[16], b[17]);
a[17] = vsubq_s16(b[16], b[17]);
a[18] = vsubq_s16(b[19], b[18]);
a[19] = vaddq_s16(b[19], b[18]);
a[20] = vaddq_s16(b[20], b[21]);
a[21] = vsubq_s16(b[20], b[21]);
a[22] = vsubq_s16(b[23], b[22]);
a[23] = vaddq_s16(b[23], b[22]);
a[24] = vaddq_s16(b[24], b[25]);
a[25] = vsubq_s16(b[24], b[25]);
a[26] = vsubq_s16(b[27], b[26]);
a[27] = vaddq_s16(b[27], b[26]);
a[28] = vaddq_s16(b[28], b[29]);
a[29] = vsubq_s16(b[28], b[29]);
a[30] = vsubq_s16(b[31], b[30]);
a[31] = vaddq_s16(b[31], b[30]);
// Final stage.
out[0] = a[0];
out[16] = a[1];
out[8] = a[2];
out[24] = a[3];
out[4] = a[4];
out[20] = a[5];
out[12] = a[6];
out[28] = a[7];
out[2] = a[8];
out[18] = a[9];
out[10] = a[10];
out[26] = a[11];
out[6] = a[12];
out[22] = a[13];
out[14] = a[14];
out[30] = a[15];
butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
&out[15]);
butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
&out[11]);
butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
&out[19]);
butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
}
#undef PASS_THROUGH
#undef ADD_S16_S32
#undef SUB_S16_S32
#undef ADDW_S16_S32
#undef SUBW_S16_S32
#undef ADD_S32
#undef SUB_S32
#undef BUTTERFLY_ONE_S16_S32
#undef BUTTERFLY_ONE_S32
#undef BUTTERFLY_TWO_S32
#if CONFIG_VP9_HIGHBITDEPTH
// Store 32 32x4 vectors, assuming stride == 32.
static INLINE void store32x32_s32(
tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
int i;
for (i = 0; i < 32; i++) {
vst1q_s32(a, l1[i]);
vst1q_s32(a + 4, r1[i]);
vst1q_s32(a + 8, l2[i]);
vst1q_s32(a + 12, r2[i]);
vst1q_s32(a + 16, l3[i]);
vst1q_s32(a + 20, r3[i]);
vst1q_s32(a + 24, l4[i]);
vst1q_s32(a + 28, r4[i]);
a += 32;
}
}
static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
int32x4_t *left /*[32]*/,
int32x4_t *right /* [32] */) {
left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
}
static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
int32x4_t *a_right /*[32]*/,
int32x4_t *b_left /*[32]*/,
int32x4_t *b_right /*[32]*/) {
// Stage 1. Done as part of the load for the first pass.
b_left[0] = vaddq_s32(a_left[0], a_left[31]);
b_left[1] = vaddq_s32(a_left[1], a_left[30]);
b_left[2] = vaddq_s32(a_left[2], a_left[29]);
b_left[3] = vaddq_s32(a_left[3], a_left[28]);
b_left[4] = vaddq_s32(a_left[4], a_left[27]);
b_left[5] = vaddq_s32(a_left[5], a_left[26]);
b_left[6] = vaddq_s32(a_left[6], a_left[25]);
b_left[7] = vaddq_s32(a_left[7], a_left[24]);
b_left[8] = vaddq_s32(a_left[8], a_left[23]);
b_left[9] = vaddq_s32(a_left[9], a_left[22]);
b_left[10] = vaddq_s32(a_left[10], a_left[21]);
b_left[11] = vaddq_s32(a_left[11], a_left[20]);
b_left[12] = vaddq_s32(a_left[12], a_left[19]);
b_left[13] = vaddq_s32(a_left[13], a_left[18]);
b_left[14] = vaddq_s32(a_left[14], a_left[17]);
b_left[15] = vaddq_s32(a_left[15], a_left[16]);
b_right[0] = vaddq_s32(a_right[0], a_right[31]);
b_right[1] = vaddq_s32(a_right[1], a_right[30]);
b_right[2] = vaddq_s32(a_right[2], a_right[29]);
b_right[3] = vaddq_s32(a_right[3], a_right[28]);
b_right[4] = vaddq_s32(a_right[4], a_right[27]);
b_right[5] = vaddq_s32(a_right[5], a_right[26]);
b_right[6] = vaddq_s32(a_right[6], a_right[25]);
b_right[7] = vaddq_s32(a_right[7], a_right[24]);
b_right[8] = vaddq_s32(a_right[8], a_right[23]);
b_right[9] = vaddq_s32(a_right[9], a_right[22]);
b_right[10] = vaddq_s32(a_right[10], a_right[21]);
b_right[11] = vaddq_s32(a_right[11], a_right[20]);
b_right[12] = vaddq_s32(a_right[12], a_right[19]);
b_right[13] = vaddq_s32(a_right[13], a_right[18]);
b_right[14] = vaddq_s32(a_right[14], a_right[17]);
b_right[15] = vaddq_s32(a_right[15], a_right[16]);
b_left[16] = vsubq_s32(a_left[15], a_left[16]);
b_left[17] = vsubq_s32(a_left[14], a_left[17]);
b_left[18] = vsubq_s32(a_left[13], a_left[18]);
b_left[19] = vsubq_s32(a_left[12], a_left[19]);
b_left[20] = vsubq_s32(a_left[11], a_left[20]);
b_left[21] = vsubq_s32(a_left[10], a_left[21]);
b_left[22] = vsubq_s32(a_left[9], a_left[22]);
b_left[23] = vsubq_s32(a_left[8], a_left[23]);
b_left[24] = vsubq_s32(a_left[7], a_left[24]);
b_left[25] = vsubq_s32(a_left[6], a_left[25]);
b_left[26] = vsubq_s32(a_left[5], a_left[26]);
b_left[27] = vsubq_s32(a_left[4], a_left[27]);
b_left[28] = vsubq_s32(a_left[3], a_left[28]);
b_left[29] = vsubq_s32(a_left[2], a_left[29]);
b_left[30] = vsubq_s32(a_left[1], a_left[30]);
b_left[31] = vsubq_s32(a_left[0], a_left[31]);
b_right[16] = vsubq_s32(a_right[15], a_right[16]);
b_right[17] = vsubq_s32(a_right[14], a_right[17]);
b_right[18] = vsubq_s32(a_right[13], a_right[18]);
b_right[19] = vsubq_s32(a_right[12], a_right[19]);
b_right[20] = vsubq_s32(a_right[11], a_right[20]);
b_right[21] = vsubq_s32(a_right[10], a_right[21]);
b_right[22] = vsubq_s32(a_right[9], a_right[22]);
b_right[23] = vsubq_s32(a_right[8], a_right[23]);
b_right[24] = vsubq_s32(a_right[7], a_right[24]);
b_right[25] = vsubq_s32(a_right[6], a_right[25]);
b_right[26] = vsubq_s32(a_right[5], a_right[26]);
b_right[27] = vsubq_s32(a_right[4], a_right[27]);
b_right[28] = vsubq_s32(a_right[3], a_right[28]);
b_right[29] = vsubq_s32(a_right[2], a_right[29]);
b_right[30] = vsubq_s32(a_right[1], a_right[30]);
b_right[31] = vsubq_s32(a_right[0], a_right[31]);
}
static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
int32x4_t *right /* [32] */) {
// Also compute partial rounding shift:
// output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
left[0] = add_round_shift_s32(left[0]);
left[1] = add_round_shift_s32(left[1]);
left[2] = add_round_shift_s32(left[2]);
left[3] = add_round_shift_s32(left[3]);
left[4] = add_round_shift_s32(left[4]);
left[5] = add_round_shift_s32(left[5]);
left[6] = add_round_shift_s32(left[6]);
left[7] = add_round_shift_s32(left[7]);
left[8] = add_round_shift_s32(left[8]);
left[9] = add_round_shift_s32(left[9]);
left[10] = add_round_shift_s32(left[10]);
left[11] = add_round_shift_s32(left[11]);
left[12] = add_round_shift_s32(left[12]);
left[13] = add_round_shift_s32(left[13]);
left[14] = add_round_shift_s32(left[14]);
left[15] = add_round_shift_s32(left[15]);
left[16] = add_round_shift_s32(left[16]);
left[17] = add_round_shift_s32(left[17]);
left[18] = add_round_shift_s32(left[18]);
left[19] = add_round_shift_s32(left[19]);
left[20] = add_round_shift_s32(left[20]);
left[21] = add_round_shift_s32(left[21]);
left[22] = add_round_shift_s32(left[22]);
left[23] = add_round_shift_s32(left[23]);
left[24] = add_round_shift_s32(left[24]);
left[25] = add_round_shift_s32(left[25]);
left[26] = add_round_shift_s32(left[26]);
left[27] = add_round_shift_s32(left[27]);
left[28] = add_round_shift_s32(left[28]);
left[29] = add_round_shift_s32(left[29]);
left[30] = add_round_shift_s32(left[30]);
left[31] = add_round_shift_s32(left[31]);
right[0] = add_round_shift_s32(right[0]);
right[1] = add_round_shift_s32(right[1]);
right[2] = add_round_shift_s32(right[2]);
right[3] = add_round_shift_s32(right[3]);
right[4] = add_round_shift_s32(right[4]);
right[5] = add_round_shift_s32(right[5]);
right[6] = add_round_shift_s32(right[6]);
right[7] = add_round_shift_s32(right[7]);
right[8] = add_round_shift_s32(right[8]);
right[9] = add_round_shift_s32(right[9]);
right[10] = add_round_shift_s32(right[10]);
right[11] = add_round_shift_s32(right[11]);
right[12] = add_round_shift_s32(right[12]);
right[13] = add_round_shift_s32(right[13]);
right[14] = add_round_shift_s32(right[14]);
right[15] = add_round_shift_s32(right[15]);
right[16] = add_round_shift_s32(right[16]);
right[17] = add_round_shift_s32(right[17]);
right[18] = add_round_shift_s32(right[18]);
right[19] = add_round_shift_s32(right[19]);
right[20] = add_round_shift_s32(right[20]);
right[21] = add_round_shift_s32(right[21]);
right[22] = add_round_shift_s32(right[22]);
right[23] = add_round_shift_s32(right[23]);
right[24] = add_round_shift_s32(right[24]);
right[25] = add_round_shift_s32(right[25]);
right[26] = add_round_shift_s32(right[26]);
right[27] = add_round_shift_s32(right[27]);
right[28] = add_round_shift_s32(right[28]);
right[29] = add_round_shift_s32(right[29]);
right[30] = add_round_shift_s32(right[30]);
right[31] = add_round_shift_s32(right[31]);
}
static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
int32x4_t *right /* [32] */) {
// Also compute partial rounding shift:
// output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
left[0] = sub_round_shift_s32(left[0]);
left[1] = sub_round_shift_s32(left[1]);
left[2] = sub_round_shift_s32(left[2]);
left[3] = sub_round_shift_s32(left[3]);
left[4] = sub_round_shift_s32(left[4]);
left[5] = sub_round_shift_s32(left[5]);
left[6] = sub_round_shift_s32(left[6]);
left[7] = sub_round_shift_s32(left[7]);
left[8] = sub_round_shift_s32(left[8]);
left[9] = sub_round_shift_s32(left[9]);
left[10] = sub_round_shift_s32(left[10]);
left[11] = sub_round_shift_s32(left[11]);
left[12] = sub_round_shift_s32(left[12]);
left[13] = sub_round_shift_s32(left[13]);
left[14] = sub_round_shift_s32(left[14]);
left[15] = sub_round_shift_s32(left[15]);
left[16] = sub_round_shift_s32(left[16]);
left[17] = sub_round_shift_s32(left[17]);
left[18] = sub_round_shift_s32(left[18]);
left[19] = sub_round_shift_s32(left[19]);
left[20] = sub_round_shift_s32(left[20]);
left[21] = sub_round_shift_s32(left[21]);
left[22] = sub_round_shift_s32(left[22]);
left[23] = sub_round_shift_s32(left[23]);
left[24] = sub_round_shift_s32(left[24]);
left[25] = sub_round_shift_s32(left[25]);
left[26] = sub_round_shift_s32(left[26]);
left[27] = sub_round_shift_s32(left[27]);
left[28] = sub_round_shift_s32(left[28]);
left[29] = sub_round_shift_s32(left[29]);
left[30] = sub_round_shift_s32(left[30]);
left[31] = sub_round_shift_s32(left[31]);
right[0] = sub_round_shift_s32(right[0]);
right[1] = sub_round_shift_s32(right[1]);
right[2] = sub_round_shift_s32(right[2]);
right[3] = sub_round_shift_s32(right[3]);
right[4] = sub_round_shift_s32(right[4]);
right[5] = sub_round_shift_s32(right[5]);
right[6] = sub_round_shift_s32(right[6]);
right[7] = sub_round_shift_s32(right[7]);
right[8] = sub_round_shift_s32(right[8]);
right[9] = sub_round_shift_s32(right[9]);
right[10] = sub_round_shift_s32(right[10]);
right[11] = sub_round_shift_s32(right[11]);
right[12] = sub_round_shift_s32(right[12]);
right[13] = sub_round_shift_s32(right[13]);
right[14] = sub_round_shift_s32(right[14]);
right[15] = sub_round_shift_s32(right[15]);
right[16] = sub_round_shift_s32(right[16]);
right[17] = sub_round_shift_s32(right[17]);
right[18] = sub_round_shift_s32(right[18]);
right[19] = sub_round_shift_s32(right[19]);
right[20] = sub_round_shift_s32(right[20]);
right[21] = sub_round_shift_s32(right[21]);
right[22] = sub_round_shift_s32(right[22]);
right[23] = sub_round_shift_s32(right[23]);
right[24] = sub_round_shift_s32(right[24]);
right[25] = sub_round_shift_s32(right[25]);
right[26] = sub_round_shift_s32(right[26]);
right[27] = sub_round_shift_s32(right[27]);
right[28] = sub_round_shift_s32(right[28]);
right[29] = sub_round_shift_s32(right[29]);
right[30] = sub_round_shift_s32(right[30]);
right[31] = sub_round_shift_s32(right[31]);
}
static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
int32x4_t *right /*32*/) {
int32x4_t al[32], ar[32];
int32x4_t bl[32], br[32];
// Stage 1: Done as part of the load.
// Stage 2.
// Mini cross. X the first 16 values and the middle 8 of the second half.
al[0] = vaddq_s32(left[0], left[15]);
ar[0] = vaddq_s32(right[0], right[15]);
al[1] = vaddq_s32(left[1], left[14]);
ar[1] = vaddq_s32(right[1], right[14]);
al[2] = vaddq_s32(left[2], left[13]);
ar[2] = vaddq_s32(right[2], right[13]);
al[3] = vaddq_s32(left[3], left[12]);
ar[3] = vaddq_s32(right[3], right[12]);
al[4] = vaddq_s32(left[4], left[11]);
ar[4] = vaddq_s32(right[4], right[11]);
al[5] = vaddq_s32(left[5], left[10]);
ar[5] = vaddq_s32(right[5], right[10]);
al[6] = vaddq_s32(left[6], left[9]);
ar[6] = vaddq_s32(right[6], right[9]);
al[7] = vaddq_s32(left[7], left[8]);
ar[7] = vaddq_s32(right[7], right[8]);
al[8] = vsubq_s32(left[7], left[8]);
ar[8] = vsubq_s32(right[7], right[8]);
al[9] = vsubq_s32(left[6], left[9]);
ar[9] = vsubq_s32(right[6], right[9]);
al[10] = vsubq_s32(left[5], left[10]);
ar[10] = vsubq_s32(right[5], right[10]);
al[11] = vsubq_s32(left[4], left[11]);
ar[11] = vsubq_s32(right[4], right[11]);
al[12] = vsubq_s32(left[3], left[12]);
ar[12] = vsubq_s32(right[3], right[12]);
al[13] = vsubq_s32(left[2], left[13]);
ar[13] = vsubq_s32(right[2], right[13]);
al[14] = vsubq_s32(left[1], left[14]);
ar[14] = vsubq_s32(right[1], right[14]);
al[15] = vsubq_s32(left[0], left[15]);
ar[15] = vsubq_s32(right[0], right[15]);
al[16] = left[16];
ar[16] = right[16];
al[17] = left[17];
ar[17] = right[17];
al[18] = left[18];
ar[18] = right[18];
al[19] = left[19];
ar[19] = right[19];
butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
al[28] = left[28];
ar[28] = right[28];
al[29] = left[29];
ar[29] = right[29];
al[30] = left[30];
ar[30] = right[30];
al[31] = left[31];
ar[31] = right[31];
// Stage 3.
bl[0] = vaddq_s32(al[0], al[7]);
br[0] = vaddq_s32(ar[0], ar[7]);
bl[1] = vaddq_s32(al[1], al[6]);
br[1] = vaddq_s32(ar[1], ar[6]);
bl[2] = vaddq_s32(al[2], al[5]);
br[2] = vaddq_s32(ar[2], ar[5]);
bl[3] = vaddq_s32(al[3], al[4]);
br[3] = vaddq_s32(ar[3], ar[4]);
bl[4] = vsubq_s32(al[3], al[4]);
br[4] = vsubq_s32(ar[3], ar[4]);
bl[5] = vsubq_s32(al[2], al[5]);
br[5] = vsubq_s32(ar[2], ar[5]);
bl[6] = vsubq_s32(al[1], al[6]);
br[6] = vsubq_s32(ar[1], ar[6]);
bl[7] = vsubq_s32(al[0], al[7]);
br[7] = vsubq_s32(ar[0], ar[7]);
bl[8] = al[8];
br[8] = ar[8];
bl[9] = al[9];
br[9] = ar[9];
butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
&bl[13], &br[13], &bl[10], &br[10]);
butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
&bl[12], &br[12], &bl[11], &br[11]);
bl[14] = al[14];
br[14] = ar[14];
bl[15] = al[15];
br[15] = ar[15];
bl[16] = vaddq_s32(left[16], al[23]);
br[16] = vaddq_s32(right[16], ar[23]);
bl[17] = vaddq_s32(left[17], al[22]);
br[17] = vaddq_s32(right[17], ar[22]);
bl[18] = vaddq_s32(left[18], al[21]);
br[18] = vaddq_s32(right[18], ar[21]);
bl[19] = vaddq_s32(left[19], al[20]);
br[19] = vaddq_s32(right[19], ar[20]);
bl[20] = vsubq_s32(left[19], al[20]);
br[20] = vsubq_s32(right[19], ar[20]);
bl[21] = vsubq_s32(left[18], al[21]);
br[21] = vsubq_s32(right[18], ar[21]);
bl[22] = vsubq_s32(left[17], al[22]);
br[22] = vsubq_s32(right[17], ar[22]);
bl[23] = vsubq_s32(left[16], al[23]);
br[23] = vsubq_s32(right[16], ar[23]);
bl[24] = vsubq_s32(left[31], al[24]);
br[24] = vsubq_s32(right[31], ar[24]);
bl[25] = vsubq_s32(left[30], al[25]);
br[25] = vsubq_s32(right[30], ar[25]);
bl[26] = vsubq_s32(left[29], al[26]);
br[26] = vsubq_s32(right[29], ar[26]);
bl[27] = vsubq_s32(left[28], al[27]);
br[27] = vsubq_s32(right[28], ar[27]);
bl[28] = vaddq_s32(left[28], al[27]);
br[28] = vaddq_s32(right[28], ar[27]);
bl[29] = vaddq_s32(left[29], al[26]);
br[29] = vaddq_s32(right[29], ar[26]);
bl[30] = vaddq_s32(left[30], al[25]);
br[30] = vaddq_s32(right[30], ar[25]);
bl[31] = vaddq_s32(left[31], al[24]);
br[31] = vaddq_s32(right[31], ar[24]);
// Stage 4.
al[0] = vaddq_s32(bl[0], bl[3]);
ar[0] = vaddq_s32(br[0], br[3]);
al[1] = vaddq_s32(bl[1], bl[2]);
ar[1] = vaddq_s32(br[1], br[2]);
al[2] = vsubq_s32(bl[1], bl[2]);
ar[2] = vsubq_s32(br[1], br[2]);
al[3] = vsubq_s32(bl[0], bl[3]);
ar[3] = vsubq_s32(br[0], br[3]);
al[4] = bl[4];
ar[4] = br[4];
butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
&ar[6], &al[5], &ar[5]);
al[7] = bl[7];
ar[7] = br[7];
al[8] = vaddq_s32(bl[8], bl[11]);
ar[8] = vaddq_s32(br[8], br[11]);
al[9] = vaddq_s32(bl[9], bl[10]);
ar[9] = vaddq_s32(br[9], br[10]);
al[10] = vsubq_s32(bl[9], bl[10]);
ar[10] = vsubq_s32(br[9], br[10]);
al[11] = vsubq_s32(bl[8], bl[11]);
ar[11] = vsubq_s32(br[8], br[11]);
al[12] = vsubq_s32(bl[15], bl[12]);
ar[12] = vsubq_s32(br[15], br[12]);
al[13] = vsubq_s32(bl[14], bl[13]);
ar[13] = vsubq_s32(br[14], br[13]);
al[14] = vaddq_s32(bl[14], bl[13]);
ar[14] = vaddq_s32(br[14], br[13]);
al[15] = vaddq_s32(bl[15], bl[12]);
ar[15] = vaddq_s32(br[15], br[12]);
al[16] = bl[16];
ar[16] = br[16];
al[17] = bl[17];
ar[17] = br[17];
butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
cospi_24_64, &al[29], &ar[29], &al[18],
&ar[18]);
butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
cospi_24_64, &al[28], &ar[28], &al[19],
&ar[19]);
butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
cospi_24_64, -cospi_8_64, &al[27], &ar[27],
&al[20], &ar[20]);
butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
cospi_24_64, -cospi_8_64, &al[26], &ar[26],
&al[21], &ar[21]);
al[22] = bl[22];
ar[22] = br[22];
al[23] = bl[23];
ar[23] = br[23];
al[24] = bl[24];
ar[24] = br[24];
al[25] = bl[25];
ar[25] = br[25];
al[30] = bl[30];
ar[30] = br[30];
al[31] = bl[31];
ar[31] = br[31];
// Stage 5.
butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
&br[0], &bl[1], &br[1]);
butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
cospi_24_64, &bl[2], &br[2], &bl[3],
&br[3]);
bl[4] = vaddq_s32(al[4], al[5]);
br[4] = vaddq_s32(ar[4], ar[5]);
bl[5] = vsubq_s32(al[4], al[5]);
br[5] = vsubq_s32(ar[4], ar[5]);
bl[6] = vsubq_s32(al[7], al[6]);
br[6] = vsubq_s32(ar[7], ar[6]);
bl[7] = vaddq_s32(al[7], al[6]);
br[7] = vaddq_s32(ar[7], ar[6]);
bl[8] = al[8];
br[8] = ar[8];
butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
cospi_24_64, &bl[14], &br[14], &bl[9],
&br[9]);
butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
cospi_24_64, -cospi_8_64, &bl[13], &br[13],
&bl[10], &br[10]);
bl[11] = al[11];
br[11] = ar[11];
bl[12] = al[12];
br[12] = ar[12];
bl[15] = al[15];
br[15] = ar[15];
bl[16] = vaddq_s32(al[19], al[16]);
br[16] = vaddq_s32(ar[19], ar[16]);
bl[17] = vaddq_s32(al[18], al[17]);
br[17] = vaddq_s32(ar[18], ar[17]);
bl[18] = vsubq_s32(al[17], al[18]);
br[18] = vsubq_s32(ar[17], ar[18]);
bl[19] = vsubq_s32(al[16], al[19]);
br[19] = vsubq_s32(ar[16], ar[19]);
bl[20] = vsubq_s32(al[23], al[20]);
br[20] = vsubq_s32(ar[23], ar[20]);
bl[21] = vsubq_s32(al[22], al[21]);
br[21] = vsubq_s32(ar[22], ar[21]);
bl[22] = vaddq_s32(al[21], al[22]);
br[22] = vaddq_s32(ar[21], ar[22]);
bl[23] = vaddq_s32(al[20], al[23]);
br[23] = vaddq_s32(ar[20], ar[23]);
bl[24] = vaddq_s32(al[27], al[24]);
br[24] = vaddq_s32(ar[27], ar[24]);
bl[25] = vaddq_s32(al[26], al[25]);
br[25] = vaddq_s32(ar[26], ar[25]);
bl[26] = vsubq_s32(al[25], al[26]);
br[26] = vsubq_s32(ar[25], ar[26]);
bl[27] = vsubq_s32(al[24], al[27]);
br[27] = vsubq_s32(ar[24], ar[27]);
bl[28] = vsubq_s32(al[31], al[28]);
br[28] = vsubq_s32(ar[31], ar[28]);
bl[29] = vsubq_s32(al[30], al[29]);
br[29] = vsubq_s32(ar[30], ar[29]);
bl[30] = vaddq_s32(al[29], al[30]);
br[30] = vaddq_s32(ar[29], ar[30]);
bl[31] = vaddq_s32(al[28], al[31]);
br[31] = vaddq_s32(ar[28], ar[31]);
// Stage 6.
al[0] = bl[0];
ar[0] = br[0];
al[1] = bl[1];
ar[1] = br[1];
al[2] = bl[2];
ar[2] = br[2];
al[3] = bl[3];
ar[3] = br[3];
butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
cospi_28_64, &al[4], &ar[4], &al[7],
&ar[7]);
butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
cospi_12_64, &al[5], &ar[5], &al[6],
&ar[6]);
al[8] = vaddq_s32(bl[8], bl[9]);
ar[8] = vaddq_s32(br[8], br[9]);
al[9] = vsubq_s32(bl[8], bl[9]);
ar[9] = vsubq_s32(br[8], br[9]);
al[10] = vsubq_s32(bl[11], bl[10]);
ar[10] = vsubq_s32(br[11], br[10]);
al[11] = vaddq_s32(bl[11], bl[10]);
ar[11] = vaddq_s32(br[11], br[10]);
al[12] = vaddq_s32(bl[12], bl[13]);
ar[12] = vaddq_s32(br[12], br[13]);
al[13] = vsubq_s32(bl[12], bl[13]);
ar[13] = vsubq_s32(br[12], br[13]);
al[14] = vsubq_s32(bl[15], bl[14]);
ar[14] = vsubq_s32(br[15], br[14]);
al[15] = vaddq_s32(bl[15], bl[14]);
ar[15] = vaddq_s32(br[15], br[14]);
al[16] = bl[16];
ar[16] = br[16];
al[19] = bl[19];
ar[19] = br[19];
al[20] = bl[20];
ar[20] = br[20];
al[23] = bl[23];
ar[23] = br[23];
al[24] = bl[24];
ar[24] = br[24];
al[27] = bl[27];
ar[27] = br[27];
al[28] = bl[28];
ar[28] = br[28];
al[31] = bl[31];
ar[31] = br[31];
butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
cospi_28_64, &al[30], &ar[30], &al[17],
&ar[17]);
butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
cospi_28_64, -cospi_4_64, &al[29], &ar[29],
&al[18], &ar[18]);
butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
cospi_20_64, cospi_12_64, &al[26], &ar[26],
&al[21], &ar[21]);
butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
cospi_12_64, -cospi_20_64, &al[25],
&ar[25], &al[22], &ar[22]);
// Stage 7.
bl[0] = al[0];
br[0] = ar[0];
bl[1] = al[1];
br[1] = ar[1];
bl[2] = al[2];
br[2] = ar[2];
bl[3] = al[3];
br[3] = ar[3];
bl[4] = al[4];
br[4] = ar[4];
bl[5] = al[5];
br[5] = ar[5];
bl[6] = al[6];
br[6] = ar[6];
bl[7] = al[7];
br[7] = ar[7];
butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
cospi_30_64, &bl[8], &br[8], &bl[15],
&br[15]);
butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
cospi_14_64, &bl[9], &br[9], &bl[14],
&br[14]);
butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
cospi_10_64, cospi_22_64, &bl[10], &br[10],
&bl[13], &br[13]);
butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
cospi_26_64, cospi_6_64, &bl[11], &br[11],
&bl[12], &br[12]);
bl[16] = vaddq_s32(al[16], al[17]);
br[16] = vaddq_s32(ar[16], ar[17]);
bl[17] = vsubq_s32(al[16], al[17]);
br[17] = vsubq_s32(ar[16], ar[17]);
bl[18] = vsubq_s32(al[19], al[18]);
br[18] = vsubq_s32(ar[19], ar[18]);
bl[19] = vaddq_s32(al[19], al[18]);
br[19] = vaddq_s32(ar[19], ar[18]);
bl[20] = vaddq_s32(al[20], al[21]);
br[20] = vaddq_s32(ar[20], ar[21]);
bl[21] = vsubq_s32(al[20], al[21]);
br[21] = vsubq_s32(ar[20], ar[21]);
bl[22] = vsubq_s32(al[23], al[22]);
br[22] = vsubq_s32(ar[23], ar[22]);
bl[23] = vaddq_s32(al[23], al[22]);
br[23] = vaddq_s32(ar[23], ar[22]);
bl[24] = vaddq_s32(al[24], al[25]);
br[24] = vaddq_s32(ar[24], ar[25]);
bl[25] = vsubq_s32(al[24], al[25]);
br[25] = vsubq_s32(ar[24], ar[25]);
bl[26] = vsubq_s32(al[27], al[26]);
br[26] = vsubq_s32(ar[27], ar[26]);
bl[27] = vaddq_s32(al[27], al[26]);
br[27] = vaddq_s32(ar[27], ar[26]);
bl[28] = vaddq_s32(al[28], al[29]);
br[28] = vaddq_s32(ar[28], ar[29]);
bl[29] = vsubq_s32(al[28], al[29]);
br[29] = vsubq_s32(ar[28], ar[29]);
bl[30] = vsubq_s32(al[31], al[30]);
br[30] = vsubq_s32(ar[31], ar[30]);
bl[31] = vaddq_s32(al[31], al[30]);
br[31] = vaddq_s32(ar[31], ar[30]);
// Final stage.
left[0] = bl[0];
right[0] = br[0];
left[16] = bl[1];
right[16] = br[1];
left[8] = bl[2];
right[8] = br[2];
left[24] = bl[3];
right[24] = br[3];
left[4] = bl[4];
right[4] = br[4];
left[20] = bl[5];
right[20] = br[5];
left[12] = bl[6];
right[12] = br[6];
left[28] = bl[7];
right[28] = br[7];
left[2] = bl[8];
right[2] = br[8];
left[18] = bl[9];
right[18] = br[9];
left[10] = bl[10];
right[10] = br[10];
left[26] = bl[11];
right[26] = br[11];
left[6] = bl[12];
right[6] = br[12];
left[22] = bl[13];
right[22] = br[13];
left[14] = bl[14];
right[14] = br[14];
left[30] = bl[15];
right[30] = br[15];
butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
cospi_31_64, &al[1], &ar[1], &al[31],
&ar[31]);
left[1] = al[1];
right[1] = ar[1];
left[31] = al[31];
right[31] = ar[31];
butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
cospi_17_64, cospi_15_64, &al[17], &ar[17],
&al[15], &ar[15]);
left[17] = al[17];
right[17] = ar[17];
left[15] = al[15];
right[15] = ar[15];
butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
cospi_23_64, &al[9], &ar[9], &al[23],
&ar[23]);
left[9] = al[9];
right[9] = ar[9];
left[23] = al[23];
right[23] = ar[23];
butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
cospi_25_64, cospi_7_64, &al[25], &ar[25],
&al[7], &ar[7]);
left[25] = al[25];
right[25] = ar[25];
left[7] = al[7];
right[7] = ar[7];
butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
cospi_27_64, &al[5], &ar[5], &al[27],
&ar[27]);
left[5] = al[5];
right[5] = ar[5];
left[27] = al[27];
right[27] = ar[27];
butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
cospi_21_64, cospi_11_64, &al[21], &ar[21],
&al[11], &ar[11]);
left[21] = al[21];
right[21] = ar[21];
left[11] = al[11];
right[11] = ar[11];
butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
cospi_13_64, cospi_19_64, &al[13], &ar[13],
&al[19], &ar[19]);
left[13] = al[13];
right[13] = ar[13];
left[19] = al[19];
right[19] = ar[19];
butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
cospi_29_64, cospi_3_64, &al[29], &ar[29],
&al[3], &ar[3]);
left[29] = al[29];
right[29] = ar[29];
left[3] = al[3];
right[3] = ar[3];
}
static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
int32x4_t *right /*32*/) {
int32x4_t al[32], ar[32];
int32x4_t bl[32], br[32];
// Stage 1: Done as part of the load.
// Stage 2.
// Mini cross. X the first 16 values and the middle 8 of the second half.
al[0] = vaddq_s32(left[0], left[15]);
ar[0] = vaddq_s32(right[0], right[15]);
al[1] = vaddq_s32(left[1], left[14]);
ar[1] = vaddq_s32(right[1], right[14]);
al[2] = vaddq_s32(left[2], left[13]);
ar[2] = vaddq_s32(right[2], right[13]);
al[3] = vaddq_s32(left[3], left[12]);
ar[3] = vaddq_s32(right[3], right[12]);
al[4] = vaddq_s32(left[4], left[11]);
ar[4] = vaddq_s32(right[4], right[11]);
al[5] = vaddq_s32(left[5], left[10]);
ar[5] = vaddq_s32(right[5], right[10]);
al[6] = vaddq_s32(left[6], left[9]);
ar[6] = vaddq_s32(right[6], right[9]);
al[7] = vaddq_s32(left[7], left[8]);
ar[7] = vaddq_s32(right[7], right[8]);
al[8] = vsubq_s32(left[7], left[8]);
ar[8] = vsubq_s32(right[7], right[8]);
al[9] = vsubq_s32(left[6], left[9]);
ar[9] = vsubq_s32(right[6], right[9]);
al[10] = vsubq_s32(left[5], left[10]);
ar[10] = vsubq_s32(right[5], right[10]);
al[11] = vsubq_s32(left[4], left[11]);
ar[11] = vsubq_s32(right[4], right[11]);
al[12] = vsubq_s32(left[3], left[12]);
ar[12] = vsubq_s32(right[3], right[12]);
al[13] = vsubq_s32(left[2], left[13]);
ar[13] = vsubq_s32(right[2], right[13]);
al[14] = vsubq_s32(left[1], left[14]);
ar[14] = vsubq_s32(right[1], right[14]);
al[15] = vsubq_s32(left[0], left[15]);
ar[15] = vsubq_s32(right[0], right[15]);
al[16] = left[16];
ar[16] = right[16];
al[17] = left[17];
ar[17] = right[17];
al[18] = left[18];
ar[18] = right[18];
al[19] = left[19];
ar[19] = right[19];
butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
al[28] = left[28];
ar[28] = right[28];
al[29] = left[29];
ar[29] = right[29];
al[30] = left[30];
ar[30] = right[30];
al[31] = left[31];
ar[31] = right[31];
// Stage 3.
bl[0] = vaddq_s32(al[0], al[7]);
br[0] = vaddq_s32(ar[0], ar[7]);
bl[1] = vaddq_s32(al[1], al[6]);
br[1] = vaddq_s32(ar[1], ar[6]);
bl[2] = vaddq_s32(al[2], al[5]);
br[2] = vaddq_s32(ar[2], ar[5]);
bl[3] = vaddq_s32(al[3], al[4]);
br[3] = vaddq_s32(ar[3], ar[4]);
bl[4] = vsubq_s32(al[3], al[4]);
br[4] = vsubq_s32(ar[3], ar[4]);
bl[5] = vsubq_s32(al[2], al[5]);
br[5] = vsubq_s32(ar[2], ar[5]);
bl[6] = vsubq_s32(al[1], al[6]);
br[6] = vsubq_s32(ar[1], ar[6]);
bl[7] = vsubq_s32(al[0], al[7]);
br[7] = vsubq_s32(ar[0], ar[7]);
bl[8] = al[8];
br[8] = ar[8];
bl[9] = al[9];
br[9] = ar[9];
butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
&bl[13], &br[13], &bl[10], &br[10]);
butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
&bl[12], &br[12], &bl[11], &br[11]);
bl[14] = al[14];
br[14] = ar[14];
bl[15] = al[15];
br[15] = ar[15];
bl[16] = vaddq_s32(left[16], al[23]);
br[16] = vaddq_s32(right[16], ar[23]);
bl[17] = vaddq_s32(left[17], al[22]);
br[17] = vaddq_s32(right[17], ar[22]);
bl[18] = vaddq_s32(left[18], al[21]);
br[18] = vaddq_s32(right[18], ar[21]);
bl[19] = vaddq_s32(left[19], al[20]);
br[19] = vaddq_s32(right[19], ar[20]);
bl[20] = vsubq_s32(left[19], al[20]);
br[20] = vsubq_s32(right[19], ar[20]);
bl[21] = vsubq_s32(left[18], al[21]);
br[21] = vsubq_s32(right[18], ar[21]);
bl[22] = vsubq_s32(left[17], al[22]);
br[22] = vsubq_s32(right[17], ar[22]);
bl[23] = vsubq_s32(left[16], al[23]);
br[23] = vsubq_s32(right[16], ar[23]);
bl[24] = vsubq_s32(left[31], al[24]);
br[24] = vsubq_s32(right[31], ar[24]);
bl[25] = vsubq_s32(left[30], al[25]);
br[25] = vsubq_s32(right[30], ar[25]);
bl[26] = vsubq_s32(left[29], al[26]);
br[26] = vsubq_s32(right[29], ar[26]);
bl[27] = vsubq_s32(left[28], al[27]);
br[27] = vsubq_s32(right[28], ar[27]);
bl[28] = vaddq_s32(left[28], al[27]);
br[28] = vaddq_s32(right[28], ar[27]);
bl[29] = vaddq_s32(left[29], al[26]);
br[29] = vaddq_s32(right[29], ar[26]);
bl[30] = vaddq_s32(left[30], al[25]);
br[30] = vaddq_s32(right[30], ar[25]);
bl[31] = vaddq_s32(left[31], al[24]);
br[31] = vaddq_s32(right[31], ar[24]);
// Stage 4.
al[0] = vaddq_s32(bl[0], bl[3]);
ar[0] = vaddq_s32(br[0], br[3]);
al[1] = vaddq_s32(bl[1], bl[2]);
ar[1] = vaddq_s32(br[1], br[2]);
al[2] = vsubq_s32(bl[1], bl[2]);
ar[2] = vsubq_s32(br[1], br[2]);
al[3] = vsubq_s32(bl[0], bl[3]);
ar[3] = vsubq_s32(br[0], br[3]);
al[4] = bl[4];
ar[4] = br[4];
butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
&ar[6], &al[5], &ar[5]);
al[7] = bl[7];
ar[7] = br[7];
al[8] = vaddq_s32(bl[8], bl[11]);
ar[8] = vaddq_s32(br[8], br[11]);
al[9] = vaddq_s32(bl[9], bl[10]);
ar[9] = vaddq_s32(br[9], br[10]);
al[10] = vsubq_s32(bl[9], bl[10]);
ar[10] = vsubq_s32(br[9], br[10]);
al[11] = vsubq_s32(bl[8], bl[11]);
ar[11] = vsubq_s32(br[8], br[11]);
al[12] = vsubq_s32(bl[15], bl[12]);
ar[12] = vsubq_s32(br[15], br[12]);
al[13] = vsubq_s32(bl[14], bl[13]);
ar[13] = vsubq_s32(br[14], br[13]);
al[14] = vaddq_s32(bl[14], bl[13]);
ar[14] = vaddq_s32(br[14], br[13]);
al[15] = vaddq_s32(bl[15], bl[12]);
ar[15] = vaddq_s32(br[15], br[12]);
al[16] = bl[16];
ar[16] = br[16];
al[17] = bl[17];
ar[17] = br[17];
butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
cospi_24_64, &al[29], &ar[29], &al[18],
&ar[18]);
butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
cospi_24_64, &al[28], &ar[28], &al[19],
&ar[19]);
butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
cospi_24_64, -cospi_8_64, &al[27], &ar[27],
&al[20], &ar[20]);
butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
cospi_24_64, -cospi_8_64, &al[26], &ar[26],
&al[21], &ar[21]);
al[22] = bl[22];
ar[22] = br[22];
al[23] = bl[23];
ar[23] = br[23];
al[24] = bl[24];
ar[24] = br[24];
al[25] = bl[25];
ar[25] = br[25];
al[30] = bl[30];
ar[30] = br[30];
al[31] = bl[31];
ar[31] = br[31];
// Stage 5.
butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
&br[0], &bl[1], &br[1]);
butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
cospi_24_64, &bl[2], &br[2], &bl[3],
&br[3]);
bl[4] = vaddq_s32(al[4], al[5]);
br[4] = vaddq_s32(ar[4], ar[5]);
bl[5] = vsubq_s32(al[4], al[5]);
br[5] = vsubq_s32(ar[4], ar[5]);
bl[6] = vsubq_s32(al[7], al[6]);
br[6] = vsubq_s32(ar[7], ar[6]);
bl[7] = vaddq_s32(al[7], al[6]);
br[7] = vaddq_s32(ar[7], ar[6]);
bl[8] = al[8];
br[8] = ar[8];
butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
cospi_24_64, &bl[14], &br[14], &bl[9],
&br[9]);
butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
cospi_24_64, -cospi_8_64, &bl[13], &br[13],
&bl[10], &br[10]);
bl[11] = al[11];
br[11] = ar[11];
bl[12] = al[12];
br[12] = ar[12];
bl[15] = al[15];
br[15] = ar[15];
bl[16] = vaddq_s32(al[19], al[16]);
br[16] = vaddq_s32(ar[19], ar[16]);
bl[17] = vaddq_s32(al[18], al[17]);
br[17] = vaddq_s32(ar[18], ar[17]);
bl[18] = vsubq_s32(al[17], al[18]);
br[18] = vsubq_s32(ar[17], ar[18]);
bl[19] = vsubq_s32(al[16], al[19]);
br[19] = vsubq_s32(ar[16], ar[19]);
bl[20] = vsubq_s32(al[23], al[20]);
br[20] = vsubq_s32(ar[23], ar[20]);
bl[21] = vsubq_s32(al[22], al[21]);
br[21] = vsubq_s32(ar[22], ar[21]);
bl[22] = vaddq_s32(al[21], al[22]);
br[22] = vaddq_s32(ar[21], ar[22]);
bl[23] = vaddq_s32(al[20], al[23]);
br[23] = vaddq_s32(ar[20], ar[23]);
bl[24] = vaddq_s32(al[27], al[24]);
br[24] = vaddq_s32(ar[27], ar[24]);
bl[25] = vaddq_s32(al[26], al[25]);
br[25] = vaddq_s32(ar[26], ar[25]);
bl[26] = vsubq_s32(al[25], al[26]);
br[26] = vsubq_s32(ar[25], ar[26]);
bl[27] = vsubq_s32(al[24], al[27]);
br[27] = vsubq_s32(ar[24], ar[27]);
bl[28] = vsubq_s32(al[31], al[28]);
br[28] = vsubq_s32(ar[31], ar[28]);
bl[29] = vsubq_s32(al[30], al[29]);
br[29] = vsubq_s32(ar[30], ar[29]);
bl[30] = vaddq_s32(al[29], al[30]);
br[30] = vaddq_s32(ar[29], ar[30]);
bl[31] = vaddq_s32(al[28], al[31]);
br[31] = vaddq_s32(ar[28], ar[31]);
// Stage 6.
al[0] = bl[0];
ar[0] = br[0];
al[1] = bl[1];
ar[1] = br[1];
al[2] = bl[2];
ar[2] = br[2];
al[3] = bl[3];
ar[3] = br[3];
butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
cospi_28_64, &al[4], &ar[4], &al[7],
&ar[7]);
butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
cospi_12_64, &al[5], &ar[5], &al[6],
&ar[6]);
al[8] = vaddq_s32(bl[8], bl[9]);
ar[8] = vaddq_s32(br[8], br[9]);
al[9] = vsubq_s32(bl[8], bl[9]);
ar[9] = vsubq_s32(br[8], br[9]);
al[10] = vsubq_s32(bl[11], bl[10]);
ar[10] = vsubq_s32(br[11], br[10]);
al[11] = vaddq_s32(bl[11], bl[10]);
ar[11] = vaddq_s32(br[11], br[10]);
al[12] = vaddq_s32(bl[12], bl[13]);
ar[12] = vaddq_s32(br[12], br[13]);
al[13] = vsubq_s32(bl[12], bl[13]);
ar[13] = vsubq_s32(br[12], br[13]);
al[14] = vsubq_s32(bl[15], bl[14]);
ar[14] = vsubq_s32(br[15], br[14]);
al[15] = vaddq_s32(bl[15], bl[14]);
ar[15] = vaddq_s32(br[15], br[14]);
al[16] = bl[16];
ar[16] = br[16];
al[19] = bl[19];
ar[19] = br[19];
al[20] = bl[20];
ar[20] = br[20];
al[23] = bl[23];
ar[23] = br[23];
al[24] = bl[24];
ar[24] = br[24];
al[27] = bl[27];
ar[27] = br[27];
al[28] = bl[28];
ar[28] = br[28];
al[31] = bl[31];
ar[31] = br[31];
butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
cospi_28_64, &al[30], &ar[30], &al[17],
&ar[17]);
butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
cospi_28_64, -cospi_4_64, &al[29], &ar[29],
&al[18], &ar[18]);
butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
cospi_20_64, cospi_12_64, &al[26], &ar[26],
&al[21], &ar[21]);
butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
cospi_12_64, -cospi_20_64, &al[25],
&ar[25], &al[22], &ar[22]);
// Stage 7.
bl[0] = al[0];
br[0] = ar[0];
bl[1] = al[1];
br[1] = ar[1];
bl[2] = al[2];
br[2] = ar[2];
bl[3] = al[3];
br[3] = ar[3];
bl[4] = al[4];
br[4] = ar[4];
bl[5] = al[5];
br[5] = ar[5];
bl[6] = al[6];
br[6] = ar[6];
bl[7] = al[7];
br[7] = ar[7];
butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
cospi_30_64, &bl[8], &br[8], &bl[15],
&br[15]);
butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
cospi_14_64, &bl[9], &br[9], &bl[14],
&br[14]);
butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
cospi_10_64, cospi_22_64, &bl[10], &br[10],
&bl[13], &br[13]);
butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
cospi_26_64, cospi_6_64, &bl[11], &br[11],
&bl[12], &br[12]);
bl[16] = vaddq_s32(al[16], al[17]);
br[16] = vaddq_s32(ar[16], ar[17]);
bl[17] = vsubq_s32(al[16], al[17]);
br[17] = vsubq_s32(ar[16], ar[17]);
bl[18] = vsubq_s32(al[19], al[18]);
br[18] = vsubq_s32(ar[19], ar[18]);
bl[19] = vaddq_s32(al[19], al[18]);
br[19] = vaddq_s32(ar[19], ar[18]);
bl[20] = vaddq_s32(al[20], al[21]);
br[20] = vaddq_s32(ar[20], ar[21]);
bl[21] = vsubq_s32(al[20], al[21]);
br[21] = vsubq_s32(ar[20], ar[21]);
bl[22] = vsubq_s32(al[23], al[22]);
br[22] = vsubq_s32(ar[23], ar[22]);
bl[23] = vaddq_s32(al[23], al[22]);
br[23] = vaddq_s32(ar[23], ar[22]);
bl[24] = vaddq_s32(al[24], al[25]);
br[24] = vaddq_s32(ar[24], ar[25]);
bl[25] = vsubq_s32(al[24], al[25]);
br[25] = vsubq_s32(ar[24], ar[25]);
bl[26] = vsubq_s32(al[27], al[26]);
br[26] = vsubq_s32(ar[27], ar[26]);
bl[27] = vaddq_s32(al[27], al[26]);
br[27] = vaddq_s32(ar[27], ar[26]);
bl[28] = vaddq_s32(al[28], al[29]);
br[28] = vaddq_s32(ar[28], ar[29]);
bl[29] = vsubq_s32(al[28], al[29]);
br[29] = vsubq_s32(ar[28], ar[29]);
bl[30] = vsubq_s32(al[31], al[30]);
br[30] = vsubq_s32(ar[31], ar[30]);
bl[31] = vaddq_s32(al[31], al[30]);
br[31] = vaddq_s32(ar[31], ar[30]);
// Final stage.
left[0] = bl[0];
right[0] = br[0];
left[16] = bl[1];
right[16] = br[1];
left[8] = bl[2];
right[8] = br[2];
left[24] = bl[3];
right[24] = br[3];
left[4] = bl[4];
right[4] = br[4];
left[20] = bl[5];
right[20] = br[5];
left[12] = bl[6];
right[12] = br[6];
left[28] = bl[7];
right[28] = br[7];
left[2] = bl[8];
right[2] = br[8];
left[18] = bl[9];
right[18] = br[9];
left[10] = bl[10];
right[10] = br[10];
left[26] = bl[11];
right[26] = br[11];
left[6] = bl[12];
right[6] = br[12];
left[22] = bl[13];
right[22] = br[13];
left[14] = bl[14];
right[14] = br[14];
left[30] = bl[15];
right[30] = br[15];
butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
cospi_31_64, &al[1], &ar[1], &al[31],
&ar[31]);
left[1] = al[1];
right[1] = ar[1];
left[31] = al[31];
right[31] = ar[31];
butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
cospi_17_64, cospi_15_64, &al[17], &ar[17],
&al[15], &ar[15]);
left[17] = al[17];
right[17] = ar[17];
left[15] = al[15];
right[15] = ar[15];
butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
cospi_23_64, &al[9], &ar[9], &al[23],
&ar[23]);
left[9] = al[9];
right[9] = ar[9];
left[23] = al[23];
right[23] = ar[23];
butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
cospi_25_64, cospi_7_64, &al[25], &ar[25],
&al[7], &ar[7]);
left[25] = al[25];
right[25] = ar[25];
left[7] = al[7];
right[7] = ar[7];
butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
cospi_27_64, &al[5], &ar[5], &al[27],
&ar[27]);
left[5] = al[5];
right[5] = ar[5];
left[27] = al[27];
right[27] = ar[27];
butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
cospi_21_64, cospi_11_64, &al[21], &ar[21],
&al[11], &ar[11]);
left[21] = al[21];
right[21] = ar[21];
left[11] = al[11];
right[11] = ar[11];
butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
cospi_13_64, cospi_19_64, &al[13], &ar[13],
&al[19], &ar[19]);
left[13] = al[13];
right[13] = ar[13];
left[19] = al[19];
right[19] = ar[19];
butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
cospi_29_64, cospi_3_64, &al[29], &ar[29],
&al[3], &ar[3]);
left[29] = al[29];
right[29] = ar[29];
left[3] = al[3];
right[3] = ar[3];
}
static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
int32x4_t *right /*32*/) {
int32x4_t al[32], ar[32];
int32x4_t bl[32], br[32];
// Stage 1: Done as part of the load.
// Stage 2.
// For the "rd" version, all the values are rounded down after stage 2 to keep
// the values in 16 bits.
al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
al[16] = add_round_shift_s32(left[16]);
ar[16] = add_round_shift_s32(right[16]);
al[17] = add_round_shift_s32(left[17]);
ar[17] = add_round_shift_s32(right[17]);
al[18] = add_round_shift_s32(left[18]);
ar[18] = add_round_shift_s32(right[18]);
al[19] = add_round_shift_s32(left[19]);
ar[19] = add_round_shift_s32(right[19]);
butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
al[20] = add_round_shift_s32(al[20]);
ar[20] = add_round_shift_s32(ar[20]);
al[21] = add_round_shift_s32(al[21]);
ar[21] = add_round_shift_s32(ar[21]);
al[22] = add_round_shift_s32(al[22]);
ar[22] = add_round_shift_s32(ar[22]);
al[23] = add_round_shift_s32(al[23]);
ar[23] = add_round_shift_s32(ar[23]);
al[24] = add_round_shift_s32(al[24]);
ar[24] = add_round_shift_s32(ar[24]);
al[25] = add_round_shift_s32(al[25]);
ar[25] = add_round_shift_s32(ar[25]);
al[26] = add_round_shift_s32(al[26]);
ar[26] = add_round_shift_s32(ar[26]);
al[27] = add_round_shift_s32(al[27]);
ar[27] = add_round_shift_s32(ar[27]);
al[28] = add_round_shift_s32(left[28]);
ar[28] = add_round_shift_s32(right[28]);
al[29] = add_round_shift_s32(left[29]);
ar[29] = add_round_shift_s32(right[29]);
al[30] = add_round_shift_s32(left[30]);
ar[30] = add_round_shift_s32(right[30]);
al[31] = add_round_shift_s32(left[31]);
ar[31] = add_round_shift_s32(right[31]);
// Stage 3.
bl[0] = vaddq_s32(al[0], al[7]);
br[0] = vaddq_s32(ar[0], ar[7]);
bl[1] = vaddq_s32(al[1], al[6]);
br[1] = vaddq_s32(ar[1], ar[6]);
bl[2] = vaddq_s32(al[2], al[5]);
br[2] = vaddq_s32(ar[2], ar[5]);
bl[3] = vaddq_s32(al[3], al[4]);
br[3] = vaddq_s32(ar[3], ar[4]);
bl[4] = vsubq_s32(al[3], al[4]);
br[4] = vsubq_s32(ar[3], ar[4]);
bl[5] = vsubq_s32(al[2], al[5]);
br[5] = vsubq_s32(ar[2], ar[5]);
bl[6] = vsubq_s32(al[1], al[6]);
br[6] = vsubq_s32(ar[1], ar[6]);
bl[7] = vsubq_s32(al[0], al[7]);
br[7] = vsubq_s32(ar[0], ar[7]);
bl[8] = al[8];
br[8] = ar[8];
bl[9] = al[9];
br[9] = ar[9];
butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
&bl[13], &br[13], &bl[10], &br[10]);
butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
&bl[12], &br[12], &bl[11], &br[11]);
bl[14] = al[14];
br[14] = ar[14];
bl[15] = al[15];
br[15] = ar[15];
bl[16] = vaddq_s32(al[16], al[23]);
br[16] = vaddq_s32(ar[16], ar[23]);
bl[17] = vaddq_s32(al[17], al[22]);
br[17] = vaddq_s32(ar[17], ar[22]);
bl[18] = vaddq_s32(al[18], al[21]);
br[18] = vaddq_s32(ar[18], ar[21]);
bl[19] = vaddq_s32(al[19], al[20]);
br[19] = vaddq_s32(ar[19], ar[20]);
bl[20] = vsubq_s32(al[19], al[20]);
br[20] = vsubq_s32(ar[19], ar[20]);
bl[21] = vsubq_s32(al[18], al[21]);
br[21] = vsubq_s32(ar[18], ar[21]);
bl[22] = vsubq_s32(al[17], al[22]);
br[22] = vsubq_s32(ar[17], ar[22]);
bl[23] = vsubq_s32(al[16], al[23]);
br[23] = vsubq_s32(ar[16], ar[23]);
bl[24] = vsubq_s32(al[31], al[24]);
br[24] = vsubq_s32(ar[31], ar[24]);
bl[25] = vsubq_s32(al[30], al[25]);
br[25] = vsubq_s32(ar[30], ar[25]);
bl[26] = vsubq_s32(al[29], al[26]);
br[26] = vsubq_s32(ar[29], ar[26]);
bl[27] = vsubq_s32(al[28], al[27]);
br[27] = vsubq_s32(ar[28], ar[27]);
bl[28] = vaddq_s32(al[28], al[27]);
br[28] = vaddq_s32(ar[28], ar[27]);
bl[29] = vaddq_s32(al[29], al[26]);
br[29] = vaddq_s32(ar[29], ar[26]);
bl[30] = vaddq_s32(al[30], al[25]);
br[30] = vaddq_s32(ar[30], ar[25]);
bl[31] = vaddq_s32(al[31], al[24]);
br[31] = vaddq_s32(ar[31], ar[24]);
// Stage 4.
al[0] = vaddq_s32(bl[0], bl[3]);
ar[0] = vaddq_s32(br[0], br[3]);
al[1] = vaddq_s32(bl[1], bl[2]);
ar[1] = vaddq_s32(br[1], br[2]);
al[2] = vsubq_s32(bl[1], bl[2]);
ar[2] = vsubq_s32(br[1], br[2]);
al[3] = vsubq_s32(bl[0], bl[3]);
ar[3] = vsubq_s32(br[0], br[3]);
al[4] = bl[4];
ar[4] = br[4];
butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
&ar[6], &al[5], &ar[5]);
al[7] = bl[7];
ar[7] = br[7];
al[8] = vaddq_s32(bl[8], bl[11]);
ar[8] = vaddq_s32(br[8], br[11]);
al[9] = vaddq_s32(bl[9], bl[10]);
ar[9] = vaddq_s32(br[9], br[10]);
al[10] = vsubq_s32(bl[9], bl[10]);
ar[10] = vsubq_s32(br[9], br[10]);
al[11] = vsubq_s32(bl[8], bl[11]);
ar[11] = vsubq_s32(br[8], br[11]);
al[12] = vsubq_s32(bl[15], bl[12]);
ar[12] = vsubq_s32(br[15], br[12]);
al[13] = vsubq_s32(bl[14], bl[13]);
ar[13] = vsubq_s32(br[14], br[13]);
al[14] = vaddq_s32(bl[14], bl[13]);
ar[14] = vaddq_s32(br[14], br[13]);
al[15] = vaddq_s32(bl[15], bl[12]);
ar[15] = vaddq_s32(br[15], br[12]);
al[16] = bl[16];
ar[16] = br[16];
al[17] = bl[17];
ar[17] = br[17];
butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
-cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
-cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
al[22] = bl[22];
ar[22] = br[22];
al[23] = bl[23];
ar[23] = br[23];
al[24] = bl[24];
ar[24] = br[24];
al[25] = bl[25];
ar[25] = br[25];
al[30] = bl[30];
ar[30] = br[30];
al[31] = bl[31];
ar[31] = br[31];
// Stage 5.
butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
&br[0], &bl[1], &br[1]);
butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
&bl[2], &br[2], &bl[3], &br[3]);
bl[4] = vaddq_s32(al[4], al[5]);
br[4] = vaddq_s32(ar[4], ar[5]);
bl[5] = vsubq_s32(al[4], al[5]);
br[5] = vsubq_s32(ar[4], ar[5]);
bl[6] = vsubq_s32(al[7], al[6]);
br[6] = vsubq_s32(ar[7], ar[6]);
bl[7] = vaddq_s32(al[7], al[6]);
br[7] = vaddq_s32(ar[7], ar[6]);
bl[8] = al[8];
br[8] = ar[8];
butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
&bl[14], &br[14], &bl[9], &br[9]);
butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
-cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
bl[11] = al[11];
br[11] = ar[11];
bl[12] = al[12];
br[12] = ar[12];
bl[15] = al[15];
br[15] = ar[15];
bl[16] = vaddq_s32(al[19], al[16]);
br[16] = vaddq_s32(ar[19], ar[16]);
bl[17] = vaddq_s32(al[18], al[17]);
br[17] = vaddq_s32(ar[18], ar[17]);
bl[18] = vsubq_s32(al[17], al[18]);
br[18] = vsubq_s32(ar[17], ar[18]);
bl[19] = vsubq_s32(al[16], al[19]);
br[19] = vsubq_s32(ar[16], ar[19]);
bl[20] = vsubq_s32(al[23], al[20]);
br[20] = vsubq_s32(ar[23], ar[20]);
bl[21] = vsubq_s32(al[22], al[21]);
br[21] = vsubq_s32(ar[22], ar[21]);
bl[22] = vaddq_s32(al[21], al[22]);
br[22] = vaddq_s32(ar[21], ar[22]);
bl[23] = vaddq_s32(al[20], al[23]);
br[23] = vaddq_s32(ar[20], ar[23]);
bl[24] = vaddq_s32(al[27], al[24]);
br[24] = vaddq_s32(ar[27], ar[24]);
bl[25] = vaddq_s32(al[26], al[25]);
br[25] = vaddq_s32(ar[26], ar[25]);
bl[26] = vsubq_s32(al[25], al[26]);
br[26] = vsubq_s32(ar[25], ar[26]);
bl[27] = vsubq_s32(al[24], al[27]);
br[27] = vsubq_s32(ar[24], ar[27]);
bl[28] = vsubq_s32(al[31], al[28]);
br[28] = vsubq_s32(ar[31], ar[28]);
bl[29] = vsubq_s32(al[30], al[29]);
br[29] = vsubq_s32(ar[30], ar[29]);
bl[30] = vaddq_s32(al[29], al[30]);
br[30] = vaddq_s32(ar[29], ar[30]);
bl[31] = vaddq_s32(al[28], al[31]);
br[31] = vaddq_s32(ar[28], ar[31]);
// Stage 6.
al[0] = bl[0];
ar[0] = br[0];
al[1] = bl[1];
ar[1] = br[1];
al[2] = bl[2];
ar[2] = br[2];
al[3] = bl[3];
ar[3] = br[3];
butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
&al[4], &ar[4], &al[7], &ar[7]);
butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
&al[5], &ar[5], &al[6], &ar[6]);
al[8] = vaddq_s32(bl[8], bl[9]);
ar[8] = vaddq_s32(br[8], br[9]);
al[9] = vsubq_s32(bl[8], bl[9]);
ar[9] = vsubq_s32(br[8], br[9]);
al[10] = vsubq_s32(bl[11], bl[10]);
ar[10] = vsubq_s32(br[11], br[10]);
al[11] = vaddq_s32(bl[11], bl[10]);
ar[11] = vaddq_s32(br[11], br[10]);
al[12] = vaddq_s32(bl[12], bl[13]);
ar[12] = vaddq_s32(br[12], br[13]);
al[13] = vsubq_s32(bl[12], bl[13]);
ar[13] = vsubq_s32(br[12], br[13]);
al[14] = vsubq_s32(bl[15], bl[14]);
ar[14] = vsubq_s32(br[15], br[14]);
al[15] = vaddq_s32(bl[15], bl[14]);
ar[15] = vaddq_s32(br[15], br[14]);
al[16] = bl[16];
ar[16] = br[16];
al[19] = bl[19];
ar[19] = br[19];
al[20] = bl[20];
ar[20] = br[20];
al[23] = bl[23];
ar[23] = br[23];
al[24] = bl[24];
ar[24] = br[24];
al[27] = bl[27];
ar[27] = br[27];
al[28] = bl[28];
ar[28] = br[28];
al[31] = bl[31];
ar[31] = br[31];
butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
-cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
-cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
// Stage 7.
bl[0] = al[0];
br[0] = ar[0];
bl[1] = al[1];
br[1] = ar[1];
bl[2] = al[2];
br[2] = ar[2];
bl[3] = al[3];
br[3] = ar[3];
bl[4] = al[4];
br[4] = ar[4];
bl[5] = al[5];
br[5] = ar[5];
bl[6] = al[6];
br[6] = ar[6];
bl[7] = al[7];
br[7] = ar[7];
butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
&bl[8], &br[8], &bl[15], &br[15]);
butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
bl[16] = vaddq_s32(al[16], al[17]);
br[16] = vaddq_s32(ar[16], ar[17]);
bl[17] = vsubq_s32(al[16], al[17]);
br[17] = vsubq_s32(ar[16], ar[17]);
bl[18] = vsubq_s32(al[19], al[18]);
br[18] = vsubq_s32(ar[19], ar[18]);
bl[19] = vaddq_s32(al[19], al[18]);
br[19] = vaddq_s32(ar[19], ar[18]);
bl[20] = vaddq_s32(al[20], al[21]);
br[20] = vaddq_s32(ar[20], ar[21]);
bl[21] = vsubq_s32(al[20], al[21]);
br[21] = vsubq_s32(ar[20], ar[21]);
bl[22] = vsubq_s32(al[23], al[22]);
br[22] = vsubq_s32(ar[23], ar[22]);
bl[23] = vaddq_s32(al[23], al[22]);
br[23] = vaddq_s32(ar[23], ar[22]);
bl[24] = vaddq_s32(al[24], al[25]);
br[24] = vaddq_s32(ar[24], ar[25]);
bl[25] = vsubq_s32(al[24], al[25]);
br[25] = vsubq_s32(ar[24], ar[25]);
bl[26] = vsubq_s32(al[27], al[26]);
br[26] = vsubq_s32(ar[27], ar[26]);
bl[27] = vaddq_s32(al[27], al[26]);
br[27] = vaddq_s32(ar[27], ar[26]);
bl[28] = vaddq_s32(al[28], al[29]);
br[28] = vaddq_s32(ar[28], ar[29]);
bl[29] = vsubq_s32(al[28], al[29]);
br[29] = vsubq_s32(ar[28], ar[29]);
bl[30] = vsubq_s32(al[31], al[30]);
br[30] = vsubq_s32(ar[31], ar[30]);
bl[31] = vaddq_s32(al[31], al[30]);
br[31] = vaddq_s32(ar[31], ar[30]);
// Final stage.
left[0] = bl[0];
right[0] = br[0];
left[16] = bl[1];
right[16] = br[1];
left[8] = bl[2];
right[8] = br[2];
left[24] = bl[3];
right[24] = br[3];
left[4] = bl[4];
right[4] = br[4];
left[20] = bl[5];
right[20] = br[5];
left[12] = bl[6];
right[12] = br[6];
left[28] = bl[7];
right[28] = br[7];
left[2] = bl[8];
right[2] = br[8];
left[18] = bl[9];
right[18] = br[9];
left[10] = bl[10];
right[10] = br[10];
left[26] = bl[11];
right[26] = br[11];
left[6] = bl[12];
right[6] = br[12];
left[22] = bl[13];
right[22] = br[13];
left[14] = bl[14];
right[14] = br[14];
left[30] = bl[15];
right[30] = br[15];
butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
left[1] = al[1];
right[1] = ar[1];
left[31] = al[31];
right[31] = ar[31];
butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
left[17] = al[17];
right[17] = ar[17];
left[15] = al[15];
right[15] = ar[15];
butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
left[9] = al[9];
right[9] = ar[9];
left[23] = al[23];
right[23] = ar[23];
butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
left[25] = al[25];
right[25] = ar[25];
left[7] = al[7];
right[7] = ar[7];
butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
left[5] = al[5];
right[5] = ar[5];
left[27] = al[27];
right[27] = ar[27];
butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
left[21] = al[21];
right[21] = ar[21];
left[11] = al[11];
right[11] = ar[11];
butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
left[13] = al[13];
right[13] = ar[13];
left[19] = al[19];
right[19] = ar[19];
butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
left[29] = al[29];
right[29] = ar[29];
left[3] = al[3];
right[3] = ar[3];
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_