vpx_dsp/loongarch/fwd_txfm_lsx.c - webm/libvpx - Git at Google

 /*
  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/loongarch/fwd_txfm_lsx.h"

 #define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
   do {                                                                         \
     __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
                                                                                \
     DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
     DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3);                \
     _t0 = __lsx_vilvl_h(_s1, _s0);                                             \
     _t1 = __lsx_vilvh_h(_s1, _s0);                                             \
     _t2 = __lsx_vilvl_h(_s3, _s2);                                             \
     _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
     DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
     DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
   } while (0)

 #if !CONFIG_VP9_HIGHBITDEPTH
 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
                         int32_t src_stride) {
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
   __m128i in8, in9, in10, in11, in12, in13, in14, in15;
   __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30;
   __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37;
   __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
   __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };
   __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };
   __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 };

   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride4 = src_stride2 << 1;
   int32_t src_stride6 = src_stride4 + src_stride2;
   int32_t src_stride8 = src_stride4 << 1;
   int16_t *input_tmp = (int16_t *)input;
   in0 = __lsx_vld(input_tmp, 0);
   DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
             input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4);
   input_tmp += src_stride4;
   DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
             input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8);
   input_tmp += src_stride4;
   DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
             input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11,
             in12);
   input_tmp += src_stride4;
   DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13,
             in14);
   input_tmp += src_stride2;
   in15 = __lsx_vldx(input_tmp, src_stride2);

   DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
   DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
   DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
             in11);
   DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
             in15);
   DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0,
             tmp1, tmp2, tmp3);
   DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5,
             tmp6, tmp7);
   FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
                 tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
   __lsx_vst(tmp0, tmp_ptr, 0);
   __lsx_vst(tmp1, tmp_ptr, 64);
   __lsx_vst(tmp2, tmp_ptr, 128);
   __lsx_vst(tmp3, tmp_ptr, 192);
   __lsx_vst(tmp4, tmp_ptr, 256);
   __lsx_vst(tmp5, tmp_ptr, 320);
   __lsx_vst(tmp6, tmp_ptr, 384);
   __lsx_vst(tmp7, tmp_ptr, 448);
   DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15,
             in14, in13, in12);
   DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10,
             in9, in8);

   tmp_ptr += 16;

   /* stp 1 */
   DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4);
   DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5);

   cnst4 = __lsx_vreplvei_h(coeff, 0);
   DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25);

   cnst5 = __lsx_vreplvei_h(coeff, 1);
   cnst5 = __lsx_vpackev_h(cnst5, cnst4);
   DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22);
   DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24);
   DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23);

   /* stp2 */
   LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
   LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
   DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4);
   DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5);
   DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1);
   cnst0 = __lsx_vpackev_h(cnst0, cnst1);
   DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26);

   cnst0 = __lsx_vreplvei_h(coeff, 4);
   cnst1 = __lsx_vpackev_h(cnst1, cnst0);
   DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21);

   LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
   vec1 = __lsx_vilvl_h(in15, in8);
   vec0 = __lsx_vilvh_h(in15, in8);

   DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1);
   cnst0 = __lsx_vpackev_h(cnst0, cnst1);

   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
   __lsx_vst(in8, tmp_ptr, 0);

   cnst0 = __lsx_vreplvei_h(coeff2, 0);
   cnst0 = __lsx_vpackev_h(cnst1, cnst0);
   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
   __lsx_vst(in8, tmp_ptr, 448);

   vec1 = __lsx_vilvl_h(in14, in9);
   vec0 = __lsx_vilvh_h(in14, in9);
   DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1);
   cnst1 = __lsx_vpackev_h(cnst1, cnst0);

   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
   __lsx_vst(in8, tmp_ptr, 256);

   cnst1 = __lsx_vreplvei_h(coeff2, 2);
   cnst0 = __lsx_vpackev_h(cnst0, cnst1);
   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
   __lsx_vst(in8, tmp_ptr, 192);

   DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1);
   cnst1 = __lsx_vpackev_h(cnst1, cnst0);
   DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25);

   cnst1 = __lsx_vreplvei_h(coeff, 3);
   cnst1 = __lsx_vpackev_h(cnst0, cnst1);
   DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22);

   /* stp4 */
   DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10);

   vec1 = __lsx_vilvl_h(in13, in10);
   vec0 = __lsx_vilvh_h(in13, in10);
   DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1);
   cnst0 = __lsx_vpackev_h(cnst0, cnst1);
   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
   __lsx_vst(in8, tmp_ptr, 128);

   cnst0 = __lsx_vreplvei_h(coeff2, 1);
   cnst0 = __lsx_vpackev_h(cnst1, cnst0);
   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
   __lsx_vst(in8, tmp_ptr, 320);

   DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11);
   vec1 = __lsx_vilvl_h(in12, in11);
   vec0 = __lsx_vilvh_h(in12, in11);
   DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1);
   cnst1 = __lsx_vpackev_h(cnst1, cnst0);

   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
   __lsx_vst(in8, tmp_ptr, 384);

   cnst1 = __lsx_vreplvei_h(coeff2, 3);
   cnst0 = __lsx_vpackev_h(cnst0, cnst1);
   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
   __lsx_vst(in8, tmp_ptr, 64);
 }

 void fdct16x8_1d_row(int16_t *input, int16_t *output) {
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
   __m128i in8, in9, in10, in11, in12, in13, in14, in15;
   int16_t *input_tmp = input;

   DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2,
             in3);
   DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5,
             in6, in7);
   DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp,
             112, in8, in9, in10, in11);
   DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208,
             input_tmp, 240, in12, in13, in14, in15);

   LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                      in4, in5, in6, in7);
   LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
                      in10, in11, in12, in13, in14, in15);
   DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
   DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
   DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10,
             in11);
   DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13,
             in14, in15);

   DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
   DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
   DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
             in11);
   DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
             in15);
   LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
                      in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4,
                      tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14,
                      in15);
   __lsx_vst(in8, input, 0);
   __lsx_vst(in9, input, 32);
   __lsx_vst(in10, input, 64);
   __lsx_vst(in11, input, 96);
   __lsx_vst(in12, input, 128);
   __lsx_vst(in13, input, 160);
   __lsx_vst(in14, input, 192);
   __lsx_vst(in15, input, 224);

   FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
                 tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
   DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9,
             in10, in11);
   DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12,
             in13, in14, in15);
   FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
                in4, in5, in6, in7);
   LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
                      tmp1, in1, tmp2, in2, tmp3, in3);
   __lsx_vst(tmp0, output, 0);
   __lsx_vst(in0, output, 32);
   __lsx_vst(tmp1, output, 64);
   __lsx_vst(in1, output, 96);
   __lsx_vst(tmp2, output, 128);
   __lsx_vst(in2, output, 160);
   __lsx_vst(tmp3, output, 192);
   __lsx_vst(in3, output, 224);

   LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
                      tmp5, in5, tmp6, in6, tmp7, in7);
   __lsx_vst(tmp4, output, 16);
   __lsx_vst(in4, output, 48);
   __lsx_vst(tmp5, output, 80);
   __lsx_vst(in5, output, 112);
   __lsx_vst(tmp6, output, 144);
   __lsx_vst(in6, output, 176);
   __lsx_vst(tmp7, output, 208);
   __lsx_vst(in7, output, 240);
 }

 void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
                      int32_t src_stride) {
   __m128i in0, in1, in2, in3;

   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride4 = src_stride2 << 1;
   int32_t src_stride6 = src_stride4 + src_stride2;

   in0 = __lsx_vld(input, 0);
   DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
   in3 = __lsx_vldx(input, src_stride6);

   /* fdct4 pre-process */
   {
     __m128i vec, mask;
     __m128i zero = __lsx_vldi(0);

     mask = __lsx_vinsgr2vr_b(zero, 1, 0);
     DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
               in3);
     vec = __lsx_vseqi_h(in0, 0);
     vec = __lsx_vxori_b(vec, 255);
     vec = __lsx_vand_v(mask, vec);
     in0 = __lsx_vadd_h(in0, vec);
   }

   VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
   LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
   VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
   LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
   DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
   DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
   DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
   __lsx_vst(in0, output, 0);
   __lsx_vst(in2, output, 16);
 }

 void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
                      int32_t src_stride) {
   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
   int32_t src_stride2 = src_stride << 1;
   int32_t src_stride4 = src_stride2 << 1;
   int32_t src_stride6 = src_stride4 + src_stride2;
   int16_t *input_tmp = (int16_t *)input;

   in0 = __lsx_vld(input_tmp, 0);
   DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
             in2);
   in3 = __lsx_vldx(input_tmp, src_stride6);
   input_tmp += src_stride4;
   in4 = __lsx_vld(input_tmp, 0);
   DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
             in6);
   in7 = __lsx_vldx(input_tmp, src_stride6);

   DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
   DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);

   VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
             in5, in6, in7);
   LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                      in4, in5, in6, in7);
   VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
             in5, in6, in7);
   LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                      in4, in5, in6, in7);
   SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);

   __lsx_vst(in0, output, 0);
   __lsx_vst(in1, output, 16);
   __lsx_vst(in2, output, 32);
   __lsx_vst(in3, output, 48);
   __lsx_vst(in4, output, 64);
   __lsx_vst(in5, output, 80);
   __lsx_vst(in6, output, 96);
   __lsx_vst(in7, output, 112);
 }

 void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
                        int32_t src_stride) {
   int32_t i;
   DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);

   /* column transform */
   for (i = 0; i < 2; ++i) {
     fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
   }

   /* row transform */
   for (i = 0; i < 2; ++i) {
     fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
   }
 }
 #endif  // !CONFIG_VP9_HIGHBITDEPTH
	/*
	* Copyright (c) 2022 The WebM project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/

	#include "./vpx_dsp_rtcd.h"
	#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"

	#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
	do { \
	__m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \
	\
	DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \
	DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \
	_t0 = __lsx_vilvl_h(_s1, _s0); \
	_t1 = __lsx_vilvh_h(_s1, _s0); \
	_t2 = __lsx_vilvl_h(_s3, _s2); \
	_t3 = __lsx_vilvh_h(_s3, _s2); \
	DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \
	DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \
	} while (0)

	#if !CONFIG_VP9_HIGHBITDEPTH
	void fdct8x16_1d_column(const int16_t input, int16_t tmp_ptr,
	int32_t src_stride) {
	__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
	__m128i in0, in1, in2, in3, in4, in5, in6, in7;
	__m128i in8, in9, in10, in11, in12, in13, in14, in15;
	__m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30;
	__m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37;
	__m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
	__m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };
	__m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };
	__m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 };

	int32_t src_stride2 = src_stride << 1;
	int32_t src_stride4 = src_stride2 << 1;
	int32_t src_stride6 = src_stride4 + src_stride2;
	int32_t src_stride8 = src_stride4 << 1;
	int16_t input_tmp = (int16_t )input;
	in0 = __lsx_vld(input_tmp, 0);
	DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
	input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4);
	input_tmp += src_stride4;
	DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
	input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8);
	input_tmp += src_stride4;
	DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
	input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11,
	in12);
	input_tmp += src_stride4;
	DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13,
	in14);
	input_tmp += src_stride2;
	in15 = __lsx_vldx(input_tmp, src_stride2);

	DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
	DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
	DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
	in11);
	DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
	in15);
	DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0,
	tmp1, tmp2, tmp3);
	DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5,
	tmp6, tmp7);
	FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
	tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
	__lsx_vst(tmp0, tmp_ptr, 0);
	__lsx_vst(tmp1, tmp_ptr, 64);
	__lsx_vst(tmp2, tmp_ptr, 128);
	__lsx_vst(tmp3, tmp_ptr, 192);
	__lsx_vst(tmp4, tmp_ptr, 256);
	__lsx_vst(tmp5, tmp_ptr, 320);
	__lsx_vst(tmp6, tmp_ptr, 384);
	__lsx_vst(tmp7, tmp_ptr, 448);
	DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15,
	in14, in13, in12);
	DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10,
	in9, in8);

	tmp_ptr += 16;

	/* stp 1 */
	DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4);
	DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5);

	cnst4 = __lsx_vreplvei_h(coeff, 0);
	DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25);

	cnst5 = __lsx_vreplvei_h(coeff, 1);
	cnst5 = __lsx_vpackev_h(cnst5, cnst4);
	DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22);
	DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24);
	DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23);

	/* stp2 */
	LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
	LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
	DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4);
	DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5);
	DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1);
	cnst0 = __lsx_vpackev_h(cnst0, cnst1);
	DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26);

	cnst0 = __lsx_vreplvei_h(coeff, 4);
	cnst1 = __lsx_vpackev_h(cnst1, cnst0);
	DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21);

	LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
	vec1 = __lsx_vilvl_h(in15, in8);
	vec0 = __lsx_vilvh_h(in15, in8);

	DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1);
	cnst0 = __lsx_vpackev_h(cnst0, cnst1);

	DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
	__lsx_vst(in8, tmp_ptr, 0);

	cnst0 = __lsx_vreplvei_h(coeff2, 0);
	cnst0 = __lsx_vpackev_h(cnst1, cnst0);
	DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
	__lsx_vst(in8, tmp_ptr, 448);

	vec1 = __lsx_vilvl_h(in14, in9);
	vec0 = __lsx_vilvh_h(in14, in9);
	DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1);
	cnst1 = __lsx_vpackev_h(cnst1, cnst0);

	DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
	__lsx_vst(in8, tmp_ptr, 256);

	cnst1 = __lsx_vreplvei_h(coeff2, 2);
	cnst0 = __lsx_vpackev_h(cnst0, cnst1);
	DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
	__lsx_vst(in8, tmp_ptr, 192);

	DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1);
	cnst1 = __lsx_vpackev_h(cnst1, cnst0);
	DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25);

	cnst1 = __lsx_vreplvei_h(coeff, 3);
	cnst1 = __lsx_vpackev_h(cnst0, cnst1);
	DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22);

	/* stp4 */
	DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10);

	vec1 = __lsx_vilvl_h(in13, in10);
	vec0 = __lsx_vilvh_h(in13, in10);
	DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1);
	cnst0 = __lsx_vpackev_h(cnst0, cnst1);
	DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
	__lsx_vst(in8, tmp_ptr, 128);

	cnst0 = __lsx_vreplvei_h(coeff2, 1);
	cnst0 = __lsx_vpackev_h(cnst1, cnst0);
	DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
	__lsx_vst(in8, tmp_ptr, 320);

	DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11);
	vec1 = __lsx_vilvl_h(in12, in11);
	vec0 = __lsx_vilvh_h(in12, in11);
	DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1);
	cnst1 = __lsx_vpackev_h(cnst1, cnst0);

	DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
	__lsx_vst(in8, tmp_ptr, 384);

	cnst1 = __lsx_vreplvei_h(coeff2, 3);
	cnst0 = __lsx_vpackev_h(cnst0, cnst1);
	DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
	__lsx_vst(in8, tmp_ptr, 64);
	}

	void fdct16x8_1d_row(int16_t input, int16_t output) {
	__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
	__m128i in0, in1, in2, in3, in4, in5, in6, in7;
	__m128i in8, in9, in10, in11, in12, in13, in14, in15;
	int16_t *input_tmp = input;

	DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2,
	in3);
	DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5,
	in6, in7);
	DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp,
	112, in8, in9, in10, in11);
	DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208,
	input_tmp, 240, in12, in13, in14, in15);

	LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
	in4, in5, in6, in7);
	LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
	in10, in11, in12, in13, in14, in15);
	DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
	DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
	DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10,
	in11);
	DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13,
	in14, in15);

	DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
	DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
	DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
	in11);
	DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
	in15);
	LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
	in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4,
	tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14,
	in15);
	__lsx_vst(in8, input, 0);
	__lsx_vst(in9, input, 32);
	__lsx_vst(in10, input, 64);
	__lsx_vst(in11, input, 96);
	__lsx_vst(in12, input, 128);
	__lsx_vst(in13, input, 160);
	__lsx_vst(in14, input, 192);
	__lsx_vst(in15, input, 224);

	FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
	tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
	DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9,
	in10, in11);
	DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12,
	in13, in14, in15);
	FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
	in4, in5, in6, in7);
	LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
	tmp1, in1, tmp2, in2, tmp3, in3);
	__lsx_vst(tmp0, output, 0);
	__lsx_vst(in0, output, 32);
	__lsx_vst(tmp1, output, 64);
	__lsx_vst(in1, output, 96);
	__lsx_vst(tmp2, output, 128);
	__lsx_vst(in2, output, 160);
	__lsx_vst(tmp3, output, 192);
	__lsx_vst(in3, output, 224);

	LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
	tmp5, in5, tmp6, in6, tmp7, in7);
	__lsx_vst(tmp4, output, 16);
	__lsx_vst(in4, output, 48);
	__lsx_vst(tmp5, output, 80);
	__lsx_vst(in5, output, 112);
	__lsx_vst(tmp6, output, 144);
	__lsx_vst(in6, output, 176);
	__lsx_vst(tmp7, output, 208);
	__lsx_vst(in7, output, 240);
	}

	void vpx_fdct4x4_lsx(const int16_t input, int16_t output,
	int32_t src_stride) {
	__m128i in0, in1, in2, in3;

	int32_t src_stride2 = src_stride << 1;
	int32_t src_stride4 = src_stride2 << 1;
	int32_t src_stride6 = src_stride4 + src_stride2;

	in0 = __lsx_vld(input, 0);
	DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
	in3 = __lsx_vldx(input, src_stride6);

	/* fdct4 pre-process */
	{
	__m128i vec, mask;
	__m128i zero = __lsx_vldi(0);

	mask = __lsx_vinsgr2vr_b(zero, 1, 0);
	DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
	in3);
	vec = __lsx_vseqi_h(in0, 0);
	vec = __lsx_vxori_b(vec, 255);
	vec = __lsx_vand_v(mask, vec);
	in0 = __lsx_vadd_h(in0, vec);
	}

	VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
	LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
	VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
	LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
	DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
	DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
	DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
	__lsx_vst(in0, output, 0);
	__lsx_vst(in2, output, 16);
	}

	void vpx_fdct8x8_lsx(const int16_t input, int16_t output,
	int32_t src_stride) {
	__m128i in0, in1, in2, in3, in4, in5, in6, in7;
	int32_t src_stride2 = src_stride << 1;
	int32_t src_stride4 = src_stride2 << 1;
	int32_t src_stride6 = src_stride4 + src_stride2;
	int16_t input_tmp = (int16_t )input;

	in0 = __lsx_vld(input_tmp, 0);
	DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
	in2);
	in3 = __lsx_vldx(input_tmp, src_stride6);
	input_tmp += src_stride4;
	in4 = __lsx_vld(input_tmp, 0);
	DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
	in6);
	in7 = __lsx_vldx(input_tmp, src_stride6);

	DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
	DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);

	VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
	in5, in6, in7);
	LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
	in4, in5, in6, in7);
	VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
	in5, in6, in7);
	LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
	in4, in5, in6, in7);
	SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);

	__lsx_vst(in0, output, 0);
	__lsx_vst(in1, output, 16);
	__lsx_vst(in2, output, 32);
	__lsx_vst(in3, output, 48);
	__lsx_vst(in4, output, 64);
	__lsx_vst(in5, output, 80);
	__lsx_vst(in6, output, 96);
	__lsx_vst(in7, output, 112);
	}

	void vpx_fdct16x16_lsx(const int16_t input, int16_t output,
	int32_t src_stride) {
	int32_t i;
	DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);

	/* column transform */
	for (i = 0; i < 2; ++i) {
	fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
	}

	/* row transform */
	for (i = 0; i < 2; ++i) {
	fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
	}
	}
	#endif // !CONFIG_VP9_HIGHBITDEPTH