Merge "ANS experiment: Use ANS everywhere." into nextgenv2
diff --git a/test/test.mk b/test/test.mk
index b173ec3..7c3f101 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -174,6 +174,8 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc
 
+LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_fwd_txfm2d_sse4_test.cc
+
 ifeq ($(CONFIG_EXT_INTER),yes)
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
diff --git a/test/vp10_fwd_txfm1d_test.cc b/test/vp10_fwd_txfm1d_test.cc
index bcbc617..2d09e0d 100644
--- a/test/vp10_fwd_txfm1d_test.cc
+++ b/test/vp10_fwd_txfm1d_test.cc
@@ -31,7 +31,7 @@
 static int8_t range_bit[12] = {32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
 
 TEST(vp10_fwd_txfm1d, round_shift) {
-  EXPECT_EQ(round_shift(7, 1), 3);
+  EXPECT_EQ(round_shift(7, 1), 4);
   EXPECT_EQ(round_shift(-7, 1), -3);
 
   EXPECT_EQ(round_shift(7, 2), 2);
@@ -46,17 +46,6 @@
   EXPECT_EQ(max_bit, 3);
 }
 
-TEST(vp10_fwd_txfm1d, half_btf) {
-  int32_t max = (1 << 15) - 1;
-  int32_t w0 = max;
-  int32_t in0 = max;
-  int32_t w1 = max;
-  int32_t in1 = max;
-  int32_t result_32 = half_btf(w0, in0, w1, in1, 0);
-  int64_t result_64 = (int64_t)w0 * (int64_t)in0 + (int64_t)w1 * (int64_t)in1;
-  EXPECT_EQ(result_32, result_64);
-}
-
 TEST(vp10_fwd_txfm1d, cospi_arr) {
   for (int i = 0; i < 7; i++) {
     for (int j = 0; j < 64; j++) {
diff --git a/test/vp10_fwd_txfm2d_sse4_test.cc b/test/vp10_fwd_txfm2d_sse4_test.cc
new file mode 100644
index 0000000..d3882cd
--- /dev/null
+++ b/test/vp10_fwd_txfm2d_sse4_test.cc
@@ -0,0 +1,72 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "./vp10_rtcd.h"
+#include "test/acm_random.h"
+#include "test/vp10_txfm_test.h"
+#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+#if CONFIG_VP9_HIGHBITDEPTH
+TEST(vp10_fwd_txfm2d_sse4_1, accuracy) {
+  int16_t input[4096] = {0};
+  int32_t output_sse4_1[4096] = {0};
+  int32_t output_c[4096] = {0};
+
+  int txfm_num = 17;
+
+  TXFM_2D_CFG cfg_list[] = {
+      fwd_txfm_2d_cfg_dct_dct_4,    fwd_txfm_2d_cfg_dct_dct_8,
+      fwd_txfm_2d_cfg_dct_dct_16,   fwd_txfm_2d_cfg_dct_dct_32,
+      fwd_txfm_2d_cfg_dct_dct_64,   fwd_txfm_2d_cfg_dct_adst_4,
+      fwd_txfm_2d_cfg_dct_adst_8,   fwd_txfm_2d_cfg_dct_adst_16,
+      fwd_txfm_2d_cfg_dct_adst_32,  fwd_txfm_2d_cfg_adst_dct_4,
+      fwd_txfm_2d_cfg_adst_dct_8,   fwd_txfm_2d_cfg_adst_dct_16,
+      fwd_txfm_2d_cfg_adst_dct_32,  fwd_txfm_2d_cfg_adst_adst_4,
+      fwd_txfm_2d_cfg_adst_adst_8,  fwd_txfm_2d_cfg_adst_adst_16,
+      fwd_txfm_2d_cfg_adst_adst_32,
+  };
+
+  Fwd_Txfm2d_Func txfm2d_func_c_list[] = {
+      vp10_fwd_txfm2d_4x4_c,   vp10_fwd_txfm2d_8x8_c,   vp10_fwd_txfm2d_16x16_c,
+      vp10_fwd_txfm2d_32x32_c, vp10_fwd_txfm2d_64x64_c,
+  };
+
+  Fwd_Txfm2d_Func txfm2d_func_sse4_1_list[] = {
+      vp10_fwd_txfm2d_4x4_sse4_1,   vp10_fwd_txfm2d_8x8_sse4_1,
+      vp10_fwd_txfm2d_16x16_sse4_1, vp10_fwd_txfm2d_32x32_sse4_1,
+      vp10_fwd_txfm2d_64x64_sse4_1,
+  };
+
+  for (int i = 0; i < txfm_num; i++) {
+    TXFM_2D_CFG cfg = cfg_list[i];
+    int txfm_size = cfg.txfm_size;
+    int func_idx = get_max_bit(txfm_size) - 2;
+    Fwd_Txfm2d_Func txfm2d_func_c = txfm2d_func_c_list[func_idx];
+    Fwd_Txfm2d_Func txfm2d_func_sse4_1 = txfm2d_func_sse4_1_list[func_idx];
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    // init input
+    for (int r = 0; r < txfm_size; r++) {
+      for (int c = 0; c < txfm_size; c++) {
+        input[r * txfm_size + c] = rnd.Rand16() % base;
+      }
+    }
+
+    txfm2d_func_c(input, output_c, cfg.txfm_size, &cfg, 10);
+    txfm2d_func_sse4_1(input, output_sse4_1, cfg.txfm_size, &cfg, 10);
+    for (int r = 0; r < txfm_size; r++) {
+      for (int c = 0; c < txfm_size; c++) {
+        EXPECT_EQ(output_c[r * txfm_size + c],
+                  output_sse4_1[r * txfm_size + c]);
+      }
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // anonymous namespace
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 23fd6a6..8e75522 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -1546,8 +1546,8 @@
   for (r = 0; r < MAX_MIB_SIZE && mi_row + r < cm->mi_rows; r += 4) {
     if (plane->plane_type == 1) {
       for (c = 0; c < (MAX_MIB_SIZE >> 1); c++) {
-        lfm->lfl_uv[r][c] = lfm->lfl_y[r][c << 1];
-        lfm->lfl_uv[r + 2][c] = lfm->lfl_y[r + 2][c << 1];
+        lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
+        lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
       }
     }
 
@@ -1563,18 +1563,18 @@
         highbd_filter_selectively_vert_row2(
             plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
             mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r][0], (int)cm->bit_depth);
+            &lfm->lfl_uv[r >> 1][0], (int)cm->bit_depth);
       } else {
         filter_selectively_vert_row2(
             plane->subsampling_x, dst->buf, dst->stride,
             mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r][0]);
+            &lfm->lfl_uv[r >> 1][0]);
       }
 #else
       filter_selectively_vert_row2(
           plane->subsampling_x, dst->buf, dst->stride,
           mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_uv[r][0]);
+          &lfm->lfl_uv[r >> 1][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
       dst->buf += 16 * dst->stride;
@@ -1615,16 +1615,17 @@
       highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
                                       dst->stride, mask_16x16_r, mask_8x8_r,
                                       mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                                      &lfm->lfl_uv[r][0], (int)cm->bit_depth);
+                                      &lfm->lfl_uv[r >> 1][0],
+                                      (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                                mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                               &lfm->lfl_uv[r][0]);
+                               &lfm->lfl_uv[r >> 1][0]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                              mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                             &lfm->lfl_uv[r][0]);
+                             &lfm->lfl_uv[r >> 1][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     dst->buf += 8 * dst->stride;
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index 547c202..766ccf7 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -1915,8 +1915,8 @@
                              0, 0, plane);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
-      uint16_t *dst_216 = CONVERT_TO_SHORTPTR(dst_2);
+      uint16_t *src2_16 = CONVERT_TO_SHORTPTR(src_2);
+      uint16_t *dst2_16 = CONVERT_TO_SHORTPTR(dst_2);
       memcpy(src2_16 - ref_stride, dst2_16 - dst_stride,
              sizeof(*src2_16) * (4 << bhl));
     } else
diff --git a/vp10/common/vp10_fwd_txfm2d.c b/vp10/common/vp10_fwd_txfm2d.c
index 32214ae..dd7101a 100644
--- a/vp10/common/vp10_fwd_txfm2d.c
+++ b/vp10/common/vp10_fwd_txfm2d.c
@@ -51,7 +51,7 @@
 
 static inline void fwd_txfm2d_c(const int16_t *input, int32_t *output,
                                 const int stride, const TXFM_2D_CFG *cfg,
-                                int32_t *txfm_buf) {
+                                int32_t *buf) {
   int i, j;
   const int txfm_size = cfg->txfm_size;
   const int8_t *shift = cfg->shift;
@@ -62,11 +62,9 @@
   const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
   const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
 
-  // txfm_buf's length is  txfm_size * txfm_size + 2 * txfm_size
-  // it is used for intermediate data buffering
-  int32_t *temp_in = txfm_buf;
-  int32_t *temp_out = temp_in + txfm_size;
-  int32_t *buf = temp_out + txfm_size;
+  // use output buffer as temp buffer
+  int32_t* temp_in = output;
+  int32_t* temp_out = output + txfm_size;
 
   // Columns
   for (i = 0; i < txfm_size; ++i) {
@@ -81,19 +79,16 @@
 
   // Rows
   for (i = 0; i < txfm_size; ++i) {
-    for (j = 0; j < txfm_size; ++j)
-      temp_in[j] = buf[j + i * txfm_size];
-    txfm_func_row(temp_in, temp_out, cos_bit_row, stage_range_row);
-    round_shift_array(temp_out, txfm_size, -shift[2]);
-    for (j = 0; j < txfm_size; ++j)
-      output[j + i * txfm_size] = (int32_t)temp_out[j];
+    txfm_func_row(buf + i * txfm_size, output + i * txfm_size, cos_bit_row,
+                  stage_range_row);
+    round_shift_array(output + i * txfm_size, txfm_size, -shift[2]);
   }
 }
 
 void vp10_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output,
                          const int stride, const TXFM_2D_CFG *cfg,
                          const int bd) {
-  int txfm_buf[4 * 4 + 4 + 4];
+  int32_t txfm_buf[4 * 4];
   (void)bd;
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
@@ -101,7 +96,7 @@
 void vp10_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output,
                          const int stride, const TXFM_2D_CFG *cfg,
                          const int bd) {
-  int txfm_buf[8 * 8 + 8 + 8];
+  int32_t txfm_buf[8 * 8];
   (void)bd;
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
@@ -109,7 +104,7 @@
 void vp10_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output,
                            const int stride, const TXFM_2D_CFG *cfg,
                            const int bd) {
-  int txfm_buf[16 * 16 + 16 + 16];
+  int32_t txfm_buf[16 * 16];
   (void)bd;
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
@@ -117,7 +112,7 @@
 void vp10_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output,
                            const int stride, const TXFM_2D_CFG *cfg,
                            const int bd) {
-  int txfm_buf[32 * 32 + 32 + 32];
+  int32_t txfm_buf[32 * 32];
   (void)bd;
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
@@ -125,7 +120,7 @@
 void vp10_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output,
                            const int stride, const TXFM_2D_CFG *cfg,
                            const int bd) {
-  int txfm_buf[64 * 64 + 64 + 64];
+  int32_t txfm_buf[64 * 64];
   (void)bd;
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index fd83e80..7b20239 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -615,15 +615,15 @@
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #fwd txfm
   add_proto qw/void vp10_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
-  specialize qw/vp10_fwd_txfm2d_4x4/;
+  specialize qw/vp10_fwd_txfm2d_4x4 sse4_1/;
   add_proto qw/void vp10_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
-  specialize qw/vp10_fwd_txfm2d_8x8/;
+  specialize qw/vp10_fwd_txfm2d_8x8 sse4_1/;
   add_proto qw/void vp10_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
-  specialize qw/vp10_fwd_txfm2d_16x16/;
+  specialize qw/vp10_fwd_txfm2d_16x16 sse4_1/;
   add_proto qw/void vp10_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
-  specialize qw/vp10_fwd_txfm2d_32x32/;
+  specialize qw/vp10_fwd_txfm2d_32x32 sse4_1/;
   add_proto qw/void vp10_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
-  specialize qw/vp10_fwd_txfm2d_64x64/;
+  specialize qw/vp10_fwd_txfm2d_64x64 sse4_1/;
 
   #inv txfm
   add_proto qw/void vp10_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
diff --git a/vp10/common/vp10_txfm.h b/vp10/common/vp10_txfm.h
index ad7b38f..9944bdd 100644
--- a/vp10/common/vp10_txfm.h
+++ b/vp10/common/vp10_txfm.h
@@ -81,23 +81,7 @@
     12785, 11204,  9616,  8022,  6424,  4821,  3216,  1608}};
 
 static INLINE int32_t round_shift(int32_t value, int bit) {
-  // For value >= 0,
-  // there are twe version of rounding
-  // 1) (value + (1 << (bit - 1)) - 1) >> bit
-  // 2) (value + (1 << (bit - 1))) >> bit
-  // boath methods are mild unbiased
-  // however, the first version has slightly advantage because
-  // it rounds number toward zero.
-  // For value < 0, we also choose the version that rounds number
-  // toward zero.
-  if (bit > 0) {
-    if (value >= 0)
-      return (value + (1 << (bit - 1)) - 1) >> bit;
-    else
-      return ((value - (1 << (bit - 1))) >> bit) + 1;
-  } else {
-    return value << (-bit);
-  }
+  return (value + (1 << (bit - 1))) >> bit;
 }
 
 static INLINE void round_shift_array(int32_t *arr, int size, int bit) {
@@ -105,8 +89,14 @@
   if (bit == 0) {
     return;
   } else {
-    for (i = 0; i < size; i++) {
-      arr[i] = round_shift(arr[i], bit);
+    if (bit > 0) {
+      for (i = 0; i < size; i++) {
+        arr[i] = round_shift(arr[i], bit);
+      }
+    } else {
+      for (i = 0; i < size; i++) {
+        arr[i] = arr[i] << (-bit);
+      }
     }
   }
 }
diff --git a/vp10/common/x86/vp10_fwd_txfm1d_sse4.c b/vp10/common/x86/vp10_fwd_txfm1d_sse4.c
new file mode 100644
index 0000000..5ade8bd
--- /dev/null
+++ b/vp10/common/x86/vp10_fwd_txfm1d_sse4.c
@@ -0,0 +1,2594 @@
+#include "vp10/common/x86/vp10_txfm1d_sse4.h"
+
+void vp10_fdct4_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+                        buf0[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[2];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[3];
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+  }
+}
+
+void vp10_fdct8_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 8;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[8];
+  __m128i buf1[8];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+    buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+    buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+    buf0[4] = buf1[4];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[7] = buf1[7];
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+                        buf1[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+                        buf1[3], bit);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+    buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[4];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[6];
+    buf1[4] = buf0[1];
+    buf1[5] = buf0[5];
+    buf1[6] = buf0[3];
+    buf1[7] = buf0[7];
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+  }
+}
+
+void vp10_fdct16_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 16;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[16];
+  __m128i buf1[16];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[0], buf0[15]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[1], buf0[14]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[2], buf0[13]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[3], buf0[12]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[4], buf0[11]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[5], buf0[10]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[6], buf0[9]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[8]);
+    buf1[8] = _mm_sub_epi32(buf0[7], buf0[8]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[7]);
+    buf0[7] = _mm_sub_epi32(buf1[0], buf1[7]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[6]);
+    buf0[6] = _mm_sub_epi32(buf1[1], buf1[6]);
+    buf0[2] = _mm_add_epi32(buf1[2], buf1[5]);
+    buf0[5] = _mm_sub_epi32(buf1[2], buf1[5]);
+    buf0[3] = _mm_add_epi32(buf1[3], buf1[4]);
+    buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
+    buf1[4] = buf0[4];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
+    buf1[7] = buf0[7];
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[9], buf0[10]);
+    buf1[12] = _mm_sub_epi32(buf0[15], buf0[12]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[12]);
+    buf1[13] = _mm_sub_epi32(buf0[14], buf0[13]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[13]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+                        buf0[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = _mm_add_epi32(buf1[4], buf1[5]);
+    buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]);
+    buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]);
+    buf0[7] = _mm_add_epi32(buf1[7], buf1[6]);
+    buf0[8] = buf1[8];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[15] = buf1[15];
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[1];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[3];
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]);
+    buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[10]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[12], buf0[13]);
+    buf1[14] = _mm_sub_epi32(buf0[15], buf0[14]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[14]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8],
+                        buf0[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[8];
+    buf1[2] = buf0[4];
+    buf1[3] = buf0[12];
+    buf1[4] = buf0[2];
+    buf1[5] = buf0[10];
+    buf1[6] = buf0[6];
+    buf1[7] = buf0[14];
+    buf1[8] = buf0[1];
+    buf1[9] = buf0[9];
+    buf1[10] = buf0[5];
+    buf1[11] = buf0[13];
+    buf1[12] = buf0[3];
+    buf1[13] = buf0[11];
+    buf1[14] = buf0[7];
+    buf1[15] = buf0[15];
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+  }
+}
+
+void vp10_fdct32_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+    buf0[16] = input[16 * col_num + col];
+    buf0[17] = input[17 * col_num + col];
+    buf0[18] = input[18 * col_num + col];
+    buf0[19] = input[19 * col_num + col];
+    buf0[20] = input[20 * col_num + col];
+    buf0[21] = input[21 * col_num + col];
+    buf0[22] = input[22 * col_num + col];
+    buf0[23] = input[23 * col_num + col];
+    buf0[24] = input[24 * col_num + col];
+    buf0[25] = input[25 * col_num + col];
+    buf0[26] = input[26 * col_num + col];
+    buf0[27] = input[27 * col_num + col];
+    buf0[28] = input[28 * col_num + col];
+    buf0[29] = input[29 * col_num + col];
+    buf0[30] = input[30 * col_num + col];
+    buf0[31] = input[31 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[16]);
+    buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+    buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+    buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+    buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+    buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+    buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+    buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+    buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+    buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+    buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+    buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+    buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+    buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+    buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+    buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+    buf1[8] = buf0[8];
+    buf1[9] = buf0[9];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[15];
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+    buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+    buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+    buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+    buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+    buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+    buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+    buf0[4] = buf1[4];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[7] = buf1[7];
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+    buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+    buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+    buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+    buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+    buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+    buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+                        buf1[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+                        buf1[3], bit);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+    buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+    buf1[8] = buf0[8];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    buf1[11] = buf0[11];
+    buf1[12] = buf0[12];
+    buf1[15] = buf0[15];
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+    buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+    buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+    buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+    buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+    buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+    buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+    buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+    buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+    buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+    buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+    buf0[16] = buf1[16];
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[27] = buf1[27];
+    buf0[28] = buf1[28];
+    buf0[31] = buf1[31];
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[1];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[3];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[5];
+    buf1[6] = buf0[6];
+    buf1[7] = buf0[7];
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
+                        buf1[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+    buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+    buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+    buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+    buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+                        buf0[31], bit);
+    btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
+
+    // stage 9
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[16];
+    buf1[2] = buf0[8];
+    buf1[3] = buf0[24];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[20];
+    buf1[6] = buf0[12];
+    buf1[7] = buf0[28];
+    buf1[8] = buf0[2];
+    buf1[9] = buf0[18];
+    buf1[10] = buf0[10];
+    buf1[11] = buf0[26];
+    buf1[12] = buf0[6];
+    buf1[13] = buf0[22];
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[30];
+    buf1[16] = buf0[1];
+    buf1[17] = buf0[17];
+    buf1[18] = buf0[9];
+    buf1[19] = buf0[25];
+    buf1[20] = buf0[5];
+    buf1[21] = buf0[21];
+    buf1[22] = buf0[13];
+    buf1[23] = buf0[29];
+    buf1[24] = buf0[3];
+    buf1[25] = buf0[19];
+    buf1[26] = buf0[11];
+    buf1[27] = buf0[27];
+    buf1[28] = buf0[7];
+    buf1[29] = buf0[23];
+    buf1[30] = buf0[15];
+    buf1[31] = buf0[31];
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+    output[16 * col_num + col] = buf1[16];
+    output[17 * col_num + col] = buf1[17];
+    output[18 * col_num + col] = buf1[18];
+    output[19 * col_num + col] = buf1[19];
+    output[20 * col_num + col] = buf1[20];
+    output[21 * col_num + col] = buf1[21];
+    output[22 * col_num + col] = buf1[22];
+    output[23 * col_num + col] = buf1[23];
+    output[24 * col_num + col] = buf1[24];
+    output[25 * col_num + col] = buf1[25];
+    output[26 * col_num + col] = buf1[26];
+    output[27 * col_num + col] = buf1[27];
+    output[28 * col_num + col] = buf1[28];
+    output[29 * col_num + col] = buf1[29];
+    output[30 * col_num + col] = buf1[30];
+    output[31 * col_num + col] = buf1[31];
+  }
+}
+
+void vp10_fadst4_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+  }
+}
+
+void vp10_fadst8_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 8;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[8];
+  __m128i buf1[8];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[7];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[5];
+    buf1[3] = buf0[2];
+    buf1[4] = buf0[3];
+    buf1[5] = buf0[4];
+    buf1[6] = buf0[1];
+    buf1[7] = buf0[6];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
+    buf1[2] = buf0[6];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[4] = buf0[3];
+    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
+    buf1[6] = buf0[5];
+    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+  }
+}
+
+void vp10_fadst16_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 16;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[16];
+  __m128i buf1[16];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[15];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[13];
+    buf1[3] = buf0[2];
+    buf1[4] = buf0[11];
+    buf1[5] = buf0[4];
+    buf1[6] = buf0[9];
+    buf1[7] = buf0[6];
+    buf1[8] = buf0[7];
+    buf1[9] = buf0[8];
+    buf1[10] = buf0[5];
+    buf1[11] = buf0[10];
+    buf1[12] = buf0[3];
+    buf1[13] = buf0[12];
+    buf1[14] = buf0[1];
+    buf1[15] = buf0[14];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    btf_32_sse4_1_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    btf_32_sse4_1_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8],
+                        buf0[9], bit);
+    btf_32_sse4_1_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
+    buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+
+    // stage 9
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
+    buf1[2] = buf0[12];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
+    buf1[4] = buf0[6];
+    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
+    buf1[6] = buf0[10];
+    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[8] = buf0[3];
+    buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
+    buf1[10] = buf0[15];
+    buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
+    buf1[12] = buf0[5];
+    buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
+    buf1[14] = buf0[9];
+    buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+  }
+}
+
+void vp10_fadst32_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+    buf0[16] = input[16 * col_num + col];
+    buf0[17] = input[17 * col_num + col];
+    buf0[18] = input[18 * col_num + col];
+    buf0[19] = input[19 * col_num + col];
+    buf0[20] = input[20 * col_num + col];
+    buf0[21] = input[21 * col_num + col];
+    buf0[22] = input[22 * col_num + col];
+    buf0[23] = input[23 * col_num + col];
+    buf0[24] = input[24 * col_num + col];
+    buf0[25] = input[25 * col_num + col];
+    buf0[26] = input[26 * col_num + col];
+    buf0[27] = input[27 * col_num + col];
+    buf0[28] = input[28 * col_num + col];
+    buf0[29] = input[29 * col_num + col];
+    buf0[30] = input[30 * col_num + col];
+    buf0[31] = input[31 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[31];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[29];
+    buf1[3] = buf0[2];
+    buf1[4] = buf0[27];
+    buf1[5] = buf0[4];
+    buf1[6] = buf0[25];
+    buf1[7] = buf0[6];
+    buf1[8] = buf0[23];
+    buf1[9] = buf0[8];
+    buf1[10] = buf0[21];
+    buf1[11] = buf0[10];
+    buf1[12] = buf0[19];
+    buf1[13] = buf0[12];
+    buf1[14] = buf0[17];
+    buf1[15] = buf0[14];
+    buf1[16] = buf0[15];
+    buf1[17] = buf0[16];
+    buf1[18] = buf0[13];
+    buf1[19] = buf0[18];
+    buf1[20] = buf0[11];
+    buf1[21] = buf0[20];
+    buf1[22] = buf0[9];
+    buf1[23] = buf0[22];
+    buf1[24] = buf0[7];
+    buf1[25] = buf0[24];
+    buf1[26] = buf0[5];
+    buf1[27] = buf0[26];
+    buf1[28] = buf0[3];
+    buf1[29] = buf0[28];
+    buf1[30] = buf0[1];
+    buf1[31] = buf0[30];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
+                        bit);
+    btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
+                        bit);
+    btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
+                        buf0[9], bit);
+    btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
+    buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
+    buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
+    buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[26] = buf1[26];
+    buf0[27] = buf1[27];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 9
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);
+
+    // stage 10
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 11
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
+    buf1[2] = buf0[24];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
+    buf1[4] = buf0[12];
+    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
+    buf1[6] = buf0[20];
+    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
+    buf1[8] = buf0[6];
+    buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
+    buf1[10] = buf0[30];
+    buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
+    buf1[12] = buf0[10];
+    buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
+    buf1[14] = buf0[18];
+    buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[16] = buf0[3];
+    buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
+    buf1[18] = buf0[27];
+    buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
+    buf1[20] = buf0[15];
+    buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
+    buf1[22] = buf0[23];
+    buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
+    buf1[24] = buf0[5];
+    buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
+    buf1[26] = buf0[29];
+    buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
+    buf1[28] = buf0[9];
+    buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
+    buf1[30] = buf0[17];
+    buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+    output[16 * col_num + col] = buf1[16];
+    output[17 * col_num + col] = buf1[17];
+    output[18 * col_num + col] = buf1[18];
+    output[19 * col_num + col] = buf1[19];
+    output[20 * col_num + col] = buf1[20];
+    output[21 * col_num + col] = buf1[21];
+    output[22 * col_num + col] = buf1[22];
+    output[23 * col_num + col] = buf1[23];
+    output[24 * col_num + col] = buf1[24];
+    output[25 * col_num + col] = buf1[25];
+    output[26 * col_num + col] = buf1[26];
+    output[27 * col_num + col] = buf1[27];
+    output[28 * col_num + col] = buf1[28];
+    output[29 * col_num + col] = buf1[29];
+    output[30 * col_num + col] = buf1[30];
+    output[31 * col_num + col] = buf1[31];
+  }
+}
+
+void vp10_fdct64_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 64;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[64];
+  __m128i buf1[64];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+    buf0[16] = input[16 * col_num + col];
+    buf0[17] = input[17 * col_num + col];
+    buf0[18] = input[18 * col_num + col];
+    buf0[19] = input[19 * col_num + col];
+    buf0[20] = input[20 * col_num + col];
+    buf0[21] = input[21 * col_num + col];
+    buf0[22] = input[22 * col_num + col];
+    buf0[23] = input[23 * col_num + col];
+    buf0[24] = input[24 * col_num + col];
+    buf0[25] = input[25 * col_num + col];
+    buf0[26] = input[26 * col_num + col];
+    buf0[27] = input[27 * col_num + col];
+    buf0[28] = input[28 * col_num + col];
+    buf0[29] = input[29 * col_num + col];
+    buf0[30] = input[30 * col_num + col];
+    buf0[31] = input[31 * col_num + col];
+    buf0[32] = input[32 * col_num + col];
+    buf0[33] = input[33 * col_num + col];
+    buf0[34] = input[34 * col_num + col];
+    buf0[35] = input[35 * col_num + col];
+    buf0[36] = input[36 * col_num + col];
+    buf0[37] = input[37 * col_num + col];
+    buf0[38] = input[38 * col_num + col];
+    buf0[39] = input[39 * col_num + col];
+    buf0[40] = input[40 * col_num + col];
+    buf0[41] = input[41 * col_num + col];
+    buf0[42] = input[42 * col_num + col];
+    buf0[43] = input[43 * col_num + col];
+    buf0[44] = input[44 * col_num + col];
+    buf0[45] = input[45 * col_num + col];
+    buf0[46] = input[46 * col_num + col];
+    buf0[47] = input[47 * col_num + col];
+    buf0[48] = input[48 * col_num + col];
+    buf0[49] = input[49 * col_num + col];
+    buf0[50] = input[50 * col_num + col];
+    buf0[51] = input[51 * col_num + col];
+    buf0[52] = input[52 * col_num + col];
+    buf0[53] = input[53 * col_num + col];
+    buf0[54] = input[54 * col_num + col];
+    buf0[55] = input[55 * col_num + col];
+    buf0[56] = input[56 * col_num + col];
+    buf0[57] = input[57 * col_num + col];
+    buf0[58] = input[58 * col_num + col];
+    buf0[59] = input[59 * col_num + col];
+    buf0[60] = input[60 * col_num + col];
+    buf0[61] = input[61 * col_num + col];
+    buf0[62] = input[62 * col_num + col];
+    buf0[63] = input[63 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[63]);
+    buf1[63] = _mm_sub_epi32(buf0[0], buf0[63]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[62]);
+    buf1[62] = _mm_sub_epi32(buf0[1], buf0[62]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[61]);
+    buf1[61] = _mm_sub_epi32(buf0[2], buf0[61]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[60]);
+    buf1[60] = _mm_sub_epi32(buf0[3], buf0[60]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[59]);
+    buf1[59] = _mm_sub_epi32(buf0[4], buf0[59]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[58]);
+    buf1[58] = _mm_sub_epi32(buf0[5], buf0[58]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[57]);
+    buf1[57] = _mm_sub_epi32(buf0[6], buf0[57]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[56]);
+    buf1[56] = _mm_sub_epi32(buf0[7], buf0[56]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[55]);
+    buf1[55] = _mm_sub_epi32(buf0[8], buf0[55]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[54]);
+    buf1[54] = _mm_sub_epi32(buf0[9], buf0[54]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[53]);
+    buf1[53] = _mm_sub_epi32(buf0[10], buf0[53]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[52]);
+    buf1[52] = _mm_sub_epi32(buf0[11], buf0[52]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[51]);
+    buf1[51] = _mm_sub_epi32(buf0[12], buf0[51]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[50]);
+    buf1[50] = _mm_sub_epi32(buf0[13], buf0[50]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[49]);
+    buf1[49] = _mm_sub_epi32(buf0[14], buf0[49]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[48]);
+    buf1[48] = _mm_sub_epi32(buf0[15], buf0[48]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[47]);
+    buf1[47] = _mm_sub_epi32(buf0[16], buf0[47]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[46]);
+    buf1[46] = _mm_sub_epi32(buf0[17], buf0[46]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[45]);
+    buf1[45] = _mm_sub_epi32(buf0[18], buf0[45]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[44]);
+    buf1[44] = _mm_sub_epi32(buf0[19], buf0[44]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[43]);
+    buf1[43] = _mm_sub_epi32(buf0[20], buf0[43]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[42]);
+    buf1[42] = _mm_sub_epi32(buf0[21], buf0[42]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[41]);
+    buf1[41] = _mm_sub_epi32(buf0[22], buf0[41]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[40]);
+    buf1[40] = _mm_sub_epi32(buf0[23], buf0[40]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[39]);
+    buf1[39] = _mm_sub_epi32(buf0[24], buf0[39]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[38]);
+    buf1[38] = _mm_sub_epi32(buf0[25], buf0[38]);
+    buf1[26] = _mm_add_epi32(buf0[26], buf0[37]);
+    buf1[37] = _mm_sub_epi32(buf0[26], buf0[37]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[36]);
+    buf1[36] = _mm_sub_epi32(buf0[27], buf0[36]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[35]);
+    buf1[35] = _mm_sub_epi32(buf0[28], buf0[35]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[34]);
+    buf1[34] = _mm_sub_epi32(buf0[29], buf0[34]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[33]);
+    buf1[33] = _mm_sub_epi32(buf0[30], buf0[33]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[32]);
+    buf1[32] = _mm_sub_epi32(buf0[31], buf0[32]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[31]);
+    buf0[31] = _mm_sub_epi32(buf1[0], buf1[31]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[30]);
+    buf0[30] = _mm_sub_epi32(buf1[1], buf1[30]);
+    buf0[2] = _mm_add_epi32(buf1[2], buf1[29]);
+    buf0[29] = _mm_sub_epi32(buf1[2], buf1[29]);
+    buf0[3] = _mm_add_epi32(buf1[3], buf1[28]);
+    buf0[28] = _mm_sub_epi32(buf1[3], buf1[28]);
+    buf0[4] = _mm_add_epi32(buf1[4], buf1[27]);
+    buf0[27] = _mm_sub_epi32(buf1[4], buf1[27]);
+    buf0[5] = _mm_add_epi32(buf1[5], buf1[26]);
+    buf0[26] = _mm_sub_epi32(buf1[5], buf1[26]);
+    buf0[6] = _mm_add_epi32(buf1[6], buf1[25]);
+    buf0[25] = _mm_sub_epi32(buf1[6], buf1[25]);
+    buf0[7] = _mm_add_epi32(buf1[7], buf1[24]);
+    buf0[24] = _mm_sub_epi32(buf1[7], buf1[24]);
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[23]);
+    buf0[23] = _mm_sub_epi32(buf1[8], buf1[23]);
+    buf0[9] = _mm_add_epi32(buf1[9], buf1[22]);
+    buf0[22] = _mm_sub_epi32(buf1[9], buf1[22]);
+    buf0[10] = _mm_add_epi32(buf1[10], buf1[21]);
+    buf0[21] = _mm_sub_epi32(buf1[10], buf1[21]);
+    buf0[11] = _mm_add_epi32(buf1[11], buf1[20]);
+    buf0[20] = _mm_sub_epi32(buf1[11], buf1[20]);
+    buf0[12] = _mm_add_epi32(buf1[12], buf1[19]);
+    buf0[19] = _mm_sub_epi32(buf1[12], buf1[19]);
+    buf0[13] = _mm_add_epi32(buf1[13], buf1[18]);
+    buf0[18] = _mm_sub_epi32(buf1[13], buf1[18]);
+    buf0[14] = _mm_add_epi32(buf1[14], buf1[17]);
+    buf0[17] = _mm_sub_epi32(buf1[14], buf1[17]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[16]);
+    buf0[16] = _mm_sub_epi32(buf1[15], buf1[16]);
+    buf0[32] = buf1[32];
+    buf0[33] = buf1[33];
+    buf0[34] = buf1[34];
+    buf0[35] = buf1[35];
+    buf0[36] = buf1[36];
+    buf0[37] = buf1[37];
+    buf0[38] = buf1[38];
+    buf0[39] = buf1[39];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[40], buf1[55], buf0[40],
+                        buf0[55], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[44], buf1[51], buf0[44],
+                        buf0[51], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[46], buf1[49], buf0[46],
+                        buf0[49], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[47], buf1[48], buf0[47],
+                        buf0[48], bit);
+    buf0[56] = buf1[56];
+    buf0[57] = buf1[57];
+    buf0[58] = buf1[58];
+    buf0[59] = buf1[59];
+    buf0[60] = buf1[60];
+    buf0[61] = buf1[61];
+    buf0[62] = buf1[62];
+    buf0[63] = buf1[63];
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[0], buf0[15]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[1], buf0[14]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[2], buf0[13]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[3], buf0[12]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[4], buf0[11]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[5], buf0[10]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[6], buf0[9]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[8]);
+    buf1[8] = _mm_sub_epi32(buf0[7], buf0[8]);
+    buf1[16] = buf0[16];
+    buf1[17] = buf0[17];
+    buf1[18] = buf0[18];
+    buf1[19] = buf0[19];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[20], buf0[27], buf1[20],
+                        buf1[27], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[22], buf0[25], buf1[22],
+                        buf1[25], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[23], buf0[24], buf1[23],
+                        buf1[24], bit);
+    buf1[28] = buf0[28];
+    buf1[29] = buf0[29];
+    buf1[30] = buf0[30];
+    buf1[31] = buf0[31];
+    buf1[32] = _mm_add_epi32(buf0[32], buf0[47]);
+    buf1[47] = _mm_sub_epi32(buf0[32], buf0[47]);
+    buf1[33] = _mm_add_epi32(buf0[33], buf0[46]);
+    buf1[46] = _mm_sub_epi32(buf0[33], buf0[46]);
+    buf1[34] = _mm_add_epi32(buf0[34], buf0[45]);
+    buf1[45] = _mm_sub_epi32(buf0[34], buf0[45]);
+    buf1[35] = _mm_add_epi32(buf0[35], buf0[44]);
+    buf1[44] = _mm_sub_epi32(buf0[35], buf0[44]);
+    buf1[36] = _mm_add_epi32(buf0[36], buf0[43]);
+    buf1[43] = _mm_sub_epi32(buf0[36], buf0[43]);
+    buf1[37] = _mm_add_epi32(buf0[37], buf0[42]);
+    buf1[42] = _mm_sub_epi32(buf0[37], buf0[42]);
+    buf1[38] = _mm_add_epi32(buf0[38], buf0[41]);
+    buf1[41] = _mm_sub_epi32(buf0[38], buf0[41]);
+    buf1[39] = _mm_add_epi32(buf0[39], buf0[40]);
+    buf1[40] = _mm_sub_epi32(buf0[39], buf0[40]);
+    buf1[48] = _mm_sub_epi32(buf0[63], buf0[48]);
+    buf1[63] = _mm_add_epi32(buf0[63], buf0[48]);
+    buf1[49] = _mm_sub_epi32(buf0[62], buf0[49]);
+    buf1[62] = _mm_add_epi32(buf0[62], buf0[49]);
+    buf1[50] = _mm_sub_epi32(buf0[61], buf0[50]);
+    buf1[61] = _mm_add_epi32(buf0[61], buf0[50]);
+    buf1[51] = _mm_sub_epi32(buf0[60], buf0[51]);
+    buf1[60] = _mm_add_epi32(buf0[60], buf0[51]);
+    buf1[52] = _mm_sub_epi32(buf0[59], buf0[52]);
+    buf1[59] = _mm_add_epi32(buf0[59], buf0[52]);
+    buf1[53] = _mm_sub_epi32(buf0[58], buf0[53]);
+    buf1[58] = _mm_add_epi32(buf0[58], buf0[53]);
+    buf1[54] = _mm_sub_epi32(buf0[57], buf0[54]);
+    buf1[57] = _mm_add_epi32(buf0[57], buf0[54]);
+    buf1[55] = _mm_sub_epi32(buf0[56], buf0[55]);
+    buf1[56] = _mm_add_epi32(buf0[56], buf0[55]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[7]);
+    buf0[7] = _mm_sub_epi32(buf1[0], buf1[7]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[6]);
+    buf0[6] = _mm_sub_epi32(buf1[1], buf1[6]);
+    buf0[2] = _mm_add_epi32(buf1[2], buf1[5]);
+    buf0[5] = _mm_sub_epi32(buf1[2], buf1[5]);
+    buf0[3] = _mm_add_epi32(buf1[3], buf1[4]);
+    buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    buf0[16] = _mm_add_epi32(buf1[16], buf1[23]);
+    buf0[23] = _mm_sub_epi32(buf1[16], buf1[23]);
+    buf0[17] = _mm_add_epi32(buf1[17], buf1[22]);
+    buf0[22] = _mm_sub_epi32(buf1[17], buf1[22]);
+    buf0[18] = _mm_add_epi32(buf1[18], buf1[21]);
+    buf0[21] = _mm_sub_epi32(buf1[18], buf1[21]);
+    buf0[19] = _mm_add_epi32(buf1[19], buf1[20]);
+    buf0[20] = _mm_sub_epi32(buf1[19], buf1[20]);
+    buf0[24] = _mm_sub_epi32(buf1[31], buf1[24]);
+    buf0[31] = _mm_add_epi32(buf1[31], buf1[24]);
+    buf0[25] = _mm_sub_epi32(buf1[30], buf1[25]);
+    buf0[30] = _mm_add_epi32(buf1[30], buf1[25]);
+    buf0[26] = _mm_sub_epi32(buf1[29], buf1[26]);
+    buf0[29] = _mm_add_epi32(buf1[29], buf1[26]);
+    buf0[27] = _mm_sub_epi32(buf1[28], buf1[27]);
+    buf0[28] = _mm_add_epi32(buf1[28], buf1[27]);
+    buf0[32] = buf1[32];
+    buf0[33] = buf1[33];
+    buf0[34] = buf1[34];
+    buf0[35] = buf1[35];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[36], buf1[59], buf0[36],
+                        buf0[59], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[38], buf1[57], buf0[38],
+                        buf0[57], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[39], buf1[56], buf0[39],
+                        buf0[56], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[40], buf1[55], buf0[40],
+                        buf0[55], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
+    buf0[44] = buf1[44];
+    buf0[45] = buf1[45];
+    buf0[46] = buf1[46];
+    buf0[47] = buf1[47];
+    buf0[48] = buf1[48];
+    buf0[49] = buf1[49];
+    buf0[50] = buf1[50];
+    buf0[51] = buf1[51];
+    buf0[60] = buf1[60];
+    buf0[61] = buf1[61];
+    buf0[62] = buf1[62];
+    buf0[63] = buf1[63];
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
+    buf1[4] = buf0[4];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
+    buf1[7] = buf0[7];
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[9], buf0[10]);
+    buf1[12] = _mm_sub_epi32(buf0[15], buf0[12]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[12]);
+    buf1[13] = _mm_sub_epi32(buf0[14], buf0[13]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[13]);
+    buf1[16] = buf0[16];
+    buf1[17] = buf0[17];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[18], buf0[29], buf1[18],
+                        buf1[29], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[19], buf0[28], buf1[19],
+                        buf1[28], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[20], buf0[27], buf1[20],
+                        buf1[27], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
+    buf1[22] = buf0[22];
+    buf1[23] = buf0[23];
+    buf1[24] = buf0[24];
+    buf1[25] = buf0[25];
+    buf1[30] = buf0[30];
+    buf1[31] = buf0[31];
+    buf1[32] = _mm_add_epi32(buf0[32], buf0[39]);
+    buf1[39] = _mm_sub_epi32(buf0[32], buf0[39]);
+    buf1[33] = _mm_add_epi32(buf0[33], buf0[38]);
+    buf1[38] = _mm_sub_epi32(buf0[33], buf0[38]);
+    buf1[34] = _mm_add_epi32(buf0[34], buf0[37]);
+    buf1[37] = _mm_sub_epi32(buf0[34], buf0[37]);
+    buf1[35] = _mm_add_epi32(buf0[35], buf0[36]);
+    buf1[36] = _mm_sub_epi32(buf0[35], buf0[36]);
+    buf1[40] = _mm_sub_epi32(buf0[47], buf0[40]);
+    buf1[47] = _mm_add_epi32(buf0[47], buf0[40]);
+    buf1[41] = _mm_sub_epi32(buf0[46], buf0[41]);
+    buf1[46] = _mm_add_epi32(buf0[46], buf0[41]);
+    buf1[42] = _mm_sub_epi32(buf0[45], buf0[42]);
+    buf1[45] = _mm_add_epi32(buf0[45], buf0[42]);
+    buf1[43] = _mm_sub_epi32(buf0[44], buf0[43]);
+    buf1[44] = _mm_add_epi32(buf0[44], buf0[43]);
+    buf1[48] = _mm_add_epi32(buf0[48], buf0[55]);
+    buf1[55] = _mm_sub_epi32(buf0[48], buf0[55]);
+    buf1[49] = _mm_add_epi32(buf0[49], buf0[54]);
+    buf1[54] = _mm_sub_epi32(buf0[49], buf0[54]);
+    buf1[50] = _mm_add_epi32(buf0[50], buf0[53]);
+    buf1[53] = _mm_sub_epi32(buf0[50], buf0[53]);
+    buf1[51] = _mm_add_epi32(buf0[51], buf0[52]);
+    buf1[52] = _mm_sub_epi32(buf0[51], buf0[52]);
+    buf1[56] = _mm_sub_epi32(buf0[63], buf0[56]);
+    buf1[63] = _mm_add_epi32(buf0[63], buf0[56]);
+    buf1[57] = _mm_sub_epi32(buf0[62], buf0[57]);
+    buf1[62] = _mm_add_epi32(buf0[62], buf0[57]);
+    buf1[58] = _mm_sub_epi32(buf0[61], buf0[58]);
+    buf1[61] = _mm_add_epi32(buf0[61], buf0[58]);
+    buf1[59] = _mm_sub_epi32(buf0[60], buf0[59]);
+    buf1[60] = _mm_add_epi32(buf0[60], buf0[59]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+                        buf0[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = _mm_add_epi32(buf1[4], buf1[5]);
+    buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]);
+    buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]);
+    buf0[7] = _mm_add_epi32(buf1[7], buf1[6]);
+    buf0[8] = buf1[8];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[15] = buf1[15];
+    buf0[16] = _mm_add_epi32(buf1[16], buf1[19]);
+    buf0[19] = _mm_sub_epi32(buf1[16], buf1[19]);
+    buf0[17] = _mm_add_epi32(buf1[17], buf1[18]);
+    buf0[18] = _mm_sub_epi32(buf1[17], buf1[18]);
+    buf0[20] = _mm_sub_epi32(buf1[23], buf1[20]);
+    buf0[23] = _mm_add_epi32(buf1[23], buf1[20]);
+    buf0[21] = _mm_sub_epi32(buf1[22], buf1[21]);
+    buf0[22] = _mm_add_epi32(buf1[22], buf1[21]);
+    buf0[24] = _mm_add_epi32(buf1[24], buf1[27]);
+    buf0[27] = _mm_sub_epi32(buf1[24], buf1[27]);
+    buf0[25] = _mm_add_epi32(buf1[25], buf1[26]);
+    buf0[26] = _mm_sub_epi32(buf1[25], buf1[26]);
+    buf0[28] = _mm_sub_epi32(buf1[31], buf1[28]);
+    buf0[31] = _mm_add_epi32(buf1[31], buf1[28]);
+    buf0[29] = _mm_sub_epi32(buf1[30], buf1[29]);
+    buf0[30] = _mm_add_epi32(buf1[30], buf1[29]);
+    buf0[32] = buf1[32];
+    buf0[33] = buf1[33];
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[34], buf1[61], buf0[34],
+                        buf0[61], bit);
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[35], buf1[60], buf0[35],
+                        buf0[60], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[36], buf1[59], buf0[36],
+                        buf0[59], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
+    buf0[38] = buf1[38];
+    buf0[39] = buf1[39];
+    buf0[40] = buf1[40];
+    buf0[41] = buf1[41];
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[44], buf1[51], buf0[44],
+                        buf0[51], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
+    buf0[46] = buf1[46];
+    buf0[47] = buf1[47];
+    buf0[48] = buf1[48];
+    buf0[49] = buf1[49];
+    buf0[54] = buf1[54];
+    buf0[55] = buf1[55];
+    buf0[56] = buf1[56];
+    buf0[57] = buf1[57];
+    buf0[62] = buf1[62];
+    buf0[63] = buf1[63];
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[1];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[3];
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]);
+    buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[10]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[12], buf0[13]);
+    buf1[14] = _mm_sub_epi32(buf0[15], buf0[14]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[14]);
+    buf1[16] = buf0[16];
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf0[17], buf0[30], buf1[17],
+                        buf1[30], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf0[18], buf0[29], buf1[18],
+                        buf1[29], bit);
+    buf1[19] = buf0[19];
+    buf1[20] = buf0[20];
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf0[22], buf0[25], buf1[22],
+                        buf1[25], bit);
+    buf1[23] = buf0[23];
+    buf1[24] = buf0[24];
+    buf1[27] = buf0[27];
+    buf1[28] = buf0[28];
+    buf1[31] = buf0[31];
+    buf1[32] = _mm_add_epi32(buf0[32], buf0[35]);
+    buf1[35] = _mm_sub_epi32(buf0[32], buf0[35]);
+    buf1[33] = _mm_add_epi32(buf0[33], buf0[34]);
+    buf1[34] = _mm_sub_epi32(buf0[33], buf0[34]);
+    buf1[36] = _mm_sub_epi32(buf0[39], buf0[36]);
+    buf1[39] = _mm_add_epi32(buf0[39], buf0[36]);
+    buf1[37] = _mm_sub_epi32(buf0[38], buf0[37]);
+    buf1[38] = _mm_add_epi32(buf0[38], buf0[37]);
+    buf1[40] = _mm_add_epi32(buf0[40], buf0[43]);
+    buf1[43] = _mm_sub_epi32(buf0[40], buf0[43]);
+    buf1[41] = _mm_add_epi32(buf0[41], buf0[42]);
+    buf1[42] = _mm_sub_epi32(buf0[41], buf0[42]);
+    buf1[44] = _mm_sub_epi32(buf0[47], buf0[44]);
+    buf1[47] = _mm_add_epi32(buf0[47], buf0[44]);
+    buf1[45] = _mm_sub_epi32(buf0[46], buf0[45]);
+    buf1[46] = _mm_add_epi32(buf0[46], buf0[45]);
+    buf1[48] = _mm_add_epi32(buf0[48], buf0[51]);
+    buf1[51] = _mm_sub_epi32(buf0[48], buf0[51]);
+    buf1[49] = _mm_add_epi32(buf0[49], buf0[50]);
+    buf1[50] = _mm_sub_epi32(buf0[49], buf0[50]);
+    buf1[52] = _mm_sub_epi32(buf0[55], buf0[52]);
+    buf1[55] = _mm_add_epi32(buf0[55], buf0[52]);
+    buf1[53] = _mm_sub_epi32(buf0[54], buf0[53]);
+    buf1[54] = _mm_add_epi32(buf0[54], buf0[53]);
+    buf1[56] = _mm_add_epi32(buf0[56], buf0[59]);
+    buf1[59] = _mm_sub_epi32(buf0[56], buf0[59]);
+    buf1[57] = _mm_add_epi32(buf0[57], buf0[58]);
+    buf1[58] = _mm_sub_epi32(buf0[57], buf0[58]);
+    buf1[60] = _mm_sub_epi32(buf0[63], buf0[60]);
+    buf1[63] = _mm_add_epi32(buf0[63], buf0[60]);
+    buf1[61] = _mm_sub_epi32(buf0[62], buf0[61]);
+    buf1[62] = _mm_add_epi32(buf0[62], buf0[61]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8],
+                        buf0[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
+    buf0[16] = _mm_add_epi32(buf1[16], buf1[17]);
+    buf0[17] = _mm_sub_epi32(buf1[16], buf1[17]);
+    buf0[18] = _mm_sub_epi32(buf1[19], buf1[18]);
+    buf0[19] = _mm_add_epi32(buf1[19], buf1[18]);
+    buf0[20] = _mm_add_epi32(buf1[20], buf1[21]);
+    buf0[21] = _mm_sub_epi32(buf1[20], buf1[21]);
+    buf0[22] = _mm_sub_epi32(buf1[23], buf1[22]);
+    buf0[23] = _mm_add_epi32(buf1[23], buf1[22]);
+    buf0[24] = _mm_add_epi32(buf1[24], buf1[25]);
+    buf0[25] = _mm_sub_epi32(buf1[24], buf1[25]);
+    buf0[26] = _mm_sub_epi32(buf1[27], buf1[26]);
+    buf0[27] = _mm_add_epi32(buf1[27], buf1[26]);
+    buf0[28] = _mm_add_epi32(buf1[28], buf1[29]);
+    buf0[29] = _mm_sub_epi32(buf1[28], buf1[29]);
+    buf0[30] = _mm_sub_epi32(buf1[31], buf1[30]);
+    buf0[31] = _mm_add_epi32(buf1[31], buf1[30]);
+    buf0[32] = buf1[32];
+    btf_32_sse4_1_type0(-cospi[4], cospi[60], buf1[33], buf1[62], buf0[33],
+                        buf0[62], bit);
+    btf_32_sse4_1_type0(-cospi[60], -cospi[4], buf1[34], buf1[61], buf0[34],
+                        buf0[61], bit);
+    buf0[35] = buf1[35];
+    buf0[36] = buf1[36];
+    btf_32_sse4_1_type0(-cospi[36], cospi[28], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
+    btf_32_sse4_1_type0(-cospi[28], -cospi[36], buf1[38], buf1[57], buf0[38],
+                        buf0[57], bit);
+    buf0[39] = buf1[39];
+    buf0[40] = buf1[40];
+    btf_32_sse4_1_type0(-cospi[20], cospi[44], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type0(-cospi[44], -cospi[20], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    buf0[43] = buf1[43];
+    buf0[44] = buf1[44];
+    btf_32_sse4_1_type0(-cospi[52], cospi[12], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
+    btf_32_sse4_1_type0(-cospi[12], -cospi[52], buf1[46], buf1[49], buf0[46],
+                        buf0[49], bit);
+    buf0[47] = buf1[47];
+    buf0[48] = buf1[48];
+    buf0[51] = buf1[51];
+    buf0[52] = buf1[52];
+    buf0[55] = buf1[55];
+    buf0[56] = buf1[56];
+    buf0[59] = buf1[59];
+    buf0[60] = buf1[60];
+    buf0[63] = buf1[63];
+
+    // stage 9
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[1];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[3];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[5];
+    buf1[6] = buf0[6];
+    buf1[7] = buf0[7];
+    buf1[8] = buf0[8];
+    buf1[9] = buf0[9];
+    buf1[10] = buf0[10];
+    buf1[11] = buf0[11];
+    buf1[12] = buf0[12];
+    buf1[13] = buf0[13];
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[15];
+    btf_32_sse4_1_type1(cospi[62], cospi[2], buf0[16], buf0[31], buf1[16],
+                        buf1[31], bit);
+    btf_32_sse4_1_type1(cospi[30], cospi[34], buf0[17], buf0[30], buf1[17],
+                        buf1[30], bit);
+    btf_32_sse4_1_type1(cospi[46], cospi[18], buf0[18], buf0[29], buf1[18],
+                        buf1[29], bit);
+    btf_32_sse4_1_type1(cospi[14], cospi[50], buf0[19], buf0[28], buf1[19],
+                        buf1[28], bit);
+    btf_32_sse4_1_type1(cospi[54], cospi[10], buf0[20], buf0[27], buf1[20],
+                        buf1[27], bit);
+    btf_32_sse4_1_type1(cospi[22], cospi[42], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
+    btf_32_sse4_1_type1(cospi[38], cospi[26], buf0[22], buf0[25], buf1[22],
+                        buf1[25], bit);
+    btf_32_sse4_1_type1(cospi[6], cospi[58], buf0[23], buf0[24], buf1[23],
+                        buf1[24], bit);
+    buf1[32] = _mm_add_epi32(buf0[32], buf0[33]);
+    buf1[33] = _mm_sub_epi32(buf0[32], buf0[33]);
+    buf1[34] = _mm_sub_epi32(buf0[35], buf0[34]);
+    buf1[35] = _mm_add_epi32(buf0[35], buf0[34]);
+    buf1[36] = _mm_add_epi32(buf0[36], buf0[37]);
+    buf1[37] = _mm_sub_epi32(buf0[36], buf0[37]);
+    buf1[38] = _mm_sub_epi32(buf0[39], buf0[38]);
+    buf1[39] = _mm_add_epi32(buf0[39], buf0[38]);
+    buf1[40] = _mm_add_epi32(buf0[40], buf0[41]);
+    buf1[41] = _mm_sub_epi32(buf0[40], buf0[41]);
+    buf1[42] = _mm_sub_epi32(buf0[43], buf0[42]);
+    buf1[43] = _mm_add_epi32(buf0[43], buf0[42]);
+    buf1[44] = _mm_add_epi32(buf0[44], buf0[45]);
+    buf1[45] = _mm_sub_epi32(buf0[44], buf0[45]);
+    buf1[46] = _mm_sub_epi32(buf0[47], buf0[46]);
+    buf1[47] = _mm_add_epi32(buf0[47], buf0[46]);
+    buf1[48] = _mm_add_epi32(buf0[48], buf0[49]);
+    buf1[49] = _mm_sub_epi32(buf0[48], buf0[49]);
+    buf1[50] = _mm_sub_epi32(buf0[51], buf0[50]);
+    buf1[51] = _mm_add_epi32(buf0[51], buf0[50]);
+    buf1[52] = _mm_add_epi32(buf0[52], buf0[53]);
+    buf1[53] = _mm_sub_epi32(buf0[52], buf0[53]);
+    buf1[54] = _mm_sub_epi32(buf0[55], buf0[54]);
+    buf1[55] = _mm_add_epi32(buf0[55], buf0[54]);
+    buf1[56] = _mm_add_epi32(buf0[56], buf0[57]);
+    buf1[57] = _mm_sub_epi32(buf0[56], buf0[57]);
+    buf1[58] = _mm_sub_epi32(buf0[59], buf0[58]);
+    buf1[59] = _mm_add_epi32(buf0[59], buf0[58]);
+    buf1[60] = _mm_add_epi32(buf0[60], buf0[61]);
+    buf1[61] = _mm_sub_epi32(buf0[60], buf0[61]);
+    buf1[62] = _mm_sub_epi32(buf0[63], buf0[62]);
+    buf1[63] = _mm_add_epi32(buf0[63], buf0[62]);
+
+    // stage 10
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[26] = buf1[26];
+    buf0[27] = buf1[27];
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+    btf_32_sse4_1_type1(cospi[63], cospi[1], buf1[32], buf1[63], buf0[32],
+                        buf0[63], bit);
+    btf_32_sse4_1_type1(cospi[31], cospi[33], buf1[33], buf1[62], buf0[33],
+                        buf0[62], bit);
+    btf_32_sse4_1_type1(cospi[47], cospi[17], buf1[34], buf1[61], buf0[34],
+                        buf0[61], bit);
+    btf_32_sse4_1_type1(cospi[15], cospi[49], buf1[35], buf1[60], buf0[35],
+                        buf0[60], bit);
+    btf_32_sse4_1_type1(cospi[55], cospi[9], buf1[36], buf1[59], buf0[36],
+                        buf0[59], bit);
+    btf_32_sse4_1_type1(cospi[23], cospi[41], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
+    btf_32_sse4_1_type1(cospi[39], cospi[25], buf1[38], buf1[57], buf0[38],
+                        buf0[57], bit);
+    btf_32_sse4_1_type1(cospi[7], cospi[57], buf1[39], buf1[56], buf0[39],
+                        buf0[56], bit);
+    btf_32_sse4_1_type1(cospi[59], cospi[5], buf1[40], buf1[55], buf0[40],
+                        buf0[55], bit);
+    btf_32_sse4_1_type1(cospi[27], cospi[37], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type1(cospi[43], cospi[21], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type1(cospi[11], cospi[53], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
+    btf_32_sse4_1_type1(cospi[51], cospi[13], buf1[44], buf1[51], buf0[44],
+                        buf0[51], bit);
+    btf_32_sse4_1_type1(cospi[19], cospi[45], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
+    btf_32_sse4_1_type1(cospi[35], cospi[29], buf1[46], buf1[49], buf0[46],
+                        buf0[49], bit);
+    btf_32_sse4_1_type1(cospi[3], cospi[61], buf1[47], buf1[48], buf0[47],
+                        buf0[48], bit);
+
+    // stage 11
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[32];
+    buf1[2] = buf0[16];
+    buf1[3] = buf0[48];
+    buf1[4] = buf0[8];
+    buf1[5] = buf0[40];
+    buf1[6] = buf0[24];
+    buf1[7] = buf0[56];
+    buf1[8] = buf0[4];
+    buf1[9] = buf0[36];
+    buf1[10] = buf0[20];
+    buf1[11] = buf0[52];
+    buf1[12] = buf0[12];
+    buf1[13] = buf0[44];
+    buf1[14] = buf0[28];
+    buf1[15] = buf0[60];
+    buf1[16] = buf0[2];
+    buf1[17] = buf0[34];
+    buf1[18] = buf0[18];
+    buf1[19] = buf0[50];
+    buf1[20] = buf0[10];
+    buf1[21] = buf0[42];
+    buf1[22] = buf0[26];
+    buf1[23] = buf0[58];
+    buf1[24] = buf0[6];
+    buf1[25] = buf0[38];
+    buf1[26] = buf0[22];
+    buf1[27] = buf0[54];
+    buf1[28] = buf0[14];
+    buf1[29] = buf0[46];
+    buf1[30] = buf0[30];
+    buf1[31] = buf0[62];
+    buf1[32] = buf0[1];
+    buf1[33] = buf0[33];
+    buf1[34] = buf0[17];
+    buf1[35] = buf0[49];
+    buf1[36] = buf0[9];
+    buf1[37] = buf0[41];
+    buf1[38] = buf0[25];
+    buf1[39] = buf0[57];
+    buf1[40] = buf0[5];
+    buf1[41] = buf0[37];
+    buf1[42] = buf0[21];
+    buf1[43] = buf0[53];
+    buf1[44] = buf0[13];
+    buf1[45] = buf0[45];
+    buf1[46] = buf0[29];
+    buf1[47] = buf0[61];
+    buf1[48] = buf0[3];
+    buf1[49] = buf0[35];
+    buf1[50] = buf0[19];
+    buf1[51] = buf0[51];
+    buf1[52] = buf0[11];
+    buf1[53] = buf0[43];
+    buf1[54] = buf0[27];
+    buf1[55] = buf0[59];
+    buf1[56] = buf0[7];
+    buf1[57] = buf0[39];
+    buf1[58] = buf0[23];
+    buf1[59] = buf0[55];
+    buf1[60] = buf0[15];
+    buf1[61] = buf0[47];
+    buf1[62] = buf0[31];
+    buf1[63] = buf0[63];
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+    output[16 * col_num + col] = buf1[16];
+    output[17 * col_num + col] = buf1[17];
+    output[18 * col_num + col] = buf1[18];
+    output[19 * col_num + col] = buf1[19];
+    output[20 * col_num + col] = buf1[20];
+    output[21 * col_num + col] = buf1[21];
+    output[22 * col_num + col] = buf1[22];
+    output[23 * col_num + col] = buf1[23];
+    output[24 * col_num + col] = buf1[24];
+    output[25 * col_num + col] = buf1[25];
+    output[26 * col_num + col] = buf1[26];
+    output[27 * col_num + col] = buf1[27];
+    output[28 * col_num + col] = buf1[28];
+    output[29 * col_num + col] = buf1[29];
+    output[30 * col_num + col] = buf1[30];
+    output[31 * col_num + col] = buf1[31];
+    output[32 * col_num + col] = buf1[32];
+    output[33 * col_num + col] = buf1[33];
+    output[34 * col_num + col] = buf1[34];
+    output[35 * col_num + col] = buf1[35];
+    output[36 * col_num + col] = buf1[36];
+    output[37 * col_num + col] = buf1[37];
+    output[38 * col_num + col] = buf1[38];
+    output[39 * col_num + col] = buf1[39];
+    output[40 * col_num + col] = buf1[40];
+    output[41 * col_num + col] = buf1[41];
+    output[42 * col_num + col] = buf1[42];
+    output[43 * col_num + col] = buf1[43];
+    output[44 * col_num + col] = buf1[44];
+    output[45 * col_num + col] = buf1[45];
+    output[46 * col_num + col] = buf1[46];
+    output[47 * col_num + col] = buf1[47];
+    output[48 * col_num + col] = buf1[48];
+    output[49 * col_num + col] = buf1[49];
+    output[50 * col_num + col] = buf1[50];
+    output[51 * col_num + col] = buf1[51];
+    output[52 * col_num + col] = buf1[52];
+    output[53 * col_num + col] = buf1[53];
+    output[54 * col_num + col] = buf1[54];
+    output[55 * col_num + col] = buf1[55];
+    output[56 * col_num + col] = buf1[56];
+    output[57 * col_num + col] = buf1[57];
+    output[58 * col_num + col] = buf1[58];
+    output[59 * col_num + col] = buf1[59];
+    output[60 * col_num + col] = buf1[60];
+    output[61 * col_num + col] = buf1[61];
+    output[62 * col_num + col] = buf1[62];
+    output[63 * col_num + col] = buf1[63];
+  }
+}
diff --git a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
new file mode 100644
index 0000000..6664bd5
--- /dev/null
+++ b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
@@ -0,0 +1,117 @@
+#include "vp10/common/x86/vp10_txfm1d_sse4.h"
+
+static inline void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
+                             const int8_t *cos_bit, const int8_t *stage_range);
+
+static inline TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4:
+      return vp10_fdct4_new_sse4_1;
+      break;
+    case TXFM_TYPE_DCT8:
+      return vp10_fdct8_new_sse4_1;
+      break;
+    case TXFM_TYPE_DCT16:
+      return vp10_fdct16_new_sse4_1;
+      break;
+    case TXFM_TYPE_DCT32:
+      return vp10_fdct32_new_sse4_1;
+      break;
+    case TXFM_TYPE_DCT64:
+      return vp10_fdct64_new_sse4_1;
+      break;
+    case TXFM_TYPE_ADST4:
+      return vp10_fadst4_new_sse4_1;
+      break;
+    case TXFM_TYPE_ADST8:
+      return vp10_fadst8_new_sse4_1;
+      break;
+    case TXFM_TYPE_ADST16:
+      return vp10_fadst16_new_sse4_1;
+      break;
+    case TXFM_TYPE_ADST32:
+      return vp10_fadst32_new_sse4_1;
+      break;
+    default:
+      assert(0);
+  }
+  return NULL;
+}
+
+static inline void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+                                   const int stride, const TXFM_2D_CFG *cfg,
+                                   int32_t *txfm_buf) {
+  const int txfm_size = cfg->txfm_size;
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32(txfm_size, buf_128, out_128);
+}
+
+void vp10_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output,
+                              const int stride, const TXFM_2D_CFG *cfg,
+                              const int bd) {
+  int32_t txfm_buf[16];
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output,
+                              const int stride, const TXFM_2D_CFG *cfg,
+                              const int bd) {
+  int32_t txfm_buf[64];
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_CFG *cfg,
+                                const int bd) {
+  int32_t txfm_buf[256];
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_CFG *cfg,
+                                const int bd) {
+  int32_t txfm_buf[1024];
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_CFG *cfg,
+                                const int bd) {
+  int32_t txfm_buf[4096];
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
+}
diff --git a/vp10/common/x86/vp10_txfm1d_sse4.h b/vp10/common/x86/vp10_txfm1d_sse4.h
new file mode 100644
index 0000000..803b86d
--- /dev/null
+++ b/vp10/common/x86/vp10_txfm1d_sse4.h
@@ -0,0 +1,145 @@
+#ifndef VP10_TXMF1D_SSE2_H_
+#define VP10_TXMF1D_SSE2_H_
+
+#include <smmintrin.h>
+#include "vp10/common/vp10_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_fdct4_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct8_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct16_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct32_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct64_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+
+void vp10_fadst4_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fadst8_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fadst16_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fadst32_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+
+void vp10_idct4_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct8_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct16_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct32_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct64_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+
+void vp10_iadst4_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_iadst8_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_iadst16_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_iadst32_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+
+static INLINE void transpose_32_4x4(int stride, const __m128i* input,
+                                    __m128i* output) {
+  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// than transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i* input,
+                                __m128i* output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+#define round_shift_32_sse4_1(vec, bit)     \
+  ({                                        \
+    __m128i tmp, round;                     \
+    round = _mm_set1_epi32(1 << (bit - 1)); \
+    tmp = _mm_add_epi32(vec, round);        \
+    _mm_srai_epi32(tmp, bit);               \
+  })
+
+#define round_shift_array_32_sse4_1(input, output, size, bit) \
+  ({                                                          \
+    if (bit > 0) {                                            \
+      int i;                                                  \
+      for (i = 0; i < size; i++) {                            \
+        output[i] = round_shift_32_sse4_1(input[i], bit);     \
+      }                                                       \
+    } else {                                                  \
+      int i;                                                  \
+      for (i = 0; i < size; i++) {                            \
+        output[i] = _mm_slli_epi32(input[i], -bit);           \
+      }                                                       \
+    }                                                         \
+  })
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+  ({                                                           \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  })
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+  ({                                                           \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in1_w0, in0_w1);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  })
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP10_TXMF1D_SSE2_H_
diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk
index 4e89e5e..40699a3 100644
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -110,6 +110,9 @@
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_sse2.c
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_dct32x32_impl_sse2.h
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_impl_sse2.h
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_txfm1d_sse4.h
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm1d_sse4.c
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm2d_sse4.c
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c