Merge "Corpus Wide VBR test implementation."
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 954975c..9f6f795 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -603,6 +603,29 @@
          UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
 }
 
+TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_avg_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
 TEST_P(ConvolveTest, Copy) {
   uint8_t *const in = input();
   uint8_t *const out = output();
@@ -1177,8 +1200,8 @@
 #else   // !CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_avx2(
     vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
-    vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
-    vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
+    vpx_convolve8_avg_horiz_avx2, vpx_convolve8_vert_avx2,
+    vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
     vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 3229b69..7b0d628 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -649,7 +649,7 @@
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
         << " The datarate for the file is greater than target by too much!";
   }
 }
@@ -676,7 +676,7 @@
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
         << " The datarate for the file is greater than target by too much!";
   }
 }
@@ -706,7 +706,7 @@
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
         << " The datarate for the file is greater than target by too much!";
   }
 }
diff --git a/test/quantize_test.cc b/test/quantize_test.cc
index 69da899..40bb264 100644
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -200,4 +200,12 @@
         make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),
         make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));
 #endif  // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(
+    MMI, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c),
+        make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c)));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc
index 9f69ae1..b7697d8 100644
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -199,4 +199,8 @@
 INSTANTIATE_TEST_CASE_P(MSA, FdctTest,
                         ::testing::Values(vp8_short_fdct4x4_msa));
 #endif  // HAVE_MSA
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, FdctTest,
+                        ::testing::Values(vp8_short_fdct4x4_mmi));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 7a04ef1..be2ac00 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -176,22 +176,22 @@
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 sse2 neon msa/;
+specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 sse2 neon msa/;
+specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 neon msa/;
+specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/;
 
 #
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa/;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi/;
 
 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/;
 
 #
 # Block subtraction
diff --git a/vp8/encoder/mips/mmi/dct_mmi.c b/vp8/encoder/mips/mmi/dct_mmi.c
new file mode 100644
index 0000000..7e45a12
--- /dev/null
+++ b/vp8/encoder/mips/mmi/dct_mmi.c
@@ -0,0 +1,426 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+/* clang-format off */
+#define TRANSPOSE_4H                                         \
+  MMI_LI(%[tmp0], 0x93)                                      \
+  "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]         \n\t" \
+  "mtc1       %[tmp0],    %[ftmp10]                    \n\t" \
+  "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp0]         \n\t" \
+  "punpcklhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp5],   %[ftmp5],   %[ftmp9]         \n\t" \
+  "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp0]         \n\t" \
+  "punpckhhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp6],   %[ftmp6],   %[ftmp9]         \n\t" \
+  "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp0]         \n\t" \
+  "punpcklhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp7],   %[ftmp7],   %[ftmp9]         \n\t" \
+  "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp0]         \n\t" \
+  "punpckhhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp8],   %[ftmp8],   %[ftmp9]         \n\t" \
+  "punpcklwd  %[ftmp1],   %[ftmp5],   %[ftmp7]         \n\t" \
+  "punpckhwd  %[ftmp2],   %[ftmp5],   %[ftmp7]         \n\t" \
+  "punpcklwd  %[ftmp3],   %[ftmp6],   %[ftmp8]         \n\t" \
+  "punpckhwd  %[ftmp4],   %[ftmp6],   %[ftmp8]         \n\t"
+/* clang-format on */
+
+void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  int pitch_half = pitch / 2;
+  uint64_t tmp[1];
+
+#if _MIPS_SIM == _ABIO32
+  register double ftmp0 asm("$f0");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f4");
+  register double ftmp3 asm("$f6");
+  register double ftmp4 asm("$f8");
+  register double ftmp5 asm("$f10");
+  register double ftmp6 asm("$f12");
+  register double ftmp7 asm("$f14");
+  register double ftmp8 asm("$f16");
+  register double ftmp9 asm("$f18");
+  register double ftmp10 asm("$f20");
+  register double ftmp11 asm("$f22");
+  register double ftmp12 asm("$f24");
+#else
+  register double ftmp0 asm("$f0");
+  register double ftmp1 asm("$f1");
+  register double ftmp2 asm("$f2");
+  register double ftmp3 asm("$f3");
+  register double ftmp4 asm("$f4");
+  register double ftmp5 asm("$f5");
+  register double ftmp6 asm("$f6");
+  register double ftmp7 asm("$f7");
+  register double ftmp8 asm("$f8");
+  register double ftmp9 asm("$f9");
+  register double ftmp10 asm("$f10");
+  register double ftmp11 asm("$f11");
+  register double ftmp12 asm("$f12");
+#endif  // _MIPS_SIM == _ABIO32
+
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
+
+  DECLARE_ALIGNED(16, int, a[4]);
+  DECLARE_ALIGNED(16, int, b[4]);
+  DECLARE_ALIGNED(16, int, c[4]);
+  DECLARE_ALIGNED(16, int, d[4]);
+
+  // stage1
+  a[0] = (input[0] + input[3]) * 8;
+  a[1] = (input[0 + pitch_half] + input[3 + pitch_half]) * 8;
+  a[2] = (input[0 + 2 * pitch_half] + input[3 + 2 * pitch_half]) * 8;
+  a[3] = (input[0 + 3 * pitch_half] + input[3 + 3 * pitch_half]) * 8;
+
+  b[0] = (input[1] + input[2]) * 8;
+  b[1] = (input[1 + pitch_half] + input[2 + pitch_half]) * 8;
+  b[2] = (input[1 + 2 * pitch_half] + input[2 + 2 * pitch_half]) * 8;
+  b[3] = (input[1 + 3 * pitch_half] + input[2 + 3 * pitch_half]) * 8;
+
+  c[0] = (input[1] - input[2]) * 8;
+  c[1] = (input[1 + pitch_half] - input[2 + pitch_half]) * 8;
+  c[2] = (input[1 + 2 * pitch_half] - input[2 + 2 * pitch_half]) * 8;
+  c[3] = (input[1 + 3 * pitch_half] - input[2 + 3 * pitch_half]) * 8;
+
+  d[0] = (input[0] - input[3]) * 8;
+  d[1] = (input[0 + pitch_half] - input[3 + pitch_half]) * 8;
+  d[2] = (input[0 + 2 * pitch_half] - input[3 + 2 * pitch_half]) * 8;
+  d[3] = (input[0 + 3 * pitch_half] - input[3 + 3 * pitch_half]) * 8;
+
+  __asm__ volatile (
+    "gslqc1     %[ftmp2],   %[ftmp1],       0x00(%[a])      \n\t"
+    "gslqc1     %[ftmp4],   %[ftmp3],       0x00(%[b])      \n\t"
+    "gslqc1     %[ftmp6],   %[ftmp5],       0x00(%[c])      \n\t"
+    "gslqc1     %[ftmp8],   %[ftmp7],       0x00(%[d])      \n\t"
+
+    "paddw      %[ftmp9],   %[ftmp1],       %[ftmp3]        \n\t"
+    "paddw      %[ftmp10],  %[ftmp2],       %[ftmp4]        \n\t"
+    "psubw      %[ftmp11],  %[ftmp1],       %[ftmp3]        \n\t"
+    "psubw      %[ftmp12],  %[ftmp2],       %[ftmp4]        \n\t"
+    "packsswh   %[ftmp1],   %[ftmp9],       %[ftmp10]       \n\t"
+    "packsswh   %[ftmp3],   %[ftmp11],      %[ftmp12]       \n\t"
+    "packsswh   %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
+    "packsswh   %[ftmp4],   %[ftmp7],       %[ftmp8]        \n\t"
+    MMI_LI(%[tmp0], 0x0c)
+    "mov.d      %[ftmp7],   %[ftmp2]                        \n\t"
+    "mov.d      %[ftmp8],   %[ftmp4]                        \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                       \n\t"
+
+    "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"
+    "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op1]    \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    "packsswh   %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
+
+    "ldc1       %[ftmp12],  %[ff_pw_7500]                   \n\t"
+    "punpcklhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op3]    \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op3]    \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    "packsswh   %[ftmp4],   %[ftmp5],       %[ftmp6]        \n\t"
+    TRANSPOSE_4H
+
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
+
+    "pcmpeqh    %[ftmp0],   %[ftmp8],       %[ftmp0]        \n\t"
+    "ldc1       %[ftmp9],   %[ff_ph_01]                     \n\t"
+    "paddh      %[ftmp0],   %[ftmp0],       %[ftmp9]        \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
+    "psubh      %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
+    "ldc1       %[ftmp9],   %[ff_ph_07]                     \n\t"
+    MMI_LI(%[tmp0], 0x04)
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
+
+    MMI_LI(%[tmp0], 0x10)
+    "ldc1       %[ftmp12],  %[ff_pw_12000]                  \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+
+    "punpcklhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op1]    \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op1]    \n\t"
+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
+    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
+    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
+    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
+    "packsswh   %[ftmp3],   %[ftmp10],      %[ftmp11]       \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]        \n\t"
+
+    "ldc1       %[ftmp12],  %[ff_pw_51000]                  \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op3]    \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op3]    \n\t"
+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
+    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
+    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
+    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
+    "packsswh   %[ftmp4],   %[ftmp10],      %[ftmp11]       \n\t"
+
+    : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
+      [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
+      [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
+      [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
+      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0])
+    : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), [a] "r"(a),
+      [b] "r"(b), [c] "r"(c), [d] "r"(d), [ff_ph_op1] "f"(ff_ph_op1),
+      [ff_ph_op3] "f"(ff_ph_op3), [ff_pw_14500] "m"(ff_pw_14500),
+      [ff_pw_7500] "m"(ff_pw_7500), [ff_pw_12000] "m"(ff_pw_12000),
+      [ff_pw_51000] "m"(ff_pw_51000)
+  );
+
+  __asm__ volatile(
+      "gssdlc1    %[ftmp1],   0x07(%[output])                 \n\t"
+      "gssdrc1    %[ftmp1],   0x00(%[output])                 \n\t"
+      "gssdlc1    %[ftmp3],   0x0f(%[output])                 \n\t"
+      "gssdrc1    %[ftmp3],   0x08(%[output])                 \n\t"
+      "gssdlc1    %[ftmp2],   0x17(%[output])                 \n\t"
+      "gssdrc1    %[ftmp2],   0x10(%[output])                 \n\t"
+      "gssdlc1    %[ftmp4],   0x1f(%[output])                 \n\t"
+      "gssdrc1    %[ftmp4],   0x18(%[output])                 \n\t"
+      :
+      : [ftmp1] "f"(ftmp1), [ftmp2] "f"(ftmp2), [ftmp3] "f"(ftmp3),
+        [ftmp4] "f"(ftmp4), [output] "r"(output)
+      : "memory");
+}
+
+void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  vp8_short_fdct4x4_mmi(input, output, pitch);
+  vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  double ftmp[13];
+  uint32_t tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
+
+  __asm__ volatile (
+    MMI_LI(%[tmp0], 0x02)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp2],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp3],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp4],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[ip])                         \n\t"
+    TRANSPOSE_4H
+
+    "psllh      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // d
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp4]            \n\t"
+    // c
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp4]            \n\t"
+    // b
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp3]            \n\t"
+
+    // a + d
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
+    // b + c
+    "paddh      %[ftmp2],   %[ftmp8],       %[ftmp7]            \n\t"
+    // b - c
+    "psubh      %[ftmp3],   %[ftmp8],       %[ftmp7]            \n\t"
+    // a - d
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp6]            \n\t"
+
+    "pcmpeqh    %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp6],   %[ftmp6],       %[ff_ph_01]           \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+    TRANSPOSE_4H
+
+    // op[2], op[0]
+    "pmaddhw    %[ftmp5],   %[ftmp1],       %[ff_pw_01]         \n\t"
+    // op[3], op[1]
+    "pmaddhw    %[ftmp1],   %[ftmp1],       %[ff_pw_mask]       \n\t"
+
+    // op[6], op[4]
+    "pmaddhw    %[ftmp6],   %[ftmp2],       %[ff_pw_01]         \n\t"
+    // op[7], op[5]
+    "pmaddhw    %[ftmp2],   %[ftmp2],       %[ff_pw_mask]       \n\t"
+
+    // op[10], op[8]
+    "pmaddhw    %[ftmp7],   %[ftmp3],       %[ff_pw_01]         \n\t"
+    // op[11], op[9]
+    "pmaddhw    %[ftmp3],   %[ftmp3],       %[ff_pw_mask]       \n\t"
+
+    // op[14], op[12]
+    "pmaddhw    %[ftmp8],   %[ftmp4],       %[ff_pw_01]         \n\t"
+    // op[15], op[13]
+    "pmaddhw    %[ftmp4],   %[ftmp4],       %[ff_pw_mask]       \n\t"
+
+    // a1, a3
+    "paddw      %[ftmp9],   %[ftmp5],       %[ftmp7]            \n\t"
+    // d1, d3
+    "paddw      %[ftmp10],  %[ftmp6],       %[ftmp8]            \n\t"
+    // c1, c3
+    "psubw      %[ftmp11],  %[ftmp6],       %[ftmp8]            \n\t"
+    // b1, b3
+    "psubw      %[ftmp12],  %[ftmp5],       %[ftmp7]            \n\t"
+
+    // a1 + d1, a3 + d3
+    "paddw      %[ftmp5],   %[ftmp9],       %[ftmp10]           \n\t"
+    // b1 + c1, b3 + c3
+    "paddw      %[ftmp6],   %[ftmp12],      %[ftmp11]           \n\t"
+    // b1 - c1, b3 - c3
+    "psubw      %[ftmp7],   %[ftmp12],      %[ftmp11]           \n\t"
+    // a1 - d1, a3 - d3
+    "psubw      %[ftmp8],   %[ftmp9],       %[ftmp10]           \n\t"
+
+    // a2, a4
+    "paddw      %[ftmp9],   %[ftmp1],       %[ftmp3]            \n\t"
+    // d2, d4
+    "paddw      %[ftmp10],  %[ftmp2],       %[ftmp4]            \n\t"
+    // c2, c4
+    "psubw      %[ftmp11],  %[ftmp2],       %[ftmp4]            \n\t"
+    // b2, b4
+    "psubw      %[ftmp12],  %[ftmp1],       %[ftmp3]            \n\t"
+
+    // a2 + d2, a4 + d4
+    "paddw      %[ftmp1],   %[ftmp9],       %[ftmp10]           \n\t"
+    // b2 + c2, b4 + c4
+    "paddw      %[ftmp2],   %[ftmp12],      %[ftmp11]           \n\t"
+    // b2 - c2, b4 - c4
+    "psubw      %[ftmp3],   %[ftmp12],      %[ftmp11]           \n\t"
+    // a2 - d2, a4 - d4
+    "psubw      %[ftmp4],   %[ftmp9],       %[ftmp10]           \n\t"
+
+    MMI_LI(%[tmp0], 0x03)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp1]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp2]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp2],   %[ftmp2],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp3]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp4]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp4],   %[ftmp4],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp5]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp6]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp7]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp7],   %[ftmp7],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp7],   %[ftmp7],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp7],   %[ftmp7],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp8]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp8],   %[ftmp8],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp8],   %[ftmp8],       %[ftmp11]           \n\t"
+
+    "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+    "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+    "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+    "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
+
+    MMI_LI(%[tmp0], 0x72)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
+    "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
+    "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
+    "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),
+      [tmp0]"=&r"(tmp[0]),
+      [ip]"+&r"(input)
+    : [op]"r"(output),
+      [ff_pw_01]"f"(ff_pw_01),          [pitch]"r"((mips_reg)pitch),
+      [ff_pw_03]"f"(ff_pw_03),          [ff_pw_mask]"f"(ff_pw_mask),
+      [ff_ph_01]"f"(ff_ph_01)
+    : "memory"
+  );
+}
diff --git a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
new file mode 100644
index 0000000..22b12bb
--- /dev/null
+++ b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
@@ -0,0 +1,237 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/quant_common.h"
+
+#define REGULAR_SELECT_EOB(i, rc)                                        \
+  z = coeff_ptr[rc];                                                     \
+  sz = (z >> 31);                                                        \
+  x = (z ^ sz) - sz;                                                     \
+  if (x >= (zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value)) {       \
+    x += round_ptr[rc];                                                  \
+    y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \
+    x = (y ^ sz) - sz;                                                   \
+    qcoeff_ptr[rc] = x;                                                  \
+    dqcoeff_ptr[rc] = x * dequant_ptr[rc];                               \
+    if (y) {                                                             \
+      eob = i;                                                           \
+      zbin_boost_ptr = b->zrun_zbin_boost;                               \
+    }                                                                    \
+  }
+
+void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+  const int16_t *coeff_ptr = b->coeff;
+  const int16_t *round_ptr = b->round;
+  const int16_t *quant_ptr = b->quant_fast;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  const int16_t *dequant_ptr = d->dequant;
+  const int16_t *inv_zig_zag = vp8_default_inv_zig_zag;
+
+  double ftmp[13];
+  uint64_t tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL };
+  int eob = 0;
+
+  __asm__ volatile(
+      // loop 0 ~ 7
+      "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "gsldlc1    %[ftmp1],   0x07(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp1],   0x00(%[coeff_ptr])              \n\t"
+      "li         %[tmp0],    0x0f                            \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "gsldlc1    %[ftmp2],   0x0f(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp2],   0x08(%[coeff_ptr])              \n\t"
+
+      "psrah      %[ftmp3],   %[ftmp1],       %[ftmp9]        \n\t"
+      "xor        %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
+      "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]        \n\t"
+      "psrah      %[ftmp4],   %[ftmp2],       %[ftmp9]        \n\t"
+      "xor        %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
+      "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x07(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp5],   0x00(%[round_ptr])              \n\t"
+      "gsldlc1    %[ftmp6],   0x0f(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp6],   0x08(%[round_ptr])              \n\t"
+      "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "gsldlc1    %[ftmp7],   0x07(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp7],   0x00(%[quant_ptr])              \n\t"
+      "gsldlc1    %[ftmp8],   0x0f(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp8],   0x08(%[quant_ptr])              \n\t"
+      "pmulhuh    %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmulhuh    %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+
+      "xor        %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
+      "xor        %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
+      "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]        \n\t"
+      "psubh      %[ftmp8],   %[ftmp8],       %[ftmp4]        \n\t"
+      "gssdlc1    %[ftmp7],   0x07(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp7],   0x00(%[qcoeff_ptr])             \n\t"
+      "gssdlc1    %[ftmp8],   0x0f(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp8],   0x08(%[qcoeff_ptr])             \n\t"
+
+      "gsldlc1    %[ftmp1],   0x07(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp1],   0x00(%[inv_zig_zag])            \n\t"
+      "gsldlc1    %[ftmp2],   0x0f(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp2],   0x08(%[inv_zig_zag])            \n\t"
+      "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp0]        \n\t"
+      "xor        %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
+      "xor        %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
+      "and        %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "and        %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp5],       %[ftmp6]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x07(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp5],   0x00(%[dequant_ptr])            \n\t"
+      "gsldlc1    %[ftmp6],   0x0f(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp6],   0x08(%[dequant_ptr])            \n\t"
+      "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+      "gssdlc1    %[ftmp5],   0x07(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp5],   0x00(%[dqcoeff_ptr])            \n\t"
+      "gssdlc1    %[ftmp6],   0x0f(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp6],   0x08(%[dqcoeff_ptr])            \n\t"
+
+      // loop 8 ~ 15
+      "gsldlc1    %[ftmp1],   0x17(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp1],   0x10(%[coeff_ptr])              \n\t"
+      "gsldlc1    %[ftmp2],   0x1f(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp2],   0x18(%[coeff_ptr])              \n\t"
+
+      "psrah      %[ftmp3],   %[ftmp1],       %[ftmp9]        \n\t"
+      "xor        %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
+      "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]        \n\t"
+      "psrah      %[ftmp4],   %[ftmp2],       %[ftmp9]        \n\t"
+      "xor        %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
+      "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x17(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp5],   0x10(%[round_ptr])              \n\t"
+      "gsldlc1    %[ftmp6],   0x1f(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp6],   0x18(%[round_ptr])              \n\t"
+      "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "gsldlc1    %[ftmp7],   0x17(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp7],   0x10(%[quant_ptr])              \n\t"
+      "gsldlc1    %[ftmp8],   0x1f(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp8],   0x18(%[quant_ptr])              \n\t"
+      "pmulhuh    %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmulhuh    %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+
+      "xor        %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
+      "xor        %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
+      "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]        \n\t"
+      "psubh      %[ftmp8],   %[ftmp8],       %[ftmp4]        \n\t"
+      "gssdlc1    %[ftmp7],   0x17(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp7],   0x10(%[qcoeff_ptr])             \n\t"
+      "gssdlc1    %[ftmp8],   0x1f(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp8],   0x18(%[qcoeff_ptr])             \n\t"
+
+      "gsldlc1    %[ftmp1],   0x17(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp1],   0x10(%[inv_zig_zag])            \n\t"
+      "gsldlc1    %[ftmp2],   0x1f(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp2],   0x18(%[inv_zig_zag])            \n\t"
+      "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp0]        \n\t"
+      "xor        %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
+      "xor        %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
+      "and        %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "and        %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "pmaxsh     %[ftmp11],  %[ftmp5],       %[ftmp6]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x17(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp5],   0x10(%[dequant_ptr])            \n\t"
+      "gsldlc1    %[ftmp6],   0x1f(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp6],   0x18(%[dequant_ptr])            \n\t"
+      "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+      "gssdlc1    %[ftmp5],   0x17(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp5],   0x10(%[dqcoeff_ptr])            \n\t"
+      "gssdlc1    %[ftmp6],   0x1f(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp6],   0x18(%[dqcoeff_ptr])            \n\t"
+
+      "li         %[tmp0],    0x10                            \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "psrlw      %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "li         %[tmp0],    0xaa                            \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "pshufh     %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "li         %[tmp0],    0xffff                          \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "and        %[ftmp10],  %[ftmp10],       %[ftmp9]       \n\t"
+      "gssdlc1    %[ftmp10],  0x07(%[eob])                    \n\t"
+      "gssdrc1    %[ftmp10],  0x00(%[eob])                    \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+        [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+      : [coeff_ptr] "r"((mips_reg)coeff_ptr),
+        [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
+        [dequant_ptr] "r"((mips_reg)dequant_ptr),
+        [round_ptr] "r"((mips_reg)round_ptr),
+        [quant_ptr] "r"((mips_reg)quant_ptr),
+        [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
+        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob),
+        [ones] "f"(ones)
+      : "memory");
+
+  *d->eob = eob;
+}
+
+void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+  int eob;
+  int x, y, z, sz;
+  const int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+  const int16_t *coeff_ptr = b->coeff;
+  const int16_t *zbin_ptr = b->zbin;
+  const int16_t *round_ptr = b->round;
+  const int16_t *quant_ptr = b->quant;
+  const int16_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  const int16_t *dequant_ptr = d->dequant;
+  const int16_t zbin_oq_value = b->zbin_extra;
+
+  memset(qcoeff_ptr, 0, 32);
+  memset(dqcoeff_ptr, 0, 32);
+
+  eob = -1;
+
+  REGULAR_SELECT_EOB(0, 0);
+  REGULAR_SELECT_EOB(1, 1);
+  REGULAR_SELECT_EOB(2, 4);
+  REGULAR_SELECT_EOB(3, 8);
+  REGULAR_SELECT_EOB(4, 5);
+  REGULAR_SELECT_EOB(5, 2);
+  REGULAR_SELECT_EOB(6, 3);
+  REGULAR_SELECT_EOB(7, 6);
+  REGULAR_SELECT_EOB(8, 9);
+  REGULAR_SELECT_EOB(9, 12);
+  REGULAR_SELECT_EOB(10, 13);
+  REGULAR_SELECT_EOB(11, 10);
+  REGULAR_SELECT_EOB(12, 7);
+  REGULAR_SELECT_EOB(13, 11);
+  REGULAR_SELECT_EOB(14, 14);
+  REGULAR_SELECT_EOB(15, 15);
+
+  *d->eob = (char)(eob + 1);
+}
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 9111a22..0dac016 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -110,6 +110,9 @@
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
 
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/dct_mmi.c
+
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c
 endif
diff --git a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
index ff4de3e..e46f789 100644
--- a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
+++ b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <arm_neon.h>
+
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
@@ -21,11 +23,11 @@
 // exceed the right and bottom boundaries of the destination frame. We rely on
 // the following frame extension function to fix these rows and columns.
 
-static INLINE void scale_plane_2_to_1_phase_0_neon(const uint8_t *src,
-                                                   const int src_stride,
-                                                   uint8_t *dst,
-                                                   const int dst_stride,
-                                                   const int w, const int h) {
+static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
   const int max_width = (w + 15) & ~15;
   int y = h;
 
@@ -45,11 +47,11 @@
   } while (--y);
 }
 
-static INLINE void scale_plane_4_to_1_phase_0_neon(const uint8_t *src,
-                                                   const int src_stride,
-                                                   uint8_t *dst,
-                                                   const int dst_stride,
-                                                   const int w, const int h) {
+static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
   const int max_width = (w + 15) & ~15;
   int y = h;
 
@@ -69,7 +71,7 @@
   } while (--y);
 }
 
-static INLINE void scale_plane_bilinear_phase_non_0_kernel(
+static INLINE void scale_plane_bilinear_kernel(
     const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
     const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
     uint8_t *const dst) {
@@ -95,7 +97,7 @@
   vst1q_u8(dst, d);
 }
 
-static INLINE void scale_plane_2_to_1_bilinear_phase_non_0_neon(
+static INLINE void scale_plane_2_to_1_bilinear(
     const uint8_t *const src, const int src_stride, uint8_t *dst,
     const int dst_stride, const int w, const int h, const int16_t c0,
     const int16_t c1) {
@@ -117,8 +119,8 @@
       // 100 102 104 106 108 10A 10C 10E  110 112 114 116 118 11A 11C 11E
       // 101 103 105 107 109 10B 10D 10F  111 113 115 117 119 11B 11D 11F
       const uint8x16x2_t s1 = vld2q_u8(src1);
-      scale_plane_bilinear_phase_non_0_kernel(s0.val[0], s0.val[1], s1.val[0],
-                                              s1.val[1], coef0, coef1, dst);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
       src0 += 32;
       src1 += 32;
       dst += 16;
@@ -130,7 +132,7 @@
   } while (--y);
 }
 
-static INLINE void scale_plane_4_to_1_bilinear_phase_non_0_neon(
+static INLINE void scale_plane_4_to_1_bilinear(
     const uint8_t *const src, const int src_stride, uint8_t *dst,
     const int dst_stride, const int w, const int h, const int16_t c0,
     const int16_t c1) {
@@ -157,8 +159,8 @@
       // 102 106 10A 10E 112 116 11A 11E  122 126 12A 12E 132 136 13A 13E (*)
       // 103 107 10B 10F 113 117 11B 11F  123 127 12B 12F 133 137 13B 13F (*)
       const uint8x16x4_t s1 = vld4q_u8(src1);
-      scale_plane_bilinear_phase_non_0_kernel(s0.val[0], s0.val[1], s1.val[0],
-                                              s1.val[1], coef0, coef1, dst);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
       src0 += 64;
       src1 += 64;
       dst += 16;
@@ -178,12 +180,11 @@
   return vrshrn_n_u16(h1, 7);
 }
 
-static void scale_plane_2_to_1_general_neon(const uint8_t *src,
-                                            const int src_stride, uint8_t *dst,
-                                            const int dst_stride, const int w,
-                                            const int h,
-                                            const int16_t *const coef,
-                                            uint8_t *const temp_buffer) {
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
   const int width_hor = (w + 3) & ~3;
   const int width_ver = (w + 7) & ~7;
   const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
@@ -268,10 +269,10 @@
                   &s[12], &s[13]);
       t += 8 * width_hor;
 
-      d[0] = scale_filter_8(&s[0], filters);
-      d[1] = scale_filter_8(&s[2], filters);
-      d[2] = scale_filter_8(&s[4], filters);
-      d[3] = scale_filter_8(&s[6], filters);
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[2], filters);  // 10 11 12 13 14 15 16 17
+      d[2] = scale_filter_8(&s[4], filters);  // 20 21 22 23 24 25 26 27
+      d[3] = scale_filter_8(&s[6], filters);  // 30 31 32 33 34 35 36 37
       vst1_u8(dst + 0 * dst_stride, d[0]);
       vst1_u8(dst + 1 * dst_stride, d[1]);
       vst1_u8(dst + 2 * dst_stride, d[2]);
@@ -295,12 +296,11 @@
   } while (x);
 }
 
-static void scale_plane_4_to_1_general_neon(const uint8_t *src,
-                                            const int src_stride, uint8_t *dst,
-                                            const int dst_stride, const int w,
-                                            const int h,
-                                            const int16_t *const coef,
-                                            uint8_t *const temp_buffer) {
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
   const int width_hor = (w + 1) & ~1;
   const int width_ver = (w + 7) & ~7;
   const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
@@ -379,8 +379,8 @@
                   &s[10], &s[11]);
       t += 8 * width_hor;
 
-      d[0] = scale_filter_8(&s[0], filters);
-      d[1] = scale_filter_8(&s[4], filters);
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[4], filters);  // 10 11 12 13 14 15 16 17
       vst1_u8(dst + 0 * dst_stride, d[0]);
       vst1_u8(dst + 1 * dst_stride, d[1]);
 
@@ -419,12 +419,11 @@
 // decided by phase_scaler, and are always less than 1 pixel below the last row
 // of the original image.
 
-static void scale_plane_4_to_3_bilinear_neon(const uint8_t *src,
-                                             const int src_stride, uint8_t *dst,
-                                             const int dst_stride, const int w,
-                                             const int h,
-                                             const int phase_scaler,
-                                             uint8_t *const temp_buffer) {
+static void scale_plane_4_to_3_bilinear(const uint8_t *src,
+                                        const int src_stride, uint8_t *dst,
+                                        const int dst_stride, const int w,
+                                        const int h, const int phase_scaler,
+                                        uint8_t *const temp_buffer) {
   static const int step_q4 = 16 * 4 / 3;
   const int width_hor = (w + 5) - ((w + 5) % 6);
   const int stride_hor = width_hor + 2;  // store 2 extra pixels
@@ -561,13 +560,12 @@
   } while (x);
 }
 
-static void scale_plane_4_to_3_general_neon(const uint8_t *src,
-                                            const int src_stride, uint8_t *dst,
-                                            const int dst_stride, const int w,
-                                            const int h,
-                                            const InterpKernel *const coef,
-                                            const int phase_scaler,
-                                            uint8_t *const temp_buffer) {
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase_scaler,
+                                       uint8_t *const temp_buffer) {
   static const int step_q4 = 16 * 4 / 3;
   const int width_hor = (w + 5) - ((w + 5) % 6);
   const int stride_hor = width_hor + 2;  // store 2 extra pixels
@@ -723,41 +721,35 @@
     // 2 to 1
     scaled = 1;
     if (phase_scaler == 0) {
-      scale_plane_2_to_1_phase_0_neon(src->y_buffer, src->y_stride,
-                                      dst->y_buffer, dst->y_stride, dst_w,
-                                      dst_h);
-      scale_plane_2_to_1_phase_0_neon(src->u_buffer, src->uv_stride,
-                                      dst->u_buffer, dst->uv_stride, dst_uv_w,
-                                      dst_uv_h);
-      scale_plane_2_to_1_phase_0_neon(src->v_buffer, src->uv_stride,
-                                      dst->v_buffer, dst->uv_stride, dst_uv_w,
-                                      dst_uv_h);
+      scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
     } else if (filter_type == BILINEAR) {
       const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
       const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
-      scale_plane_2_to_1_bilinear_phase_non_0_neon(src->y_buffer, src->y_stride,
-                                                   dst->y_buffer, dst->y_stride,
-                                                   dst_w, dst_h, c0, c1);
-      scale_plane_2_to_1_bilinear_phase_non_0_neon(
-          src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
-          dst_uv_w, dst_uv_h, c0, c1);
-      scale_plane_2_to_1_bilinear_phase_non_0_neon(
-          src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
-          dst_uv_w, dst_uv_h, c0, c1);
+      scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0, c1);
+      scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+      scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
     } else {
       const int buffer_stride = (dst_w + 3) & ~3;
       const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
       uint8_t *const temp_buffer =
           (uint8_t *)malloc(buffer_stride * buffer_height);
       if (temp_buffer) {
-        scale_plane_2_to_1_general_neon(
+        scale_plane_2_to_1_general(
             src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
             dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
-        scale_plane_2_to_1_general_neon(
+        scale_plane_2_to_1_general(
             src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
             dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
             temp_buffer);
-        scale_plane_2_to_1_general_neon(
+        scale_plane_2_to_1_general(
             src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
             dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
             temp_buffer);
@@ -770,41 +762,35 @@
     // 4 to 1
     scaled = 1;
     if (phase_scaler == 0) {
-      scale_plane_4_to_1_phase_0_neon(src->y_buffer, src->y_stride,
-                                      dst->y_buffer, dst->y_stride, dst_w,
-                                      dst_h);
-      scale_plane_4_to_1_phase_0_neon(src->u_buffer, src->uv_stride,
-                                      dst->u_buffer, dst->uv_stride, dst_uv_w,
-                                      dst_uv_h);
-      scale_plane_4_to_1_phase_0_neon(src->v_buffer, src->uv_stride,
-                                      dst->v_buffer, dst->uv_stride, dst_uv_w,
-                                      dst_uv_h);
+      scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
     } else if (filter_type == BILINEAR) {
       const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
       const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
-      scale_plane_4_to_1_bilinear_phase_non_0_neon(src->y_buffer, src->y_stride,
-                                                   dst->y_buffer, dst->y_stride,
-                                                   dst_w, dst_h, c0, c1);
-      scale_plane_4_to_1_bilinear_phase_non_0_neon(
-          src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
-          dst_uv_w, dst_uv_h, c0, c1);
-      scale_plane_4_to_1_bilinear_phase_non_0_neon(
-          src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
-          dst_uv_w, dst_uv_h, c0, c1);
+      scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0, c1);
+      scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+      scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
     } else {
       const int buffer_stride = (dst_w + 1) & ~1;
       const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
       uint8_t *const temp_buffer =
           (uint8_t *)malloc(buffer_stride * buffer_height);
       if (temp_buffer) {
-        scale_plane_4_to_1_general_neon(
+        scale_plane_4_to_1_general(
             src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
             dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
-        scale_plane_4_to_1_general_neon(
+        scale_plane_4_to_1_general(
             src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
             dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
             temp_buffer);
-        scale_plane_4_to_1_general_neon(
+        scale_plane_4_to_1_general(
             src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
             dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
             temp_buffer);
@@ -822,27 +808,27 @@
     if (temp_buffer) {
       scaled = 1;
       if (filter_type == BILINEAR) {
-        scale_plane_4_to_3_bilinear_neon(src->y_buffer, src->y_stride,
-                                         dst->y_buffer, dst->y_stride, dst_w,
-                                         dst_h, phase_scaler, temp_buffer);
-        scale_plane_4_to_3_bilinear_neon(
-            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
-            dst_uv_w, dst_uv_h, phase_scaler, temp_buffer);
-        scale_plane_4_to_3_bilinear_neon(
-            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
-            dst_uv_w, dst_uv_h, phase_scaler, temp_buffer);
+        scale_plane_4_to_3_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                    dst->y_stride, dst_w, dst_h, phase_scaler,
+                                    temp_buffer);
+        scale_plane_4_to_3_bilinear(src->u_buffer, src->uv_stride,
+                                    dst->u_buffer, dst->uv_stride, dst_uv_w,
+                                    dst_uv_h, phase_scaler, temp_buffer);
+        scale_plane_4_to_3_bilinear(src->v_buffer, src->uv_stride,
+                                    dst->v_buffer, dst->uv_stride, dst_uv_w,
+                                    dst_uv_h, phase_scaler, temp_buffer);
       } else {
-        scale_plane_4_to_3_general_neon(
+        scale_plane_4_to_3_general(
             src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
             dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
-        scale_plane_4_to_3_general_neon(
-            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
-            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type], phase_scaler,
-            temp_buffer);
-        scale_plane_4_to_3_general_neon(
-            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
-            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type], phase_scaler,
-            temp_buffer);
+        scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                   dst->uv_stride, dst_uv_w, dst_uv_h,
+                                   vp9_filter_kernels[filter_type],
+                                   phase_scaler, temp_buffer);
+        scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                   dst->uv_stride, dst_uv_w, dst_uv_h,
+                                   vp9_filter_kernels[filter_type],
+                                   phase_scaler, temp_buffer);
       }
       free(temp_buffer);
     }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index dee17ad..aa298ac 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3489,7 +3489,7 @@
 static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
                                      RD_COST *rd_cost, BLOCK_SIZE bsize,
                                      PICK_MODE_CONTEXT *ctx) {
-  if (bsize < BLOCK_16X16)
+  if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16)
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
   else
     vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index e81d03b..8d75684 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2278,11 +2278,14 @@
     int start_frame = 0;
     int frames_to_buffer = 1;
     int frame = 0;
+    int scene_cut_force_key_frame = 0;
     uint64_t avg_sad_current = 0;
     uint32_t min_thresh = 4000;
     float thresh = 8.0f;
+    uint32_t thresh_key = 140000;
+    if (cpi->oxcf.speed <= 5) thresh_key = 240000;
     if (cpi->oxcf.rc_mode == VPX_VBR) {
-      min_thresh = 70000;
+      min_thresh = 65000;
       thresh = 2.1f;
     }
     if (cpi->oxcf.lag_in_frames > 0) {
@@ -2308,6 +2311,8 @@
         rc->high_source_sad = 1;
       else
         rc->high_source_sad = 0;
+      if (rc->high_source_sad && avg_sad_current > thresh_key)
+        scene_cut_force_key_frame = 1;
       // Update recursive average for current frame.
       if (avg_sad_current > 0)
         rc->avg_source_sad[0] =
@@ -2368,6 +2373,8 @@
             rc->high_source_sad = 1;
           else
             rc->high_source_sad = 0;
+          if (rc->high_source_sad && avg_sad > thresh_key)
+            scene_cut_force_key_frame = 1;
           if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR)
             rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2;
         } else {
@@ -2399,6 +2406,7 @@
         cpi->ext_refresh_frame_flags_pending == 0) {
       int target;
       cpi->refresh_golden_frame = 1;
+      if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME;
       rc->source_alt_ref_pending = 0;
       if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf)
         rc->source_alt_ref_pending = 1;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 17b5f2b..e5499d6 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -370,6 +370,7 @@
   sf->use_simple_block_yrd = 0;
   sf->adapt_partition_source_sad = 0;
   sf->use_altref_onepass = 0;
+  sf->nonrd_keyframe = 0;
 
   if (speed >= 1) {
     sf->allow_txfm_domain_distortion = 1;
@@ -602,6 +603,7 @@
   if (speed >= 8) {
     sf->adaptive_rd_thresh = 4;
     sf->skip_encode_sb = 1;
+    sf->nonrd_keyframe = 1;
     if (!cpi->use_svc) cpi->max_copied_frame = 4;
     if (cpi->row_mt && cpi->oxcf.max_threads > 1)
       sf->adaptive_rd_thresh_row_mt = 1;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 517369d..9e5bf9a 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -499,6 +499,9 @@
 
   // Enable use of alt-refs in 1 pass VBR.
   int use_altref_onepass;
+
+  // Always use nonrd_pick_intra for all block sizes on keyframes.
+  int nonrd_keyframe;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 0d7bd1d..81e5b42 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -15,25 +15,31 @@
 #include "./vpx_scale_rtcd.h"
 #include "vpx_dsp/x86/convolve_ssse3.h"
 #include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_scale/yv12config.h"
 
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+    const uint8_t *const src, const __m128i *const mask) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+  const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+  const __m128i a_and = _mm_and_si128(a, *mask);
+  const __m128i b_and = _mm_and_si128(b, *mask);
+  return _mm_packus_epi16(a_and, b_and);
+}
+
 static void scale_plane_2_to_1_phase_0(const uint8_t *src,
                                        const ptrdiff_t src_stride, uint8_t *dst,
                                        const ptrdiff_t dst_stride,
                                        const int dst_w, const int dst_h) {
-  const __m128i mask = _mm_set1_epi16(0x00FF);
   const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi16(0x00FF);
   int y = dst_h;
 
   do {
     int x = max_width;
     do {
-      const __m128i a = _mm_loadu_si128((const __m128i *)(src + 0));
-      const __m128i b = _mm_loadu_si128((const __m128i *)(src + 16));
-      const __m128i a_and = _mm_and_si128(a, mask);
-      const __m128i b_and = _mm_and_si128(b, mask);
-      const __m128i c = _mm_packus_epi16(a_and, b_and);
-      _mm_storeu_si128((__m128i *)dst, c);
+      const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+      _mm_storeu_si128((__m128i *)dst, d);
       src += 32;
       dst += 16;
       x -= 16;
@@ -43,6 +49,395 @@
   } while (--y);
 }
 
+static void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi32(0x000000FF);
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
+      const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
+      const __m128i d2 = _mm_packus_epi16(d0, d1);
+      _mm_storeu_si128((__m128i *)dst, d2);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+                                                  const __m128i c0c1) {
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+  const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+  // round and shift by 7 bit each 16 bit
+  const __m128i t2 = _mm_adds_epi16(t0, k_64);
+  const __m128i t3 = _mm_adds_epi16(t1, k_64);
+  const __m128i t4 = _mm_srai_epi16(t2, 7);
+  const __m128i t5 = _mm_srai_epi16(t3, 7);
+  return _mm_packus_epi16(t4, t5);
+}
+
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[2], d[2];
+
+      // Horizontal
+      // Even rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // odd rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // Vertical
+      s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[8], d[8];
+
+      // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
+      //       Here we tried to not use shuffle instructions which would be slow
+      //       on some x86 CPUs.
+
+      // Horizontal
+      // 000 001 xx xx 004 005 xx xx  008 009 xx xx 00C 00D xx xx
+      // 010 011 xx xx 014 015 xx xx  018 019 xx xx 01C 01D xx xx
+      // 020 021 xx xx 024 025 xx xx  028 029 xx xx 02C 02D xx xx
+      // 030 031 xx xx 034 035 xx xx  038 039 xx xx 03C 03D xx xx
+      // 100 101 xx xx 104 105 xx xx  108 109 xx xx 10C 10D xx xx
+      // 110 111 xx xx 114 115 xx xx  118 119 xx xx 11C 11D xx xx
+      // 120 121 xx xx 124 125 xx xx  128 129 xx xx 12C 12D xx xx
+      // 130 131 xx xx 134 135 xx xx  138 139 xx xx 13C 13D xx xx
+      s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
+      s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
+      s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
+      s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
+      s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
+      s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
+
+      // 000 001 100 101 xx xx xx xx  004 005 104 105 xx xx xx xx
+      // 008 009 108 109 xx xx xx xx  00C 00D 10C 10D xx xx xx xx
+      // 010 011 110 111 xx xx xx xx  014 015 114 115 xx xx xx xx
+      // 018 019 118 119 xx xx xx xx  01C 01D 11C 11D xx xx xx xx
+      // 020 021 120 121 xx xx xx xx  024 025 124 125 xx xx xx xx
+      // 028 029 128 129 xx xx xx xx  02C 02D 12C 12D xx xx xx xx
+      // 030 031 130 131 xx xx xx xx  034 035 134 135 xx xx xx xx
+      // 038 039 138 139 xx xx xx xx  03C 03D 13C 13D xx xx xx xx
+      d[0] = _mm_unpacklo_epi16(s[0], s[4]);
+      d[1] = _mm_unpackhi_epi16(s[0], s[4]);
+      d[2] = _mm_unpacklo_epi16(s[1], s[5]);
+      d[3] = _mm_unpackhi_epi16(s[1], s[5]);
+      d[4] = _mm_unpacklo_epi16(s[2], s[6]);
+      d[5] = _mm_unpackhi_epi16(s[2], s[6]);
+      d[6] = _mm_unpacklo_epi16(s[3], s[7]);
+      d[7] = _mm_unpackhi_epi16(s[3], s[7]);
+
+      // 000 001 100 101 008 009 108 109  xx xx xx xx xx xx xx xx
+      // 004 005 104 105 00C 00D 10C 10D  xx xx xx xx xx xx xx xx
+      // 010 011 110 111 018 019 118 119  xx xx xx xx xx xx xx xx
+      // 014 015 114 115 01C 01D 11C 11D  xx xx xx xx xx xx xx xx
+      // 020 021 120 121 028 029 128 129  xx xx xx xx xx xx xx xx
+      // 024 025 124 125 02C 02D 12C 12D  xx xx xx xx xx xx xx xx
+      // 030 031 130 131 038 039 138 139  xx xx xx xx xx xx xx xx
+      // 034 035 134 135 03C 03D 13C 13D  xx xx xx xx xx xx xx xx
+      s[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      s[2] = _mm_unpacklo_epi32(d[2], d[3]);
+      s[3] = _mm_unpackhi_epi32(d[2], d[3]);
+      s[4] = _mm_unpacklo_epi32(d[4], d[5]);
+      s[5] = _mm_unpackhi_epi32(d[4], d[5]);
+      s[6] = _mm_unpacklo_epi32(d[6], d[7]);
+      s[7] = _mm_unpackhi_epi32(d[6], d[7]);
+
+      // 000 001 100 101 004 005 104 105  008 009 108 109 00C 00D 10C 10D
+      // 010 011 110 111 014 015 114 115  018 019 118 119 01C 01D 11C 11D
+      // 020 021 120 121 024 025 124 125  028 029 128 129 02C 02D 12C 12D
+      // 030 031 130 131 034 035 134 135  038 039 138 139 03C 03D 13C 13D
+      d[0] = _mm_unpacklo_epi32(s[0], s[1]);
+      d[1] = _mm_unpacklo_epi32(s[2], s[3]);
+      d[2] = _mm_unpacklo_epi32(s[4], s[5]);
+      d[3] = _mm_unpacklo_epi32(s[6], s[7]);
+
+      d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
+      d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
+
+      // Vertical
+      d[0] = scale_plane_bilinear_kernel(d, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  do {
+    load_8bit_8x8(src + 2, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[3]);
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // 0C 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      transpose_16bit_4x8(&s[3], &s[3]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 01 11 21 31 41 51 61 71
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 02 12 22 32 42 52 62 72
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 03 13 23 33 43 53 63 73
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      d[0] = _mm_packus_epi16(d[0], d[2]);
+      d[1] = _mm_packus_epi16(d[1], d[3]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      d[2] = _mm_unpacklo_epi16(d[0], d[1]);
+      d[3] = _mm_unpackhi_epi16(d[0], d[1]);
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      d[0] = _mm_unpacklo_epi32(d[2], d[3]);
+      d[1] = _mm_unpackhi_epi32(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      t += 8;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 10 11 12 13 14 15 16 17
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 20 21 22 23 24 25 26 27
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 30 31 32 33 34 35 36 37
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[1] = _mm_packus_epi16(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, dst, dst_stride);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  do {
+    load_8bit_8x8(src + 4, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75 (overlapped)
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[2]);
+      // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      transpose_16bit_4x8(&s[2], &s[2]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 01 11 21 31 41 51 61 71
+
+      // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
+      // 01 11 21 31 41 51 61 71  xx xx xx xx xx xx xx xx
+      d[0] = _mm_packus_epi16(d[0], d[0]);
+      d[1] = _mm_packus_epi16(d[1], d[1]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      d[0] = _mm_unpacklo_epi16(d[0], d[1]);
+      store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      t += 4;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 10 11 12 13 14 15 16 17
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
 static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
                                                   const __m128i *const f) {
   __m128i ss[4], temp;
@@ -163,19 +558,100 @@
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
+  const int dst_uv_w = dst_w / 2;
+  const int dst_uv_h = dst_h / 2;
   int scaled = 0;
 
-  if (dst_w * 2 == src_w && dst_h * 2 == src_h && phase_scaler == 0) {
+  // phase_scaler is usually 0 or 8.
+  assert(phase_scaler >= 0 && phase_scaler < 16);
+
+  if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
     // 2 to 1
-    const int dst_uv_w = dst_w / 2;
-    const int dst_uv_h = dst_h / 2;
     scaled = 1;
-    scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
-                               dst->y_stride, dst_w, dst_h);
-    scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
-                               dst->uv_stride, dst_uv_w, dst_uv_h);
-    scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
-                               dst->uv_stride, dst_uv_w, dst_uv_h);
+
+    if (phase_scaler == 0) {
+      scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+      scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0c1);
+      scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+      scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+    } else {
+      const int buffer_stride = (dst_w + 3) & ~3;
+      const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scale_plane_2_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_2_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_2_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+    // 4 to 1
+    scaled = 1;
+    if (phase_scaler == 0) {
+      scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+      scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0c1);
+      scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+      scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+    } else {
+      const int buffer_stride = (dst_w + 1) & ~1;
+      const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow
+      const int extra_padding = 16;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
+      if (temp_buffer) {
+        scale_plane_4_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_4_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_4_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
   } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
     // 1 to 2
     uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7));
diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index ebd3d7f..af7c529 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -112,10 +112,10 @@
   if (!img_data) {
     uint64_t alloc_size;
     /* Calculate storage sizes given the chroma subsampling */
-    align = xcs ? (1 << xcs) - 1 : 1;
-    w = (d_w + align - 1) & ~(align - 1);
-    align = ycs ? (1 << ycs) - 1 : 1;
-    h = (d_h + align - 1) & ~(align - 1);
+    align = (1 << xcs) - 1;
+    w = (d_w + align) & ~align;
+    align = (1 << ycs) - 1;
+    h = (d_h + align) & ~align;
 
     s = (fmt & VPX_IMG_FMT_PLANAR) ? w : bps * w / 8;
     s = (s + stride_align - 1) & ~(stride_align - 1);
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 5cb17e1..474f505 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -364,13 +364,13 @@
 specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
 add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
 add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
 add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_scaled_2d ssse3 neon/;
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index 24e298d..f9f0a48 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -55,16 +55,22 @@
   d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
 }
 
-static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
                                    const ptrdiff_t stride, __m128i *const d) {
   d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
   d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
   d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
   d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
-  d[4] = _mm_loadu_si128((const __m128i *)(s + 4 * stride));
-  d[5] = _mm_loadu_si128((const __m128i *)(s + 5 * stride));
-  d[6] = _mm_loadu_si128((const __m128i *)(s + 6 * stride));
-  d[7] = _mm_loadu_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+  loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
 }
 
 static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
@@ -75,6 +81,26 @@
   *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
 }
 
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+                                       const ptrdiff_t stride) {
+  __m128i ss[4];
+
+  ss[0] = s;
+  ss[1] = _mm_srli_si128(s, 4);
+  ss[2] = _mm_srli_si128(s, 8);
+  ss[3] = _mm_srli_si128(s, 12);
+  store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+                                            uint8_t *const d,
+                                            const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+  _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
 static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
                                   const ptrdiff_t stride) {
   _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 6eafe9a..4e851b5 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -59,10 +59,11 @@
 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
 #endif  // __clang__
 
-static void vpx_filter_block1d16_h8_avx2(
+static INLINE void vpx_filter_block1d16_h8_X_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
+    const int avg) {
+  __m128i filtersReg, outReg1, outReg2;
   __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
   __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
@@ -185,13 +186,21 @@
 
     src_ptr += src_stride;
 
+    // average if necessary
+    outReg1 = _mm256_castsi256_si128(srcRegFilt32b1_1);
+    outReg2 = _mm256_extractf128_si256(srcRegFilt32b1_1, 1);
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+      outReg2 = _mm_avg_epu8(
+          outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+    }
+
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr,
-                    _mm256_castsi256_si128(srcRegFilt32b1_1));
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
 
     // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + output_pitch),
-                    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
+
     output_ptr += dst_stride;
   }
 
@@ -280,17 +289,37 @@
     // shrink to 8 bit each 16 bits, the first lane contain the first
     // convolve result and the second lane contain the second convolve
     // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+    outReg1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    // average if necessary
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+    }
 
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
   }
 }
 
-static void vpx_filter_block1d16_v8_avx2(
+static void vpx_filter_block1d16_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+    ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+  vpx_filter_block1d16_h8_X_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+                                 output_height, filter, 0);
+}
+
+static void vpx_filter_block1d16_h8_avg_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+    ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+  vpx_filter_block1d16_h8_X_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+                                 output_height, filter, 1);
+}
+
+static INLINE void vpx_filter_block1d16_v8_X_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
+    const int avg) {
+  __m128i filtersReg, outReg1, outReg2;
   __m256i addFilterReg64;
   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
@@ -435,12 +464,20 @@
 
     src_ptr += src_stride;
 
+    // average if necessary
+    outReg1 = _mm256_castsi256_si128(srcReg32b1);
+    outReg2 = _mm256_extractf128_si256(srcReg32b1, 1);
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+      outReg2 = _mm_avg_epu8(
+          outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+    }
+
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
 
     // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch),
-                    _mm256_extractf128_si256(srcReg32b1, 1));
+    _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
 
     output_ptr += dst_stride;
 
@@ -515,13 +552,33 @@
     // shrink to 8 bit each 16 bits, the first lane contain the first
     // convolve result and the second lane contain the second convolve
     // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+    outReg1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+    // average if necessary
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+    }
 
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
   }
 }
 
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *filter) {
+  vpx_filter_block1d16_v8_X_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                 height, filter, 0);
+}
+
+static void vpx_filter_block1d16_v8_avg_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) {
+  vpx_filter_block1d16_v8_X_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                 height, filter, 1);
+}
+
 #if HAVE_AVX2 && HAVE_SSSE3
 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 #if ARCH_X86_64
@@ -539,6 +596,14 @@
 #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
 #endif  // ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3
 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
@@ -552,6 +617,18 @@
 #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
 #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
 #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3
+#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3
+#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3
+#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3
+#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
+#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3
 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
 //                                const InterpKernel *filter, int x0_q4,
@@ -562,13 +639,31 @@
 //                               const InterpKernel *filter, int x0_q4,
 //                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
+// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
 FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
 
 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
 //                          const InterpKernel *filter, int x0_q4,
 //                          int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                          int w, int h);
+// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                              int w, int h);
 FUN_CONV_2D(, avx2);
+FUN_CONV_2D(avg_, avx2);
 #endif  // HAVE_AX2 && HAVE_SSSE3