Merge changes Icec98e6f,I63614e65,I25ea05f4

* changes:
  Add full_pixel_exhaustive_new
  Add sse cost in vp9_full_pixel_diamond_new
  Use motion field for mv inconsistency in mv search
diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc
index 38b4b58..128b503 100644
--- a/test/blockiness_test.cc
+++ b/test/blockiness_test.cc
@@ -26,10 +26,7 @@
 #include "test/util.h"
 
 #include "vpx_mem/vpx_mem.h"
-
-extern "C" double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
-                                     const unsigned char *img2, int img2_pitch,
-                                     int width, int height);
+#include "vp9/encoder/vp9_blockiness.h"
 
 using libvpx_test::ACMRandom;
 
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 39c5e79..89b1cd8 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -130,6 +130,12 @@
   RunTest(left_col, above_data, dst, ref_dst);
 }
 
+// Instantiate a token test to avoid -Wuninitialized warnings when none of the
+// other tests are enabled.
+INSTANTIATE_TEST_CASE_P(
+    C, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_d45_predictor_4x4_c,
+                                     &vpx_d45_predictor_4x4_c, 4, 8)));
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9IntraPredTest,
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index e12f65a..48a1972 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -8,28 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "vp8/common/arm/loopfilter_arm.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/onyxc_int.h"
 
-typedef void loopfilter_y_neon(unsigned char *src, int pitch,
-                               unsigned char blimit, unsigned char limit,
-                               unsigned char thresh);
-typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
-                                unsigned char blimit, unsigned char limit,
-                                unsigned char thresh, unsigned char *v);
-
-extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
-extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
-extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
-
-extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
-extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
-extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
-
 /* NEON loopfilter functions */
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr,
diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h
new file mode 100644
index 0000000..6cf660d
--- /dev/null
+++ b/vp8/common/arm/loopfilter_arm.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
+#define VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
+
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+                               unsigned char blimit, unsigned char limit,
+                               unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+                                unsigned char blimit, unsigned char limit,
+                                unsigned char thresh, unsigned char *v);
+
+loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+
+loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
+loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+
+#endif  // VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
diff --git a/vp8/common/arm/neon/bilinearpredict_neon.c b/vp8/common/arm/neon/bilinearpredict_neon.c
index 8520ab5..590956d 100644
--- a/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -10,7 +10,9 @@
 
 #include <arm_neon.h>
 #include <string.h>
+
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
 
 static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 },
diff --git a/vp8/common/arm/neon/copymem_neon.c b/vp8/common/arm/neon/copymem_neon.c
index c1d293b..c89b47d 100644
--- a/vp8/common/arm/neon/copymem_neon.c
+++ b/vp8/common/arm/neon/copymem_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride,
                           unsigned char *dst, int dst_stride) {
   uint8x8_t vtmp;
diff --git a/vp8/common/arm/neon/dequantizeb_neon.c b/vp8/common/arm/neon/dequantizeb_neon.c
index 6edff3c..791aaea 100644
--- a/vp8/common/arm/neon/dequantizeb_neon.c
+++ b/vp8/common/arm/neon/dequantizeb_neon.c
@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 
 void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
diff --git a/vp8/common/arm/neon/idct_blk_neon.c b/vp8/common/arm/neon/idct_blk_neon.c
index 3d02e13..5c26ce6 100644
--- a/vp8/common/arm/neon/idct_blk_neon.c
+++ b/vp8/common/arm/neon/idct_blk_neon.c
@@ -8,15 +8,226 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include <arm_neon.h>
 
-/* place these declarations here because we don't want to maintain them
- * outside of this scope
- */
-void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *dst,
-                               int stride);
-void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *dst, int stride);
+#include "./vp8_rtcd.h"
+
+static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst,
+                                   int stride) {
+  unsigned char *dst0;
+  int i, a0, a1;
+  int16x8x2_t q2Add;
+  int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0);
+  uint8x8_t d2u8, d4u8;
+  uint16x8_t q1u16, q2u16;
+
+  a0 = ((q[0] * dq) + 4) >> 3;
+  a1 = ((q[16] * dq) + 4) >> 3;
+  q[0] = q[16] = 0;
+  q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+  q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+  for (i = 0; i < 2; i++, dst += 4) {
+    dst0 = dst;
+    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+    dst0 += stride;
+    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+    dst0 += stride;
+    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+    dst0 += stride;
+    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+    q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                     vreinterpret_u8_s32(d2s32));
+    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                     vreinterpret_u8_s32(d4s32));
+
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+    d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+    d2s32 = vreinterpret_s32_u8(d2u8);
+    d4s32 = vreinterpret_s32_u8(d4u8);
+
+    dst0 = dst;
+    vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+  }
+}
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq,
+                                      unsigned char *dst, int stride) {
+  unsigned char *dst0, *dst1;
+  int32x2_t d28, d29, d30, d31;
+  int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+  int16x8_t qEmpty = vdupq_n_s16(0);
+  int32x4x2_t q2tmp0, q2tmp1;
+  int16x8x2_t q2tmp2, q2tmp3;
+  int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+  d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+  // load dq
+  q0 = vld1q_s16(dq);
+  dq += 8;
+  q1 = vld1q_s16(dq);
+
+  // load q
+  q2 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+  q += 8;
+  q3 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+  q += 8;
+  q4 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+  q += 8;
+  q5 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+
+  // load src from dst
+  dst0 = dst;
+  dst1 = dst + 4;
+  d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+  dst0 += stride;
+  d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+  dst1 += stride;
+  d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+  dst0 += stride;
+  d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+  dst1 += stride;
+
+  d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+  dst0 += stride;
+  d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+  dst1 += stride;
+  d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+  d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+  q2 = vmulq_s16(q2, q0);
+  q3 = vmulq_s16(q3, q1);
+  q4 = vmulq_s16(q4, q0);
+  q5 = vmulq_s16(q5, q1);
+
+  // vswp
+  dLow0 = vget_low_s16(q2);
+  dHigh0 = vget_high_s16(q2);
+  dLow1 = vget_low_s16(q4);
+  dHigh1 = vget_high_s16(q4);
+  q2 = vcombine_s16(dLow0, dLow1);
+  q4 = vcombine_s16(dHigh0, dHigh1);
+
+  dLow0 = vget_low_s16(q3);
+  dHigh0 = vget_high_s16(q3);
+  dLow1 = vget_low_s16(q5);
+  dHigh1 = vget_high_s16(q5);
+  q3 = vcombine_s16(dLow0, dLow1);
+  q5 = vcombine_s16(dHigh0, dHigh1);
+
+  q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+  q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+  q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+  q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+  q10 = vqaddq_s16(q2, q3);
+  q11 = vqsubq_s16(q2, q3);
+
+  q8 = vshrq_n_s16(q8, 1);
+  q9 = vshrq_n_s16(q9, 1);
+
+  q4 = vqaddq_s16(q4, q8);
+  q5 = vqaddq_s16(q5, q9);
+
+  q2 = vqsubq_s16(q6, q5);
+  q3 = vqaddq_s16(q7, q4);
+
+  q4 = vqaddq_s16(q10, q3);
+  q5 = vqaddq_s16(q11, q2);
+  q6 = vqsubq_s16(q11, q2);
+  q7 = vqsubq_s16(q10, q3);
+
+  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                     vreinterpretq_s16_s32(q2tmp1.val[0]));
+  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                     vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+  // loop 2
+  q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+  q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+  q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+  q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+  q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+  q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+  q10 = vshrq_n_s16(q10, 1);
+  q11 = vshrq_n_s16(q11, 1);
+
+  q10 = vqaddq_s16(q2tmp2.val[1], q10);
+  q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+  q8 = vqsubq_s16(q8, q11);
+  q9 = vqaddq_s16(q9, q10);
+
+  q4 = vqaddq_s16(q2, q9);
+  q5 = vqaddq_s16(q3, q8);
+  q6 = vqsubq_s16(q3, q8);
+  q7 = vqsubq_s16(q2, q9);
+
+  q4 = vrshrq_n_s16(q4, 3);
+  q5 = vrshrq_n_s16(q5, 3);
+  q6 = vrshrq_n_s16(q6, 3);
+  q7 = vrshrq_n_s16(q7, 3);
+
+  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                     vreinterpretq_s16_s32(q2tmp1.val[0]));
+  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                     vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+  q4 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28)));
+  q5 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29)));
+  q6 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30)));
+  q7 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31)));
+
+  d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+  d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+  d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+  d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+  dst0 = dst;
+  dst1 = dst + 4;
+  vst1_lane_s32((int32_t *)dst0, d28, 0);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst1, d28, 1);
+  dst1 += stride;
+  vst1_lane_s32((int32_t *)dst0, d29, 0);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst1, d29, 1);
+  dst1 += stride;
+
+  vst1_lane_s32((int32_t *)dst0, d30, 0);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst1, d30, 1);
+  dst1 += stride;
+  vst1_lane_s32((int32_t *)dst0, d31, 0);
+  vst1_lane_s32((int32_t *)dst1, d31, 1);
+}
 
 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst,
                                        int stride, char *eobs) {
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
deleted file mode 100644
index c83102a..0000000
--- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst,
-                            int stride) {
-  unsigned char *dst0;
-  int i, a0, a1;
-  int16x8x2_t q2Add;
-  int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0);
-  uint8x8_t d2u8, d4u8;
-  uint16x8_t q1u16, q2u16;
-
-  a0 = ((q[0] * dq) + 4) >> 3;
-  a1 = ((q[16] * dq) + 4) >> 3;
-  q[0] = q[16] = 0;
-  q2Add.val[0] = vdupq_n_s16((int16_t)a0);
-  q2Add.val[1] = vdupq_n_s16((int16_t)a1);
-
-  for (i = 0; i < 2; i++, dst += 4) {
-    dst0 = dst;
-    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
-    dst0 += stride;
-    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
-    dst0 += stride;
-    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
-    dst0 += stride;
-    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
-
-    q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
-                     vreinterpret_u8_s32(d2s32));
-    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
-                     vreinterpret_u8_s32(d4s32));
-
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
-    d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
-
-    d2s32 = vreinterpret_s32_u8(d2u8);
-    d4s32 = vreinterpret_s32_u8(d4u8);
-
-    dst0 = dst;
-    vst1_lane_s32((int32_t *)dst0, d2s32, 0);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d2s32, 1);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d4s32, 0);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d4s32, 1);
-  }
-  return;
-}
diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
deleted file mode 100644
index f30671c..0000000
--- a/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 17734;
-// because the lowest bit in 0x8a8c is 0, we can pre-shift this
-
-void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq, unsigned char *dst,
-                               int stride) {
-  unsigned char *dst0, *dst1;
-  int32x2_t d28, d29, d30, d31;
-  int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
-  int16x8_t qEmpty = vdupq_n_s16(0);
-  int32x4x2_t q2tmp0, q2tmp1;
-  int16x8x2_t q2tmp2, q2tmp3;
-  int16x4_t dLow0, dLow1, dHigh0, dHigh1;
-
-  d28 = d29 = d30 = d31 = vdup_n_s32(0);
-
-  // load dq
-  q0 = vld1q_s16(dq);
-  dq += 8;
-  q1 = vld1q_s16(dq);
-
-  // load q
-  q2 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-  q += 8;
-  q3 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-  q += 8;
-  q4 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-  q += 8;
-  q5 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-
-  // load src from dst
-  dst0 = dst;
-  dst1 = dst + 4;
-  d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
-  dst0 += stride;
-  d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
-  dst1 += stride;
-  d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
-  dst0 += stride;
-  d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
-  dst1 += stride;
-
-  d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
-  dst0 += stride;
-  d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
-  dst1 += stride;
-  d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
-  d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
-
-  q2 = vmulq_s16(q2, q0);
-  q3 = vmulq_s16(q3, q1);
-  q4 = vmulq_s16(q4, q0);
-  q5 = vmulq_s16(q5, q1);
-
-  // vswp
-  dLow0 = vget_low_s16(q2);
-  dHigh0 = vget_high_s16(q2);
-  dLow1 = vget_low_s16(q4);
-  dHigh1 = vget_high_s16(q4);
-  q2 = vcombine_s16(dLow0, dLow1);
-  q4 = vcombine_s16(dHigh0, dHigh1);
-
-  dLow0 = vget_low_s16(q3);
-  dHigh0 = vget_high_s16(q3);
-  dLow1 = vget_low_s16(q5);
-  dHigh1 = vget_high_s16(q5);
-  q3 = vcombine_s16(dLow0, dLow1);
-  q5 = vcombine_s16(dHigh0, dHigh1);
-
-  q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
-  q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
-  q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
-  q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
-
-  q10 = vqaddq_s16(q2, q3);
-  q11 = vqsubq_s16(q2, q3);
-
-  q8 = vshrq_n_s16(q8, 1);
-  q9 = vshrq_n_s16(q9, 1);
-
-  q4 = vqaddq_s16(q4, q8);
-  q5 = vqaddq_s16(q5, q9);
-
-  q2 = vqsubq_s16(q6, q5);
-  q3 = vqaddq_s16(q7, q4);
-
-  q4 = vqaddq_s16(q10, q3);
-  q5 = vqaddq_s16(q11, q2);
-  q6 = vqsubq_s16(q11, q2);
-  q7 = vqsubq_s16(q10, q3);
-
-  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
-  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
-  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
-                     vreinterpretq_s16_s32(q2tmp1.val[0]));
-  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
-                     vreinterpretq_s16_s32(q2tmp1.val[1]));
-
-  // loop 2
-  q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
-  q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
-  q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
-  q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
-
-  q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
-  q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
-
-  q10 = vshrq_n_s16(q10, 1);
-  q11 = vshrq_n_s16(q11, 1);
-
-  q10 = vqaddq_s16(q2tmp2.val[1], q10);
-  q11 = vqaddq_s16(q2tmp3.val[1], q11);
-
-  q8 = vqsubq_s16(q8, q11);
-  q9 = vqaddq_s16(q9, q10);
-
-  q4 = vqaddq_s16(q2, q9);
-  q5 = vqaddq_s16(q3, q8);
-  q6 = vqsubq_s16(q3, q8);
-  q7 = vqsubq_s16(q2, q9);
-
-  q4 = vrshrq_n_s16(q4, 3);
-  q5 = vrshrq_n_s16(q5, 3);
-  q6 = vrshrq_n_s16(q6, 3);
-  q7 = vrshrq_n_s16(q7, 3);
-
-  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
-  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
-  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
-                     vreinterpretq_s16_s32(q2tmp1.val[0]));
-  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
-                     vreinterpretq_s16_s32(q2tmp1.val[1]));
-
-  q4 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28)));
-  q5 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29)));
-  q6 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30)));
-  q7 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31)));
-
-  d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
-  d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
-  d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
-  d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
-
-  dst0 = dst;
-  dst1 = dst + 4;
-  vst1_lane_s32((int32_t *)dst0, d28, 0);
-  dst0 += stride;
-  vst1_lane_s32((int32_t *)dst1, d28, 1);
-  dst1 += stride;
-  vst1_lane_s32((int32_t *)dst0, d29, 0);
-  dst0 += stride;
-  vst1_lane_s32((int32_t *)dst1, d29, 1);
-  dst1 += stride;
-
-  vst1_lane_s32((int32_t *)dst0, d30, 0);
-  dst0 += stride;
-  vst1_lane_s32((int32_t *)dst1, d30, 1);
-  dst1 += stride;
-  vst1_lane_s32((int32_t *)dst0, d31, 0);
-  vst1_lane_s32((int32_t *)dst1, d31, 1);
-  return;
-}
diff --git a/vp8/common/arm/neon/iwalsh_neon.c b/vp8/common/arm/neon/iwalsh_neon.c
index 6c4bcc1..91600bf 100644
--- a/vp8/common/arm/neon/iwalsh_neon.c
+++ b/vp8/common/arm/neon/iwalsh_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 void vp8_short_inv_walsh4x4_neon(int16_t *input, int16_t *mb_dqcoeff) {
   int16x8_t q0s16, q1s16, q2s16, q3s16;
   int16x4_t d4s16, d5s16, d6s16, d7s16;
diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
index a168219..df983b2 100644
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 
 static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
     unsigned char *s, int p, const unsigned char *blimit) {
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
index 80a222d..fbc83ae 100644
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx_ports/arm.h"
 
 #ifdef VPX_INCOMPATIBLE_GCC
diff --git a/vp8/common/arm/neon/mbloopfilter_neon.c b/vp8/common/arm/neon/mbloopfilter_neon.c
index 65eec30..fafaf2d 100644
--- a/vp8/common/arm/neon/mbloopfilter_neon.c
+++ b/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "vp8/common/arm/loopfilter_arm.h"
 
 static INLINE void vp8_mbloop_filter_neon(uint8x16_t qblimit,  // mblimit
                                           uint8x16_t qlimit,   // limit
diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
index aa2567d..48e86d3 100644
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 #include <string.h>
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_ports/mem.h"
 
diff --git a/vp8/common/arm/neon/vp8_loopfilter_neon.c b/vp8/common/arm/neon/vp8_loopfilter_neon.c
index d728673..ebc004a 100644
--- a/vp8/common/arm/neon/vp8_loopfilter_neon.c
+++ b/vp8/common/arm/neon/vp8_loopfilter_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "vp8/common/arm/loopfilter_arm.h"
 #include "vpx_ports/arm.h"
 
 static INLINE void vp8_loop_filter_neon(uint8x16_t qblimit,  // flimit
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 1ff2e5c..2ed19c4 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -151,124 +151,6 @@
 }
 #endif  // CONFIG_POSTPROC
 
-/* Blend the macro block with a solid colored square.  Leave the
- * edges unblended to give distinction to macro blocks in areas
- * filled with the same color block.
- */
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                          int y_1, int u_1, int v_1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y_1 * ((1 << 16) - alpha);
-  int u1_const = u_1 * ((1 << 16) - alpha);
-  int v1_const = v_1 * ((1 << 16) - alpha);
-
-  y += 2 * stride + 2;
-  for (i = 0; i < 12; ++i) {
-    for (j = 0; j < 12; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  u += stride + 1;
-  v += stride + 1;
-
-  for (i = 0; i < 6; ++i) {
-    for (j = 0; j < 6; ++j) {
-      u[j] = (u[j] * alpha + u1_const) >> 16;
-      v[j] = (v[j] * alpha + v1_const) >> 16;
-    }
-    u += stride;
-    v += stride;
-  }
-}
-
-/* Blend only the edge of the macro block.  Leave center
- * unblended to allow for other visualizations to be layered.
- */
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                          int y_1, int u_1, int v_1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y_1 * ((1 << 16) - alpha);
-  int u1_const = u_1 * ((1 << 16) - alpha);
-  int v1_const = v_1 * ((1 << 16) - alpha);
-
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 16; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  for (i = 0; i < 12; ++i) {
-    y[0] = (y[0] * alpha + y1_const) >> 16;
-    y[1] = (y[1] * alpha + y1_const) >> 16;
-    y[14] = (y[14] * alpha + y1_const) >> 16;
-    y[15] = (y[15] * alpha + y1_const) >> 16;
-    y += stride;
-  }
-
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 16; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  for (j = 0; j < 8; ++j) {
-    u[j] = (u[j] * alpha + u1_const) >> 16;
-    v[j] = (v[j] * alpha + v1_const) >> 16;
-  }
-  u += stride;
-  v += stride;
-
-  for (i = 0; i < 6; ++i) {
-    u[0] = (u[0] * alpha + u1_const) >> 16;
-    v[0] = (v[0] * alpha + v1_const) >> 16;
-
-    u[7] = (u[7] * alpha + u1_const) >> 16;
-    v[7] = (v[7] * alpha + v1_const) >> 16;
-
-    u += stride;
-    v += stride;
-  }
-
-  for (j = 0; j < 8; ++j) {
-    u[j] = (u[j] * alpha + u1_const) >> 16;
-    v[j] = (v[j] * alpha + v1_const) >> 16;
-  }
-}
-
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                   int y_1, int u_1, int v_1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y_1 * ((1 << 16) - alpha);
-  int u1_const = u_1 * ((1 << 16) - alpha);
-  int v1_const = v_1 * ((1 << 16) - alpha);
-
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 2; ++j) {
-      u[j] = (u[j] * alpha + u1_const) >> 16;
-      v[j] = (v[j] * alpha + v1_const) >> 16;
-    }
-    u += stride;
-    v += stride;
-  }
-}
-
 #if CONFIG_POSTPROC
 int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
                         vp8_ppflags_t *ppflags) {
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 235c77e..8452b5e 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -177,10 +177,8 @@
 #
 # Block copy
 #
-if ($opts{arch} =~ /x86/) {
-    add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height";
-    specialize qw/vp8_copy32xn sse2 sse3/;
-}
+add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height";
+specialize qw/vp8_copy32xn sse2 sse3/;
 
 #
 # Forward DCT
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index e342d7c..f2a18f0 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -76,7 +76,7 @@
   }
 
   {
-    const int shift = vp8_norm[range];
+    const unsigned char shift = vp8_norm[(unsigned char)range];
     range <<= shift;
     value <<= shift;
     count -= shift;
diff --git a/vp8/decoder/error_concealment.c b/vp8/decoder/error_concealment.c
index e221414..85982e4 100644
--- a/vp8/decoder/error_concealment.c
+++ b/vp8/decoder/error_concealment.c
@@ -147,8 +147,8 @@
   }
 }
 
-void vp8_calculate_overlaps(MB_OVERLAP *overlap_ul, int mb_rows, int mb_cols,
-                            union b_mode_info *bmi, int b_row, int b_col) {
+static void calculate_overlaps(MB_OVERLAP *overlap_ul, int mb_rows, int mb_cols,
+                               union b_mode_info *bmi, int b_row, int b_col) {
   MB_OVERLAP *mb_overlap;
   int row, col, rel_row, rel_col;
   int new_row, new_col;
@@ -280,9 +280,9 @@
   int sub_col;
   for (sub_row = 0; sub_row < 4; ++sub_row) {
     for (sub_col = 0; sub_col < 4; ++sub_col) {
-      vp8_calculate_overlaps(overlaps, mb_rows, mb_cols,
-                             &(prev_mi->bmi[sub_row * 4 + sub_col]),
-                             4 * mb_row + sub_row, 4 * mb_col + sub_col);
+      calculate_overlaps(overlaps, mb_rows, mb_cols,
+                         &(prev_mi->bmi[sub_row * 4 + sub_col]),
+                         4 * mb_row + sub_row, 4 * mb_col + sub_col);
     }
   }
 }
diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c
index d066be1..6fc6080 100644
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.c
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -9,6 +9,8 @@
  */
 
 #include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
 
 static const uint16_t inv_zig_zag[16] = { 1, 2, 6,  7,  3,  5,  8,  13,
diff --git a/vp8/encoder/arm/neon/shortfdct_neon.c b/vp8/encoder/arm/neon/shortfdct_neon.c
index 76853e6..99dff6b 100644
--- a/vp8/encoder/arm/neon/shortfdct_neon.c
+++ b/vp8/encoder/arm/neon/shortfdct_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 void vp8_short_fdct4x4_neon(int16_t *input, int16_t *output, int pitch) {
   int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
   int16x4_t d16s16, d17s16, d26s16, dEmptys16;
diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
index 8d6ea4c..02056f2 100644
--- a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
+++ b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
@@ -9,6 +9,8 @@
  */
 
 #include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
 #include "vpx_ports/arm.h"
 
 #ifdef VPX_INCOMPATIBLE_GCC
diff --git a/vp8/common/copy_c.c b/vp8/encoder/copy_c.c
similarity index 100%
rename from vp8/common/copy_c.c
rename to vp8/encoder/copy_c.c
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 1c3612f..999d6e8 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1129,6 +1129,7 @@
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
+#if HAVE_SSE2 || HAVE_MSA
 int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                              int_mv *best_mv, int search_param, int sad_per_bit,
                              int *num00, vp8_variance_fn_ptr_t *fn_ptr,
@@ -1277,6 +1278,7 @@
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
+#endif  // HAVE_SSE2 || HAVE_MSA
 
 int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                           int sad_per_bit, int distance,
@@ -1364,6 +1366,7 @@
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
+#if HAVE_SSSE3
 int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                           int sad_per_bit, int distance,
                           vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
@@ -1482,7 +1485,9 @@
   return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
+#endif  // HAVE_SSSE3
 
+#if HAVE_SSE4_1
 int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                           int sad_per_bit, int distance,
                           vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
@@ -1628,6 +1633,7 @@
   return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
+#endif  // HAVE_SSE4_1
 
 int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                               int_mv *ref_mv, int error_per_bit,
@@ -1707,6 +1713,7 @@
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
+#if HAVE_SSE2 || HAVE_MSA
 int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                               int_mv *ref_mv, int error_per_bit,
                               int search_range, vp8_variance_fn_ptr_t *fn_ptr,
@@ -1816,3 +1823,4 @@
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
+#endif  // HAVE_SSE2 || HAVE_MSA
diff --git a/vp8/common/x86/copy_sse2.asm b/vp8/encoder/x86/copy_sse2.asm
similarity index 100%
rename from vp8/common/x86/copy_sse2.asm
rename to vp8/encoder/x86/copy_sse2.asm
diff --git a/vp8/common/x86/copy_sse3.asm b/vp8/encoder/x86/copy_sse3.asm
similarity index 100%
rename from vp8/common/x86/copy_sse3.asm
rename to vp8/encoder/x86/copy_sse3.asm
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 9f106a2..3b442b1 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -15,7 +15,6 @@
 VP8_COMMON_SRCS-yes += common/alloccommon.c
 VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
-VP8_COMMON_SRCS-yes += common/copy_c.c
 # VP8_COMMON_SRCS-yes += common/debugmodes.c
 VP8_COMMON_SRCS-yes += common/default_coef_probs.h
 VP8_COMMON_SRCS-yes += common/dequantize.c
@@ -80,7 +79,6 @@
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
@@ -88,7 +86,6 @@
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
 
 ifeq ($(CONFIG_POSTPROC),yes)
@@ -129,14 +126,13 @@
 
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.h
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_loopfilter_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index ab85edd..3a8f8ea 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -23,6 +23,7 @@
 VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
 VP8_CX_SRCS-yes += encoder/bitstream.c
 VP8_CX_SRCS-yes += encoder/boolhuff.c
+VP8_CX_SRCS-yes += encoder/copy_c.c
 VP8_CX_SRCS-yes += encoder/dct.c
 VP8_CX_SRCS-yes += encoder/encodeframe.c
 VP8_CX_SRCS-yes += encoder/encodeframe.h
@@ -82,6 +83,8 @@
 VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h
 endif
 
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c
diff --git a/vp9/common/ppc/vp9_idct_vsx.c b/vp9/common/ppc/vp9_idct_vsx.c
index 1b2a93e..e861596 100644
--- a/vp9/common/ppc/vp9_idct_vsx.c
+++ b/vp9/common/ppc/vp9_idct_vsx.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/ppc/inv_txfm_vsx.h"
 #include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
diff --git a/vp9/encoder/vp9_blockiness.c b/vp9/encoder/vp9_blockiness.c
index 9ab57b5..da68a3c 100644
--- a/vp9/encoder/vp9_blockiness.c
+++ b/vp9/encoder/vp9_blockiness.c
@@ -11,6 +11,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/system_state.h"
+#include "vp9/encoder/vp9_blockiness.h"
 
 static int horizontal_filter(const uint8_t *s) {
   return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
diff --git a/vp9/encoder/vp9_blockiness.h b/vp9/encoder/vp9_blockiness.h
new file mode 100644
index 0000000..e840cb2
--- /dev/null
+++ b/vp9/encoder/vp9_blockiness.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
+#define VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
+                          const uint8_t *img2, int img2_pitch, int width,
+                          int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index fb0cc23..bf35b35 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -43,6 +43,9 @@
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_aq_variance.h"
 #include "vp9/encoder/vp9_bitstream.h"
+#if CONFIG_INTERNAL_STATS
+#include "vp9/encoder/vp9_blockiness.h"
+#endif
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
@@ -5162,10 +5165,6 @@
 }
 
 #if CONFIG_INTERNAL_STATS
-extern double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
-                                 const uint8_t *img2, int img2_pitch, int width,
-                                 int height);
-
 static void adjust_image_stat(double y, double u, double v, double all,
                               ImageStat *s) {
   s->stat[Y] += y;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 03ac934..8f0da48 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -820,6 +820,8 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo tile = tile_data->tile_info;
+  const int mb_col_start = ROUND_POWER_OF_TWO(tile.mi_col_start, 1);
+  const int mb_col_end = ROUND_POWER_OF_TWO(tile.mi_col_end, 1);
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
   const PICK_MODE_CONTEXT *ctx = &td->pc_root->none;
@@ -846,9 +848,8 @@
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
-  xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) +
-           (tile.mi_col_start >> 1);
-  xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + (tile.mi_col_start >> 1);
+  xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + mb_col_start;
+  xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + mb_col_start;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][1];
@@ -862,10 +863,9 @@
   uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
 
   // Reset above block coeffs.
-  recon_yoffset =
-      (mb_row * recon_y_stride * 16) + (tile.mi_col_start >> 1) * 16;
-  recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height) +
-                   (tile.mi_col_start >> 1) * uv_mb_height;
+  recon_yoffset = (mb_row * recon_y_stride * 16) + mb_col_start * 16;
+  recon_uvoffset =
+      (mb_row * recon_uv_stride * uv_mb_height) + mb_col_start * uv_mb_height;
 
   // Set up limit values for motion vectors to prevent them extending
   // outside the UMV borders.
@@ -873,8 +873,7 @@
   x->mv_limits.row_max =
       ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
 
-  for (mb_col = tile.mi_col_start >> 1, c = 0; mb_col < (tile.mi_col_end >> 1);
-       ++mb_col, c++) {
+  for (mb_col = mb_col_start, c = 0; mb_col < mb_col_end; ++mb_col, c++) {
     int this_error;
     int this_intra_error;
     const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
@@ -920,7 +919,7 @@
     x->skip_encode = 0;
     x->fp_src_pred = 0;
     // Do intra prediction based on source pixels for tile boundaries
-    if ((mb_col == (tile.mi_col_start >> 1)) && mb_col != 0) {
+    if (mb_col == mb_col_start && mb_col != 0) {
       xd->left_mi = &mi_left;
       x->fp_src_pred = 1;
     }
@@ -1310,7 +1309,7 @@
     recon_uvoffset += uv_mb_height;
 
     // Accumulate row level stats to the corresponding tile stats
-    if (cpi->row_mt && mb_col == (tile.mi_col_end >> 1) - 1)
+    if (cpi->row_mt && mb_col == mb_col_end - 1)
       accumulate_fp_mb_row_stat(tile_data, fp_acc_data);
 
     (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c,
diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index cae524e..8426b94 100644
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -499,14 +499,14 @@
         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
         store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs + 8);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
         store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs + 8);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index f5a76dc..05981d6 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -75,6 +75,7 @@
 VP9_CX_SRCS-yes += encoder/vp9_resize.c
 VP9_CX_SRCS-yes += encoder/vp9_resize.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.h
 
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct_neon.c
index 04646ed..3708cbb 100644
--- a/vpx_dsp/arm/fdct_neon.c
+++ b/vpx_dsp/arm/fdct_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c
index 8049277..374a262 100644
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/vpx_dsp/arm/fwd_txfm_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index 1ce66d3..c4a49e3 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/arm/mem_neon.h"
diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c
index eef1233..612897e 100644
--- a/vpx_dsp/arm/subtract_neon.c
+++ b/vpx_dsp/arm/subtract_neon.c
@@ -12,6 +12,7 @@
 #include <assert.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/arm/mem_neon.h"
 
diff --git a/vpx_dsp/bitreader.h b/vpx_dsp/bitreader.h
index fbc1003..68e1bd6 100644
--- a/vpx_dsp/bitreader.h
+++ b/vpx_dsp/bitreader.h
@@ -94,7 +94,7 @@
   }
 
   {
-    const int shift = vpx_norm[range];
+    const unsigned char shift = vpx_norm[(unsigned char)range];
     range <<= shift;
     value <<= shift;
     count -= shift;
diff --git a/vpx_dsp/ppc/fdct32x32_vsx.c b/vpx_dsp/ppc/fdct32x32_vsx.c
index 0156683..6110716 100644
--- a/vpx_dsp/ppc/fdct32x32_vsx.c
+++ b/vpx_dsp/ppc/fdct32x32_vsx.c
@@ -11,9 +11,9 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
-#include "vpx_dsp/ppc/types_vsx.h"
-#include "vpx_dsp/ppc/txfm_common_vsx.h"
 #include "vpx_dsp/ppc/transpose_vsx.h"
+#include "vpx_dsp/ppc/txfm_common_vsx.h"
+#include "vpx_dsp/ppc/types_vsx.h"
 
 // Returns ((a +/- b) * cospi16 + (2 << 13)) >> 14.
 static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add,
@@ -223,7 +223,7 @@
   return vec_sra(vec_add(vec_add(a, vec_ones_s16), sign), vec_dct_scale_log2);
 }
 
-void vpx_fdct32_vsx(const int16x8_t *in, int16x8_t *out, int pass) {
+static void fdct32_vsx(const int16x8_t *in, int16x8_t *out, int pass) {
   int16x8_t temp0[32];  // Hold stages: 1, 4, 7
   int16x8_t temp1[32];  // Hold stages: 2, 5
   int16x8_t temp2[32];  // Hold stages: 3, 6
@@ -478,16 +478,16 @@
 
   // Process in 8x32 columns.
   load(input, stride, temp0);
-  vpx_fdct32_vsx(temp0, temp1, 0);
+  fdct32_vsx(temp0, temp1, 0);
 
   load(input + 8, stride, temp0);
-  vpx_fdct32_vsx(temp0, temp2, 0);
+  fdct32_vsx(temp0, temp2, 0);
 
   load(input + 16, stride, temp0);
-  vpx_fdct32_vsx(temp0, temp3, 0);
+  fdct32_vsx(temp0, temp3, 0);
 
   load(input + 24, stride, temp0);
-  vpx_fdct32_vsx(temp0, temp4, 0);
+  fdct32_vsx(temp0, temp4, 0);
 
   // Generate the top row by munging the first set of 8 from each one
   // together.
@@ -496,7 +496,7 @@
   transpose_8x8(&temp3[0], &temp0[16]);
   transpose_8x8(&temp4[0], &temp0[24]);
 
-  vpx_fdct32_vsx(temp0, temp5, 1);
+  fdct32_vsx(temp0, temp5, 1);
 
   transpose_8x8(&temp5[0], &temp6[0]);
   transpose_8x8(&temp5[8], &temp6[8]);
@@ -511,7 +511,7 @@
   transpose_8x8(&temp3[8], &temp0[16]);
   transpose_8x8(&temp4[8], &temp0[24]);
 
-  vpx_fdct32_vsx(temp0, temp5, 1);
+  fdct32_vsx(temp0, temp5, 1);
 
   transpose_8x8(&temp5[0], &temp6[0]);
   transpose_8x8(&temp5[8], &temp6[8]);
@@ -526,7 +526,7 @@
   transpose_8x8(&temp3[16], &temp0[16]);
   transpose_8x8(&temp4[16], &temp0[24]);
 
-  vpx_fdct32_vsx(temp0, temp5, 1);
+  fdct32_vsx(temp0, temp5, 1);
 
   transpose_8x8(&temp5[0], &temp6[0]);
   transpose_8x8(&temp5[8], &temp6[8]);
@@ -541,7 +541,7 @@
   transpose_8x8(&temp3[24], &temp0[16]);
   transpose_8x8(&temp4[24], &temp0[24]);
 
-  vpx_fdct32_vsx(temp0, temp5, 1);
+  fdct32_vsx(temp0, temp5, 1);
 
   transpose_8x8(&temp5[0], &temp6[0]);
   transpose_8x8(&temp5[8], &temp6[8]);
diff --git a/vpx_dsp/ppc/subtract_vsx.c b/vpx_dsp/ppc/subtract_vsx.c
index 51a2415..76ad302 100644
--- a/vpx_dsp/ppc/subtract_vsx.c
+++ b/vpx_dsp/ppc/subtract_vsx.c
@@ -11,6 +11,7 @@
 #include <assert.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/ppc/types_vsx.h"
 
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 83cd9e8..37d1de0 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -46,6 +46,7 @@
 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
 #endif
 
+#if ARCH_X86_64
 void vpx_filter_block1d4_h8_intrin_ssse3(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -197,6 +198,7 @@
     output_ptr += out_pitch;
   }
 }
+#endif  // ARCH_X86_64
 
 static void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr,
                                           ptrdiff_t src_stride,
diff --git a/vpx_ports/emms_mmx.c b/vpx_ports/emms_mmx.c
index 1b28809..f1036b9 100644
--- a/vpx_ports/emms_mmx.c
+++ b/vpx_ports/emms_mmx.c
@@ -10,4 +10,6 @@
 
 #include <mmintrin.h>
 
+#include "vpx_ports/system_state.h"
+
 void vpx_clear_system_state() { _mm_empty(); }