Merge "Use longer test clips in y4m_test"
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 02cc7ff..cef602e 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -416,8 +416,14 @@
     for (int i = 0; i < kOutputBufferSize; ++i) {
       if (IsIndexInBorder(i)) {
         output_[i] = 255;
+#if CONFIG_VP9_HIGHBITDEPTH
+        output16_[i] = mask_;
+#endif
       } else {
         output_[i] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        output16_[i] = 0;
+#endif
       }
     }
 
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index b0ea61f..438eeb3 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -114,9 +114,9 @@
     return 0;
   }
 
-  // Checks that the ximage data is contained within the external frame buffer
-  // private data passed back in the ximage.
-  void CheckXImageFrameBuffer(const vpx_image_t *img) {
+  // Checks that the vpx_image_t data is contained within the external frame
+  // buffer private data passed back in the vpx_image_t.
+  void CheckImageFrameBuffer(const vpx_image_t *img) {
     if (img->fb_priv != NULL) {
       const struct ExternalFrameBuffer *const ext_fb =
           reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
@@ -342,7 +342,7 @@
 
     // Get decompressed data
     while ((img = dec_iter.Next()) != NULL) {
-      fb_list_.CheckXImageFrameBuffer(img);
+      fb_list_.CheckImageFrameBuffer(img);
     }
   }
 
diff --git a/test/test.mk b/test/test.mk
index 3cf3202..2b76361 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -171,6 +171,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc
 ifneq ($(CONFIG_REALTIME_ONLY),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc
new file mode 100644
index 0000000..7333547
--- /dev/null
+++ b/test/yuv_temporal_filter_test.cc
@@ -0,0 +1,479 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/register_state_check.h"
+#include "vpx_ports/vpx_timer.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+using ::libvpx_test::Buffer;
+
+typedef void (*YUVTemporalFilterFunc)(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32,
+    uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
+    uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
+static INLINE int get_filter_weight(unsigned int row, unsigned int col,
+                                    unsigned int block_height,
+                                    unsigned int block_width,
+                                    const int *const blk_fw, int use_32x32) {
+  if (use_32x32) {
+    return blk_fw[0];
+  }
+
+  return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)];
+}
+
+static INLINE int mod_index(int sum_dist, int index, int rounding, int strength,
+                            int filter_weight) {
+  int mod = (sum_dist * 3) / index;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+void reference_filter(
+    const Buffer<uint8_t> &y_src, const Buffer<uint8_t> &y_pre,
+    const Buffer<uint8_t> &u_src, const Buffer<uint8_t> &v_src,
+    const Buffer<uint8_t> &u_pre, const Buffer<uint8_t> &v_pre,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *const blk_fw, int use_32x32,
+    Buffer<uint32_t> *y_accumulator, Buffer<uint16_t> *y_count,
+    Buffer<uint32_t> *u_accumulator, Buffer<uint16_t> *u_count,
+    Buffer<uint32_t> *v_accumulator, Buffer<uint16_t> *v_count) {
+  // blk_fw means block_filter_weight
+  // Set up buffer to store squared_diffs
+  Buffer<int> y_dif = Buffer<int>(block_width, block_height, 0);
+  const int uv_block_width = block_width >> ss_x;
+  const int uv_block_height = block_height >> ss_y;
+  Buffer<int> u_dif = Buffer<int>(uv_block_width, uv_block_height, 0);
+  Buffer<int> v_dif = Buffer<int>(uv_block_width, uv_block_height, 0);
+  ASSERT_TRUE(y_dif.Init());
+  ASSERT_TRUE(u_dif.Init());
+  ASSERT_TRUE(v_dif.Init());
+  y_dif.Set(0);
+  u_dif.Set(0);
+  v_dif.Set(0);
+
+  // How many bits to we want round
+  ASSERT_GE(strength, 0);
+  ASSERT_LE(strength, 6);
+  int rounding = 0;
+  if (strength > 0) {
+    rounding = 1 << (strength - 1);
+  }
+
+  // Check that the buffers are valid
+  ASSERT_TRUE(y_src.TopLeftPixel() != NULL);
+  ASSERT_TRUE(y_pre.TopLeftPixel() != NULL);
+  ASSERT_TRUE(y_dif.TopLeftPixel() != NULL);
+  ASSERT_TRUE(u_src.TopLeftPixel() != NULL);
+  ASSERT_TRUE(u_pre.TopLeftPixel() != NULL);
+  ASSERT_TRUE(u_dif.TopLeftPixel() != NULL);
+  ASSERT_TRUE(v_src.TopLeftPixel() != NULL);
+  ASSERT_TRUE(v_pre.TopLeftPixel() != NULL);
+  ASSERT_TRUE(v_dif.TopLeftPixel() != NULL);
+
+  // Get the square diffs
+  for (int row = 0; row < (int)block_height; row++) {
+    for (int col = 0; col < (int)block_width; col++) {
+      int diff = y_src.TopLeftPixel()[row * y_src.stride() + col] -
+                 y_pre.TopLeftPixel()[row * y_pre.stride() + col];
+      y_dif.TopLeftPixel()[row * y_dif.stride() + col] = diff * diff;
+    }
+  }
+
+  for (int row = 0; row < uv_block_height; row++) {
+    for (int col = 0; col < uv_block_width; col++) {
+      int u_diff = u_src.TopLeftPixel()[row * u_src.stride() + col] -
+                   u_pre.TopLeftPixel()[row * u_pre.stride() + col];
+      int v_diff = v_src.TopLeftPixel()[row * v_src.stride() + col] -
+                   v_pre.TopLeftPixel()[row * v_pre.stride() + col];
+      u_dif.TopLeftPixel()[row * u_dif.stride() + col] = u_diff * u_diff;
+      v_dif.TopLeftPixel()[row * v_dif.stride() + col] = v_diff * v_diff;
+    }
+  }
+
+  // Apply the filter
+  for (int row = 0; row < (int)block_height; row++) {
+    for (int col = 0; col < (int)block_width; col++) {
+      const int uv_r = row >> ss_y;
+      const int uv_c = col >> ss_x;
+      int filter_weight = get_filter_weight(row, col, block_height, block_width,
+                                            blk_fw, use_32x32);
+
+      // First we get the modifier for the current y pixel
+      const int y_pixel = y_pre.TopLeftPixel()[row * y_pre.stride() + col];
+      int y_num_used = 0;
+      int y_mod = 0;
+
+      // Sum the neighboring 3x3 y pixels
+      for (int row_step = -1; row_step <= 1; row_step++) {
+        for (int col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = row + row_step;
+          const int sub_col = col + col_step;
+
+          if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
+              sub_col < (int)block_width) {
+            y_mod += y_dif.TopLeftPixel()[sub_row * y_dif.stride() + sub_col];
+            y_num_used++;
+          }
+        }
+      }
+
+      ASSERT_GE(y_num_used, 0);
+
+      // Sum the corresponding uv pixels to the current y modifier
+      // Note we are rounding down instead of rounding to the nearest pixel.
+      y_mod += u_dif.TopLeftPixel()[uv_r * uv_block_width + uv_c];
+      y_mod += v_dif.TopLeftPixel()[uv_r * uv_block_width + uv_c];
+
+      y_num_used += 2;
+
+      // Set the modifier
+      y_mod = mod_index(y_mod, y_num_used, rounding, strength, filter_weight);
+
+      // Accumulate the result
+      y_count->TopLeftPixel()[row * y_count->stride() + col] += y_mod;
+      y_accumulator->TopLeftPixel()[row * y_accumulator->stride() + col] +=
+          y_mod * y_pixel;
+
+      // Get the modifier for chroma components
+      if (!(row & ss_y) && !(col & ss_x)) {
+        const int u_pixel = u_pre.TopLeftPixel()[uv_r * u_pre.stride() + uv_c];
+        const int v_pixel = v_pre.TopLeftPixel()[uv_r * v_pre.stride() + uv_c];
+
+        int uv_num_used = 0;
+        int u_mod = 0, v_mod = 0;
+
+        // Sum the neighboring 3x3 chromal pixels to the chroma modifier
+        for (int row_step = -1; row_step <= 1; row_step++) {
+          for (int col_step = -1; col_step <= 1; col_step++) {
+            const int sub_row = uv_r + row_step;
+            const int sub_col = uv_c + col_step;
+
+            if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
+                sub_col < uv_block_width) {
+              u_mod += u_dif.TopLeftPixel()[sub_row * uv_block_width + sub_col];
+              v_mod += v_dif.TopLeftPixel()[sub_row * uv_block_width + sub_col];
+              uv_num_used++;
+            }
+          }
+        }
+
+        assert(uv_num_used > 0);
+
+        // Sum all the luma pixels associated with the current luma pixel
+        for (int row_step = 0; row_step < 1 + ss_y; row_step++) {
+          for (int col_step = 0; col_step < 1 + ss_x; col_step++) {
+            const int sub_row = (uv_r << ss_y) + row_step;
+            const int sub_col = (uv_c << ss_x) + col_step;
+            const int y_diff =
+                y_dif.TopLeftPixel()[sub_row * y_dif.stride() + sub_col];
+
+            u_mod += y_diff;
+            v_mod += y_diff;
+            uv_num_used++;
+          }
+        }
+
+        // Set the modifier
+        u_mod =
+            mod_index(u_mod, uv_num_used, rounding, strength, filter_weight);
+        v_mod =
+            mod_index(v_mod, uv_num_used, rounding, strength, filter_weight);
+
+        // Accumuate the result
+        u_count->TopLeftPixel()[uv_r * u_count->stride() + uv_c] += u_mod;
+        u_accumulator->TopLeftPixel()[uv_r * u_accumulator->stride() + uv_c] +=
+            u_mod * u_pixel;
+        v_count->TopLeftPixel()[uv_r * u_count->stride() + uv_c] += v_mod;
+        v_accumulator->TopLeftPixel()[uv_r * v_accumulator->stride() + uv_c] +=
+            v_mod * v_pixel;
+      }
+    }
+  }
+}
+
+class YUVTemporalFilterTest
+    : public ::testing::TestWithParam<YUVTemporalFilterFunc> {
+ public:
+  virtual void SetUp() {
+    filter_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  YUVTemporalFilterFunc filter_func_;
+  ACMRandom rnd_;
+};
+
+TEST_P(YUVTemporalFilterTest, USE_32X32) {
+  const int width = 32, height = 32;
+  Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8);
+  Buffer<uint8_t> y_pre = Buffer<uint8_t>(width, height, 0);
+  Buffer<uint16_t> y_count_ref = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum_ref = Buffer<uint32_t>(width, height, 0);
+  Buffer<uint16_t> y_count_tst = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum_tst = Buffer<uint32_t>(width, height, 0);
+  ASSERT_TRUE(y_src.Init());
+  ASSERT_TRUE(y_pre.Init());
+  ASSERT_TRUE(y_count_ref.Init());
+  ASSERT_TRUE(y_accum_ref.Init());
+  ASSERT_TRUE(y_count_tst.Init());
+  ASSERT_TRUE(y_accum_tst.Init());
+
+  const int use_32x32 = 1;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_strength = 0; filter_strength <= 6;
+           filter_strength += 2) {
+        for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
+          const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+          Buffer<uint8_t> u_src = Buffer<uint8_t>(uv_width, uv_height, 8);
+          Buffer<uint8_t> u_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> u_count_ref =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> u_accum_ref =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> u_count_tst =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> u_accum_tst =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          ASSERT_TRUE(u_src.Init());
+          ASSERT_TRUE(u_pre.Init());
+          ASSERT_TRUE(u_count_ref.Init());
+          ASSERT_TRUE(u_accum_ref.Init());
+          ASSERT_TRUE(u_count_tst.Init());
+          ASSERT_TRUE(u_accum_tst.Init());
+          Buffer<uint8_t> v_src = Buffer<uint8_t>(uv_width, uv_height, 8);
+          Buffer<uint8_t> v_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> v_count_ref =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> v_accum_ref =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> v_count_tst =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> v_accum_tst =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          ASSERT_TRUE(v_src.Init());
+          ASSERT_TRUE(v_pre.Init());
+          ASSERT_TRUE(v_count_ref.Init());
+          ASSERT_TRUE(v_accum_ref.Init());
+          ASSERT_TRUE(v_count_tst.Init());
+          ASSERT_TRUE(v_accum_tst.Init());
+
+          // The difference between the buffers must be small to pass the
+          // threshold to apply the filter.
+          y_src.Set(&rnd_, 0, 7);
+          y_pre.Set(&rnd_, 0, 7);
+          u_src.Set(&rnd_, 0, 7);
+          u_pre.Set(&rnd_, 0, 7);
+          v_src.Set(&rnd_, 0, 7);
+          v_pre.Set(&rnd_, 0, 7);
+
+          y_accum_ref.Set(rnd_.Rand8());
+          y_accum_tst.CopyFrom(y_accum_ref);
+          y_count_ref.Set(rnd_.Rand8());
+          y_count_tst.CopyFrom(y_count_ref);
+          u_accum_ref.Set(rnd_.Rand8());
+          u_accum_tst.CopyFrom(u_accum_ref);
+          u_count_ref.Set(rnd_.Rand8());
+          u_count_tst.CopyFrom(u_count_ref);
+          v_accum_ref.Set(rnd_.Rand8());
+          v_accum_tst.CopyFrom(v_accum_ref);
+          v_count_ref.Set(rnd_.Rand8());
+          v_count_tst.CopyFrom(v_count_ref);
+
+          reference_filter(y_src, y_pre, u_src, v_src, u_pre, v_pre, width,
+                           height, ss_x, ss_y, filter_strength, &filter_weight,
+                           use_32x32, &y_accum_ref, &y_count_ref, &u_accum_ref,
+                           &u_count_ref, &v_accum_ref, &v_count_ref);
+          ASM_REGISTER_STATE_CHECK(filter_func_(
+              y_src.TopLeftPixel(), y_src.stride(), y_pre.TopLeftPixel(),
+              y_pre.stride(), u_src.TopLeftPixel(), v_src.TopLeftPixel(),
+              u_src.stride(), u_pre.TopLeftPixel(), v_pre.TopLeftPixel(),
+              u_pre.stride(), width, height, ss_x, ss_y, filter_strength,
+              &filter_weight, use_32x32, y_accum_tst.TopLeftPixel(),
+              y_count_tst.TopLeftPixel(), u_accum_tst.TopLeftPixel(),
+              u_count_tst.TopLeftPixel(), v_accum_tst.TopLeftPixel(),
+              v_count_tst.TopLeftPixel()));
+
+          EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref));
+          EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref));
+          EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref));
+          EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref));
+          EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref));
+          EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref));
+
+          if (HasFailure()) {
+            printf("SS_X: %d, SS_Y: %d, Weight: %d, Strength: %d\n", ss_x, ss_y,
+                   filter_weight, filter_strength);
+            y_accum_tst.PrintDifference(y_accum_ref);
+            y_count_tst.PrintDifference(y_count_ref);
+            u_accum_tst.PrintDifference(u_accum_ref);
+            u_count_tst.PrintDifference(u_count_ref);
+            v_accum_tst.PrintDifference(v_accum_ref);
+            v_count_tst.PrintDifference(v_count_ref);
+            return;
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, USE_16X16) {
+  const int width = 32, height = 32;
+  Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8);
+  Buffer<uint8_t> y_pre = Buffer<uint8_t>(width, height, 0);
+  Buffer<uint16_t> y_count_ref = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum_ref = Buffer<uint32_t>(width, height, 0);
+  Buffer<uint16_t> y_count_tst = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum_tst = Buffer<uint32_t>(width, height, 0);
+  ASSERT_TRUE(y_src.Init());
+  ASSERT_TRUE(y_pre.Init());
+  ASSERT_TRUE(y_count_ref.Init());
+  ASSERT_TRUE(y_accum_ref.Init());
+  ASSERT_TRUE(y_count_tst.Init());
+  ASSERT_TRUE(y_accum_tst.Init());
+
+  const int use_32x32 = 0;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) {
+        // Set up the filter
+        int filter_weight[4];
+        int filter_idx_cp = filter_idx;
+        for (int idx = 0; idx < 4; idx++) {
+          filter_weight[idx] = filter_idx_cp % 3;
+          filter_idx_cp /= 3;
+        }
+
+        // Test each parameter
+        for (int filter_strength = 0; filter_strength <= 6;
+             filter_strength += 2) {
+          const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+          Buffer<uint8_t> u_src = Buffer<uint8_t>(uv_width, uv_height, 8);
+          Buffer<uint8_t> u_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> u_count_ref =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> u_accum_ref =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> u_count_tst =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> u_accum_tst =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          ASSERT_TRUE(u_src.Init());
+          ASSERT_TRUE(u_pre.Init());
+          ASSERT_TRUE(u_count_ref.Init());
+          ASSERT_TRUE(u_accum_ref.Init());
+          ASSERT_TRUE(u_count_tst.Init());
+          ASSERT_TRUE(u_accum_tst.Init());
+          Buffer<uint8_t> v_src = Buffer<uint8_t>(uv_width, uv_height, 8);
+          Buffer<uint8_t> v_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> v_count_ref =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> v_accum_ref =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> v_count_tst =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> v_accum_tst =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          ASSERT_TRUE(v_src.Init());
+          ASSERT_TRUE(v_pre.Init());
+          ASSERT_TRUE(v_count_ref.Init());
+          ASSERT_TRUE(v_accum_ref.Init());
+          ASSERT_TRUE(v_count_tst.Init());
+          ASSERT_TRUE(v_accum_tst.Init());
+
+          // The difference between the buffers must be small to pass the
+          // threshold to apply the filter.
+          y_src.Set(&rnd_, 0, 7);
+          y_pre.Set(&rnd_, 0, 7);
+          u_src.Set(&rnd_, 0, 7);
+          u_pre.Set(&rnd_, 0, 7);
+          v_src.Set(&rnd_, 0, 7);
+          v_pre.Set(&rnd_, 0, 7);
+
+          y_accum_ref.Set(rnd_.Rand8());
+          y_accum_tst.CopyFrom(y_accum_ref);
+          y_count_ref.Set(rnd_.Rand8());
+          y_count_tst.CopyFrom(y_count_ref);
+          u_accum_ref.Set(rnd_.Rand8());
+          u_accum_tst.CopyFrom(u_accum_ref);
+          u_count_ref.Set(rnd_.Rand8());
+          u_count_tst.CopyFrom(u_count_ref);
+          v_accum_ref.Set(rnd_.Rand8());
+          v_accum_tst.CopyFrom(v_accum_ref);
+          v_count_ref.Set(rnd_.Rand8());
+          v_count_tst.CopyFrom(v_count_ref);
+
+          reference_filter(y_src, y_pre, u_src, v_src, u_pre, v_pre, width,
+                           height, ss_x, ss_y, filter_strength, filter_weight,
+                           use_32x32, &y_accum_ref, &y_count_ref, &u_accum_ref,
+                           &u_count_ref, &v_accum_ref, &v_count_ref);
+          ASM_REGISTER_STATE_CHECK(filter_func_(
+              y_src.TopLeftPixel(), y_src.stride(), y_pre.TopLeftPixel(),
+              y_pre.stride(), u_src.TopLeftPixel(), v_src.TopLeftPixel(),
+              u_src.stride(), u_pre.TopLeftPixel(), v_pre.TopLeftPixel(),
+              u_pre.stride(), width, height, ss_x, ss_y, filter_strength,
+              filter_weight, use_32x32, y_accum_tst.TopLeftPixel(),
+              y_count_tst.TopLeftPixel(), u_accum_tst.TopLeftPixel(),
+              u_count_tst.TopLeftPixel(), v_accum_tst.TopLeftPixel(),
+              v_count_tst.TopLeftPixel()));
+
+          EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref));
+          EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref));
+          EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref));
+          EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref));
+          EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref));
+          EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref));
+
+          if (HasFailure()) {
+            printf("SS_X: %d, SS_Y: %d, Weight Idx: %d, Strength: %d\n", ss_x,
+                   ss_y, filter_idx, filter_strength);
+            y_accum_tst.PrintDifference(y_accum_ref);
+            y_count_tst.PrintDifference(y_count_ref);
+            u_accum_tst.PrintDifference(u_accum_ref);
+            u_count_tst.PrintDifference(u_count_ref);
+            v_accum_tst.PrintDifference(v_accum_ref);
+            v_count_tst.PrintDifference(v_count_ref);
+            return;
+          }
+        }
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, YUVTemporalFilterTest,
+                        ::testing::Values(&vp9_apply_temporal_filter));
+}  // namespace
diff --git a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
index 057d2e9..219ff63 100644
--- a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
@@ -150,8 +150,9 @@
   return out;
 }
 
-void vpx_highbd_iadst16_neon(const int32_t *input, int32_t *output,
-                             uint16_t *dest, const int stride, const int bd) {
+static void highbd_iadst16_neon(const int32_t *input, int32_t *output,
+                                uint16_t *dest, const int stride,
+                                const int bd) {
   const int32x4_t c_1_31_5_27 =
       create_s32x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64);
   const int32x4_t c_9_23_13_19 =
@@ -424,11 +425,11 @@
     static const highbd_iht_2d IHT_16[] = {
       { vpx_highbd_idct16x16_256_add_half1d,
         vpx_highbd_idct16x16_256_add_half1d },  // DCT_DCT  = 0
-      { vpx_highbd_iadst16_neon,
+      { highbd_iadst16_neon,
         vpx_highbd_idct16x16_256_add_half1d },  // ADST_DCT = 1
       { vpx_highbd_idct16x16_256_add_half1d,
-        vpx_highbd_iadst16_neon },                          // DCT_ADST = 2
-      { vpx_highbd_iadst16_neon, vpx_highbd_iadst16_neon }  // ADST_ADST = 3
+        highbd_iadst16_neon },                      // DCT_ADST = 2
+      { highbd_iadst16_neon, highbd_iadst16_neon }  // ADST_ADST = 3
     };
     const highbd_iht_2d ht = IHT_16[tx_type];
     int32_t row_output[16 * 16];
diff --git a/vp9/common/mips/msa/vp9_idct16x16_msa.c b/vp9/common/mips/msa/vp9_idct16x16_msa.c
index 3e35301..c031322 100644
--- a/vp9/common/mips/msa/vp9_idct16x16_msa.c
+++ b/vp9/common/mips/msa/vp9_idct16x16_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
diff --git a/vp9/common/mips/msa/vp9_idct4x4_msa.c b/vp9/common/mips/msa/vp9_idct4x4_msa.c
index 786fbdb..aaccd5c 100644
--- a/vp9/common/mips/msa/vp9_idct4x4_msa.c
+++ b/vp9/common/mips/msa/vp9_idct4x4_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
diff --git a/vp9/common/mips/msa/vp9_idct8x8_msa.c b/vp9/common/mips/msa/vp9_idct8x8_msa.c
index e416677..76d15ff 100644
--- a/vp9/common/mips/msa/vp9_idct8x8_msa.c
+++ b/vp9/common/mips/msa/vp9_idct8x8_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 8bb68cf..3102b08 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -186,6 +186,8 @@
 if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
 add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
 specialize qw/vp9_temporal_filter_apply sse4_1/;
+
+add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
 }
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
diff --git a/vp9/encoder/mips/msa/vp9_error_msa.c b/vp9/encoder/mips/msa/vp9_error_msa.c
index 188d04d..61786d8 100644
--- a/vp9/encoder/mips/msa/vp9_error_msa.c
+++ b/vp9/encoder/mips/msa/vp9_error_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -79,6 +80,7 @@
     return err;                                                              \
   }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 BLOCK_ERROR_BLOCKSIZE_MSA(16);
 BLOCK_ERROR_BLOCKSIZE_MSA(64);
 BLOCK_ERROR_BLOCKSIZE_MSA(256);
@@ -103,3 +105,4 @@
 
   return err;
 }
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
index 0831e59..efbbe83 100644
--- a/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
+++ b/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
 #include "vpx_dsp/mips/fwd_txfm_msa.h"
diff --git a/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
index fa36f09..9c5cc12 100644
--- a/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
+++ b/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
 
diff --git a/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
index 604db85..26d81aa 100644
--- a/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
+++ b/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 18adfeb..cb9ea2d 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -337,8 +337,7 @@
   if (bsize == BLOCK_32X32) {
     return 3;
   }
-  printf("ERROR: non-square block size\n");
-  assert(0);
+  assert(0 && "ERROR: non-square block size");
   return -1;
 }
 
@@ -355,8 +354,7 @@
   if (square_block_idx == 3) {
     return BLOCK_32X32;
   }
-  printf("ERROR: invalid square_block_idx\n");
-  assert(0);
+  assert(0 && "ERROR: invalid square_block_idx");
   return BLOCK_INVALID;
 }
 
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 05f3f28..534b15a 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1646,7 +1646,7 @@
 
 // Exhuastive motion search around a given centre position with a given
 // step size.
-static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
+static int exhaustive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
                                   int range, int step, int sad_per_bit,
                                   const vp9_variance_fn_ptr_t *fn_ptr,
                                   const MV *center_mv) {
@@ -1760,7 +1760,7 @@
   return best_cost;
 }
 
-static double exhuastive_mesh_search_new(const MACROBLOCK *x, MV *best_mv,
+static double exhaustive_mesh_search_new(const MACROBLOCK *x, MV *best_mv,
                                          int range, int step,
                                          const vp9_variance_fn_ptr_t *fn_ptr,
                                          const MV *center_mv, double lambda,
@@ -1883,7 +1883,7 @@
 
   // initial search
   bestsme =
-      exhuastive_mesh_search_new(x, &temp_mv, range, interval, fn_ptr, &temp_mv,
+      exhaustive_mesh_search_new(x, &temp_mv, range, interval, fn_ptr, &temp_mv,
                                  lambda, nb_full_mvs, full_mv_num);
 
   if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
@@ -1891,7 +1891,7 @@
     // till we reach a step size of 1. Then break out.
     for (i = 1; i < MAX_MESH_STEP; ++i) {
       // First pass with coarser step and longer range
-      bestsme = exhuastive_mesh_search_new(
+      bestsme = exhaustive_mesh_search_new(
           x, &temp_mv, sf->mesh_patterns[i].range,
           sf->mesh_patterns[i].interval, fn_ptr, &temp_mv, lambda, nb_full_mvs,
           full_mv_num);
@@ -2581,7 +2581,7 @@
   interval = VPXMAX(interval, range / baseline_interval_divisor);
 
   // initial search
-  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
+  bestsme = exhaustive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
                                    sadpb, fn_ptr, &temp_mv);
 
   if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
@@ -2589,7 +2589,7 @@
     // till we reach a step size of 1. Then break out.
     for (i = 1; i < MAX_MESH_STEP; ++i) {
       // First pass with coarser step and longer range
-      bestsme = exhuastive_mesh_search(
+      bestsme = exhaustive_mesh_search(
           x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range,
           sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv);
 
@@ -2868,10 +2868,10 @@
   if (method == NSTEP) {
     if (sf->exhaustive_searches_thresh < INT_MAX &&
         !cpi->rc.is_src_frame_alt_ref) {
-      const int64_t exhuastive_thr =
+      const int64_t exhaustive_thr =
           sf->exhaustive_searches_thresh >>
           (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
-      if (var > exhuastive_thr) run_exhaustive_search = 1;
+      if (var > exhaustive_thr) run_exhaustive_search = 1;
     }
   } else if (method == MESH) {
     run_exhaustive_search = 1;
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index ab880ff..6bef887 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -167,8 +167,7 @@
     case BLOCK_64X64: square_bsize = BLOCK_32X32; break;
     default:
       square_bsize = BLOCK_INVALID;
-      printf("ERROR: invlid block size %d\n", bsize);
-      assert(0);
+      assert(0 && "ERROR: invalid block size");
       break;
   }
   return square_bsize;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 5ad68e2..9df2eb3 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -247,6 +247,9 @@
   return target;
 }
 
+// TODO(marpan/jianj): bits_off_target and buffer_level are used in the saame
+// way for CBR mode, for the buffering updates below. Look into removing one
+// of these (i.e., bits_off_target).
 // Update the buffer level before encoding with the per-frame-bandwidth,
 static void update_buffer_level_preencode(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1800,6 +1803,8 @@
     }
   }
 
+  if (cpi->use_svc) vp9_svc_adjust_avg_frame_qindex(cpi);
+
   // Keep record of last boosted (KF/KF/ARF) Q value.
   // If the current frame is coded at a lower Q then we also update it.
   // If all mbs in this group are skipped only update if the Q value is
@@ -1922,8 +1927,10 @@
   // increasing buffer levels/overflow for certain layers even though whole
   // superframe is dropped, we cap buffer level if its already stable.
   if (cpi->use_svc && cpi->svc.framedrop_mode != LAYER_DROP &&
-      cpi->rc.buffer_level > cpi->rc.optimal_buffer_level)
+      cpi->rc.buffer_level > cpi->rc.optimal_buffer_level) {
     cpi->rc.buffer_level = cpi->rc.optimal_buffer_level;
+    cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level;
+  }
 }
 
 static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 3223f71..35155c7 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -1227,3 +1227,25 @@
       cpi->svc.timebase_fac * cpi->svc.duration[cpi->svc.spatial_layer_id];
   vp9_new_framerate(cpi, 10000000.0 / this_duration);
 }
+
+void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  RATE_CONTROL *const rc = &cpi->rc;
+  // On key frames in CBR mode: reset the avg_frame_index for base layer
+  // (to level closer to worst_quality) if the overshoot is significant.
+  // Reset it for all temporal layers on base spatial layer.
+  if (cm->frame_type == KEY_FRAME && cpi->oxcf.rc_mode == VPX_CBR &&
+      rc->projected_frame_size > 3 * rc->avg_frame_bandwidth) {
+    int tl;
+    rc->avg_frame_qindex[INTER_FRAME] =
+        VPXMAX(rc->avg_frame_qindex[INTER_FRAME],
+               (cm->base_qindex + rc->worst_quality) >> 1);
+    for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      const int layer = LAYER_IDS_TO_IDX(0, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      lrc->avg_frame_qindex[INTER_FRAME] = rc->avg_frame_qindex[INTER_FRAME];
+    }
+  }
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index c256446..34795d8 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -262,6 +262,7 @@
 
 void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi);
 
+void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index cd340c3..23943bb 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -198,8 +198,8 @@
 
 static INLINE int get_filter_weight(unsigned int i, unsigned int j,
                                     unsigned int block_height,
-                                    unsigned int block_width, int *blk_fw,
-                                    int use_32x32) {
+                                    unsigned int block_width,
+                                    const int *const blk_fw, int use_32x32) {
   int filter_weight = 0;
 
   if (use_32x32)
@@ -220,12 +220,12 @@
   return filter_weight;
 }
 
-static void apply_temporal_filter(
+void vp9_apply_temporal_filter_c(
     const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred,
     int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1,
     int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred,
     int uv_buf_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int *blk_fw, int use_32x32,
+    int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32,
     uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
     uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) {
   unsigned int i, j, k, m;
@@ -767,7 +767,7 @@
               count + (BLK_PELS << 1));
         } else {
           // Apply the filter (YUV)
-          apply_temporal_filter(
+          vp9_apply_temporal_filter_c(
               f->y_buffer + mb_y_offset, f->y_stride, predictor, BW,
               f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
               f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1),
@@ -778,7 +778,7 @@
         }
 #else
         // Apply the filter (YUV)
-        apply_temporal_filter(
+        vp9_apply_temporal_filter_c(
             f->y_buffer + mb_y_offset, f->y_stride, predictor, BW,
             f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
             f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1),
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 8d38c98..b2d57dc 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -1015,6 +1015,9 @@
                   vpx_svc_spatial_layer_sync_t *)
 #define VPX_CTRL_VP9E_SET_SVC_SPATIAL_LAYER_SYNC
 
+VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int)
+#define VPX_CTRL_VP9E_SET_POSTENCODE_DROP
+
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
diff --git a/vpx/vpx_frame_buffer.h b/vpx/vpx_frame_buffer.h
index 2813ca6..fc83200 100644
--- a/vpx/vpx_frame_buffer.h
+++ b/vpx/vpx_frame_buffer.h
@@ -52,9 +52,9 @@
  * data. The callback is triggered when the decoder needs a frame buffer to
  * decode a compressed image into. This function may be called more than once
  * for every call to vpx_codec_decode. The application may set fb->priv to
- * some data which will be passed back in the ximage and the release function
- * call. |fb| is guaranteed to not be NULL. On success the callback must
- * return 0. Any failure the callback must return a value less than 0.
+ * some data which will be passed back in the vpx_image_t and the release
+ * function call. |fb| is guaranteed to not be NULL. On success the callback
+ * must return 0. Any failure the callback must return a value less than 0.
  *
  * \param[in] priv         Callback's private data
  * \param[in] min_size     Size in bytes needed by the buffer
diff --git a/vpx_dsp/mips/add_noise_msa.c b/vpx_dsp/mips/add_noise_msa.c
index 43d2c11..9754141 100644
--- a/vpx_dsp/mips/add_noise_msa.c
+++ b/vpx_dsp/mips/add_noise_msa.c
@@ -9,7 +9,9 @@
  */
 
 #include <stdlib.h>
-#include "./macros_msa.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise,
                              int blackclamp, int whiteclamp, int width,
diff --git a/vpx_dsp/mips/avg_msa.c b/vpx_dsp/mips/avg_msa.c
index d0ac7b8..3fd18de 100644
--- a/vpx_dsp/mips/avg_msa.c
+++ b/vpx_dsp/mips/avg_msa.c
@@ -9,6 +9,7 @@
  */
 #include <stdlib.h>
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -56,6 +57,7 @@
   return sum_out;
 }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride,
                           int16_t *dst) {
   v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -391,6 +393,7 @@
 
   return satd;
 }
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
 
 void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
                          const int ref_stride, const int height) {
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c
index 1707d32..4e93ff5 100644
--- a/vpx_dsp/mips/deblock_msa.c
+++ b/vpx_dsp/mips/deblock_msa.c
@@ -10,7 +10,8 @@
 
 #include <stdlib.h>
 
-#include "./macros_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 extern const int16_t vpx_rv[];
 
diff --git a/vpx_dsp/mips/fwd_dct32x32_msa.c b/vpx_dsp/mips/fwd_dct32x32_msa.c
index 06fdc95..36583e2 100644
--- a/vpx_dsp/mips/fwd_dct32x32_msa.c
+++ b/vpx_dsp/mips/fwd_dct32x32_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/fwd_txfm_msa.h"
 
 static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
diff --git a/vpx_dsp/mips/idct16x16_msa.c b/vpx_dsp/mips/idct16x16_msa.c
index 2a211c5..7ca61a2 100644
--- a/vpx_dsp/mips/idct16x16_msa.c
+++ b/vpx_dsp/mips/idct16x16_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
diff --git a/vpx_dsp/mips/idct32x32_msa.c b/vpx_dsp/mips/idct32x32_msa.c
index 2ea6136..0539481 100644
--- a/vpx_dsp/mips/idct32x32_msa.c
+++ b/vpx_dsp/mips/idct32x32_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 static void idct32x8_row_transpose_store(const int16_t *input,
diff --git a/vpx_dsp/mips/idct4x4_msa.c b/vpx_dsp/mips/idct4x4_msa.c
index 0a85742..56ffec3 100644
--- a/vpx_dsp/mips/idct4x4_msa.c
+++ b/vpx_dsp/mips/idct4x4_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
diff --git a/vpx_dsp/mips/idct8x8_msa.c b/vpx_dsp/mips/idct8x8_msa.c
index 7f77d20..a383ff2 100644
--- a/vpx_dsp/mips/idct8x8_msa.c
+++ b/vpx_dsp/mips/idct8x8_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index 8398ec3..b75d4d7 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -16,11 +16,17 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
+// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty
+// hacky and awful to read. Note that there is a filter_x[3] == 128 check in
+// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function
+// assumes the filter is always 8 tap.
 typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
                                 uint32_t output_height, const int16_t *filter);
 
-#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)         \
+// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we
+// have 4-tap vert avg filter.
+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \
   void vpx_convolve8_##name##_##opt(                                         \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
@@ -33,6 +39,7 @@
     assert(filter_row[3] != 128);                                            \
     assert(step_q4 == 16);                                                   \
     if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
+      const int num_taps = 8;                                                \
       while (w >= 16) {                                                      \
         vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
                                                  dst_stride, h, filter_row); \
@@ -47,7 +54,9 @@
         vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
                                                 dst_stride, h, filter_row);  \
       }                                                                      \
+      (void)num_taps;                                                        \
     } else if (filter_row[2] | filter_row[5]) {                              \
+      const int num_taps = is_avg ? 8 : 4;                                   \
       while (w >= 16) {                                                      \
         vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
                                                  dst_stride, h, filter_row); \
@@ -62,25 +71,28 @@
         vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
                                                 dst_stride, h, filter_row);  \
       }                                                                      \
+      (void)num_taps;                                                        \
     } else {                                                                 \
+      const int num_taps = 2;                                                \
       while (w >= 16) {                                                      \
-        vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
+        vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
                                                  dst_stride, h, filter_row); \
         src += 16;                                                           \
         dst += 16;                                                           \
         w -= 16;                                                             \
       }                                                                      \
       if (w == 8) {                                                          \
-        vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
+        vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst,  \
                                                 dst_stride, h, filter_row);  \
       } else if (w == 4) {                                                   \
-        vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
+        vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst,  \
                                                 dst_stride, h, filter_row);  \
       }                                                                      \
+      (void)num_taps;                                                        \
     }                                                                        \
   }
 
-#define FUN_CONV_2D(avg, opt)                                                  \
+#define FUN_CONV_2D(avg, opt, is_avg)                                          \
   void vpx_convolve8_##avg##opt(                                               \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \
       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
@@ -94,7 +106,7 @@
     assert(h <= 64);                                                           \
     assert(x_step_q4 == 16);                                                   \
     assert(y_step_q4 == 16);                                                   \
-    if (filter_x[0] | filter_x[1] | filter_x[2]) {                             \
+    if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) {               \
       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                           \
       vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \
                                 filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
@@ -102,6 +114,15 @@
       vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \
                                       filter, x0_q4, x_step_q4, y0_q4,         \
                                       y_step_q4, w, h);                        \
+    } else if (filter_x[2] | filter_x[5]) {                                    \
+      const int num_taps = is_avg ? 8 : 4;                                     \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                           \
+      vpx_convolve8_horiz_##opt(                                               \
+          src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,       \
+          filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1);    \
+      vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64,    \
+                                      dst, dst_stride, filter, x0_q4,          \
+                                      x_step_q4, y0_q4, y_step_q4, w, h);      \
     } else {                                                                   \
       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                           \
       vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \
@@ -121,89 +142,96 @@
                                        unsigned int output_height,
                                        const int16_t *filter, int bd);
 
-#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)  \
-  void vpx_highbd_convolve8_##name##_##opt(                                \
-      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,            \
-      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,         \
-      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {     \
-    const int16_t *filter_row = filter[offset];                            \
-    if (step_q4 == 16 && filter_row[3] != 128) {                           \
-      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
-        while (w >= 16) {                                                  \
-          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 16;                                                       \
-          dst += 16;                                                       \
-          w -= 16;                                                         \
-        }                                                                  \
-        while (w >= 8) {                                                   \
-          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 8;                                                        \
-          dst += 8;                                                        \
-          w -= 8;                                                          \
-        }                                                                  \
-        while (w >= 4) {                                                   \
-          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 4;                                                        \
-          dst += 4;                                                        \
-          w -= 4;                                                          \
-        }                                                                  \
-      } else if (filter_row[2] | filter_row[5]) {                          \
-        while (w >= 16) {                                                  \
-          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 16;                                                       \
-          dst += 16;                                                       \
-          w -= 16;                                                         \
-        }                                                                  \
-        while (w >= 8) {                                                   \
-          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 8;                                                        \
-          dst += 8;                                                        \
-          w -= 8;                                                          \
-        }                                                                  \
-        while (w >= 4) {                                                   \
-          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 4;                                                        \
-          dst += 4;                                                        \
-          w -= 4;                                                          \
-        }                                                                  \
-      } else {                                                             \
-        while (w >= 16) {                                                  \
-          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
-          src += 16;                                                       \
-          dst += 16;                                                       \
-          w -= 16;                                                         \
-        }                                                                  \
-        while (w >= 8) {                                                   \
-          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                  \
-              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
-          src += 8;                                                        \
-          dst += 8;                                                        \
-          w -= 8;                                                          \
-        }                                                                  \
-        while (w >= 4) {                                                   \
-          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                  \
-              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
-          src += 4;                                                        \
-          dst += 4;                                                        \
-          w -= 4;                                                          \
-        }                                                                  \
-      }                                                                    \
-    }                                                                      \
-    if (w) {                                                               \
-      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,    \
-                                      filter, x0_q4, x_step_q4, y0_q4,     \
-                                      y_step_q4, w, h, bd);                \
-    }                                                                      \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt,     \
+                         is_avg)                                              \
+  void vpx_highbd_convolve8_##name##_##opt(                                   \
+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
+      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
+    const int16_t *filter_row = filter_kernel[offset];                        \
+    if (step_q4 == 16 && filter_row[3] != 128) {                              \
+      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {    \
+        const int num_taps = 8;                                               \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+        (void)num_taps;                                                       \
+      } else if (filter_row[2] | filter_row[5]) {                             \
+        const int num_taps = is_avg ? 8 : 4;                                  \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+        (void)num_taps;                                                       \
+      } else {                                                                \
+        const int num_taps = 2;                                               \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+        (void)num_taps;                                                       \
+      }                                                                       \
+    }                                                                         \
+    if (w) {                                                                  \
+      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
+                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
+                                      y_step_q4, w, h, bd);                   \
+    }                                                                         \
   }
 
-#define HIGH_FUN_CONV_2D(avg, opt)                                             \
+#define HIGH_FUN_CONV_2D(avg, opt, is_avg)                                     \
   void vpx_highbd_convolve8_##avg##opt(                                        \
       const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,                \
       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
@@ -212,7 +240,8 @@
     assert(w <= 64);                                                           \
     assert(h <= 64);                                                           \
     if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \
-      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {   \
+      if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) ||           \
+          filter_x[3] == 128) {                                                \
         DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                        \
         vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \
                                          fdata2, 64, filter, x0_q4, x_step_q4, \
@@ -220,6 +249,16 @@
         vpx_highbd_convolve8_##avg##vert_##opt(                                \
             fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \
             y0_q4, y_step_q4, w, h, bd);                                       \
+      } else if (filter_x[2] | filter_x[5]) {                                  \
+        const int num_taps = is_avg ? 8 : 4;                                   \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                        \
+        vpx_highbd_convolve8_horiz_##opt(                                      \
+            src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,     \
+            filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1,   \
+            bd);                                                               \
+        vpx_highbd_convolve8_##avg##vert_##opt(                                \
+            fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter,     \
+            x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);                     \
       } else {                                                                 \
         DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                        \
         vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \
@@ -235,6 +274,6 @@
                                     bd);                                       \
     }                                                                          \
   }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // VPX_VPX_DSP_X86_CONVOLVE_H_
diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c
index aef067e..3209625 100644
--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -1089,22 +1089,19 @@
 
   // Repeat for the last row if needed
   if (h > 0) {
-    src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
-    // Reorder into 2 1 1 2
-    src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
-
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
     src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
     src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
 
     res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
                                    &kernel_reg_23, &kernel_reg_45);
 
-    res_reg = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
 
     res_reg = _mm256_packus_epi32(res_reg, res_reg);
-    res_reg = _mm256_permute4x64_epi64(res_reg, 0x8);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
 
-    _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
   }
 }
 
@@ -1279,10 +1276,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
@@ -1368,10 +1361,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
@@ -1476,9 +1465,10 @@
 #define vpx_highbd_filter_block1d4_h4_avg_avx2 \
   vpx_highbd_filter_block1d4_h8_avg_avx2
 
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
-HIGH_FUN_CONV_2D(, avx2);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), , avx2, 0);
+HIGH_FUN_CONV_2D(, avx2, 0);
 
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
@@ -1497,9 +1487,9 @@
 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
   vpx_highbd_filter_block1d4_v2_avg_sse2
 
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
-                 avx2);
-HIGH_FUN_CONV_2D(avg_, avx2);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
+HIGH_FUN_CONV_2D(avg_, avx2, 1);
 
 #undef HIGHBD_FUNC
diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index e40fe69..e0e8b8f 100644
--- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -133,10 +133,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -345,10 +341,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -531,10 +523,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -713,10 +701,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the source, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
@@ -896,10 +880,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the source, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
@@ -1060,10 +1040,12 @@
 //                                  const InterpKernel *filter, int x0_q4,
 //                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
+            sse2, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1);
 
 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
@@ -1075,8 +1057,8 @@
 //                             const InterpKernel *filter, int x0_q4,
 //                             int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                             int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_, sse2);
+FUN_CONV_2D(, sse2, 0);
+FUN_CONV_2D(avg_, sse2, 1);
 
 #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
@@ -1157,11 +1139,12 @@
 //                                         const int16_t *filter_y,
 //                                         int y_step_q4,
 //                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
-                 sse2);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), , sse2, 0);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1);
 
 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -1173,6 +1156,6 @@
 //                                    const InterpKernel *filter, int x0_q4,
 //                                    int32_t x_step_q4, int y0_q4,
 //                                    int y_step_q4, int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_, sse2);
+HIGH_FUN_CONV_2D(, sse2, 0);
+HIGH_FUN_CONV_2D(avg_, sse2, 1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index ccedfe2..d381a7a 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -464,10 +464,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -665,10 +661,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
@@ -839,10 +831,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
@@ -981,10 +969,12 @@
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+            avx2, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
 
 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
@@ -996,6 +986,6 @@
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, avx2);
-FUN_CONV_2D(avg_, avx2);
+FUN_CONV_2D(, avx2, 0);
+FUN_CONV_2D(avg_, avx2, 1);
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 37d1de0..63049c9 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -310,10 +310,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -483,10 +479,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -627,10 +619,6 @@
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -743,10 +731,12 @@
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+            ssse3, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1);
 
 static void filter_horiz_w8_ssse3(const uint8_t *const src,
                                   const ptrdiff_t src_stride,
@@ -1093,5 +1083,5 @@
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_, ssse3);
+FUN_CONV_2D(, ssse3, 0);
+FUN_CONV_2D(avg_, ssse3, 1);
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index 08679a4..eee291c 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -228,6 +228,7 @@
       // Allocation to hold larger frame, or first allocation.
       vpx_free(ybf->buffer_alloc);
       ybf->buffer_alloc = NULL;
+      ybf->buffer_alloc_sz = 0;
 
       ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size);
       if (!ybf->buffer_alloc) return -1;