Merge "No vpx_img_alloc for y4m input in example encoders."

commit: ce4336c2ab60d185b431345987b2188511760e54 [log] [tgz]
author: Jerome Jiang <jianj@google.com> Wed Feb 06 00:57:28 2019
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Wed Feb 06 00:57:28 2019
tree: eece9df8a685d0c9560c07e7ee7e8b33683df0fc
parent: e05cea787872dc2f9ef163a62ca9021955730f71 [diff]
parent: a4525dccec0cbd23b507cb58ce6d8b24a3dd4559 [diff]
diff --git a/.mailmap b/.mailmap
index 29af510..7c26790 100644
--- a/.mailmap
+++ b/.mailmap

@@ -4,9 +4,12 @@
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
 Chris Cunningham <chcunningham@chromium.org>
+Chi Yo Tsai <chiyotsai@google.com>
 Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
 Deb Mukherjee <debargha@google.com>
+Elliott Karpilovsky <elliottk@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
+Fyodor Kyslov <kyslov@google.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hui Su <huisu@google.com>
@@ -20,6 +23,7 @@
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
 Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
+Martin Storsjö <martin@martin.st>
 Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
 Peter Boström <pbos@chromium.org> <pbos@google.com>
@@ -28,6 +32,7 @@
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
+Sai Deng <sdeng@google.com>
 Sami Pietilä <samipietila@google.com>
 Shiyou Yin <yinshiyou-hf@loongson.cn>
 Tamar Levy <tamar.levy@intel.com>
@@ -40,3 +45,4 @@
 Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <Yaowu Xu>
+xiwei gu <guxiwei-hf@loongson.cn>

diff --git a/AUTHORS b/AUTHORS
index 04c2872..3f7a86d 100644
--- a/AUTHORS
+++ b/AUTHORS

@@ -26,6 +26,7 @@
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
 Cheng Chen <chengchen@google.com>
+Chi Yo Tsai <chiyotsai@google.com>
 chm <chm@rock-chips.com>
 Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
@@ -38,11 +39,13 @@
 Dragan Mrdjan <dmrdjan@mips.com>
 Ed Baker <edward.baker@intel.com>
 Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Elliott Karpilovsky <elliottk@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
+Fyodor Kyslov <kyslov@google.com>
 Gabriel Marin <gmx@chromium.org>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Geza Lore <gezalore@gmail.com>
@@ -55,6 +58,7 @@
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
 Han Shen <shenhan@google.com>
+Harish Mahendrakar <harish.mahendrakar@ittiam.com>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Krasin <krasin@chromium.org>
@@ -81,6 +85,7 @@
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
 John Stark <jhnstrk@gmail.com>
+Jon Kunkee <jkunkee@microsoft.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
 Julia Robson <juliamrobson@gmail.com>
@@ -91,15 +96,18 @@
 Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
+Liu Peng <pengliu.mail@gmail.com>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
+Luc Trudeau <luc@trud.ca>
 Makoto Kato <makoto.kt@gmail.com>
 Mans Rullgard <mans@mansr.com>
 Marco Paniconi <marpan@google.com>
 Mark Mentovai <mark@chromium.org>
 Martin Ettl <ettl.martin78@googlemail.com>
-Martin Storsjo <martin@martin.st>
+Martin Storsjö <martin@martin.st>
 Matthew Heaney <matthewjheaney@chromium.org>
+Matthias Räncker <theonetruecamper@gmx.de>
 Michael Kohler <michaelkohler@live.com>
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
@@ -107,10 +115,12 @@
 Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
 Min Ye <yeemmi@google.com>
+Mirko Bonadei <mbonadei@google.com>
 Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@mozilla.com>
 Nico Weber <thakis@chromium.org>
+Niveditha Rau <niveditha.rau@gmail.com>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
 Patrik Westin <patrik.westin@gmail.com>
@@ -129,9 +139,12 @@
 Rahul Chaudhry <rahulchaudhry@google.com>
 Ralph Giles <giles@xiph.org>
 Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
+Raphael Kubo da Costa <raphael.kubo.da.costa@intel.com>
+Ritu Baldwa <ritu.baldwa@ittiam.com>
 Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
+Sai Deng <sdeng@google.com>
 Sami Pietilä <samipietila@google.com>
 Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
@@ -139,12 +152,15 @@
 Scott LaVarnway <slavarnway@google.com>
 Sean McGovern <gseanmcg@gmail.com>
 Sergey Kolomenkin <kolomenkin@gmail.com>
+Sergey Silkin <ssilkin@google.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
 Shiyou Yin <yinshiyou-hf@loongson.cn>
+Shubham Tandle <shubham.tandle@ittiam.com>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
+Supradeep T R <supradeep.tr@ittiam.com>
 Sylvestre Ledru <sylvestre@mozilla.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
@@ -157,8 +173,11 @@
 Tom Finegan <tomfinegan@google.com>
 Tristan Matthews <le.businessman@gmail.com>
 Urvang Joshi <urvang@google.com>
+Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
 Vlad Tsyrklevich <vtsyrklevich@chromium.org>
+Wan-Teh Chang <wtc@google.com>
+xiwei gu <guxiwei-hf@loongson.cn>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>

diff --git a/CHANGELOG b/CHANGELOG
index 52089df..3bdf8ac 100644
--- a/CHANGELOG
+++ b/CHANGELOG

@@ -1,3 +1,44 @@
+2019-01-31 v1.8.0 "Northern Shoveler Duck"
+  This release focused on encoding performance for realtime and VOD use cases.
+
+  - Upgrading:
+    This adds and improves several vp9 controls. Most are related to SVC:
+      VP9E_SET_SVC_FRAME_DROP_LAYER:
+        - Frame dropping in SVC.
+      VP9E_SET_SVC_INTER_LAYER_PRED:
+        - Inter-layer prediction in SVC.
+      VP9E_SET_SVC_GF_TEMPORAL_REF:
+        - Enable long term temporal reference in SVC.
+      VP9E_SET_SVC_REF_FRAME_CONFIG/VP9E_GET_SVC_REF_FRAME_CONFIG:
+        - Extend and improve this control for better flexibility in setting SVC
+          pattern dynamically.
+      VP9E_SET_POSTENCODE_DROP:
+        - Allow for post-encode frame dropping (applies to non-SVC too).
+      VP9E_SET_SVC_SPATIAL_LAYER_SYNC:
+        - Enable spatial layer sync frames.
+      VP9E_SET_SVC_LAYER_ID:
+        - Extend api to specify temporal id for each spatial layers.
+      VP9E_SET_ROI_MAP:
+        - Extend Region of Interest functionality to VP9.
+
+  - Enhancements:
+    2 pass vp9 encoding has improved substantially. When using --auto-alt-ref=6,
+    we see approximately 8% for VBR and 10% for CQ. When using --auto-alt-ref=1,
+    the gains are approximately 4% for VBR and 5% for CQ.
+
+    For real-time encoding, speed 7 has improved by ~5-10%. Encodes targeted at
+    screen sharing have improved when the content changes significantly (slide
+    sharing) or scrolls. There is a new speed 9 setting for mobile devices which
+    is about 10-20% faster than speed 8.
+
+  - Bug fixes:
+    VP9 denoiser issue.
+    VP9 partition issue for 1080p.
+    VP9 rate control improvments.
+    Postprocessing Multi Frame Quality Enhancement (MFQE) issue.
+    VP8 multithread decoder issues.
+    A variety of fuzzing issues.
+
 2018-01-04 v1.7.0 "Mandarin Duck"
   This release focused on high bit depth performance (10/12 bit) and vp9
   encoding improvements.

diff --git a/README b/README
index 318846f..61bee3e 100644
--- a/README
+++ b/README

@@ -1,4 +1,4 @@
-README - 24 January 2018
+README - 31 January 2019
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -63,12 +63,14 @@
     arm64-android-gcc
     arm64-darwin-gcc
     arm64-linux-gcc
+    arm64-win64-gcc
     arm64-win64-vs15
     armv7-android-gcc
     armv7-darwin-gcc
     armv7-linux-rvct
     armv7-linux-gcc
     armv7-none-rvct
+    armv7-win32-gcc
     armv7-win32-vs14
     armv7-win32-vs15
     armv7s-darwin-gcc
@@ -89,6 +91,7 @@
     x86-darwin14-gcc
     x86-darwin15-gcc
     x86-darwin16-gcc
+    x86-darwin17-gcc
     x86-iphonesimulator-gcc
     x86-linux-gcc
     x86-linux-icc
@@ -106,6 +109,7 @@
     x86_64-darwin14-gcc
     x86_64-darwin15-gcc
     x86_64-darwin16-gcc
+    x86_64-darwin17-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc

diff --git a/configure b/configure
index c8f1d35..e91b57a 100755
--- a/configure
+++ b/configure

@@ -625,6 +625,9 @@
         if enabled mips || [ -z "${INLINE}" ]; then
           enabled extra_warnings || check_add_cflags -Wno-unused-function
         fi
+        # Enforce c89 for c files. Don't be too strict about it though. Allow
+        # gnu extensions like "//" for comments.
+        check_cflags -std=gnu89 && add_cflags_only -std=gnu89
         # Avoid this warning for third_party C++ sources. Some reorganization
         # would be needed to apply this only to test/*.cc.
         check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32

diff --git a/examples/vpx_dec_fuzzer.cc b/examples/vpx_dec_fuzzer.cc
index b74b47c..e3b0d2e 100644
--- a/examples/vpx_dec_fuzzer.cc
+++ b/examples/vpx_dec_fuzzer.cc

@@ -33,7 +33,8 @@
  * Out of memory errors when running generated fuzzer binary
    $../libvpx/configure --disable-unit-tests --size-limit=12288x12288 \
    --extra-cflags="-DVPX_MAX_ALLOCABLE_MEMORY=1073741824" \
-   --disable-webm-io --enable-debug
+   --disable-webm-io --enable-debug --disable-vp8-encoder \
+   --disable-vp9-encoder --disable-examples
 
  * Build libvpx
    $make -j32
@@ -42,7 +43,7 @@
    $ $CXX $CXXFLAGS -std=c++11 -DDECODER=vp9 \
    -fsanitize=fuzzer -I../libvpx -I. -Wl,--start-group \
    ../libvpx/examples/vpx_dec_fuzzer.cc -o ./vpx_dec_fuzzer_vp9 \
-   ./libvpx.a ./tools_common.c.o -Wl,--end-group
+   ./libvpx.a -Wl,--end-group
 
  * DECODER should be defined as vp9 or vp8 to enable vp9/vp8
  *
@@ -66,13 +67,15 @@
 #include <stdlib.h>
 #include <memory>
 
-#include "./tools_common.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx_ports/mem_ops.h"
 
-#define VPX_TOSTRING(str) #str
-#define VPX_STRINGIFY(str) VPX_TOSTRING(str)
+#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
+#define IVF_FILE_HDR_SZ 32
+
+#define VPXD_INTERFACE(name) VPXD_INTERFACE_(name)
+#define VPXD_INTERFACE_(name) vpx_codec_##name##_dx()
 
 static void CloseFile(FILE *file) { fclose(file); }
 
@@ -131,16 +134,12 @@
   if (fread(header, 1, IVF_FILE_HDR_SZ, file.get()) != IVF_FILE_HDR_SZ) {
     return 0;
   }
-  const VpxInterface *decoder = get_vpx_decoder_by_name(VPX_STRINGIFY(DECODER));
-  if (decoder == nullptr) {
-    return 0;
-  }
 
   vpx_codec_ctx_t codec;
   // Set thread count in the range [1, 64].
   const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1;
   vpx_codec_dec_cfg_t cfg = { threads, 0, 0 };
-  if (vpx_codec_dec_init(&codec, decoder->codec_interface(), &cfg, 0)) {
+  if (vpx_codec_dec_init(&codec, VPXD_INTERFACE(DECODER), &cfg, 0)) {
     return 0;
   }
 

diff --git a/libs.mk b/libs.mk
index 7ec8c87..d0c4d64 100644
--- a/libs.mk
+++ b/libs.mk

@@ -233,7 +233,7 @@
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-SO_VERSION_MAJOR := 5
+SO_VERSION_MAJOR := 6
 SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))

diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
deleted file mode 100644
index d14a482..0000000
--- a/test/temporal_filter_test.cc
+++ /dev/null

@@ -1,280 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <limits>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vp9_rtcd.h"
-#include "test/acm_random.h"
-#include "test/buffer.h"
-#include "test/register_state_check.h"
-#include "vpx_ports/vpx_timer.h"
-
-namespace {
-
-using ::libvpx_test::ACMRandom;
-using ::libvpx_test::Buffer;
-
-typedef void (*TemporalFilterFunc)(const uint8_t *a, unsigned int stride,
-                                   const uint8_t *b, unsigned int w,
-                                   unsigned int h, int filter_strength,
-                                   int filter_weight, unsigned int *accumulator,
-                                   uint16_t *count);
-
-// Calculate the difference between 'a' and 'b', sum in blocks of 9, and apply
-// filter based on strength and weight. Store the resulting filter amount in
-// 'count' and apply it to 'b' and store it in 'accumulator'.
-void reference_filter(const Buffer<uint8_t> &a, const Buffer<uint8_t> &b, int w,
-                      int h, int filter_strength, int filter_weight,
-                      Buffer<unsigned int> *accumulator,
-                      Buffer<uint16_t> *count) {
-  Buffer<int> diff_sq = Buffer<int>(w, h, 0);
-  ASSERT_TRUE(diff_sq.Init());
-  diff_sq.Set(0);
-
-  int rounding = 0;
-  if (filter_strength > 0) {
-    rounding = 1 << (filter_strength - 1);
-  }
-
-  ASSERT_TRUE(a.TopLeftPixel() != NULL);
-  ASSERT_TRUE(b.TopLeftPixel() != NULL);
-  ASSERT_TRUE(diff_sq.TopLeftPixel() != NULL);
-  // Calculate all the differences. Avoids re-calculating a bunch of extra
-  // values.
-  for (int height = 0; height < h; ++height) {
-    for (int width = 0; width < w; ++width) {
-      int diff = a.TopLeftPixel()[height * a.stride() + width] -
-                 b.TopLeftPixel()[height * b.stride() + width];
-      diff_sq.TopLeftPixel()[height * diff_sq.stride() + width] = diff * diff;
-    }
-  }
-
-  // For any given point, sum the neighboring values and calculate the
-  // modifier.
-  for (int height = 0; height < h; ++height) {
-    for (int width = 0; width < w; ++width) {
-      // Determine how many values are being summed.
-      int summed_values = 9;
-
-      if (height == 0 || height == (h - 1)) {
-        summed_values -= 3;
-      }
-
-      if (width == 0 || width == (w - 1)) {
-        if (summed_values == 6) {  // corner
-          summed_values -= 2;
-        } else {
-          summed_values -= 3;
-        }
-      }
-
-      // Sum the diff_sq of the surrounding values.
-      int sum = 0;
-      for (int idy = -1; idy <= 1; ++idy) {
-        for (int idx = -1; idx <= 1; ++idx) {
-          const int y = height + idy;
-          const int x = width + idx;
-
-          // If inside the border.
-          if (y >= 0 && y < h && x >= 0 && x < w) {
-            sum += diff_sq.TopLeftPixel()[y * diff_sq.stride() + x];
-          }
-        }
-      }
-
-      sum *= 3;
-      sum /= summed_values;
-      sum += rounding;
-      sum >>= filter_strength;
-
-      // Clamp the value and invert it.
-      if (sum > 16) sum = 16;
-      sum = 16 - sum;
-
-      sum *= filter_weight;
-
-      count->TopLeftPixel()[height * count->stride() + width] += sum;
-      accumulator->TopLeftPixel()[height * accumulator->stride() + width] +=
-          sum * b.TopLeftPixel()[height * b.stride() + width];
-    }
-  }
-}
-
-class TemporalFilterTest : public ::testing::TestWithParam<TemporalFilterFunc> {
- public:
-  virtual void SetUp() {
-    filter_func_ = GetParam();
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-  }
-
- protected:
-  TemporalFilterFunc filter_func_;
-  ACMRandom rnd_;
-};
-
-TEST_P(TemporalFilterTest, SizeCombinations) {
-  // Depending on subsampling this function may be called with values of 8 or 16
-  // for width and height, in any combination.
-  Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
-  ASSERT_TRUE(a.Init());
-
-  const int filter_weight = 2;
-  const int filter_strength = 6;
-
-  for (int width = 8; width <= 16; width += 8) {
-    for (int height = 8; height <= 16; height += 8) {
-      // The second buffer must not have any border.
-      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
-      ASSERT_TRUE(b.Init());
-      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_ref.Init());
-      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_chk.Init());
-      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_ref.Init());
-      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_chk.Init());
-
-      // The difference between the buffers must be small to pass the threshold
-      // to apply the filter.
-      a.Set(&rnd_, 0, 7);
-      b.Set(&rnd_, 0, 7);
-
-      accum_ref.Set(rnd_.Rand8());
-      accum_chk.CopyFrom(accum_ref);
-      count_ref.Set(rnd_.Rand8());
-      count_chk.CopyFrom(count_ref);
-      reference_filter(a, b, width, height, filter_strength, filter_weight,
-                       &accum_ref, &count_ref);
-      ASM_REGISTER_STATE_CHECK(
-          filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
-                       height, filter_strength, filter_weight,
-                       accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()));
-      EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
-      EXPECT_TRUE(count_chk.CheckValues(count_ref));
-      if (HasFailure()) {
-        printf("Width: %d Height: %d\n", width, height);
-        count_chk.PrintDifference(count_ref);
-        accum_chk.PrintDifference(accum_ref);
-        return;
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterTest, CompareReferenceRandom) {
-  for (int width = 8; width <= 16; width += 8) {
-    for (int height = 8; height <= 16; height += 8) {
-      Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
-      ASSERT_TRUE(a.Init());
-      // The second buffer must not have any border.
-      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
-      ASSERT_TRUE(b.Init());
-      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_ref.Init());
-      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_chk.Init());
-      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_ref.Init());
-      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_chk.Init());
-
-      for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
-        for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
-          for (int repeat = 0; repeat < 100; ++repeat) {
-            if (repeat < 50) {
-              a.Set(&rnd_, 0, 7);
-              b.Set(&rnd_, 0, 7);
-            } else {
-              // Check large (but close) values as well.
-              a.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
-                    std::numeric_limits<uint8_t>::max());
-              b.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
-                    std::numeric_limits<uint8_t>::max());
-            }
-
-            accum_ref.Set(rnd_.Rand8());
-            accum_chk.CopyFrom(accum_ref);
-            count_ref.Set(rnd_.Rand8());
-            count_chk.CopyFrom(count_ref);
-            reference_filter(a, b, width, height, filter_strength,
-                             filter_weight, &accum_ref, &count_ref);
-            ASM_REGISTER_STATE_CHECK(filter_func_(
-                a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height,
-                filter_strength, filter_weight, accum_chk.TopLeftPixel(),
-                count_chk.TopLeftPixel()));
-            EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
-            EXPECT_TRUE(count_chk.CheckValues(count_ref));
-            if (HasFailure()) {
-              printf("Weight: %d Strength: %d\n", filter_weight,
-                     filter_strength);
-              count_chk.PrintDifference(count_ref);
-              accum_chk.PrintDifference(accum_ref);
-              return;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterTest, DISABLED_Speed) {
-  Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
-  ASSERT_TRUE(a.Init());
-
-  const int filter_weight = 2;
-  const int filter_strength = 6;
-
-  for (int width = 8; width <= 16; width += 8) {
-    for (int height = 8; height <= 16; height += 8) {
-      // The second buffer must not have any border.
-      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
-      ASSERT_TRUE(b.Init());
-      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_ref.Init());
-      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_chk.Init());
-      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_ref.Init());
-      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_chk.Init());
-
-      a.Set(&rnd_, 0, 7);
-      b.Set(&rnd_, 0, 7);
-
-      accum_chk.Set(0);
-      count_chk.Set(0);
-
-      vpx_usec_timer timer;
-      vpx_usec_timer_start(&timer);
-      for (int i = 0; i < 10000; ++i) {
-        filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
-                     height, filter_strength, filter_weight,
-                     accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
-      }
-      vpx_usec_timer_mark(&timer);
-      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
-      printf("Temporal filter %dx%d time: %5d us\n", width, height,
-             elapsed_time);
-    }
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest,
-                        ::testing::Values(&vp9_temporal_filter_apply_c));
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest,
-                        ::testing::Values(&vp9_temporal_filter_apply_sse4_1));
-#endif  // HAVE_SSE4_1
-}  // namespace

diff --git a/test/test.mk b/test/test.mk
index 2b76361..61eb606 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -170,7 +170,6 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc
 ifneq ($(CONFIG_REALTIME_ONLY),yes)
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc

diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc
index 8d68e4a..9fb170b 100644
--- a/test/yuv_temporal_filter_test.cc
+++ b/test/yuv_temporal_filter_test.cc

@@ -30,6 +30,18 @@
     uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
     uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
 
+struct TemporalFilterWithBd {
+  TemporalFilterWithBd(YUVTemporalFilterFunc func, int bitdepth)
+      : temporal_filter(func), bd(bitdepth) {}
+
+  YUVTemporalFilterFunc temporal_filter;
+  int bd;
+};
+
+std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) {
+  return os << "Bitdepth: " << tf.bd;
+}
+
 int GetFilterWeight(unsigned int row, unsigned int col,
                     unsigned int block_height, unsigned int block_width,
                     const int *const blk_fw, int use_32x32) {
@@ -40,8 +52,24 @@
   return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)];
 }
 
+template <typename PixelType>
 int GetModIndex(int sum_dist, int index, int rounding, int strength,
                 int filter_weight) {
+  int mod = sum_dist * 3 / index;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+template <>
+int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength,
+                         int filter_weight) {
   unsigned int index_mult[14] = {
     0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124
   };
@@ -61,22 +89,33 @@
   return mod;
 }
 
+template <typename PixelType>
 void ApplyReferenceFilter(
-    const Buffer<uint8_t> &y_src, const Buffer<uint8_t> &y_pre,
-    const Buffer<uint8_t> &u_src, const Buffer<uint8_t> &v_src,
-    const Buffer<uint8_t> &u_pre, const Buffer<uint8_t> &v_pre,
+    const Buffer<PixelType> &y_src, const Buffer<PixelType> &y_pre,
+    const Buffer<PixelType> &u_src, const Buffer<PixelType> &v_src,
+    const Buffer<PixelType> &u_pre, const Buffer<PixelType> &v_pre,
     unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
     int strength, const int *const blk_fw, int use_32x32,
-    Buffer<uint32_t> *y_accumulator, Buffer<uint16_t> *y_count,
-    Buffer<uint32_t> *u_accumulator, Buffer<uint16_t> *u_count,
-    Buffer<uint32_t> *v_accumulator, Buffer<uint16_t> *v_count) {
-  // blk_fw means block_filter_weight
-  // Set up buffer to store squared_diffs
+    Buffer<uint32_t> *y_accumulator, Buffer<uint16_t> *y_counter,
+    Buffer<uint32_t> *u_accumulator, Buffer<uint16_t> *u_counter,
+    Buffer<uint32_t> *v_accumulator, Buffer<uint16_t> *v_counter) {
+  const PixelType *y_src_ptr = y_src.TopLeftPixel();
+  const PixelType *y_pre_ptr = y_pre.TopLeftPixel();
+  const PixelType *u_src_ptr = u_src.TopLeftPixel();
+  const PixelType *u_pre_ptr = u_pre.TopLeftPixel();
+  const PixelType *v_src_ptr = v_src.TopLeftPixel();
+  const PixelType *v_pre_ptr = v_pre.TopLeftPixel();
+
+  const int uv_block_width = block_width >> ss_x,
+            uv_block_height = block_height >> ss_y;
+  const int y_src_stride = y_src.stride(), y_pre_stride = y_pre.stride();
+  const int uv_src_stride = u_src.stride(), uv_pre_stride = u_pre.stride();
+  const int y_diff_stride = block_width, uv_diff_stride = uv_block_width;
+
   Buffer<int> y_dif = Buffer<int>(block_width, block_height, 0);
-  const int uv_block_width = block_width >> ss_x;
-  const int uv_block_height = block_height >> ss_y;
   Buffer<int> u_dif = Buffer<int>(uv_block_width, uv_block_height, 0);
   Buffer<int> v_dif = Buffer<int>(uv_block_width, uv_block_height, 0);
+
   ASSERT_TRUE(y_dif.Init());
   ASSERT_TRUE(u_dif.Init());
   ASSERT_TRUE(v_dif.Init());
@@ -84,55 +123,56 @@
   u_dif.Set(0);
   v_dif.Set(0);
 
-  // How many bits do we want to round
-  ASSERT_GE(strength, 0);
-  ASSERT_LE(strength, 6);
-  int rounding = 0;
-  if (strength > 0) {
-    rounding = 1 << (strength - 1);
-  }
+  int *y_diff_ptr = y_dif.TopLeftPixel();
+  int *u_diff_ptr = u_dif.TopLeftPixel();
+  int *v_diff_ptr = v_dif.TopLeftPixel();
 
-  // Check that the buffers are valid
-  ASSERT_TRUE(y_src.TopLeftPixel() != NULL);
-  ASSERT_TRUE(y_pre.TopLeftPixel() != NULL);
-  ASSERT_TRUE(y_dif.TopLeftPixel() != NULL);
-  ASSERT_TRUE(u_src.TopLeftPixel() != NULL);
-  ASSERT_TRUE(u_pre.TopLeftPixel() != NULL);
-  ASSERT_TRUE(u_dif.TopLeftPixel() != NULL);
-  ASSERT_TRUE(v_src.TopLeftPixel() != NULL);
-  ASSERT_TRUE(v_pre.TopLeftPixel() != NULL);
-  ASSERT_TRUE(v_dif.TopLeftPixel() != NULL);
+  uint32_t *y_accum = y_accumulator->TopLeftPixel();
+  uint32_t *u_accum = u_accumulator->TopLeftPixel();
+  uint32_t *v_accum = v_accumulator->TopLeftPixel();
+  uint16_t *y_count = y_counter->TopLeftPixel();
+  uint16_t *u_count = u_counter->TopLeftPixel();
+  uint16_t *v_count = v_counter->TopLeftPixel();
+
+  const int y_accum_stride = y_accumulator->stride();
+  const int u_accum_stride = u_accumulator->stride();
+  const int v_accum_stride = v_accumulator->stride();
+  const int y_count_stride = y_counter->stride();
+  const int u_count_stride = u_counter->stride();
+  const int v_count_stride = v_counter->stride();
+
+  const int rounding = (1 << strength) >> 1;
 
   // Get the square diffs
-  for (int row = 0; row < static_cast<int>(block_height); row++) {
-    for (int col = 0; col < static_cast<int>(block_width); col++) {
-      const int diff = y_src.TopLeftPixel()[row * y_src.stride() + col] -
-                       y_pre.TopLeftPixel()[row * y_pre.stride() + col];
-      y_dif.TopLeftPixel()[row * y_dif.stride() + col] = diff * diff;
+  for (int row = 0; row < (int)block_height; row++) {
+    for (int col = 0; col < (int)block_width; col++) {
+      const int diff = y_src_ptr[row * y_src_stride + col] -
+                       y_pre_ptr[row * y_pre_stride + col];
+      y_diff_ptr[row * y_diff_stride + col] = diff * diff;
     }
   }
 
-  for (int row = 0; row < uv_block_height; row++) {
-    for (int col = 0; col < uv_block_width; col++) {
-      const int u_diff = u_src.TopLeftPixel()[row * u_src.stride() + col] -
-                         u_pre.TopLeftPixel()[row * u_pre.stride() + col];
-      const int v_diff = v_src.TopLeftPixel()[row * v_src.stride() + col] -
-                         v_pre.TopLeftPixel()[row * v_pre.stride() + col];
-      u_dif.TopLeftPixel()[row * u_dif.stride() + col] = u_diff * u_diff;
-      v_dif.TopLeftPixel()[row * v_dif.stride() + col] = v_diff * v_diff;
+  for (int row = 0; row < (int)uv_block_height; row++) {
+    for (int col = 0; col < (int)uv_block_width; col++) {
+      const int u_diff = u_src_ptr[row * uv_src_stride + col] -
+                         u_pre_ptr[row * uv_pre_stride + col];
+      const int v_diff = v_src_ptr[row * uv_src_stride + col] -
+                         v_pre_ptr[row * uv_pre_stride + col];
+      u_diff_ptr[row * uv_diff_stride + col] = u_diff * u_diff;
+      v_diff_ptr[row * uv_diff_stride + col] = v_diff * v_diff;
     }
   }
 
-  // Apply the filter
-  for (int row = 0; row < static_cast<int>(block_height); row++) {
-    for (int col = 0; col < static_cast<int>(block_width); col++) {
-      const int uv_r = row >> ss_y;
-      const int uv_c = col >> ss_x;
+  // Apply the filter to luma
+  for (int row = 0; row < (int)block_height; row++) {
+    for (int col = 0; col < (int)block_width; col++) {
+      const int uv_row = row >> ss_y;
+      const int uv_col = col >> ss_x;
       const int filter_weight = GetFilterWeight(row, col, block_height,
                                                 block_width, blk_fw, use_32x32);
 
       // First we get the modifier for the current y pixel
-      const int y_pixel = y_pre.TopLeftPixel()[row * y_pre.stride() + col];
+      const int y_pixel = y_pre_ptr[row * y_pre_stride + col];
       int y_num_used = 0;
       int y_mod = 0;
 
@@ -142,116 +182,316 @@
           const int sub_row = row + row_step;
           const int sub_col = col + col_step;
 
-          if (sub_row >= 0 && sub_row < static_cast<int>(block_height) &&
-              sub_col >= 0 && sub_col < static_cast<int>(block_width)) {
-            y_mod += y_dif.TopLeftPixel()[sub_row * y_dif.stride() + sub_col];
+          if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
+              sub_col < (int)block_width) {
+            y_mod += y_diff_ptr[sub_row * y_diff_stride + sub_col];
             y_num_used++;
           }
         }
       }
 
-      ASSERT_GE(y_num_used, 0);
-
       // Sum the corresponding uv pixels to the current y modifier
       // Note we are rounding down instead of rounding to the nearest pixel.
-      y_mod += u_dif.TopLeftPixel()[uv_r * uv_block_width + uv_c];
-      y_mod += v_dif.TopLeftPixel()[uv_r * uv_block_width + uv_c];
+      y_mod += u_diff_ptr[uv_row * uv_diff_stride + uv_col];
+      y_mod += v_diff_ptr[uv_row * uv_diff_stride + uv_col];
 
       y_num_used += 2;
 
       // Set the modifier
-      y_mod = GetModIndex(y_mod, y_num_used, rounding, strength, filter_weight);
+      y_mod = GetModIndex<PixelType>(y_mod, y_num_used, rounding, strength,
+                                     filter_weight);
 
       // Accumulate the result
-      y_count->TopLeftPixel()[row * y_count->stride() + col] += y_mod;
-      y_accumulator->TopLeftPixel()[row * y_accumulator->stride() + col] +=
-          y_mod * y_pixel;
+      y_count[row * y_count_stride + col] += y_mod;
+      y_accum[row * y_accum_stride + col] += y_mod * y_pixel;
+    }
+  }
 
-      // Get the modifier for chroma components
-      if (!(row & ss_y) && !(col & ss_x)) {
-        const int u_pixel = u_pre.TopLeftPixel()[uv_r * u_pre.stride() + uv_c];
-        const int v_pixel = v_pre.TopLeftPixel()[uv_r * v_pre.stride() + uv_c];
+  // Apply the filter to chroma
+  for (int uv_row = 0; uv_row < (int)uv_block_height; uv_row++) {
+    for (int uv_col = 0; uv_col < (int)uv_block_width; uv_col++) {
+      const int y_row = uv_row << ss_y;
+      const int y_col = uv_col << ss_x;
+      const int filter_weight = GetFilterWeight(
+          uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
 
-        int uv_num_used = 0;
-        int u_mod = 0, v_mod = 0;
+      const int u_pixel = u_pre_ptr[uv_row * uv_pre_stride + uv_col];
+      const int v_pixel = v_pre_ptr[uv_row * uv_pre_stride + uv_col];
 
-        // Sum the neighboring 3x3 chromal pixels to the chroma modifier
-        for (int row_step = -1; row_step <= 1; row_step++) {
-          for (int col_step = -1; col_step <= 1; col_step++) {
-            const int sub_row = uv_r + row_step;
-            const int sub_col = uv_c + col_step;
+      int uv_num_used = 0;
+      int u_mod = 0, v_mod = 0;
 
-            if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
-                sub_col < uv_block_width) {
-              u_mod += u_dif.TopLeftPixel()[sub_row * uv_block_width + sub_col];
-              v_mod += v_dif.TopLeftPixel()[sub_row * uv_block_width + sub_col];
-              uv_num_used++;
-            }
-          }
-        }
+      // Sum the neighboring 3x3 chromal pixels to the chroma modifier
+      for (int row_step = -1; row_step <= 1; row_step++) {
+        for (int col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = uv_row + row_step;
+          const int sub_col = uv_col + col_step;
 
-        ASSERT_GT(uv_num_used, 0);
-
-        // Sum all the luma pixels associated with the current luma pixel
-        for (int row_step = 0; row_step < 1 + ss_y; row_step++) {
-          for (int col_step = 0; col_step < 1 + ss_x; col_step++) {
-            const int sub_row = (uv_r << ss_y) + row_step;
-            const int sub_col = (uv_c << ss_x) + col_step;
-            const int y_diff =
-                y_dif.TopLeftPixel()[sub_row * y_dif.stride() + sub_col];
-
-            u_mod += y_diff;
-            v_mod += y_diff;
+          if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
+              sub_col < uv_block_width) {
+            u_mod += u_diff_ptr[sub_row * uv_diff_stride + sub_col];
+            v_mod += v_diff_ptr[sub_row * uv_diff_stride + sub_col];
             uv_num_used++;
           }
         }
-
-        // Set the modifier
-        u_mod =
-            GetModIndex(u_mod, uv_num_used, rounding, strength, filter_weight);
-        v_mod =
-            GetModIndex(v_mod, uv_num_used, rounding, strength, filter_weight);
-
-        // Accumulate the result
-        u_count->TopLeftPixel()[uv_r * u_count->stride() + uv_c] += u_mod;
-        u_accumulator->TopLeftPixel()[uv_r * u_accumulator->stride() + uv_c] +=
-            u_mod * u_pixel;
-        v_count->TopLeftPixel()[uv_r * u_count->stride() + uv_c] += v_mod;
-        v_accumulator->TopLeftPixel()[uv_r * v_accumulator->stride() + uv_c] +=
-            v_mod * v_pixel;
       }
+
+      // Sum all the luma pixels associated with the current luma pixel
+      for (int row_step = 0; row_step < 1 + ss_y; row_step++) {
+        for (int col_step = 0; col_step < 1 + ss_x; col_step++) {
+          const int sub_row = y_row + row_step;
+          const int sub_col = y_col + col_step;
+          const int y_diff = y_diff_ptr[sub_row * y_diff_stride + sub_col];
+
+          u_mod += y_diff;
+          v_mod += y_diff;
+          uv_num_used++;
+        }
+      }
+
+      // Set the modifier
+      u_mod = GetModIndex<PixelType>(u_mod, uv_num_used, rounding, strength,
+                                     filter_weight);
+      v_mod = GetModIndex<PixelType>(v_mod, uv_num_used, rounding, strength,
+                                     filter_weight);
+
+      // Accumulate the result
+      u_count[uv_row * u_count_stride + uv_col] += u_mod;
+      u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel;
+      v_count[uv_row * v_count_stride + uv_col] += v_mod;
+      v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel;
     }
   }
 }
 
 class YUVTemporalFilterTest
-    : public ::testing::TestWithParam<YUVTemporalFilterFunc> {
+    : public ::testing::TestWithParam<TemporalFilterWithBd> {
  public:
   virtual void SetUp() {
-    filter_func_ = GetParam();
+    filter_func_ = GetParam().temporal_filter;
+    bd_ = GetParam().bd;
+    use_highbd_ = (bd_ != 8);
+
     rnd_.Reset(ACMRandom::DeterministicSeed());
+    saturate_test_ = 0;
+    num_repeats_ = 10;
+
+    ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12);
   }
 
  protected:
+  template <typename PixelType>
+  void CompareTestWithParam(int width, int height, int ss_x, int ss_y,
+                            int filter_strength, int use_32x32,
+                            const int *filter_weight);
+  template <typename PixelType>
+  void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y,
+                              int filter_strength, int use_32x32,
+                              const int *filter_weight);
   YUVTemporalFilterFunc filter_func_;
   ACMRandom rnd_;
+  int saturate_test_;
+  int num_repeats_;
+  int use_highbd_;
+  int bd_;
 };
 
-TEST_P(YUVTemporalFilterTest, Use32x32) {
-  const int width = 32, height = 32;
-  Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8);
-  Buffer<uint8_t> y_pre = Buffer<uint8_t>(width, height, 0);
+template <typename PixelType>
+void YUVTemporalFilterTest::CompareTestWithParam(int width, int height,
+                                                 int ss_x, int ss_y,
+                                                 int filter_strength,
+                                                 int use_32x32,
+                                                 const int *filter_weight) {
+  const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+
+  Buffer<PixelType> y_src = Buffer<PixelType>(width, height, 0);
+  Buffer<PixelType> y_pre = Buffer<PixelType>(width, height, 0);
   Buffer<uint16_t> y_count_ref = Buffer<uint16_t>(width, height, 0);
   Buffer<uint32_t> y_accum_ref = Buffer<uint32_t>(width, height, 0);
   Buffer<uint16_t> y_count_tst = Buffer<uint16_t>(width, height, 0);
   Buffer<uint32_t> y_accum_tst = Buffer<uint32_t>(width, height, 0);
+
+  Buffer<PixelType> u_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> u_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> u_count_ref = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> u_accum_ref = Buffer<uint32_t>(uv_width, uv_height, 0);
+  Buffer<uint16_t> u_count_tst = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> u_accum_tst = Buffer<uint32_t>(uv_width, uv_height, 0);
+
+  Buffer<PixelType> v_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> v_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> v_count_ref = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> v_accum_ref = Buffer<uint32_t>(uv_width, uv_height, 0);
+  Buffer<uint16_t> v_count_tst = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> v_accum_tst = Buffer<uint32_t>(uv_width, uv_height, 0);
+
   ASSERT_TRUE(y_src.Init());
   ASSERT_TRUE(y_pre.Init());
   ASSERT_TRUE(y_count_ref.Init());
   ASSERT_TRUE(y_accum_ref.Init());
   ASSERT_TRUE(y_count_tst.Init());
   ASSERT_TRUE(y_accum_tst.Init());
+  ASSERT_TRUE(u_src.Init());
+  ASSERT_TRUE(u_pre.Init());
+  ASSERT_TRUE(u_count_ref.Init());
+  ASSERT_TRUE(u_accum_ref.Init());
+  ASSERT_TRUE(u_count_tst.Init());
+  ASSERT_TRUE(u_accum_tst.Init());
 
+  ASSERT_TRUE(v_src.Init());
+  ASSERT_TRUE(v_pre.Init());
+  ASSERT_TRUE(v_count_ref.Init());
+  ASSERT_TRUE(v_accum_ref.Init());
+  ASSERT_TRUE(v_count_tst.Init());
+  ASSERT_TRUE(v_accum_tst.Init());
+
+  y_accum_ref.Set(0);
+  y_accum_tst.Set(0);
+  y_count_ref.Set(0);
+  y_count_tst.Set(0);
+  u_accum_ref.Set(0);
+  u_accum_tst.Set(0);
+  u_count_ref.Set(0);
+  u_count_tst.Set(0);
+  v_accum_ref.Set(0);
+  v_accum_tst.Set(0);
+  v_count_ref.Set(0);
+  v_count_tst.Set(0);
+
+  for (int repeats = 0; repeats < num_repeats_; repeats++) {
+    if (saturate_test_) {
+      const int max_val = (1 << bd_) - 1;
+      y_src.Set(max_val);
+      y_pre.Set(0);
+      u_src.Set(max_val);
+      u_pre.Set(0);
+      v_src.Set(max_val);
+      v_pre.Set(0);
+    } else {
+      y_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+      y_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+      u_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+      u_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+      v_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+      v_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+    }
+
+    ApplyReferenceFilter<PixelType>(
+        y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y,
+        filter_strength, filter_weight, use_32x32, &y_accum_ref, &y_count_ref,
+        &u_accum_ref, &u_count_ref, &v_accum_ref, &v_count_ref);
+
+    ASM_REGISTER_STATE_CHECK(filter_func_(
+        reinterpret_cast<const uint8_t *>(y_src.TopLeftPixel()), y_src.stride(),
+        reinterpret_cast<const uint8_t *>(y_pre.TopLeftPixel()), y_pre.stride(),
+        reinterpret_cast<const uint8_t *>(u_src.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_src.TopLeftPixel()), u_src.stride(),
+        reinterpret_cast<const uint8_t *>(u_pre.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_pre.TopLeftPixel()), u_pre.stride(),
+        width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32,
+        y_accum_tst.TopLeftPixel(), y_count_tst.TopLeftPixel(),
+        u_accum_tst.TopLeftPixel(), u_count_tst.TopLeftPixel(),
+        v_accum_tst.TopLeftPixel(), v_count_tst.TopLeftPixel()));
+
+    EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref));
+    EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref));
+    EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref));
+    EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref));
+    EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref));
+    EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref));
+
+    if (HasFailure()) {
+      if (use_32x32) {
+        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y,
+               filter_strength, *filter_weight);
+      } else {
+        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x,
+               ss_y, filter_strength, filter_weight[0], filter_weight[1],
+               filter_weight[2], filter_weight[3]);
+      }
+      y_accum_tst.PrintDifference(y_accum_ref);
+      y_count_tst.PrintDifference(y_count_ref);
+      u_accum_tst.PrintDifference(u_accum_ref);
+      u_count_tst.PrintDifference(u_count_ref);
+      v_accum_tst.PrintDifference(v_accum_ref);
+      v_count_tst.PrintDifference(v_count_ref);
+
+      return;
+    }
+  }
+}
+
+template <typename PixelType>
+void YUVTemporalFilterTest::RunTestFilterWithParam(int width, int height,
+                                                   int ss_x, int ss_y,
+                                                   int filter_strength,
+                                                   int use_32x32,
+                                                   const int *filter_weight) {
+  const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+
+  Buffer<PixelType> y_src = Buffer<PixelType>(width, height, 0);
+  Buffer<PixelType> y_pre = Buffer<PixelType>(width, height, 0);
+  Buffer<uint16_t> y_count = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum = Buffer<uint32_t>(width, height, 0);
+
+  Buffer<PixelType> u_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> u_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> u_count = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> u_accum = Buffer<uint32_t>(uv_width, uv_height, 0);
+
+  Buffer<PixelType> v_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> v_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> v_count = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> v_accum = Buffer<uint32_t>(uv_width, uv_height, 0);
+
+  ASSERT_TRUE(y_src.Init());
+  ASSERT_TRUE(y_pre.Init());
+  ASSERT_TRUE(y_count.Init());
+  ASSERT_TRUE(y_accum.Init());
+
+  ASSERT_TRUE(u_src.Init());
+  ASSERT_TRUE(u_pre.Init());
+  ASSERT_TRUE(u_count.Init());
+  ASSERT_TRUE(u_accum.Init());
+
+  ASSERT_TRUE(v_src.Init());
+  ASSERT_TRUE(v_pre.Init());
+  ASSERT_TRUE(v_count.Init());
+  ASSERT_TRUE(v_accum.Init());
+
+  y_accum.Set(0);
+  y_count.Set(0);
+
+  u_accum.Set(0);
+  u_count.Set(0);
+
+  v_accum.Set(0);
+  v_count.Set(0);
+
+  y_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+  y_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+  u_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+  u_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+  v_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+  v_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+
+  for (int repeats = 0; repeats < num_repeats_; repeats++) {
+    ASM_REGISTER_STATE_CHECK(filter_func_(
+        reinterpret_cast<const uint8_t *>(y_src.TopLeftPixel()), y_src.stride(),
+        reinterpret_cast<const uint8_t *>(y_pre.TopLeftPixel()), y_pre.stride(),
+        reinterpret_cast<const uint8_t *>(u_src.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_src.TopLeftPixel()), u_src.stride(),
+        reinterpret_cast<const uint8_t *>(u_pre.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_pre.TopLeftPixel()), u_pre.stride(),
+        width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32,
+        y_accum.TopLeftPixel(), y_count.TopLeftPixel(), u_accum.TopLeftPixel(),
+        u_count.TopLeftPixel(), v_accum.TopLeftPixel(),
+        v_count.TopLeftPixel()));
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, Use32x32) {
+  const int width = 32, height = 32;
   const int use_32x32 = 1;
 
   for (int ss_x = 0; ss_x <= 1; ss_x++) {
@@ -259,95 +499,17 @@
       for (int filter_strength = 0; filter_strength <= 6;
            filter_strength += 2) {
         for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
-          const int uv_width = width >> ss_x, uv_height = height >> ss_y;
-          Buffer<uint8_t> u_src = Buffer<uint8_t>(uv_width, uv_height, 8);
-          Buffer<uint8_t> u_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> u_count_ref =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> u_accum_ref =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> u_count_tst =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> u_accum_tst =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          ASSERT_TRUE(u_src.Init());
-          ASSERT_TRUE(u_pre.Init());
-          ASSERT_TRUE(u_count_ref.Init());
-          ASSERT_TRUE(u_accum_ref.Init());
-          ASSERT_TRUE(u_count_tst.Init());
-          ASSERT_TRUE(u_accum_tst.Init());
-          Buffer<uint8_t> v_src = Buffer<uint8_t>(uv_width, uv_height, 8);
-          Buffer<uint8_t> v_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> v_count_ref =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> v_accum_ref =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> v_count_tst =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> v_accum_tst =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          ASSERT_TRUE(v_src.Init());
-          ASSERT_TRUE(v_pre.Init());
-          ASSERT_TRUE(v_count_ref.Init());
-          ASSERT_TRUE(v_accum_ref.Init());
-          ASSERT_TRUE(v_count_tst.Init());
-          ASSERT_TRUE(v_accum_tst.Init());
-
-          // The difference between the buffers must be small to pass the
-          // threshold to apply the filter.
-          y_src.Set(&rnd_, 0, 7);
-          y_pre.Set(&rnd_, 0, 7);
-          u_src.Set(&rnd_, 0, 7);
-          u_pre.Set(&rnd_, 0, 7);
-          v_src.Set(&rnd_, 0, 7);
-          v_pre.Set(&rnd_, 0, 7);
-
-          y_accum_ref.Set(rnd_.Rand8());
-          y_accum_tst.CopyFrom(y_accum_ref);
-          y_count_ref.Set(rnd_.Rand8());
-          y_count_tst.CopyFrom(y_count_ref);
-          u_accum_ref.Set(rnd_.Rand8());
-          u_accum_tst.CopyFrom(u_accum_ref);
-          u_count_ref.Set(rnd_.Rand8());
-          u_count_tst.CopyFrom(u_count_ref);
-          v_accum_ref.Set(rnd_.Rand8());
-          v_accum_tst.CopyFrom(v_accum_ref);
-          v_count_ref.Set(rnd_.Rand8());
-          v_count_tst.CopyFrom(v_count_ref);
-
-          ApplyReferenceFilter(y_src, y_pre, u_src, v_src, u_pre, v_pre, width,
-                               height, ss_x, ss_y, filter_strength,
-                               &filter_weight, use_32x32, &y_accum_ref,
-                               &y_count_ref, &u_accum_ref, &u_count_ref,
-                               &v_accum_ref, &v_count_ref);
-          ASM_REGISTER_STATE_CHECK(filter_func_(
-              y_src.TopLeftPixel(), y_src.stride(), y_pre.TopLeftPixel(),
-              y_pre.stride(), u_src.TopLeftPixel(), v_src.TopLeftPixel(),
-              u_src.stride(), u_pre.TopLeftPixel(), v_pre.TopLeftPixel(),
-              u_pre.stride(), width, height, ss_x, ss_y, filter_strength,
-              &filter_weight, use_32x32, y_accum_tst.TopLeftPixel(),
-              y_count_tst.TopLeftPixel(), u_accum_tst.TopLeftPixel(),
-              u_count_tst.TopLeftPixel(), v_accum_tst.TopLeftPixel(),
-              v_count_tst.TopLeftPixel()));
-
-          EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref));
-          EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref));
-          EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref));
-          EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref));
-          EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref));
-          EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref));
-
-          if (HasFailure()) {
-            printf("SS_X: %d, SS_Y: %d, Weight: %d, Strength: %d\n", ss_x, ss_y,
-                   filter_weight, filter_strength);
-            y_accum_tst.PrintDifference(y_accum_ref);
-            y_count_tst.PrintDifference(y_count_ref);
-            u_accum_tst.PrintDifference(u_accum_ref);
-            u_count_tst.PrintDifference(u_count_ref);
-            v_accum_tst.PrintDifference(v_accum_ref);
-            v_count_tst.PrintDifference(v_count_ref);
-            return;
+          if (use_highbd_) {
+            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                           adjusted_strength, use_32x32,
+                                           &filter_weight);
+          } else {
+            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                          filter_strength, use_32x32,
+                                          &filter_weight);
           }
+          ASSERT_FALSE(HasFailure());
         }
       }
     }
@@ -356,19 +518,6 @@
 
 TEST_P(YUVTemporalFilterTest, Use16x16) {
   const int width = 32, height = 32;
-  Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8);
-  Buffer<uint8_t> y_pre = Buffer<uint8_t>(width, height, 0);
-  Buffer<uint16_t> y_count_ref = Buffer<uint16_t>(width, height, 0);
-  Buffer<uint32_t> y_accum_ref = Buffer<uint32_t>(width, height, 0);
-  Buffer<uint16_t> y_count_tst = Buffer<uint16_t>(width, height, 0);
-  Buffer<uint32_t> y_accum_tst = Buffer<uint32_t>(width, height, 0);
-  ASSERT_TRUE(y_src.Init());
-  ASSERT_TRUE(y_pre.Init());
-  ASSERT_TRUE(y_count_ref.Init());
-  ASSERT_TRUE(y_accum_ref.Init());
-  ASSERT_TRUE(y_count_tst.Init());
-  ASSERT_TRUE(y_accum_tst.Init());
-
   const int use_32x32 = 0;
 
   for (int ss_x = 0; ss_x <= 1; ss_x++) {
@@ -385,95 +534,18 @@
         // Test each parameter
         for (int filter_strength = 0; filter_strength <= 6;
              filter_strength += 2) {
-          const int uv_width = width >> ss_x, uv_height = height >> ss_y;
-          Buffer<uint8_t> u_src = Buffer<uint8_t>(uv_width, uv_height, 8);
-          Buffer<uint8_t> u_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> u_count_ref =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> u_accum_ref =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> u_count_tst =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> u_accum_tst =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          ASSERT_TRUE(u_src.Init());
-          ASSERT_TRUE(u_pre.Init());
-          ASSERT_TRUE(u_count_ref.Init());
-          ASSERT_TRUE(u_accum_ref.Init());
-          ASSERT_TRUE(u_count_tst.Init());
-          ASSERT_TRUE(u_accum_tst.Init());
-          Buffer<uint8_t> v_src = Buffer<uint8_t>(uv_width, uv_height, 8);
-          Buffer<uint8_t> v_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> v_count_ref =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> v_accum_ref =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> v_count_tst =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> v_accum_tst =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          ASSERT_TRUE(v_src.Init());
-          ASSERT_TRUE(v_pre.Init());
-          ASSERT_TRUE(v_count_ref.Init());
-          ASSERT_TRUE(v_accum_ref.Init());
-          ASSERT_TRUE(v_count_tst.Init());
-          ASSERT_TRUE(v_accum_tst.Init());
-
-          // The difference between the buffers must be small to pass the
-          // threshold to apply the filter.
-          y_src.Set(&rnd_, 0, 7);
-          y_pre.Set(&rnd_, 0, 7);
-          u_src.Set(&rnd_, 0, 7);
-          u_pre.Set(&rnd_, 0, 7);
-          v_src.Set(&rnd_, 0, 7);
-          v_pre.Set(&rnd_, 0, 7);
-
-          y_accum_ref.Set(rnd_.Rand8());
-          y_accum_tst.CopyFrom(y_accum_ref);
-          y_count_ref.Set(rnd_.Rand8());
-          y_count_tst.CopyFrom(y_count_ref);
-          u_accum_ref.Set(rnd_.Rand8());
-          u_accum_tst.CopyFrom(u_accum_ref);
-          u_count_ref.Set(rnd_.Rand8());
-          u_count_tst.CopyFrom(u_count_ref);
-          v_accum_ref.Set(rnd_.Rand8());
-          v_accum_tst.CopyFrom(v_accum_ref);
-          v_count_ref.Set(rnd_.Rand8());
-          v_count_tst.CopyFrom(v_count_ref);
-
-          ApplyReferenceFilter(y_src, y_pre, u_src, v_src, u_pre, v_pre, width,
-                               height, ss_x, ss_y, filter_strength,
-                               filter_weight, use_32x32, &y_accum_ref,
-                               &y_count_ref, &u_accum_ref, &u_count_ref,
-                               &v_accum_ref, &v_count_ref);
-          ASM_REGISTER_STATE_CHECK(filter_func_(
-              y_src.TopLeftPixel(), y_src.stride(), y_pre.TopLeftPixel(),
-              y_pre.stride(), u_src.TopLeftPixel(), v_src.TopLeftPixel(),
-              u_src.stride(), u_pre.TopLeftPixel(), v_pre.TopLeftPixel(),
-              u_pre.stride(), width, height, ss_x, ss_y, filter_strength,
-              filter_weight, use_32x32, y_accum_tst.TopLeftPixel(),
-              y_count_tst.TopLeftPixel(), u_accum_tst.TopLeftPixel(),
-              u_count_tst.TopLeftPixel(), v_accum_tst.TopLeftPixel(),
-              v_count_tst.TopLeftPixel()));
-
-          EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref));
-          EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref));
-          EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref));
-          EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref));
-          EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref));
-          EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref));
-
-          if (HasFailure()) {
-            printf("SS_X: %d, SS_Y: %d, Weight Idx: %d, Strength: %d\n", ss_x,
-                   ss_y, filter_idx, filter_strength);
-            y_accum_tst.PrintDifference(y_accum_ref);
-            y_count_tst.PrintDifference(y_count_ref);
-            u_accum_tst.PrintDifference(u_accum_ref);
-            u_count_tst.PrintDifference(u_count_ref);
-            v_accum_tst.PrintDifference(v_accum_ref);
-            v_count_tst.PrintDifference(v_count_ref);
-            return;
+          if (use_highbd_) {
+            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                           adjusted_strength, use_32x32,
+                                           filter_weight);
+          } else {
+            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                          filter_strength, use_32x32,
+                                          filter_weight);
           }
+
+          ASSERT_FALSE(HasFailure());
         }
       }
     }
@@ -483,115 +555,25 @@
 TEST_P(YUVTemporalFilterTest, SaturationTest) {
   const int width = 32, height = 32;
   const int use_32x32 = 1;
-
-  Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8);
-  Buffer<uint8_t> y_pre = Buffer<uint8_t>(width, height, 0);
-  Buffer<uint16_t> y_count_ref = Buffer<uint16_t>(width, height, 0);
-  Buffer<uint32_t> y_accum_ref = Buffer<uint32_t>(width, height, 0);
-  Buffer<uint16_t> y_count_tst = Buffer<uint16_t>(width, height, 0);
-  Buffer<uint32_t> y_accum_tst = Buffer<uint32_t>(width, height, 0);
-  ASSERT_TRUE(y_src.Init());
-  ASSERT_TRUE(y_pre.Init());
-  ASSERT_TRUE(y_count_ref.Init());
-  ASSERT_TRUE(y_accum_ref.Init());
-  ASSERT_TRUE(y_count_tst.Init());
-  ASSERT_TRUE(y_accum_tst.Init());
+  const int filter_weight = 1;
+  saturate_test_ = 1;
 
   for (int ss_x = 0; ss_x <= 1; ss_x++) {
     for (int ss_y = 0; ss_y <= 1; ss_y++) {
       for (int filter_strength = 0; filter_strength <= 6;
            filter_strength += 2) {
-        for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
-          const int uv_width = width >> ss_x, uv_height = height >> ss_y;
-          Buffer<uint8_t> u_src = Buffer<uint8_t>(uv_width, uv_height, 8);
-          Buffer<uint8_t> u_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> u_count_ref =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> u_accum_ref =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> u_count_tst =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> u_accum_tst =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          ASSERT_TRUE(u_src.Init());
-          ASSERT_TRUE(u_pre.Init());
-          ASSERT_TRUE(u_count_ref.Init());
-          ASSERT_TRUE(u_accum_ref.Init());
-          ASSERT_TRUE(u_count_tst.Init());
-          ASSERT_TRUE(u_accum_tst.Init());
-          Buffer<uint8_t> v_src = Buffer<uint8_t>(uv_width, uv_height, 8);
-          Buffer<uint8_t> v_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> v_count_ref =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> v_accum_ref =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          Buffer<uint16_t> v_count_tst =
-              Buffer<uint16_t>(uv_width, uv_height, 0);
-          Buffer<uint32_t> v_accum_tst =
-              Buffer<uint32_t>(uv_width, uv_height, 0);
-          ASSERT_TRUE(v_src.Init());
-          ASSERT_TRUE(v_pre.Init());
-          ASSERT_TRUE(v_count_ref.Init());
-          ASSERT_TRUE(v_accum_ref.Init());
-          ASSERT_TRUE(v_count_tst.Init());
-          ASSERT_TRUE(v_accum_tst.Init());
-
-          // The difference between the buffers must be small to pass the
-          // threshold to apply the filter.
-          y_src.Set(255);
-          y_pre.Set(0);
-          u_src.Set(255);
-          u_pre.Set(0);
-          v_src.Set(255);
-          v_pre.Set(0);
-
-          y_accum_ref.Set(rnd_.Rand8());
-          y_accum_tst.CopyFrom(y_accum_ref);
-          y_count_ref.Set(rnd_.Rand8());
-          y_count_tst.CopyFrom(y_count_ref);
-          u_accum_ref.Set(rnd_.Rand8());
-          u_accum_tst.CopyFrom(u_accum_ref);
-          u_count_ref.Set(rnd_.Rand8());
-          u_count_tst.CopyFrom(u_count_ref);
-          v_accum_ref.Set(rnd_.Rand8());
-          v_accum_tst.CopyFrom(v_accum_ref);
-          v_count_ref.Set(rnd_.Rand8());
-          v_count_tst.CopyFrom(v_count_ref);
-
-          ApplyReferenceFilter(y_src, y_pre, u_src, v_src, u_pre, v_pre, width,
-                               height, ss_x, ss_y, filter_strength,
-                               &filter_weight, use_32x32, &y_accum_ref,
-                               &y_count_ref, &u_accum_ref, &u_count_ref,
-                               &v_accum_ref, &v_count_ref);
-          ASM_REGISTER_STATE_CHECK(filter_func_(
-              y_src.TopLeftPixel(), y_src.stride(), y_pre.TopLeftPixel(),
-              y_pre.stride(), u_src.TopLeftPixel(), v_src.TopLeftPixel(),
-              u_src.stride(), u_pre.TopLeftPixel(), v_pre.TopLeftPixel(),
-              u_pre.stride(), width, height, ss_x, ss_y, filter_strength,
-              &filter_weight, use_32x32, y_accum_tst.TopLeftPixel(),
-              y_count_tst.TopLeftPixel(), u_accum_tst.TopLeftPixel(),
-              u_count_tst.TopLeftPixel(), v_accum_tst.TopLeftPixel(),
-              v_count_tst.TopLeftPixel()));
-
-          EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref));
-          EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref));
-          EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref));
-          EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref));
-          EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref));
-          EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref));
-
-          if (HasFailure()) {
-            printf("SS_X: %d, SS_Y: %d, Weight: %d, Strength: %d\n", ss_x, ss_y,
-                   filter_weight, filter_strength);
-            y_accum_tst.PrintDifference(y_accum_ref);
-            y_count_tst.PrintDifference(y_count_ref);
-            u_accum_tst.PrintDifference(u_accum_ref);
-            u_count_tst.PrintDifference(u_count_ref);
-            v_accum_tst.PrintDifference(v_accum_ref);
-            v_count_tst.PrintDifference(v_count_ref);
-            return;
-          }
+        if (use_highbd_) {
+          const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+          CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                         adjusted_strength, use_32x32,
+                                         &filter_weight);
+        } else {
+          CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                        filter_strength, use_32x32,
+                                        &filter_weight);
         }
+
+        ASSERT_FALSE(HasFailure());
       }
     }
   }
@@ -599,14 +581,7 @@
 
 TEST_P(YUVTemporalFilterTest, DISABLED_Speed) {
   const int width = 32, height = 32;
-  Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8);
-  Buffer<uint8_t> y_pre = Buffer<uint8_t>(width, height, 0);
-  Buffer<uint16_t> y_count = Buffer<uint16_t>(width, height, 0);
-  Buffer<uint32_t> y_accum = Buffer<uint32_t>(width, height, 0);
-  ASSERT_TRUE(y_src.Init());
-  ASSERT_TRUE(y_pre.Init());
-  ASSERT_TRUE(y_count.Init());
-  ASSERT_TRUE(y_accum.Init());
+  num_repeats_ = 1000;
 
   for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) {
     const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3;
@@ -625,50 +600,17 @@
           // Test each parameter
           for (int filter_strength = 0; filter_strength <= 6;
                filter_strength += 2) {
-            const int uv_width = width >> ss_x, uv_height = height >> ss_y;
-            Buffer<uint8_t> u_src = Buffer<uint8_t>(uv_width, uv_height, 8);
-            Buffer<uint8_t> u_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
-            Buffer<uint16_t> u_count = Buffer<uint16_t>(uv_width, uv_height, 0);
-            Buffer<uint32_t> u_accum = Buffer<uint32_t>(uv_width, uv_height, 0);
-            ASSERT_TRUE(u_src.Init());
-            ASSERT_TRUE(u_pre.Init());
-            ASSERT_TRUE(u_count.Init());
-            ASSERT_TRUE(u_accum.Init());
-            Buffer<uint8_t> v_src = Buffer<uint8_t>(uv_width, uv_height, 8);
-            Buffer<uint8_t> v_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
-            Buffer<uint16_t> v_count = Buffer<uint16_t>(uv_width, uv_height, 0);
-            Buffer<uint32_t> v_accum = Buffer<uint32_t>(uv_width, uv_height, 0);
-            ASSERT_TRUE(v_src.Init());
-            ASSERT_TRUE(v_pre.Init());
-            ASSERT_TRUE(v_count.Init());
-            ASSERT_TRUE(v_accum.Init());
-
-            y_src.Set(&rnd_, 0, 7);
-            y_pre.Set(&rnd_, 0, 7);
-            u_src.Set(&rnd_, 0, 7);
-            u_pre.Set(&rnd_, 0, 7);
-            v_src.Set(&rnd_, 0, 7);
-            v_pre.Set(&rnd_, 0, 7);
-
-            y_accum.Set(0);
-            y_count.Set(0);
-            u_accum.Set(0);
-            u_count.Set(0);
-            v_accum.Set(0);
-            v_count.Set(0);
-
             vpx_usec_timer timer;
             vpx_usec_timer_start(&timer);
-            for (int num_calls = 0; num_calls < 1000; num_calls++) {
-              filter_func_(
-                  y_src.TopLeftPixel(), y_src.stride(), y_pre.TopLeftPixel(),
-                  y_pre.stride(), u_src.TopLeftPixel(), v_src.TopLeftPixel(),
-                  u_src.stride(), u_pre.TopLeftPixel(), v_pre.TopLeftPixel(),
-                  u_pre.stride(), width, height, ss_x, ss_y, filter_strength,
-                  filter_weight, use_32x32, y_accum.TopLeftPixel(),
-                  y_count.TopLeftPixel(), u_accum.TopLeftPixel(),
-                  u_count.TopLeftPixel(), v_accum.TopLeftPixel(),
-                  v_count.TopLeftPixel());
+
+            if (use_highbd_) {
+              RunTestFilterWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                               filter_strength, use_32x32,
+                                               filter_weight);
+            } else {
+              RunTestFilterWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                              filter_strength, use_32x32,
+                                              filter_weight);
             }
 
             vpx_usec_timer_mark(&timer);
@@ -676,9 +618,9 @@
                 static_cast<int>(vpx_usec_timer_elapsed(&timer));
 
             printf(
-                "Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: %d, Strength: "
-                "%d, Time: %5d\n",
-                use_32x32, ss_x, ss_y, filter_idx, filter_strength,
+                "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: "
+                "%d, Strength: %d, Time: %5d\n",
+                bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength,
                 elapsed_time);
           }
         }
@@ -687,11 +629,46 @@
   }
 }
 
-INSTANTIATE_TEST_CASE_P(C, YUVTemporalFilterTest,
-                        ::testing::Values(&vp9_apply_temporal_filter_c));
+#if CONFIG_VP9_HIGHBITDEPTH
+#define WRAP_HIGHBD_FUNC(func, bd)                                            \
+  void wrap_##func##_##bd(                                                    \
+      const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,           \
+      int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,           \
+      int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,          \
+      int uv_pre_stride, unsigned int block_width, unsigned int block_height, \
+      int ss_x, int ss_y, int strength, const int *const blk_fw,              \
+      int use_32x32, uint32_t *y_accumulator, uint16_t *y_count,              \
+      uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator,    \
+      uint16_t *v_count) {                                                    \
+    func(reinterpret_cast<const uint16_t *>(y_src), y_src_stride,             \
+         reinterpret_cast<const uint16_t *>(y_pre), y_pre_stride,             \
+         reinterpret_cast<const uint16_t *>(u_src),                           \
+         reinterpret_cast<const uint16_t *>(v_src), uv_src_stride,            \
+         reinterpret_cast<const uint16_t *>(u_pre),                           \
+         reinterpret_cast<const uint16_t *>(v_pre), uv_pre_stride,            \
+         block_width, block_height, ss_x, ss_y, strength, blk_fw, use_32x32,  \
+         y_accumulator, y_count, u_accumulator, u_count, v_accumulator,       \
+         v_count);                                                            \
+  }
+
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10);
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12);
+
+INSTANTIATE_TEST_CASE_P(
+    C, YUVTemporalFilterTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&vp9_apply_temporal_filter_c, 8),
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10),
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_12, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, YUVTemporalFilterTest,
+    ::testing::Values(TemporalFilterWithBd(&vp9_apply_temporal_filter_c, 8)));
 
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(SSE4_1, YUVTemporalFilterTest,
-                        ::testing::Values(&vp9_apply_temporal_filter_sse4_1));
+                        ::testing::Values(TemporalFilterWithBd(
+                            &vp9_apply_temporal_filter_sse4_1, 8)));
 #endif  // HAVE_SSE4_1
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index adc2502..9aca0f2 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -3950,6 +3950,7 @@
 
     if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
       if (vp8_drop_encodedframe_overshoot(cpi, Q)) return;
+      cpi->last_pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs);
     }
 
     cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);

diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 603de8b..5189d43 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h

@@ -510,6 +510,7 @@
 
   int force_maxqp;
   int frames_since_last_drop_overshoot;
+  int last_pred_err_mb;
 
   // GF update for 1 pass cbr.
   int gf_update_onepass_cbr;

diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index ce07a6f..d7badeb 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c

@@ -1484,7 +1484,8 @@
     if (cpi->drop_frames_allowed && pred_err_mb > (thresh_pred_err_mb << 4))
       thresh_rate = thresh_rate >> 3;
     if ((Q < thresh_qp && cpi->projected_frame_size > thresh_rate &&
-         pred_err_mb > thresh_pred_err_mb) ||
+         pred_err_mb > thresh_pred_err_mb &&
+         pred_err_mb > 2 * cpi->last_pred_err_mb) ||
         force_drop_overshoot) {
       unsigned int i;
       double new_correction_factor;

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 7e5e3c9..00c4414 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -183,14 +183,19 @@
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_diamond_search_sad avx/;
 
+#
+# Apply temporal filter
+#
 if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
-add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
-specialize qw/vp9_temporal_filter_apply sse4_1/;
-
 add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
 specialize qw/vp9_apply_temporal_filter sse4_1/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vp9_highbd_apply_temporal_filter/, "const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count";
+  }
 }
 
+
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   # ENCODEMB INVOKE

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index c3bca34..41072d5 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -2768,6 +2768,7 @@
     int num_sbs = 1;
     const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
     const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
+    const int num_jobs = sb_rows << cm->log2_tile_cols;
 
     if (pbi->row_mt_worker_data == NULL) {
       CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data,
@@ -2784,10 +2785,11 @@
       num_sbs = sb_cols * sb_rows;
     }
 
-    if (num_sbs > pbi->row_mt_worker_data->num_sbs) {
+    if (num_sbs > pbi->row_mt_worker_data->num_sbs ||
+        num_jobs > pbi->row_mt_worker_data->num_jobs) {
       vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data);
       vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs,
-                               pbi->max_threads, sb_rows << cm->log2_tile_cols);
+                               pbi->max_threads, num_jobs);
     }
     vp9_jobq_alloc(pbi);
   }

diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index a2a7424..ef8cd46 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c

@@ -479,7 +479,8 @@
   double weight_segment_target = 0;
   double weight_segment = 0;
   int thresh_low_motion = (cm->width < 720) ? 55 : 20;
-  int qp_thresh = VPXMIN(20, rc->best_quality << 1);
+  int qp_thresh = VPXMIN((cpi->oxcf.content == VP9E_CONTENT_SCREEN) ? 35 : 20,
+                         rc->best_quality << 1);
   cr->apply_cyclic_refresh = 1;
   if (frame_is_intra_only(cm) || cpi->svc.temporal_layer_id > 0 ||
       is_lossless_requested(&cpi->oxcf) ||

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 236567f..ee06b43 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -3481,9 +3481,9 @@
 }
 
 // Use a neural net model to prune partition-none and partition-split search.
-// The model uses prediction residue variance and quantization step size as
-// input features.
-#define FEATURES 6
+// Features used: QP; spatial block size contexts; variance of prediction
+// residue after simple_motion_search.
+#define FEATURES 12
 static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
                                           MACROBLOCK *const x,
                                           PC_TREE *const pc_tree,
@@ -3502,28 +3502,27 @@
   uint8_t *const pred_buf = pred_buffer;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   const int speed = cpi->oxcf.speed;
-  int i;
   float thresh = 0.0f;
 
   switch (bsize) {
     case BLOCK_64X64:
-      nn_config = &vp9_var_rd_part_nnconfig_64;
-      thresh = speed > 0 ? 3.5f : 3.0f;
+      nn_config = &vp9_part_split_nnconfig_64;
+      thresh = speed > 0 ? 2.8f : 3.0f;
       break;
     case BLOCK_32X32:
-      nn_config = &vp9_var_rd_part_nnconfig_32;
+      nn_config = &vp9_part_split_nnconfig_32;
       thresh = speed > 0 ? 3.5f : 3.0f;
       break;
     case BLOCK_16X16:
-      nn_config = &vp9_var_rd_part_nnconfig_16;
-      thresh = speed > 0 ? 3.5f : 4.0f;
+      nn_config = &vp9_part_split_nnconfig_16;
+      thresh = speed > 0 ? 3.8f : 4.0f;
       break;
     case BLOCK_8X8:
-      nn_config = &vp9_var_rd_part_nnconfig_8;
+      nn_config = &vp9_part_split_nnconfig_8;
       if (cm->width >= 720 && cm->height >= 720)
         thresh = speed > 0 ? 2.5f : 2.0f;
       else
-        thresh = speed > 0 ? 3.5f : 2.0f;
+        thresh = speed > 0 ? 3.8f : 2.0f;
       break;
     default: assert(0 && "Unexpected block size."); return;
   }
@@ -3542,6 +3541,7 @@
       ref_mv.row = ref_mv.col = 0;
     else
       ref_mv = pc_tree->mv;
+    vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
     simple_motion_search(cpi, x, bsize, mi_row, mi_col, ref_mv, ref, pred_buf);
     pc_tree->mv = x->e_mbd.mi[0]->mv[0].as_mv;
   }
@@ -3560,8 +3560,8 @@
     float score;
 
     // Generate model input features.
-    features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f);
-    vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+    features[feature_idx++] = logf((float)dc_q + 1.0f);
+
     // Get the variance of the residue as input features.
     {
       const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];
@@ -3575,7 +3575,19 @@
       const unsigned int var =
           cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
       const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+      const MACROBLOCKD *const xd = &x->e_mbd;
+      const int has_above = !!xd->above_mi;
+      const int has_left = !!xd->left_mi;
+      const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize;
+      const BLOCK_SIZE left_bsize = has_left ? xd->left_mi->sb_type : bsize;
+      int i;
 
+      features[feature_idx++] = (float)has_above;
+      features[feature_idx++] = (float)b_width_log2_lookup[above_bsize];
+      features[feature_idx++] = (float)b_height_log2_lookup[above_bsize];
+      features[feature_idx++] = (float)has_left;
+      features[feature_idx++] = (float)b_width_log2_lookup[left_bsize];
+      features[feature_idx++] = (float)b_height_log2_lookup[left_bsize];
       features[feature_idx++] = logf((float)var + 1.0f);
       for (i = 0; i < 4; ++i) {
         const int x_idx = (i & 1) * bs / 2;
@@ -3604,7 +3616,6 @@
   }
 }
 #undef FEATURES
-#undef LABELS
 
 static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
                             int mi_col, int orig_rdmult) {

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index b8c86ea..362077a 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -6146,7 +6146,7 @@
     // TODO(angiebird): Consider subpixel when computing the sse.
     cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
                           pre.stride, &sse);
-    return (double)sse;
+    return (double)(sse << VP9_DIST_SCALE_LOG2);
   } else {
     assert(0);
     return 0;

diff --git a/vp9/encoder/vp9_partition_models.h b/vp9/encoder/vp9_partition_models.h
index 904d214..76e3b5d 100644
--- a/vp9/encoder/vp9_partition_models.h
+++ b/vp9/encoder/vp9_partition_models.h

@@ -966,175 +966,209 @@
 #undef FEATURES
 #endif  // CONFIG_ML_VAR_PARTITION
 
-#define FEATURES 6
+#define FEATURES 12
 #define LABELS 1
-static const float vp9_var_rd_part_nn_weights_64_layer0[FEATURES * 8] = {
-  -0.100129f, 0.128867f,  -1.375086f, -2.268096f, -1.470368f, -2.296274f,
-  0.034445f,  -0.062993f, -2.151904f, 0.523215f,  1.611269f,  1.530051f,
-  0.418182f,  -1.330239f, 0.828388f,  0.386546f,  -0.026188f, -0.055459f,
-  -0.474437f, 0.861295f,  -2.208743f, -0.652991f, -2.985873f, -1.728956f,
-  0.388052f,  -0.420720f, 2.015495f,  1.280342f,  3.040914f,  1.760749f,
-  -0.009062f, 0.009623f,  1.579270f,  -2.012891f, 1.629662f,  -1.796016f,
-  -0.279782f, -0.288359f, 1.875618f,  1.639855f,  0.903020f,  0.906438f,
-  0.553394f,  -1.621589f, 0.185063f,  0.605207f,  -0.133560f, 0.588689f,
+#define NODES 8
+static const float vp9_part_split_nn_weights_64_layer0[FEATURES * NODES] = {
+  -0.609728f, -0.409099f, -0.472449f, 0.183769f,  -0.457740f, 0.081089f,
+  0.171003f,  0.578696f,  -0.019043f, -0.856142f, 0.557369f,  -1.779424f,
+  -0.274044f, -0.320632f, -0.392531f, -0.359462f, -0.404106f, -0.288357f,
+  0.200620f,  0.038013f,  -0.430093f, 0.235083f,  -0.487442f, 0.424814f,
+  -0.232758f, -0.442943f, 0.229397f,  -0.540301f, -0.648421f, -0.649747f,
+  -0.171638f, 0.603824f,  0.468497f,  -0.421580f, 0.178840f,  -0.533838f,
+  -0.029471f, -0.076296f, 0.197426f,  -0.187908f, -0.003950f, -0.065740f,
+  0.085165f,  -0.039674f, -5.640702f, 1.909538f,  -1.434604f, 3.294606f,
+  -0.788812f, 0.196864f,  0.057012f,  -0.019757f, 0.336233f,  0.075378f,
+  0.081503f,  0.491864f,  -1.899470f, -1.764173f, -1.888137f, -1.762343f,
+  0.845542f,  0.202285f,  0.381948f,  -0.150996f, 0.556893f,  -0.305354f,
+  0.561482f,  -0.021974f, -0.703117f, 0.268638f,  -0.665736f, 1.191005f,
+  -0.081568f, -0.115653f, 0.272029f,  -0.140074f, 0.072683f,  0.092651f,
+  -0.472287f, -0.055790f, -0.434425f, 0.352055f,  0.048246f,  0.372865f,
+  0.111499f,  -0.338304f, 0.739133f,  0.156519f,  -0.594644f, 0.137295f,
+  0.613350f,  -0.165102f, -1.003731f, 0.043070f,  -0.887896f, -0.174202f,
 };
 
-static const float vp9_var_rd_part_nn_bias_64_layer0[8] = {
-  0.659717f, 0.120912f, 0.329894f, -1.586385f,
-  1.715839f, 0.085754f, 2.038774f, 0.268119f,
+static const float vp9_part_split_nn_bias_64_layer0[NODES] = {
+  1.182714f,  0.000000f,  0.902019f,  0.953115f,
+  -1.372486f, -1.288740f, -0.155144f, -3.041362f,
 };
 
-static const float vp9_var_rd_part_nn_weights_64_layer1[8 * LABELS] = {
-  -3.445586f, 2.375620f, 1.236970f, 0.804030f,
-  -2.448384f, 2.827254f, 2.291478f, 0.790252f,
+static const float vp9_part_split_nn_weights_64_layer1[NODES * LABELS] = {
+  0.841214f,  0.456016f,  0.869270f, 1.692999f,
+  -1.700494f, -0.911761f, 0.030111f, -1.447548f,
 };
 
-static const float vp9_var_rd_part_nn_bias_64_layer1[LABELS] = {
-  -1.16608453f,
+static const float vp9_part_split_nn_bias_64_layer1[LABELS] = {
+  1.17782545f,
 };
 
-static const NN_CONFIG vp9_var_rd_part_nnconfig_64 = {
+static const NN_CONFIG vp9_part_split_nnconfig_64 = {
   FEATURES,  // num_inputs
   LABELS,    // num_outputs
   1,         // num_hidden_layers
   {
-      8,
+      NODES,
   },  // num_hidden_nodes
   {
-      vp9_var_rd_part_nn_weights_64_layer0,
-      vp9_var_rd_part_nn_weights_64_layer1,
+      vp9_part_split_nn_weights_64_layer0,
+      vp9_part_split_nn_weights_64_layer1,
   },
   {
-      vp9_var_rd_part_nn_bias_64_layer0,
-      vp9_var_rd_part_nn_bias_64_layer1,
+      vp9_part_split_nn_bias_64_layer0,
+      vp9_part_split_nn_bias_64_layer1,
   },
 };
 
-static const float vp9_var_rd_part_nn_weights_32_layer0[FEATURES * 8] = {
-  0.022420f,  -0.032201f, 1.228065f,  -2.767655f, 1.928743f,  0.566863f,
-  0.459229f,  0.422048f,  0.833395f,  0.822960f,  -0.232227f, 0.586895f,
-  0.442856f,  -0.018564f, 0.227672f,  -1.291306f, 0.119428f,  -0.776563f,
-  -0.042947f, 0.183129f,  0.592231f,  1.174859f,  -0.503868f, 0.270102f,
-  -0.330537f, -0.036340f, 1.144630f,  1.783710f,  1.216929f,  2.038085f,
-  0.373782f,  -0.430258f, 1.957002f,  1.383908f,  2.012261f,  1.585693f,
-  -0.394399f, -0.337523f, -0.238335f, 0.007819f,  -0.368294f, 0.437875f,
-  -0.318923f, -0.242000f, 2.276263f,  1.501432f,  0.645706f,  0.344774f,
+static const float vp9_part_split_nn_weights_32_layer0[FEATURES * NODES] = {
+  -0.105488f, -0.218662f, 0.010980f,  -0.226979f, 0.028076f,  0.743430f,
+  0.789266f,  0.031907f,  -1.464200f, 0.222336f,  -1.068493f, -0.052712f,
+  -0.176181f, -0.102654f, -0.973932f, -0.182637f, -0.198000f, 0.335977f,
+  0.271346f,  0.133005f,  1.674203f,  0.689567f,  0.657133f,  0.283524f,
+  0.115529f,  0.738327f,  0.317184f,  -0.179736f, 0.403691f,  0.679350f,
+  0.048925f,  0.271338f,  -1.538921f, -0.900737f, -1.377845f, 0.084245f,
+  0.803122f,  -0.107806f, 0.103045f,  -0.023335f, -0.098116f, -0.127809f,
+  0.037665f,  -0.523225f, 1.622185f,  1.903999f,  1.358889f,  1.680785f,
+  0.027743f,  0.117906f,  -0.158810f, 0.057775f,  0.168257f,  0.062414f,
+  0.086228f,  -0.087381f, -3.066082f, 3.021855f,  -4.092155f, 2.550104f,
+  -0.230022f, -0.207445f, -0.000347f, 0.034042f,  0.097057f,  0.220088f,
+  -0.228841f, -0.029405f, -1.507174f, -1.455184f, 2.624904f,  2.643355f,
+  0.319912f,  0.585531f,  -1.018225f, -0.699606f, 1.026490f,  0.169952f,
+  -0.093579f, -0.142352f, -0.107256f, 0.059598f,  0.043190f,  0.507543f,
+  -0.138617f, 0.030197f,  0.059574f,  -0.634051f, -0.586724f, -0.148020f,
+  -0.334380f, 0.459547f,  1.620600f,  0.496850f,  0.639480f,  -0.465715f,
 };
 
-static const float vp9_var_rd_part_nn_bias_32_layer0[8] = {
-  -0.023846f, -1.348117f, 1.365007f,  -1.644164f,
-  0.062992f,  1.257980f,  -0.098642f, 1.388472f,
+static const float vp9_part_split_nn_bias_32_layer0[NODES] = {
+  -1.125885f, 0.753197f, -0.825808f, 0.004839f,
+  0.583920f,  0.718062f, 0.976741f,  0.796188f,
 };
 
-static const float vp9_var_rd_part_nn_weights_32_layer1[8 * LABELS] = {
-  3.016729f, 0.622684f,  -1.021302f, 1.490383f,
-  1.702046f, -2.964618f, 0.689045f,  1.711754f,
+static const float vp9_part_split_nn_weights_32_layer1[NODES * LABELS] = {
+  -0.458745f, 0.724624f, -0.479720f, -2.199872f,
+  1.162661f,  1.194153f, -0.716896f, 0.824080f,
 };
 
-static const float vp9_var_rd_part_nn_bias_32_layer1[LABELS] = {
-  -1.28798676f,
+static const float vp9_part_split_nn_bias_32_layer1[LABELS] = {
+  0.71644074f,
 };
 
-static const NN_CONFIG vp9_var_rd_part_nnconfig_32 = {
+static const NN_CONFIG vp9_part_split_nnconfig_32 = {
   FEATURES,  // num_inputs
   LABELS,    // num_outputs
   1,         // num_hidden_layers
   {
-      8,
+      NODES,
   },  // num_hidden_nodes
   {
-      vp9_var_rd_part_nn_weights_32_layer0,
-      vp9_var_rd_part_nn_weights_32_layer1,
+      vp9_part_split_nn_weights_32_layer0,
+      vp9_part_split_nn_weights_32_layer1,
   },
   {
-      vp9_var_rd_part_nn_bias_32_layer0,
-      vp9_var_rd_part_nn_bias_32_layer1,
+      vp9_part_split_nn_bias_32_layer0,
+      vp9_part_split_nn_bias_32_layer1,
   },
 };
 
-static const float vp9_var_rd_part_nn_weights_16_layer0[FEATURES * 8] = {
-  -0.726813f, -0.026748f, 1.376946f,  1.467961f,  1.961810f,  1.690412f,
-  0.596484f,  -0.261486f, -0.310905f, -0.366311f, -1.300086f, -0.534336f,
-  0.040520f,  -0.032391f, -1.194214f, 2.438063f,  -3.915334f, 1.997270f,
-  0.673696f,  -0.676393f, 1.654886f,  1.553838f,  1.129691f,  1.360201f,
-  0.255001f,  0.336442f,  -0.487759f, -0.634555f, 0.479170f,  -0.110475f,
-  -0.661852f, -0.158872f, -0.350243f, -0.303957f, -0.045018f, 0.586151f,
-  -0.262463f, 0.228079f,  -1.688776f, -1.594502f, -2.261078f, -1.802535f,
-  0.034748f,  -0.028476f, 2.713258f,  0.212446f,  -1.529202f, -2.560178f,
+static const float vp9_part_split_nn_weights_16_layer0[FEATURES * NODES] = {
+  -0.003629f, -0.046852f, 0.220428f,  -0.033042f, 0.049365f,  0.112818f,
+  -0.306149f, -0.005872f, 1.066947f,  -2.290226f, 2.159505f,  -0.618714f,
+  -0.213294f, 0.451372f,  -0.199459f, 0.223730f,  -0.321709f, 0.063364f,
+  0.148704f,  -0.293371f, 0.077225f,  -0.421947f, -0.515543f, -0.240975f,
+  -0.418516f, 1.036523f,  -0.009165f, 0.032484f,  1.086549f,  0.220322f,
+  -0.247585f, -0.221232f, -0.225050f, 0.993051f,  0.285907f,  1.308846f,
+  0.707456f,  0.335152f,  0.234556f,  0.264590f,  -0.078033f, 0.542226f,
+  0.057777f,  0.163471f,  0.039245f,  -0.725960f, 0.963780f,  -0.972001f,
+  0.252237f,  -0.192745f, -0.836571f, -0.460539f, -0.528713f, -0.160198f,
+  -0.621108f, 0.486405f,  -0.221923f, 1.519426f,  -0.857871f, 0.411595f,
+  0.947188f,  0.203339f,  0.174526f,  0.016382f,  0.256879f,  0.049818f,
+  0.057836f,  -0.659096f, 0.459894f,  0.174695f,  0.379359f,  0.062530f,
+  -0.210201f, -0.355788f, -0.208432f, -0.401723f, -0.115373f, 0.191336f,
+  -0.109342f, 0.002455f,  -0.078746f, -0.391871f, 0.149892f,  -0.239615f,
+  -0.520709f, 0.118568f,  -0.437975f, 0.118116f,  -0.565426f, -0.206446f,
+  0.113407f,  0.558894f,  0.534627f,  1.154350f,  -0.116833f, 1.723311f,
 };
 
-static const float vp9_var_rd_part_nn_bias_16_layer0[8] = {
-  0.495983f,  1.858545f, 0.162974f, 1.992247f,
-  -2.698863f, 0.110020f, 0.550830f, 0.420941f,
+static const float vp9_part_split_nn_bias_16_layer0[NODES] = {
+  0.013109f,  -0.034341f, 0.679845f,  -0.035781f,
+  -0.104183f, 0.098055f,  -0.041130f, 0.160107f,
 };
 
-static const float vp9_var_rd_part_nn_weights_16_layer1[8 * LABELS] = {
-  1.768409f, -1.394240f, 1.076846f,  -1.762808f,
-  1.517405f, 0.535195f,  -0.426827f, 1.002272f,
+static const float vp9_part_split_nn_weights_16_layer1[NODES * LABELS] = {
+  1.499564f, -0.403259f, 1.366532f, -0.469868f,
+  0.482227f, -2.076697f, 0.527691f, 0.540495f,
 };
 
-static const float vp9_var_rd_part_nn_bias_16_layer1[LABELS] = {
-  -1.65894794f,
+static const float vp9_part_split_nn_bias_16_layer1[LABELS] = {
+  0.01134653f,
 };
 
-static const NN_CONFIG vp9_var_rd_part_nnconfig_16 = {
+static const NN_CONFIG vp9_part_split_nnconfig_16 = {
   FEATURES,  // num_inputs
   LABELS,    // num_outputs
   1,         // num_hidden_layers
   {
-      8,
+      NODES,
   },  // num_hidden_nodes
   {
-      vp9_var_rd_part_nn_weights_16_layer0,
-      vp9_var_rd_part_nn_weights_16_layer1,
+      vp9_part_split_nn_weights_16_layer0,
+      vp9_part_split_nn_weights_16_layer1,
   },
   {
-      vp9_var_rd_part_nn_bias_16_layer0,
-      vp9_var_rd_part_nn_bias_16_layer1,
+      vp9_part_split_nn_bias_16_layer0,
+      vp9_part_split_nn_bias_16_layer1,
   },
 };
 
-static const float vp9_var_rd_part_nn_weights_8_layer0[FEATURES * 8] = {
-  -0.804900f, -1.214983f, 0.840202f, 0.686566f,  0.155804f,  0.025542f,
-  -1.244635f, -0.368403f, 0.364150f, 1.081073f,  0.552387f,  0.452715f,
-  0.652968f,  -0.293058f, 0.048967f, 0.021240f,  -0.662981f, 0.424700f,
-  0.008293f,  -0.013088f, 0.747007f, -1.453907f, -1.498226f, 1.593252f,
-  -0.239557f, -0.143766f, 0.064311f, 1.320998f,  -0.477411f, 0.026374f,
-  0.730884f,  -0.675124f, 0.965521f, 0.863658f,  0.809186f,  0.812280f,
-  0.513131f,  0.185102f,  0.211354f, 0.793666f,  0.121714f,  -0.015383f,
-  -0.650980f, -0.046581f, 0.911141f, 0.806319f,  0.974773f,  0.815893f,
+static const float vp9_part_split_nn_weights_8_layer0[FEATURES * NODES] = {
+  -0.668875f, -0.159078f, -0.062663f, -0.483785f, -0.146814f, -0.608975f,
+  -0.589145f, 0.203704f,  -0.051007f, -0.113769f, -0.477511f, -0.122603f,
+  -1.329890f, 1.403386f,  0.199636f,  -0.161139f, 2.182090f,  -0.014307f,
+  0.015755f,  -0.208468f, 0.884353f,  0.815920f,  0.632464f,  0.838225f,
+  1.369483f,  -0.029068f, 0.570213f,  -0.573546f, 0.029617f,  0.562054f,
+  -0.653093f, -0.211910f, -0.661013f, -0.384418f, -0.574038f, -0.510069f,
+  0.173047f,  -0.274231f, -1.044008f, -0.422040f, -0.810296f, 0.144069f,
+  -0.406704f, 0.411230f,  -0.144023f, 0.745651f,  -0.595091f, 0.111787f,
+  0.840651f,  0.030123f,  -0.242155f, 0.101486f,  -0.017889f, -0.254467f,
+  -0.285407f, -0.076675f, -0.549542f, -0.013544f, -0.686566f, -0.755150f,
+  1.623949f,  -0.286369f, 0.170976f,  0.016442f,  -0.598353f, -0.038540f,
+  0.202597f,  -0.933582f, 0.599510f,  0.362273f,  0.577722f,  0.477603f,
+  0.767097f,  0.431532f,  0.457034f,  0.223279f,  0.381349f,  0.033777f,
+  0.423923f,  -0.664762f, 0.385662f,  0.075744f,  0.182681f,  0.024118f,
+  0.319408f,  -0.528864f, 0.976537f,  -0.305971f, -0.189380f, -0.241689f,
+  -1.318092f, 0.088647f,  -0.109030f, -0.945654f, 1.082797f,  0.184564f,
 };
 
-static const float vp9_var_rd_part_nn_bias_8_layer0[8] = {
-  0.176134f, 0.651308f, 2.007761f,  0.068812f,
-  1.061517f, 1.487161f, -2.308147f, 1.099828f,
+static const float vp9_part_split_nn_bias_8_layer0[NODES] = {
+  -0.237472f, 2.051396f,  0.297062f, -0.730194f,
+  0.060472f,  -0.565959f, 0.560869f, -0.395448f,
 };
 
-static const float vp9_var_rd_part_nn_weights_8_layer1[8 * LABELS] = {
-  0.683032f, 1.326393f,  -1.661539f, 1.438920f,
-  1.118023f, -2.237380f, 1.518468f,  2.010416f,
+static const float vp9_part_split_nn_weights_8_layer1[NODES * LABELS] = {
+  0.568121f,  1.575915f,  -0.544309f, 0.751595f,
+  -0.117911f, -1.340730f, -0.739671f, 0.661216f,
 };
 
-static const float vp9_var_rd_part_nn_bias_8_layer1[LABELS] = {
-  -1.65423989f,
+static const float vp9_part_split_nn_bias_8_layer1[LABELS] = {
+  -0.63375306f,
 };
 
-static const NN_CONFIG vp9_var_rd_part_nnconfig_8 = {
+static const NN_CONFIG vp9_part_split_nnconfig_8 = {
   FEATURES,  // num_inputs
   LABELS,    // num_outputs
   1,         // num_hidden_layers
   {
-      8,
+      NODES,
   },  // num_hidden_nodes
   {
-      vp9_var_rd_part_nn_weights_8_layer0,
-      vp9_var_rd_part_nn_weights_8_layer1,
+      vp9_part_split_nn_weights_8_layer0,
+      vp9_part_split_nn_weights_8_layer1,
   },
   {
-      vp9_var_rd_part_nn_bias_8_layer0,
-      vp9_var_rd_part_nn_bias_8_layer1,
+      vp9_part_split_nn_bias_8_layer0,
+      vp9_part_split_nn_bias_8_layer1,
   },
 };
+#undef NODES
 #undef FEATURES
 #undef LABELS
 

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 9df2eb3..e342250 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -2271,7 +2271,7 @@
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
   if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-      rc->frames_to_key == 0) {
+      (cpi->oxcf.auto_key && rc->frames_to_key == 0)) {
     cm->frame_type = KEY_FRAME;
     rc->frames_to_key = cpi->oxcf.key_freq;
     rc->kf_boost = DEFAULT_KF_BOOST;

diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index fa85f21..062ca32 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h

@@ -42,6 +42,9 @@
 #define RD_THRESH_MAX_FACT 64
 #define RD_THRESH_INC 1
 
+#define VP9_DIST_SCALE_LOG2 4
+#define VP9_DIST_SCALE (1 << VP9_DIST_SCALE_LOG2)
+
 // This enumerator type needs to be kept aligned with the mode order in
 // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
 typedef enum {

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index c73b0ed..6f07269 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -273,9 +273,9 @@
   }
 
   *skip_txfm_sb = skip_flag;
-  *skip_sse_sb = total_sse << 4;
+  *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2;
   *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum << 4;
+  *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index ee5f0e5..0b636b8 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -205,28 +205,44 @@
   return mod;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int highbd_mod_index(int sum_dist, int index, int rounding,
+                                   int strength, int filter_weight) {
+  int mod = sum_dist * 3 / index;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static INLINE int get_filter_weight(unsigned int i, unsigned int j,
                                     unsigned int block_height,
                                     unsigned int block_width,
                                     const int *const blk_fw, int use_32x32) {
-  int filter_weight = 0;
-
-  if (use_32x32)
-    // blk_fw[0] ~ blk_fw[3] are the same.
+  // blk_fw[0] ~ blk_fw[3] are the same.
+  if (use_32x32) {
     return blk_fw[0];
+  }
 
   if (i < block_height / 2) {
-    if (j < block_width / 2)
-      filter_weight = blk_fw[0];
-    else
-      filter_weight = blk_fw[1];
-  } else {
-    if (j < block_width / 2)
-      filter_weight = blk_fw[2];
-    else
-      filter_weight = blk_fw[3];
+    if (j < block_width / 2) {
+      return blk_fw[0];
+    }
+
+    return blk_fw[1];
   }
-  return filter_weight;
+
+  if (j < block_width / 2) {
+    return blk_fw[2];
+  }
+
+  return blk_fw[3];
 }
 
 void vp9_apply_temporal_filter_c(
@@ -280,7 +296,7 @@
   for (i = 0, k = 0, m = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++) {
       const int pixel_value = y_pred[i * y_buf_stride + j];
-      int filter_weight =
+      const int filter_weight =
           get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
 
       // non-local mean approach
@@ -370,133 +386,152 @@
   }
 }
 
-// TODO(any): This function is not used anymore. Should be removed.
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride,
-                                 const uint8_t *frame2,
-                                 unsigned int block_width,
-                                 unsigned int block_height, int strength,
-                                 int filter_weight, uint32_t *accumulator,
-                                 uint16_t *count) {
-  unsigned int i, j, k;
-  int modifier;
-  int byte = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_apply_temporal_filter_c(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32,
+    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count) {
+  const int uv_block_width = block_width >> ss_x;
+  const int uv_block_height = block_height >> ss_y;
+  const int y_diff_stride = BW;
+  const int uv_diff_stride = BW;
+
+  DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]);
+
   const int rounding = (1 << strength) >> 1;
 
-  assert(strength >= 0);
-  assert(strength <= 6);
+  // Loop variables
+  int row, col;
+  int uv_row, uv_col;
+  int row_step, col_step;
 
-  assert(filter_weight >= 0);
-  assert(filter_weight <= 2);
+  memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+  memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+  memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
 
-  for (i = 0, k = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++, k++) {
-      int pixel_value = *frame2;
-
-      // non-local mean approach
-      int diff_sse[9] = { 0 };
-      int idx, idy, index = 0;
-
-      for (idy = -1; idy <= 1; ++idy) {
-        for (idx = -1; idx <= 1; ++idx) {
-          int row = (int)i + idy;
-          int col = (int)j + idx;
-
-          if (row >= 0 && row < (int)block_height && col >= 0 &&
-              col < (int)block_width) {
-            int diff = frame1[byte + idy * (int)stride + idx] -
-                       frame2[idy * (int)block_width + idx];
-            diff_sse[index] = diff * diff;
-            ++index;
-          }
-        }
-      }
-
-      assert(index > 0);
-
-      modifier = 0;
-      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
-
-      modifier *= 3;
-      modifier /= index;
-
-      ++frame2;
-
-      modifier += rounding;
-      modifier >>= strength;
-
-      if (modifier > 16) modifier = 16;
-
-      modifier = 16 - modifier;
-      modifier *= filter_weight;
-
-      count[k] += modifier;
-      accumulator[k] += modifier * pixel_value;
-
-      byte++;
-    }
-
-    byte += stride - block_width;
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_temporal_filter_apply_c(
-    const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8,
-    unsigned int block_width, unsigned int block_height, int strength,
-    int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count) {
-  const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
-  const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
-  unsigned int i, j, k;
-  int modifier;
-  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
-
-  int diff_sse[BLK_PELS] = { 0 };
-  int this_idx = 0;
-
-  for (i = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++) {
+  // Get the square diffs
+  for (row = 0; row < (int)block_height; row++) {
+    for (col = 0; col < (int)block_width; col++) {
       const int diff =
-          frame1[i * (int)stride + j] - frame2[i * (int)block_width + j];
-      diff_sse[this_idx++] = diff * diff;
+          y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col];
+      y_diff_sse[row * y_diff_stride + col] = diff * diff;
     }
   }
 
-  modifier = 0;
-  for (i = 0, k = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++, k++) {
-      int pixel_value = frame2[i * (int)block_width + j];
-      int filter_weight =
-          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
+  for (row = 0; row < (int)uv_block_height; row++) {
+    for (col = 0; col < (int)uv_block_width; col++) {
+      const int u_diff =
+          u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col];
+      const int v_diff =
+          v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col];
+      u_diff_sse[row * uv_diff_stride + col] = u_diff * u_diff;
+      v_diff_sse[row * uv_diff_stride + col] = v_diff * v_diff;
+    }
+  }
 
-      int idx, idy, index = 0;
+  // Apply the filter to luma
+  for (row = 0; row < (int)block_height; row++) {
+    for (col = 0; col < (int)block_width; col++) {
+      const int uv_row = row >> ss_y;
+      const int uv_col = col >> ss_x;
+      const int filter_weight = get_filter_weight(
+          row, col, block_height, block_width, blk_fw, use_32x32);
 
-      for (idy = -1; idy <= 1; ++idy) {
-        for (idx = -1; idx <= 1; ++idx) {
-          int row = (int)i + idy;
-          int col = (int)j + idx;
+      // First we get the modifier for the current y pixel
+      const int y_pixel = y_pre[row * y_pre_stride + col];
+      int y_num_used = 0;
+      int y_mod = 0;
 
-          if (row >= 0 && row < (int)block_height && col >= 0 &&
-              col < (int)block_width) {
-            modifier += diff_sse[row * (int)block_width + col];
-            ++index;
+      // Sum the neighboring 3x3 y pixels
+      for (row_step = -1; row_step <= 1; row_step++) {
+        for (col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = row + row_step;
+          const int sub_col = col + col_step;
+
+          if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
+              sub_col < (int)block_width) {
+            y_mod += y_diff_sse[sub_row * y_diff_stride + sub_col];
+            y_num_used++;
           }
         }
       }
-      assert(index > 0);
 
-      modifier *= 3;
-      modifier /= index;
+      // Sum the corresponding uv pixels to the current y modifier
+      // Note we are rounding down instead of rounding to the nearest pixel.
+      y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col];
+      y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col];
 
-      modifier += rounding;
-      modifier >>= strength;
+      y_num_used += 2;
 
-      if (modifier > 16) modifier = 16;
+      // Set the modifier
+      y_mod = highbd_mod_index(y_mod, y_num_used, rounding, strength,
+                               filter_weight);
 
-      modifier = 16 - modifier;
-      modifier *= filter_weight;
+      // Accumulate the result
+      y_count[row * block_width + col] += y_mod;
+      y_accum[row * block_width + col] += y_mod * y_pixel;
+    }
+  }
 
-      count[k] += modifier;
-      accumulator[k] += modifier * pixel_value;
+  // Apply the filter to chroma
+  for (uv_row = 0; uv_row < (int)uv_block_height; uv_row++) {
+    for (uv_col = 0; uv_col < (int)uv_block_width; uv_col++) {
+      const int y_row = uv_row << ss_y;
+      const int y_col = uv_col << ss_x;
+      const int filter_weight = get_filter_weight(
+          uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
+
+      const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col];
+      const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col];
+
+      int uv_num_used = 0;
+      int u_mod = 0, v_mod = 0;
+
+      // Sum the neighboring 3x3 chromal pixels to the chroma modifier
+      for (row_step = -1; row_step <= 1; row_step++) {
+        for (col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = uv_row + row_step;
+          const int sub_col = uv_col + col_step;
+
+          if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
+              sub_col < uv_block_width) {
+            u_mod += u_diff_sse[sub_row * uv_diff_stride + sub_col];
+            v_mod += v_diff_sse[sub_row * uv_diff_stride + sub_col];
+            uv_num_used++;
+          }
+        }
+      }
+
+      // Sum all the luma pixels associated with the current luma pixel
+      for (row_step = 0; row_step < 1 + ss_y; row_step++) {
+        for (col_step = 0; col_step < 1 + ss_x; col_step++) {
+          const int sub_row = y_row + row_step;
+          const int sub_col = y_col + col_step;
+          const int y_diff = y_diff_sse[sub_row * y_diff_stride + sub_col];
+
+          u_mod += y_diff;
+          v_mod += y_diff;
+          uv_num_used++;
+        }
+      }
+
+      // Set the modifier
+      u_mod = highbd_mod_index(u_mod, uv_num_used, rounding, strength,
+                               filter_weight);
+      v_mod = highbd_mod_index(v_mod, uv_num_used, rounding, strength,
+                               filter_weight);
+
+      // Accumulate the result
+      u_count[uv_row * uv_block_width + uv_col] += u_mod;
+      u_accum[uv_row * uv_block_width + uv_col] += u_mod * u_pixel;
+      v_count[uv_row * uv_block_width + uv_col] += v_mod;
+      v_accum[uv_row * uv_block_width + uv_col] += v_mod * v_pixel;
     }
   }
 }
@@ -752,7 +787,7 @@
         }
       }
 
-      if (blk_fw[0] || blk_fw[1] || blk_fw[2] || blk_fw[3]) {
+      if (blk_fw[0] | blk_fw[1] | blk_fw[2] | blk_fw[3]) {
         // Construct the predictors
         temporal_filter_predictors_mb_c(
             mbd, frames[frame]->y_buffer + mb_y_offset,
@@ -765,18 +800,17 @@
         if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
           int adj_strength = strength + 2 * (mbd->bd - 8);
           // Apply the filter (YUV)
-          vp9_highbd_temporal_filter_apply(
-              f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, BH,
-              adj_strength, blk_fw, use_32x32, accumulator, count);
-          vp9_highbd_temporal_filter_apply(
-              f->u_buffer + mb_uv_offset, f->uv_stride, predictor + BLK_PELS,
-              mb_uv_width, mb_uv_height, adj_strength, blk_fw, use_32x32,
-              accumulator + BLK_PELS, count + BLK_PELS);
-          vp9_highbd_temporal_filter_apply(
-              f->v_buffer + mb_uv_offset, f->uv_stride,
-              predictor + (BLK_PELS << 1), mb_uv_width, mb_uv_height,
-              adj_strength, blk_fw, use_32x32, accumulator + (BLK_PELS << 1),
-              count + (BLK_PELS << 1));
+          vp9_highbd_apply_temporal_filter(
+              CONVERT_TO_SHORTPTR(f->y_buffer + mb_y_offset), f->y_stride,
+              CONVERT_TO_SHORTPTR(predictor), BW,
+              CONVERT_TO_SHORTPTR(f->u_buffer + mb_uv_offset),
+              CONVERT_TO_SHORTPTR(f->v_buffer + mb_uv_offset), f->uv_stride,
+              CONVERT_TO_SHORTPTR(predictor + BLK_PELS),
+              CONVERT_TO_SHORTPTR(predictor + (BLK_PELS << 1)), mb_uv_width, BW,
+              BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y,
+              adj_strength, blk_fw, use_32x32, accumulator, count,
+              accumulator + BLK_PELS, count + BLK_PELS,
+              accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
         } else {
           // Apply the filter (YUV)
           vp9_apply_temporal_filter(

diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c
index b560e22..a97c96d 100644
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c

@@ -18,71 +18,6 @@
 #include "vp9/encoder/vp9_temporal_filter.h"
 #include "vp9/encoder/x86/temporal_filter_constants.h"
 
-// Load values from 'a' and 'b'. Compute the difference squared and sum
-// neighboring values such that:
-// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
-// Values to the left and right of the row are set to 0.
-// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
-static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
-  const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
-  const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);
-
-  const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
-  const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);
-
-  const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
-  const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);
-
-  // Shift all the values one place to the left/right so we can efficiently sum
-  // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
-  const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
-  const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);
-
-  // It becomes necessary to treat the values as unsigned at this point. The
-  // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
-  // forward since the filter is only applied to smooth small pixel changes.
-  // Once the value has saturated to uint16_t it is well outside the useful
-  // range.
-  __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
-  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
-
-  *sum = sum_u16;
-}
-
-static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
-                   __m128i *sum_1) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
-  const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);
-
-  const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
-  const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
-  const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
-  const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);
-
-  const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
-  const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
-  const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
-  const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);
-
-  __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
-  // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
-  __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);
-
-  __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
-  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
-
-  *sum_0 = sum_u16;
-
-  shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
-  shift_right = _mm_srli_si128(diff_sq_1_u16, 2);
-
-  sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
-  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
-
-  *sum_1 = sum_u16;
-}
-
 // Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
 // difference squared, and store as unsigned 16-bit integer to dst.
 static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
@@ -312,148 +247,6 @@
   get_sum_8(y_dist + 8, sum_second);
 }
 
-void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
-                                      const uint8_t *b, unsigned int width,
-                                      unsigned int height, int strength,
-                                      int weight, uint32_t *accumulator,
-                                      uint16_t *count) {
-  unsigned int h;
-  const int rounding = (1 << strength) >> 1;
-
-  assert(strength >= 0);
-  assert(strength <= 6);
-
-  assert(weight >= 0);
-  assert(weight <= 2);
-
-  assert(width == 8 || width == 16);
-
-  if (width == 8) {
-    __m128i sum_row_a, sum_row_b, sum_row_c;
-    __m128i mul_constants = _mm_setr_epi16(
-        NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
-
-    sum_8(a, b, &sum_row_a);
-    sum_8(a + stride, b + width, &sum_row_b);
-    sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
-    sum_row_c =
-        average_8(sum_row_c, &mul_constants, strength, rounding, weight);
-    accumulate_and_store_8(sum_row_c, b, count, accumulator);
-
-    a += stride + stride;
-    b += width;
-    count += width;
-    accumulator += width;
-
-    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
-                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
-
-    for (h = 0; h < height - 2; ++h) {
-      sum_8(a, b + width, &sum_row_c);
-      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
-      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
-      sum_row_a =
-          average_8(sum_row_a, &mul_constants, strength, rounding, weight);
-      accumulate_and_store_8(sum_row_a, b, count, accumulator);
-
-      a += stride;
-      b += width;
-      count += width;
-      accumulator += width;
-
-      sum_row_a = sum_row_b;
-      sum_row_b = sum_row_c;
-    }
-
-    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
-                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
-    sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
-    sum_row_a =
-        average_8(sum_row_a, &mul_constants, strength, rounding, weight);
-    accumulate_and_store_8(sum_row_a, b, count, accumulator);
-
-  } else {  // width == 16
-    __m128i sum_row_a_0, sum_row_a_1;
-    __m128i sum_row_b_0, sum_row_b_1;
-    __m128i sum_row_c_0, sum_row_c_1;
-    __m128i mul_constants_0 = _mm_setr_epi16(
-                NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
-            mul_constants_1 = _mm_setr_epi16(
-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
-
-    sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
-    sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);
-
-    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
-    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
-
-    average_16(&sum_row_c_0, &sum_row_c_1, &mul_constants_0, &mul_constants_1,
-               strength, rounding, weight);
-    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
-
-    a += stride + stride;
-    b += width;
-    count += width;
-    accumulator += width;
-
-    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
-    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
-    for (h = 0; h < height - 2; ++h) {
-      sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);
-
-      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
-      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
-      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
-      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);
-
-      average_16(&sum_row_a_0, &sum_row_a_1, &mul_constants_0, &mul_constants_1,
-                 strength, rounding, weight);
-      accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);
-
-      a += stride;
-      b += width;
-      count += width;
-      accumulator += width;
-
-      sum_row_a_0 = sum_row_b_0;
-      sum_row_a_1 = sum_row_b_1;
-      sum_row_b_0 = sum_row_c_0;
-      sum_row_b_1 = sum_row_c_1;
-    }
-
-    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
-    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
-    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
-    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
-
-    average_16(&sum_row_c_0, &sum_row_c_1, &mul_constants_0, &mul_constants_1,
-               strength, rounding, weight);
-    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
-  }
-}
-
 // Read in a row of chroma values corresponds to a row of 16 luma values.
 static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
                                            const uint16_t *v_dist,
@@ -461,7 +254,7 @@
                                            __m128i *v_first,
                                            __m128i *v_second) {
   if (!ss_x) {
-    // If there is no chroma subsampling in the horizaontal direction, then we
+    // If there is no chroma subsampling in the horizontal direction, then we
     // need to load 16 entries from chroma.
     read_dist_16(u_dist, u_first, u_second);
     read_dist_16(v_dist, v_first, v_second);
@@ -481,8 +274,8 @@
   }
 }
 
-// Horizonta add unsigned 16-bit ints in src and store them as signed 32-bit int
-// in dst.
+// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit
+// int in dst.
 static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i shift_right = _mm_srli_si128(*src, 2);
@@ -530,7 +323,7 @@
 
 // Apply temporal filter to the luma components. This performs temporal
 // filtering on a luma block of 16 X block_height. Use blk_fw as an array of
-// size 4for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
 // else use top_weight for top half, and bottom weight for bottom half.
 static void vp9_apply_temporal_filter_luma_16(
     const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
@@ -557,6 +350,9 @@
   __m128i sum_row_first;
   __m128i sum_row_second;
 
+  // Loop variables
+  unsigned int h;
+
   assert(strength >= 0);
   assert(strength <= 6);
 
@@ -615,7 +411,7 @@
   mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
   mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
 
-  for (unsigned int h = 1; h < block_height - 1; ++h) {
+  for (h = 1; h < block_height - 1; ++h) {
     // Move the weight to bottom half
     if (!use_whole_blk && h == block_height / 2) {
       if (blk_fw) {
@@ -847,6 +643,9 @@
 
   __m128i u_sum_row, v_sum_row;
 
+  // Loop variable
+  unsigned int h;
+
   (void)uv_block_width;
 
   // First row
@@ -897,7 +696,7 @@
   // Then all the rows except the last one
   mul = _mm_loadu_si128((const __m128i *)neighbors[1]);
 
-  for (unsigned int h = 1; h < uv_block_height - 1; ++h) {
+  for (h = 1; h < uv_block_height - 1; ++h) {
     // Move the weight pointer to the bottom half of the blocks
     if (h == uv_block_height / 2) {
       if (blk_fw) {
@@ -1143,6 +942,9 @@
   const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
   const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
 
+  // Loop variables
+  unsigned int row, blk_col;
+
   assert(block_width <= BW && "block width too large");
   assert(block_height <= BH && "block height too large");
   assert(block_width % 16 == 0 && "block width must be multiple of 16");
@@ -1160,8 +962,8 @@
       "subblock filter weight must be less than 2");
 
   // Precompute the difference sqaured
-  for (unsigned int row = 0; row < block_height; row++) {
-    for (unsigned int blk_col = 0; blk_col < block_width; blk_col += 16) {
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
       store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
                     y_dist_ptr + blk_col);
     }
@@ -1170,8 +972,8 @@
     y_dist_ptr += DIST_STRIDE;
   }
 
-  for (unsigned int row = 0; row < chroma_height; row++) {
-    for (unsigned int blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
       store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
                    u_dist_ptr + blk_col);
       store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,

diff --git a/vpx_dsp/ppc/fdct32x32_vsx.c b/vpx_dsp/ppc/fdct32x32_vsx.c
index 6110716..328b0e3 100644
--- a/vpx_dsp/ppc/fdct32x32_vsx.c
+++ b/vpx_dsp/ppc/fdct32x32_vsx.c

@@ -227,10 +227,11 @@
   int16x8_t temp0[32];  // Hold stages: 1, 4, 7
   int16x8_t temp1[32];  // Hold stages: 2, 5
   int16x8_t temp2[32];  // Hold stages: 3, 6
+  int i;
 
   // Stage 1
   // Unrolling this loops actually slows down Power9 benchmarks
-  for (int i = 0; i < 16; i++) {
+  for (i = 0; i < 16; i++) {
     temp0[i] = vec_add(in[i], in[31 - i]);
     // pass through to stage 3.
     temp1[i + 16] = vec_sub(in[15 - i], in[i + 16]);
@@ -238,7 +239,7 @@
 
   // Stage 2
   // Unrolling this loops actually slows down Power9 benchmarks
-  for (int i = 0; i < 8; i++) {
+  for (i = 0; i < 8; i++) {
     temp1[i] = vec_add(temp0[i], temp0[15 - i]);
     temp1[i + 8] = vec_sub(temp0[7 - i], temp0[i + 8]);
   }
@@ -461,7 +462,7 @@
                    &out[3]);
 
   if (pass == 0) {
-    for (int i = 0; i < 32; i++) {
+    for (i = 0; i < 32; i++) {
       out[i] = sub_round_shift(out[i]);
     }
   }
commit	ce4336c2ab60d185b431345987b2188511760e54	[log] [tgz]
author	Jerome Jiang <jianj@google.com>	Wed Feb 06 00:57:28 2019
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Wed Feb 06 00:57:28 2019
tree	eece9df8a685d0c9560c07e7ee7e8b33683df0fc
parent	e05cea787872dc2f9ef163a62ca9021955730f71 [diff]
parent	a4525dccec0cbd23b507cb58ce6d8b24a3dd4559 [diff]