Merge "examples.mk,vpxdec: rm libwebm muxer dependency" into main
diff --git a/README b/README
index e360df0..9fa50038 100644
--- a/README
+++ b/README
@@ -64,9 +64,15 @@
     arm64-android-gcc
     arm64-darwin-gcc
     arm64-darwin20-gcc
+    arm64-darwin21-gcc
+    arm64-darwin22-gcc
     arm64-linux-gcc
     arm64-win64-gcc
     arm64-win64-vs15
+    arm64-win64-vs16
+    arm64-win64-vs16-clangcl
+    arm64-win64-vs17
+    arm64-win64-vs17-clangcl
     armv7-android-gcc
     armv7-darwin-gcc
     armv7-linux-rvct
@@ -77,6 +83,8 @@
     armv7-win32-vs15
     armv7s-darwin-gcc
     armv8-linux-gcc
+    loongarch32-linux-gcc
+    loongarch64-linux-gcc
     mips32-linux-gcc
     mips64-linux-gcc
     ppc64le-linux-gcc
@@ -117,6 +125,8 @@
     x86_64-darwin18-gcc
     x86_64-darwin19-gcc
     x86_64-darwin20-gcc
+    x86_64-darwin21-gcc
+    x86_64-darwin22-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc
diff --git a/build/make/Makefile b/build/make/Makefile
index 5c38c18..65ac229 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -304,6 +304,19 @@
 	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$^
 endef
 
+# Don't use -Wl,-z,defs with Clang's sanitizers.
+#
+# Clang's AddressSanitizer documentation says "When linking shared libraries,
+# the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link
+# errors (don't use it with AddressSanitizer)." See
+# https://clang.llvm.org/docs/AddressSanitizer.html#usage.
+NO_UNDEFINED := -Wl,-z,defs
+ifeq ($(findstring clang,$(CC)),clang)
+    ifneq ($(filter -fsanitize=%,$(LDFLAGS)),)
+        NO_UNDEFINED :=
+    endif
+endif
+
 define so_template
 # Not using a pattern rule here because we don't want to generate empty
 # archives when they are listed as a dependency in files not responsible
@@ -313,7 +326,8 @@
 $(1):
 	$(if $(quiet),@echo "    [LD] $$@")
 	$(qexec)$$(LD) -shared $$(LDFLAGS) \
-            -Wl,--no-undefined -Wl,-soname,$$(SONAME) \
+            $(NO_UNDEFINED) \
+            -Wl,-soname,$$(SONAME) \
             -Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \
             $$(filter %.o,$$^) $$(extralibs)
 endef
diff --git a/build/make/configure.sh b/build/make/configure.sh
index ec9af5e..6fd67f1 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -521,6 +521,7 @@
 EXE_SFX   = ${EXE_SFX}
 VCPROJ_SFX = ${VCPROJ_SFX}
 RTCD_OPTIONS = ${RTCD_OPTIONS}
+LIBWEBM_CXXFLAGS = ${LIBWEBM_CXXFLAGS}
 LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS}
 EOF
 
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 58bb66b..1e1db05 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -141,7 +141,17 @@
     case "$opt" in
         --help|-h) show_help
         ;;
-        --target=*) target="${optval}"
+        --target=*)
+            target="${optval}"
+            platform_toolset=$(echo ${target} | awk 'BEGIN{FS="-"}{print $4}')
+            case "$platform_toolset" in
+                clangcl) platform_toolset="ClangCl"
+                ;;
+                "")
+                ;;
+                *) die Unrecognized Visual Studio Platform Toolset in $opt
+                ;;
+            esac
         ;;
         --out=*) outfile="$optval"
         ;;
@@ -259,6 +269,10 @@
     ;;
     arm64*)
         platforms[0]="ARM64"
+        # As of Visual Studio 2022 17.5.5, clang-cl does not support ARM64EC.
+        if [ "$vs_ver" -ge 17 -a "$platform_toolset" != "ClangCl" ]; then
+            platforms[1]="ARM64EC"
+        fi
         asm_Debug_cmdline="armasm64 -nologo -oldit "%(FullPath)""
         asm_Release_cmdline="armasm64 -nologo -oldit "%(FullPath)""
     ;;
@@ -335,17 +349,21 @@
             else
                 tag_content ConfigurationType StaticLibrary
             fi
-            if [ "$vs_ver" = "14" ]; then
-                tag_content PlatformToolset v140
-            fi
-            if [ "$vs_ver" = "15" ]; then
-                tag_content PlatformToolset v141
-            fi
-            if [ "$vs_ver" = "16" ]; then
-                tag_content PlatformToolset v142
-            fi
-            if [ "$vs_ver" = "17" ]; then
-                tag_content PlatformToolset v143
+            if [ -n "$platform_toolset" ]; then
+                tag_content PlatformToolset "$platform_toolset"
+            else
+                if [ "$vs_ver" = "14" ]; then
+                    tag_content PlatformToolset v140
+                fi
+                if [ "$vs_ver" = "15" ]; then
+                    tag_content PlatformToolset v141
+                fi
+                if [ "$vs_ver" = "16" ]; then
+                    tag_content PlatformToolset v142
+                fi
+                if [ "$vs_ver" = "17" ]; then
+                    tag_content PlatformToolset v143
+                fi
             fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
diff --git a/configure b/configure
index 2070772..b73436b 100755
--- a/configure
+++ b/configure
@@ -106,7 +106,9 @@
 all_platforms="${all_platforms} arm64-win64-gcc"
 all_platforms="${all_platforms} arm64-win64-vs15"
 all_platforms="${all_platforms} arm64-win64-vs16"
+all_platforms="${all_platforms} arm64-win64-vs16-clangcl"
 all_platforms="${all_platforms} arm64-win64-vs17"
+all_platforms="${all_platforms} arm64-win64-vs17-clangcl"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
@@ -647,6 +649,7 @@
         check_add_cflags -Wimplicit-function-declaration
         check_add_cflags -Wmissing-declarations
         check_add_cflags -Wmissing-prototypes
+        check_add_cflags -Wshadow
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunreachable-code-loop-increment
         check_add_cflags -Wunused
@@ -677,13 +680,16 @@
         check_add_cxxflags -Wc++17-extensions
         check_add_cxxflags -Wc++20-extensions
 
-        # disable some warnings specific to libyuv.
+        # disable some warnings specific to libyuv / libwebm.
         check_cxxflags -Wno-missing-declarations \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations"
         check_cxxflags -Wno-missing-prototypes \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes"
         check_cxxflags -Wno-pass-failed \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed"
+        check_cxxflags -Wno-shadow \
+          && LIBWEBM_CXXFLAGS="${LIBWEBM_CXXFLAGS} -Wno-shadow" \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-shadow"
         check_cxxflags -Wno-unused-parameter \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter"
     fi
diff --git a/examples.mk b/examples.mk
index 9f83230..22726a3 100644
--- a/examples.mk
+++ b/examples.mk
@@ -57,6 +57,7 @@
 # Add compile flags and include path for libwebm sources.
 ifeq ($(CONFIG_WEBM_IO),yes)
   CXXFLAGS     += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+  $(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
   INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm
 endif
 
diff --git a/libs.mk b/libs.mk
index 1411fee..f6f6cc9 100644
--- a/libs.mk
+++ b/libs.mk
@@ -178,6 +178,7 @@
 INSTALL-LIBS-yes += include/vpx/vpx_integer.h
 INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
 INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
+INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 INSTALL-LIBS-yes                  += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib)
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index f747c35..d8fabd5 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -81,11 +81,11 @@
         // Only the reference buffer may have a stride not equal to width.
         Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(pred.Init());
-        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg_ref.Init());
-        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg_chk.Init());
         const int bitdepth_mask = (1 << bitdepth) - 1;
         for (int h = 0; h < height; ++h) {
@@ -121,11 +121,11 @@
   const int height = 32;
   Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8);
   ASSERT_TRUE(ref.Init());
-  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(pred.Init());
-  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(avg_ref.Init());
-  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(avg_chk.Init());
 
   for (int i = 0; i < 500; ++i) {
@@ -167,9 +167,9 @@
         const int height = 1 << height_pow;
         Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(pred.Init());
-        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg.Init());
         const int bitdepth_mask = (1 << bitdepth) - 1;
         for (int h = 0; h < height; ++h) {
@@ -217,6 +217,11 @@
                          ::testing::Values(&vpx_comp_avg_pred_sse2));
 #endif  // HAVE_SSE2
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_avx2));
+#endif  // HAVE_AVX2
+
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_neon));
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index e435ed8..e8a044a 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -11,6 +11,7 @@
 #include <climits>
 #include <cstring>
 #include <initializer_list>
+#include <new>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
@@ -20,7 +21,7 @@
 
 #include "./vpx_config.h"
 #include "vpx/vp8cx.h"
-#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_tpl.h"
 
 namespace {
 
@@ -368,7 +369,7 @@
     : public ::libvpx_test::EncoderTest,
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
  public:
-  EncodeApiGetTplStatsTest() : EncoderTest(GetParam()) {}
+  EncodeApiGetTplStatsTest() : EncoderTest(GetParam()), test_io_(false) {}
   ~EncodeApiGetTplStatsTest() override {}
 
  protected:
@@ -384,36 +385,82 @@
     }
   }
 
-  vpx_codec_err_t AllocateTplList(VpxTplFrameStats **data) {
-    // Allocate MAX_ARF_GOP_SIZE * sizeof(VpxTplFrameStats) that will be filled
-    // by VP9E_GET_TPL_STATS
-    *data =
+  vpx_codec_err_t AllocateTplList(VpxTplGopStats *data) {
+    // Allocate MAX_ARF_GOP_SIZE (50) * sizeof(VpxTplFrameStats) that will be
+    // filled by VP9E_GET_TPL_STATS.
+    // MAX_ARF_GOP_SIZE is used here because the test doesn't know the size of
+    // each GOP before getting TPL stats from the encoder.
+    data->size = 50;
+    data->frame_stats_list =
         static_cast<VpxTplFrameStats *>(calloc(50, sizeof(VpxTplFrameStats)));
-    if (*data == nullptr) return VPX_CODEC_MEM_ERROR;
+    if (data->frame_stats_list == nullptr) return VPX_CODEC_MEM_ERROR;
     return VPX_CODEC_OK;
   }
 
+  void CompareTplGopStats(const VpxTplGopStats &ref_gop_stats,
+                          const VpxTplGopStats &test_gop_stats) {
+    ASSERT_EQ(ref_gop_stats.size, test_gop_stats.size);
+    for (int frame = 0; frame < ref_gop_stats.size; frame++) {
+      const VpxTplFrameStats &ref_frame_stats =
+          ref_gop_stats.frame_stats_list[frame];
+      const VpxTplFrameStats &test_frame_stats =
+          test_gop_stats.frame_stats_list[frame];
+      ASSERT_EQ(ref_frame_stats.num_blocks, test_frame_stats.num_blocks);
+      ASSERT_EQ(ref_frame_stats.frame_width, test_frame_stats.frame_width);
+      ASSERT_EQ(ref_frame_stats.frame_height, test_frame_stats.frame_height);
+      for (int block = 0; block < ref_frame_stats.num_blocks; block++) {
+        const VpxTplBlockStats &ref_block_stats =
+            ref_frame_stats.block_stats_list[block];
+        const VpxTplBlockStats &test_block_stats =
+            test_frame_stats.block_stats_list[block];
+        ASSERT_EQ(ref_block_stats.inter_cost, test_block_stats.inter_cost);
+        ASSERT_EQ(ref_block_stats.intra_cost, test_block_stats.intra_cost);
+        ASSERT_EQ(ref_block_stats.mv_c, test_block_stats.mv_c);
+        ASSERT_EQ(ref_block_stats.mv_r, test_block_stats.mv_r);
+        ASSERT_EQ(ref_block_stats.recrf_dist, test_block_stats.recrf_dist);
+        ASSERT_EQ(ref_block_stats.recrf_rate, test_block_stats.recrf_rate);
+        ASSERT_EQ(ref_block_stats.ref_frame_index,
+                  test_block_stats.ref_frame_index);
+      }
+    }
+  }
+
   void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
     ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
     while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
       switch (pkt->kind) {
         case VPX_CODEC_CX_FRAME_PKT: {
-          VpxTplFrameStats *tpl_stats = NULL;
+          VpxTplGopStats tpl_stats;
           EXPECT_EQ(AllocateTplList(&tpl_stats), VPX_CODEC_OK);
-          encoder->Control(VP9E_GET_TPL_STATS, tpl_stats);
+          encoder->Control(VP9E_GET_TPL_STATS, &tpl_stats);
           bool stats_not_all_zero = false;
-          for (unsigned int i = 0; i < cfg_.g_lag_in_frames; i++) {
-            if (tpl_stats[i].frame_width != 0) {
-              ASSERT_EQ(tpl_stats[i].frame_width, width_);
-              ASSERT_EQ(tpl_stats[i].frame_height, height_);
-              ASSERT_GT(tpl_stats[i].num_blocks, 0);
-              ASSERT_NE(tpl_stats[i].block_stats_list, nullptr);
+          for (int i = 0; i < tpl_stats.size; i++) {
+            VpxTplFrameStats *frame_stats_list = tpl_stats.frame_stats_list;
+            if (frame_stats_list[i].frame_width != 0) {
+              ASSERT_EQ(frame_stats_list[i].frame_width, width_);
+              ASSERT_EQ(frame_stats_list[i].frame_height, height_);
+              ASSERT_GT(frame_stats_list[i].num_blocks, 0);
+              ASSERT_NE(frame_stats_list[i].block_stats_list, nullptr);
               stats_not_all_zero = true;
             }
           }
           ASSERT_TRUE(stats_not_all_zero);
-          // Free the memory right away now as this is only a test.
-          free(tpl_stats);
+          if (test_io_ && tpl_stats.size > 0) {
+            libvpx_test::TempOutFile *temp_out_file =
+                new (std::nothrow) libvpx_test::TempOutFile("w+");
+            ASSERT_NE(temp_out_file, nullptr);
+            ASSERT_NE(temp_out_file->file(), nullptr);
+            vpx_write_tpl_gop_stats(temp_out_file->file(), &tpl_stats);
+            rewind(temp_out_file->file());
+            VpxTplGopStats gop_stats_io;
+            ASSERT_EQ(
+                vpx_read_tpl_gop_stats(temp_out_file->file(), &gop_stats_io),
+                VPX_CODEC_OK);
+            CompareTplGopStats(gop_stats_io, tpl_stats);
+            vpx_free_tpl_gop_stats(&gop_stats_io);
+            delete temp_out_file;
+          }
+          free(tpl_stats.frame_stats_list);
           break;
         }
         default: break;
@@ -423,6 +470,7 @@
 
   int width_;
   int height_;
+  bool test_io_;
 };
 
 TEST_P(EncodeApiGetTplStatsTest, GetTplStats) {
@@ -430,7 +478,17 @@
   width_ = 352;
   height_ = 288;
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_,
-                                       height_, 30, 1, 0, 150);
+                                       height_, 30, 1, 0, 50);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(EncodeApiGetTplStatsTest, GetTplStatsIO) {
+  cfg_.g_lag_in_frames = 25;
+  width_ = 352;
+  height_ = 288;
+  test_io_ = true;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_,
+                                       height_, 30, 1, 0, 50);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index a5cd830..165fcfa 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -19,7 +19,7 @@
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
 #endif
-#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_tpl.h"
 
 namespace libvpx_test {
 
@@ -154,7 +154,7 @@
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 
-  void Control(int ctrl_id, VpxTplFrameStats *arg) {
+  void Control(int ctrl_id, VpxTplGopStats *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
diff --git a/test/test.mk b/test/test.mk
index bbcdd0c..b64e89b 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -85,6 +85,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc
+$(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += decode_api_test.cc
diff --git a/test/video_source.h b/test/video_source.h
index a10ff6f..5ed99d0 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -64,7 +64,7 @@
   return fopen(path_to_source.c_str(), "rb");
 }
 
-static FILE *GetTempOutFile(std::string *file_name) {
+static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) {
   file_name->clear();
 #if defined(_WIN32)
   char fname[MAX_PATH];
@@ -73,7 +73,7 @@
     // Assume for now that the filename generated is unique per process
     if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
       file_name->assign(fname);
-      return fopen(fname, "wb+");
+      return fopen(fname, io_mode);
     }
   }
   return nullptr;
@@ -94,13 +94,16 @@
   const int fd = mkstemp(temp_file_name.get());
   if (fd == -1) return nullptr;
   *file_name = temp_file_name.get();
-  return fdopen(fd, "wb+");
+  return fdopen(fd, io_mode);
 #endif
 }
 
 class TempOutFile {
  public:
-  TempOutFile() { file_ = GetTempOutFile(&file_name_); }
+  TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); }
+  TempOutFile(const char *io_mode) {
+    file_ = GetTempOutFile(&file_name_, io_mode);
+  }
   ~TempOutFile() {
     CloseFile();
     if (!file_name_.empty()) {
diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h
index 7cb3c98..cc85b9a 100644
--- a/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/vp8/common/mips/msa/vp8_macros_msa.h
@@ -40,160 +40,160 @@
 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
 
 #if (__mips_isa_rev >= 6)
-#define LW(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint32_t val_m;                                  \
-                                                     \
-    asm volatile("lw  %[val_m],  %[psrc_m]  \n\t"    \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+    uint32_t lw_val_m;                                  \
+                                                        \
+    asm volatile("lw  %[lw_val_m],  %[lw_psrc_m]  \n\t" \
+                                                        \
+                 : [lw_val_m] "=r"(lw_val_m)            \
+                 : [lw_psrc_m] "m"(*lw_psrc_m));        \
+                                                        \
+    lw_val_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint64_t val_m = 0;                              \
-                                                     \
-    asm volatile("ld  %[val_m],  %[psrc_m]  \n\t"    \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+    uint64_t ld_val_m = 0;                              \
+                                                        \
+    asm volatile("ld  %[ld_val_m],  %[ld_psrc_m]  \n\t" \
+                                                        \
+                 : [ld_val_m] "=r"(ld_val_m)            \
+                 : [ld_psrc_m] "m"(*ld_psrc_m));        \
+                                                        \
+    ld_val_m;                                           \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_ld = (const uint8_t *)(psrc);       \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_ld);                                   \
-    val1_m = LW(psrc_ld + 4);                               \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc);           \
+    uint32_t ld_val0_m, ld_val1_m;                                \
+    uint64_t ld_val_m = 0;                                        \
+                                                                  \
+    ld_val0_m = LW(ld_psrc_m);                                    \
+    ld_val1_m = LW(ld_psrc_m + 4);                                \
+                                                                  \
+    ld_val_m = (uint64_t)(ld_val1_m);                             \
+    ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+    ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m);        \
+                                                                  \
+    ld_val_m;                                                     \
   })
 #endif  // (__mips == 64)
 
-#define SH(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint16_t val_m = (val);                 \
-                                                  \
-    asm volatile("sh  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SH(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sh_pdst_m = (uint8_t *)(pdst);             \
+    const uint16_t sh_val_m = (val);                    \
+                                                        \
+    asm volatile("sh  %[sh_val_m],  %[sh_pdst_m]  \n\t" \
+                                                        \
+                 : [sh_pdst_m] "=m"(*sh_pdst_m)         \
+                 : [sh_val_m] "r"(sh_val_m));           \
   }
 
-#define SW(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint32_t val_m = (val);                 \
-                                                  \
-    asm volatile("sw  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SW(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sw_pdst_m = (uint8_t *)(pdst);             \
+    const uint32_t sw_val_m = (val);                    \
+                                                        \
+    asm volatile("sw  %[sw_val_m],  %[sw_pdst_m]  \n\t" \
+                                                        \
+                 : [sw_pdst_m] "=m"(*sw_pdst_m)         \
+                 : [sw_val_m] "r"(sw_val_m));           \
   }
 
-#define SD(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint64_t val_m = (val);                 \
-                                                  \
-    asm volatile("sd  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SD(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sd_pdst_m = (uint8_t *)(pdst);             \
+    const uint64_t sd_val_m = (val);                    \
+                                                        \
+    asm volatile("sd  %[sd_val_m],  %[sd_pdst_m]  \n\t" \
+                                                        \
+                 : [sd_pdst_m] "=m"(*sd_pdst_m)         \
+                 : [sd_val_m] "r"(sd_val_m));           \
   }
 #else  // !(__mips_isa_rev >= 6)
-#define LW(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint32_t val_m;                                  \
-                                                     \
-    asm volatile(                                    \
-        "lwr %[val_m], 0(%[psrc_m]) \n\t"            \
-        "lwl %[val_m], 3(%[psrc_m]) \n\t"            \
-        : [val_m] "=&r"(val_m)                       \
-        : [psrc_m] "r"(psrc_m));                     \
-                                                     \
-    val_m;                                           \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+    uint32_t lw_val_m;                                  \
+                                                        \
+    asm volatile(                                       \
+        "lwr %[lw_val_m], 0(%[lw_psrc_m]) \n\t"         \
+        "lwl %[lw_val_m], 3(%[lw_psrc_m]) \n\t"         \
+        : [lw_val_m] "=&r"(lw_val_m)                    \
+        : [lw_psrc_m] "r"(lw_psrc_m));                  \
+                                                        \
+    lw_val_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint64_t val_m = 0;                              \
-                                                     \
-    asm volatile(                                    \
-        "ldr %[val_m], 0(%[psrc_m]) \n\t"            \
-        "ldl %[val_m], 7(%[psrc_m]) \n\t"            \
-        : [val_m] "=&r"(val_m)                       \
-        : [psrc_m] "r"(psrc_m));                     \
-                                                     \
-    val_m;                                           \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+    uint64_t ld_val_m = 0;                              \
+                                                        \
+    asm volatile(                                       \
+        "ldr %[ld_val_m], 0(%[ld_psrc_m]) \n\t"         \
+        "ldl %[ld_val_m], 7(%[ld_psrc_m]) \n\t"         \
+        : [ld_val_m] "=&r"(ld_val_m)                    \
+        : [ld_psrc_m] "r"(ld_psrc_m));                  \
+                                                        \
+    ld_val_m;                                           \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m1);                                   \
-    val1_m = LW(psrc_m1 + 4);                               \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *ld_psrc_m1 = (const uint8_t *)(psrc);          \
+    uint32_t ld_val0_m, ld_val1_m;                                \
+    uint64_t ld_val_m = 0;                                        \
+                                                                  \
+    ld_val0_m = LW(ld_psrc_m1);                                   \
+    ld_val1_m = LW(ld_psrc_m1 + 4);                               \
+                                                                  \
+    ld_val_m = (uint64_t)(ld_val1_m);                             \
+    ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+    ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m);        \
+                                                                  \
+    ld_val_m;                                                     \
   })
 #endif  // (__mips == 64)
-#define SH(val, pdst)                              \
-  {                                                \
-    uint8_t *pdst_m = (uint8_t *)(pdst);           \
-    const uint16_t val_m = (val);                  \
-                                                   \
-    asm volatile("ush  %[val_m],  %[pdst_m]  \n\t" \
-                                                   \
-                 : [pdst_m] "=m"(*pdst_m)          \
-                 : [val_m] "r"(val_m));            \
+#define SH(val, pdst)                                    \
+  {                                                      \
+    uint8_t *sh_pdst_m = (uint8_t *)(pdst);              \
+    const uint16_t sh_val_m = (val);                     \
+                                                         \
+    asm volatile("ush  %[sh_val_m],  %[sh_pdst_m]  \n\t" \
+                                                         \
+                 : [sh_pdst_m] "=m"(*sh_pdst_m)          \
+                 : [sh_val_m] "r"(sh_val_m));            \
   }
 
-#define SW(val, pdst)                              \
-  {                                                \
-    uint8_t *pdst_m = (uint8_t *)(pdst);           \
-    const uint32_t val_m = (val);                  \
-                                                   \
-    asm volatile("usw  %[val_m],  %[pdst_m]  \n\t" \
-                                                   \
-                 : [pdst_m] "=m"(*pdst_m)          \
-                 : [val_m] "r"(val_m));            \
+#define SW(val, pdst)                                    \
+  {                                                      \
+    uint8_t *sw_pdst_m = (uint8_t *)(pdst);              \
+    const uint32_t sw_val_m = (val);                     \
+                                                         \
+    asm volatile("usw  %[sw_val_m],  %[sw_pdst_m]  \n\t" \
+                                                         \
+                 : [sw_pdst_m] "=m"(*sw_pdst_m)          \
+                 : [sw_val_m] "r"(sw_val_m));            \
   }
 
-#define SD(val, pdst)                                        \
-  {                                                          \
-    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
-    uint32_t val0_m, val1_m;                                 \
-                                                             \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-                                                             \
-    SW(val0_m, pdst_m1);                                     \
-    SW(val1_m, pdst_m1 + 4);                                 \
+#define SD(val, pdst)                                           \
+  {                                                             \
+    uint8_t *sd_pdst_m1 = (uint8_t *)(pdst);                    \
+    uint32_t sd_val0_m, sd_val1_m;                              \
+                                                                \
+    sd_val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    sd_val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+                                                                \
+    SW(sd_val0_m, sd_pdst_m1);                                  \
+    SW(sd_val1_m, sd_pdst_m1 + 4);                              \
   }
 #endif  // (__mips_isa_rev >= 6)
 
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index a6bedc4..56500a8 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -135,27 +135,6 @@
 int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
 int vp8_remove_decoder_instances(struct frame_buffers *fb);
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr)                                         \
-  do {                                                                      \
-    assert(pbi->common.error.setjmp);                                       \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,           \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr)                               \
-  do {                                                            \
-    assert(pbi->common.error.setjmp);                             \
-    (lval) = (expr);                                              \
-    if (!(lval))                                                  \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);            \
-  } while (0)
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 490f62d..9ea6a4f 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -30,11 +30,13 @@
 #include "error_concealment.h"
 #endif
 
-#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
-#define CALLOC_ARRAY_ALIGNED(p, n, algn)                            \
-  do {                                                              \
-    CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \
-    memset((p), 0, (n) * sizeof(*(p)));                             \
+#define CALLOC_ARRAY(p, n) \
+  CHECK_MEM_ERROR(&pbi->common.error, (p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn)                       \
+  do {                                                         \
+    CHECK_MEM_ERROR(&pbi->common.error, (p),                   \
+                    vpx_memalign((algn), sizeof(*(p)) * (n))); \
+    memset((p), 0, (n) * sizeof(*(p)));                        \
   } while (0)
 
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
@@ -754,7 +756,7 @@
     uv_width = width >> 1;
 
     /* Allocate a vpx_atomic_int for each mb row. */
-    CHECK_MEM_ERROR(pbi->mt_current_mb_col,
+    CHECK_MEM_ERROR(&pc->error, pbi->mt_current_mb_col,
                     vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows));
     for (i = 0; i < pc->mb_rows; ++i)
       vpx_atomic_init(&pbi->mt_current_mb_col[i], 0);
@@ -762,7 +764,7 @@
     /* Allocate memory for above_row buffers. */
     CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_yabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (width + (VP8BORDERINPIXELS << 1))));
       vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1));
@@ -770,7 +772,7 @@
 
     CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_uabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (uv_width + VP8BORDERINPIXELS)));
       vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS);
@@ -778,7 +780,7 @@
 
     CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_vabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (uv_width + VP8BORDERINPIXELS)));
       vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS);
@@ -787,17 +789,17 @@
     /* Allocate memory for left_col buffers. */
     CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_yleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_yleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 16, 1));
 
     CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_uleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_uleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 8, 1));
 
     CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_vleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_vleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 8, 1));
   }
 }
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 6201075..dc29945 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -123,7 +123,7 @@
     unsigned int tmp;
 
     /* Create a list to sort to */
-    CHECK_MEM_ERROR(sortlist,
+    CHECK_MEM_ERROR(&cpi->common.error, sortlist,
                     vpx_calloc(sizeof(unsigned int), cpi->common.MBs));
 
     /* Copy map to sort list */
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index cb35f4f..2583cb0 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -510,16 +510,16 @@
 
     if (th_count == 0) return 0;
 
-    CHECK_MEM_ERROR(cpi->h_encoding_thread,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread,
                     vpx_malloc(sizeof(pthread_t) * th_count));
-    CHECK_MEM_ERROR(cpi->h_event_start_encoding,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding,
                     vpx_malloc(sizeof(sem_t) * th_count));
-    CHECK_MEM_ERROR(cpi->h_event_end_encoding,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding,
                     vpx_malloc(sizeof(sem_t) * th_count));
-    CHECK_MEM_ERROR(cpi->mb_row_ei,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei,
                     vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
     memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
-    CHECK_MEM_ERROR(cpi->en_thread_data,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->en_thread_data,
                     vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
 
     vpx_atomic_store_release(&cpi->b_multi_threaded, 1);
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index a780048..8941329 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1169,7 +1169,8 @@
 #else
     unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
 #endif
-    CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->tok,
+                    vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
 
   /* Data used for real time vc mode to see if gf needs refreshing */
@@ -1178,37 +1179,39 @@
   /* Structures used to monitor GF usage */
   vpx_free(cpi->gf_active_flags);
   CHECK_MEM_ERROR(
-      cpi->gf_active_flags,
+      &cpi->common.error, cpi->gf_active_flags,
       vpx_calloc(sizeof(*cpi->gf_active_flags), cm->mb_rows * cm->mb_cols));
   cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
   vpx_free(cpi->mb_activity_map);
   CHECK_MEM_ERROR(
-      cpi->mb_activity_map,
+      &cpi->common.error, cpi->mb_activity_map,
       vpx_calloc(sizeof(*cpi->mb_activity_map), cm->mb_rows * cm->mb_cols));
 
   /* allocate memory for storing last frame's MVs for MV prediction. */
   vpx_free(cpi->lfmv);
-  CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
-                                        sizeof(*cpi->lfmv)));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->lfmv,
+      vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lfmv)));
   vpx_free(cpi->lf_ref_frame_sign_bias);
-  CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame_sign_bias,
                   vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
                              sizeof(*cpi->lf_ref_frame_sign_bias)));
   vpx_free(cpi->lf_ref_frame);
-  CHECK_MEM_ERROR(cpi->lf_ref_frame,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame,
                   vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
                              sizeof(*cpi->lf_ref_frame)));
 
   /* Create the encoder segmentation map and set all entries to 0 */
   vpx_free(cpi->segmentation_map);
   CHECK_MEM_ERROR(
-      cpi->segmentation_map,
+      &cpi->common.error, cpi->segmentation_map,
       vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->segmentation_map)));
   cpi->cyclic_refresh_mode_index = 0;
   vpx_free(cpi->active_map);
-  CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
-                                              sizeof(*cpi->active_map)));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->active_map,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->active_map)));
   memset(cpi->active_map, 1, (cm->mb_rows * cm->mb_cols));
 
 #if CONFIG_MULTITHREAD
@@ -1226,7 +1229,7 @@
     int i;
 
     vpx_free(cpi->mt_current_mb_col);
-    CHECK_MEM_ERROR(cpi->mt_current_mb_col,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->mt_current_mb_col,
                     vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
     for (i = 0; i < cm->mb_rows; ++i)
       vpx_atomic_init(&cpi->mt_current_mb_col[i], 0);
@@ -1235,7 +1238,8 @@
 #endif
 
   vpx_free(cpi->tplist);
-  CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->tplist,
+                  vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
 
 #if CONFIG_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
@@ -1773,8 +1777,9 @@
 
   cpi->common.error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site),
-                                         (MAX_MVSEARCH_STEPS * 8) + 1));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->mb.ss,
+      vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
 
   vp8_create_common(&cpi->common);
 
@@ -1879,18 +1884,19 @@
   }
 
   if (cpi->cyclic_refresh_mode_enabled) {
-    CHECK_MEM_ERROR(cpi->cyclic_refresh_map,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->cyclic_refresh_map,
                     vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
   } else {
     cpi->cyclic_refresh_map = (signed char *)NULL;
   }
 
-  CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
-                                            sizeof(cpi->skin_map[0])));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->skin_map,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(cpi->skin_map[0])));
 
-  CHECK_MEM_ERROR(cpi->consec_zero_last,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last,
                   vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
-  CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last_mvbias,
                   vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
 
   /*Initialize the feed-forward activity masking.*/
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 46a1791..bde5c2f 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -731,26 +731,6 @@
 
 void vp8_set_speed_features(VP8_COMP *cpi);
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr)                                         \
-  do {                                                                      \
-    assert(cpi->common.error.setjmp);                                       \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,           \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr)                               \
-  do {                                                            \
-    assert(cpi->common.error.setjmp);                             \
-    (lval) = (expr);                                              \
-    if (!(lval))                                                  \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);            \
-  } while (0)
-#endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index a9d1f80..0821eef 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -911,12 +911,6 @@
     }
   }
 
-  if (setjmp(ctx->cpi->common.error.jmp)) {
-    ctx->cpi->common.error.setjmp = 0;
-    vpx_clear_system_state();
-    return VPX_CODEC_CORRUPT_FRAME;
-  }
-
   /* Initialize the encoder instance on the first frame*/
   if (!res && ctx->cpi) {
     unsigned int lib_flags;
@@ -927,6 +921,13 @@
     unsigned char *cx_data_end;
     int comp_data_state = 0;
 
+    if (setjmp(ctx->cpi->common.error.jmp)) {
+      ctx->cpi->common.error.setjmp = 0;
+      vpx_clear_system_state();
+      return VPX_CODEC_CORRUPT_FRAME;
+    }
+    ctx->cpi->common.error.setjmp = 1;
+
     /* Set up internal flags */
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) {
       ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1;
@@ -962,8 +963,6 @@
     cx_data_end = ctx->cx_data + cx_data_sz;
     lib_flags = 0;
 
-    ctx->cpi->common.error.setjmp = 1;
-
     while (cx_data_sz >= ctx->cx_data_sz / 2) {
       comp_data_state = vp8_get_compressed_data(
           ctx->cpi, &lib_flags, &size, cx_data, cx_data_end, &dst_time_stamp,
@@ -1059,6 +1058,7 @@
         }
       }
     }
+    ctx->cpi->common.error.setjmp = 0;
   }
 
   return res;
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 55a77ba..fdc0b35 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -310,6 +310,7 @@
     VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
     VP8_COMMON *const pc = &pbi->common;
     if (setjmp(pbi->common.error.jmp)) {
+      pbi->common.error.setjmp = 0;
       vp8_remove_decoder_instances(fb);
       vp8_zero(fb->pbi);
       vpx_clear_system_state();
@@ -494,6 +495,7 @@
 
     /* get ready for the next series of fragments */
     ctx->fragments.count = 0;
+    pbi->common.error.setjmp = 0;
   }
 
   return res;
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 8d2bed3..d63bad9 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -46,27 +46,6 @@
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
 }
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(cm, lval, expr)                                     \
-  do {                                                                      \
-    assert(&(cm)->error.setjmp);                                            \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR,                 \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(cm, lval, expr)                     \
-  do {                                                      \
-    assert(&(cm)->error.setjmp);                            \
-    (lval) = (expr);                                        \
-    if (!(lval))                                            \
-      vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);      \
-  } while (0)
-#endif
-
 #define VP9_SYNC_CODE_0 0x49
 #define VP9_SYNC_CODE_1 0x83
 #define VP9_SYNC_CODE_2 0x42
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index ad44781..1c6ecc0 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -283,7 +283,7 @@
   {
     int i;
 
-    CHECK_MEM_ERROR(cm, lf_sync->mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->mutex,
                     vpx_malloc(sizeof(*lf_sync->mutex) * rows));
     if (lf_sync->mutex) {
       for (i = 0; i < rows; ++i) {
@@ -291,7 +291,7 @@
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->cond,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->cond,
                     vpx_malloc(sizeof(*lf_sync->cond) * rows));
     if (lf_sync->cond) {
       for (i = 0; i < rows; ++i) {
@@ -299,11 +299,11 @@
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->lf_mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->lf_mutex,
                     vpx_malloc(sizeof(*lf_sync->lf_mutex)));
     pthread_mutex_init(lf_sync->lf_mutex, NULL);
 
-    CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_mutex,
                     vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
     if (lf_sync->recon_done_mutex) {
       for (i = 0; i < rows; ++i) {
@@ -311,7 +311,7 @@
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_cond,
                     vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
     if (lf_sync->recon_done_cond) {
       for (i = 0; i < rows; ++i) {
@@ -321,15 +321,15 @@
   }
 #endif  // CONFIG_MULTITHREAD
 
-  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->lfdata,
                   vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
   lf_sync->num_workers = num_workers;
   lf_sync->num_active_workers = lf_sync->num_workers;
 
-  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->cur_sb_col,
                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
 
-  CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->num_tiles_done,
                   vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
                                  mi_cols_aligned_to_sb(cm->mi_rows) >>
                              MI_BLOCK_SIZE_LOG2));
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 6eae41f..6ec1d9f 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1469,7 +1469,7 @@
   vpx_free(cm->cur_frame->mvs);
   cm->cur_frame->mi_rows = cm->mi_rows;
   cm->cur_frame->mi_cols = cm->mi_cols;
-  CHECK_MEM_ERROR(cm, cm->cur_frame->mvs,
+  CHECK_MEM_ERROR(&cm->error, cm->cur_frame->mvs,
                   (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
                                        sizeof(*cm->cur_frame->mvs)));
 }
@@ -1776,7 +1776,8 @@
 
   if (jobq_size > row_mt_worker_data->jobq_size) {
     vpx_free(row_mt_worker_data->jobq_buf);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size));
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->jobq_buf,
+                    vpx_calloc(1, jobq_size));
     vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf,
                   jobq_size);
     row_mt_worker_data->jobq_size = jobq_size;
@@ -1923,7 +1924,7 @@
       const int is_last_row = sb_rows - 1 == cur_sb_row;
       int mi_col_start, mi_col_end;
       if (!tile_data_recon)
-        CHECK_MEM_ERROR(cm, tile_data_recon,
+        CHECK_MEM_ERROR(&cm->error, tile_data_recon,
                         vpx_memalign(32, sizeof(TileWorkerData)));
 
       tile_data_recon->xd = pbi->mb;
@@ -2025,7 +2026,7 @@
 
   if (cm->lf.filter_level && !cm->skip_loop_filter &&
       pbi->lf_worker.data1 == NULL) {
-    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
+    CHECK_MEM_ERROR(&cm->error, pbi->lf_worker.data1,
                     vpx_memalign(32, sizeof(LFWorkerData)));
     pbi->lf_worker.hook = vp9_loop_filter_worker;
     if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
@@ -2192,8 +2193,6 @@
 
   volatile int mi_row = 0;
   volatile int n = tile_data->buf_start;
-  tile_data->error_info.setjmp = 1;
-
   if (setjmp(tile_data->error_info.jmp)) {
     tile_data->error_info.setjmp = 0;
     tile_data->xd.corrupted = 1;
@@ -2206,6 +2205,7 @@
     }
     return 0;
   }
+  tile_data->error_info.setjmp = 1;
 
   tile_data->xd.corrupted = 0;
 
@@ -2285,7 +2285,7 @@
 
   if (pbi->num_tile_workers == 0) {
     const int num_threads = pbi->max_threads;
-    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+    CHECK_MEM_ERROR(&cm->error, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
     for (n = 0; n < num_threads; ++n) {
       VPxWorker *const worker = &pbi->tile_workers[n];
@@ -2824,7 +2824,7 @@
     const int num_jobs = sb_rows << cm->log2_tile_cols;
 
     if (pbi->row_mt_worker_data == NULL) {
-      CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data,
+      CHECK_MEM_ERROR(&cm->error, pbi->row_mt_worker_data,
                       vpx_calloc(1, sizeof(*pbi->row_mt_worker_data)));
 #if CONFIG_MULTITHREAD
       pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL);
@@ -3006,7 +3006,8 @@
     // platforms without DECLARE_ALIGNED().
     assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
     vpx_free(pbi->tile_worker_data);
-    CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size));
+    CHECK_MEM_ERROR(&cm->error, pbi->tile_worker_data,
+                    vpx_memalign(32, twd_size));
     pbi->total_tiles = tile_rows * tile_cols;
   }
 
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 92cd91f..5a7e9f9 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -66,7 +66,7 @@
   {
     int i;
     CHECK_MEM_ERROR(
-        cm, row_mt_worker_data->recon_sync_mutex,
+        &cm->error, row_mt_worker_data->recon_sync_mutex,
         vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs));
     if (row_mt_worker_data->recon_sync_mutex) {
       for (i = 0; i < num_jobs; ++i) {
@@ -75,7 +75,7 @@
     }
 
     CHECK_MEM_ERROR(
-        cm, row_mt_worker_data->recon_sync_cond,
+        &cm->error, row_mt_worker_data->recon_sync_cond,
         vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs));
     if (row_mt_worker_data->recon_sync_cond) {
       for (i = 0; i < num_jobs; ++i) {
@@ -86,24 +86,24 @@
 #endif
   row_mt_worker_data->num_sbs = num_sbs;
   for (plane = 0; plane < 3; ++plane) {
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane],
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->dqcoeff[plane],
                     vpx_memalign(32, dqcoeff_size));
     memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane],
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->eob[plane],
                     vpx_calloc(num_sbs << EOBS_PER_SB_LOG2,
                                sizeof(*row_mt_worker_data->eob[plane])));
   }
-  CHECK_MEM_ERROR(cm, row_mt_worker_data->partition,
+  CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->partition,
                   vpx_calloc(num_sbs * PARTITIONS_PER_SB,
                              sizeof(*row_mt_worker_data->partition)));
-  CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map,
+  CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->recon_map,
                   vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map)));
 
   // allocate memory for thread_data
   if (row_mt_worker_data->thread_data == NULL) {
     const size_t thread_size =
         max_threads * sizeof(*row_mt_worker_data->thread_data);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data,
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->thread_data,
                     vpx_memalign(32, thread_size));
   }
 }
@@ -181,9 +181,10 @@
 
   cm->error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(&cm->error, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
+      &cm->error, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
   pbi->need_resync = 1;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 17c123a..ca56d14 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -967,13 +967,13 @@
   int i;
   const size_t worker_data_size =
       cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
-  CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data,
+  CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data,
                   vpx_memalign(16, worker_data_size));
   memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
   for (i = 1; i < cpi->num_workers; ++i) {
     cpi->vp9_bitstream_worker_data[i].dest_size =
         cpi->oxcf.width * cpi->oxcf.height;
-    CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data[i].dest,
+    CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest,
                     vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size));
   }
 }
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index b74b902..42073f7 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -25,16 +25,17 @@
   int i, k;
   ctx->num_4x4_blk = num_blk;
 
-  CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, vpx_calloc(num_blk, sizeof(uint8_t)));
+  CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk,
+                  vpx_calloc(num_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     for (k = 0; k < 3; ++k) {
-      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k],
                       vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
       ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
       ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
@@ -100,10 +101,10 @@
   int nodes;
 
   vpx_free(td->leaf_tree);
-  CHECK_MEM_ERROR(cm, td->leaf_tree,
+  CHECK_MEM_ERROR(&cm->error, td->leaf_tree,
                   vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
   vpx_free(td->pc_tree);
-  CHECK_MEM_ERROR(cm, td->pc_tree,
+  CHECK_MEM_ERROR(&cm->error, td->pc_tree,
                   vpx_calloc(tree_nodes, sizeof(*td->pc_tree)));
 
   this_pc = &td->pc_tree[0];
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 77d7239..baea8eb 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -634,11 +634,11 @@
   denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
   init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
   denoiser->num_layers = num_layers;
-  CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+  CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y,
                   vpx_calloc(denoiser->num_ref_frames * num_layers,
                              sizeof(denoiser->running_avg_y[0])));
   CHECK_MEM_ERROR(
-      cm, denoiser->mc_running_avg_y,
+      &cm->error, denoiser->mc_running_avg_y,
       vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
 
   for (layer = 0; layer < num_layers; ++layer) {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3a04239..a979ae1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1545,7 +1545,7 @@
   }
 
   if (low_res && threshold_4x4avg < INT64_MAX)
-    CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2)));
+    CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2)));
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
   for (i = 0; i < 4; i++) {
@@ -5783,7 +5783,7 @@
     if (cm->last_width != cm->width || cm->last_height != cm->height) {
       if (cpi->source_diff_var) vpx_free(cpi->source_diff_var);
 
-      CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+      CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
                       vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
     }
 
@@ -5823,7 +5823,7 @@
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
     if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
     CHECK_MEM_ERROR(
-        cm, cpi->tile_data,
+        &cm->error, cpi->tile_data,
         vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
     cpi->allocated_tiles = tile_cols * tile_rows;
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f76eec2..9d5c003 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -681,9 +681,10 @@
   return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
 }
 
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
-                    unsigned int cols, int delta_q[8], int delta_lf[8],
-                    int skip[8], int ref_frame[8]) {
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+                                unsigned int rows, unsigned int cols,
+                                int delta_q[8], int delta_lf[8], int skip[8],
+                                int ref_frame[8]) {
   VP9_COMMON *cm = &cpi->common;
   vpx_roi_map_t *roi = &cpi->roi;
   const int range = 63;
@@ -694,13 +695,13 @@
 
   // Check number of rows and columns match
   if (frame_rows != (int)rows || frame_cols != (int)cols) {
-    return -1;
+    return VPX_CODEC_INVALID_PARAM;
   }
 
   if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
       !check_seg_range(ref_frame, ref_frame_range) ||
       !check_seg_range(skip, skip_range))
-    return -1;
+    return VPX_CODEC_INVALID_PARAM;
 
   // Also disable segmentation if no deltas are specified.
   if (!map ||
@@ -714,14 +715,15 @@
         ref_frame[6] == -1 && ref_frame[7] == -1))) {
     vp9_disable_segmentation(&cm->seg);
     cpi->roi.enabled = 0;
-    return 0;
+    return VPX_CODEC_OK;
   }
 
   if (roi->roi_map) {
     vpx_free(roi->roi_map);
     roi->roi_map = NULL;
   }
-  CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
+  roi->roi_map = vpx_malloc(rows * cols);
+  if (!roi->roi_map) return VPX_CODEC_MEM_ERROR;
 
   // Copy to ROI structure in the compressor.
   memcpy(roi->roi_map, map, rows * cols);
@@ -733,7 +735,7 @@
   roi->rows = rows;
   roi->cols = cols;
 
-  return 0;
+  return VPX_CODEC_OK;
 }
 
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
@@ -1374,7 +1376,7 @@
   VP9_COMMON *cm = &cpi->common;
   int mi_size = cm->mi_cols * cm->mi_rows;
 
-  CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
+  CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base,
                   vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
@@ -1393,14 +1395,14 @@
 
   {
     unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
-    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+    CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0],
                     vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
 
   sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
   vpx_free(cpi->tplist[0][0]);
   CHECK_MEM_ERROR(
-      cm, cpi->tplist[0][0],
+      &cm->error, cpi->tplist[0][0],
       vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0])));
 
   vp9_setup_pc_tree(&cpi->common, &cpi->td);
@@ -1996,48 +1998,48 @@
 
   // Create the encoder segmentation map and set all entries to 0
   vpx_free(cpi->segmentation_map);
-  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+  CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // Create a map used for cyclic background refresh.
   if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh);
-  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+  CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh,
                   vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
 
   // Create a map used to mark inactive areas.
   vpx_free(cpi->active_map.map);
-  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+  CHECK_MEM_ERROR(&cm->error, cpi->active_map.map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // And a place holder structure is the coding context
   // for use if we want to save and restore it
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
-  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+  CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
 static void alloc_copy_partition_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cpi->prev_partition == NULL) {
-    CHECK_MEM_ERROR(cm, cpi->prev_partition,
+    CHECK_MEM_ERROR(&cm->error, cpi->prev_partition,
                     (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
                                              sizeof(*cpi->prev_partition)));
   }
   if (cpi->prev_segment_id == NULL) {
     CHECK_MEM_ERROR(
-        cm, cpi->prev_segment_id,
+        &cm->error, cpi->prev_segment_id,
         (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                              sizeof(*cpi->prev_segment_id)));
   }
   if (cpi->prev_variance_low == NULL) {
-    CHECK_MEM_ERROR(cm, cpi->prev_variance_low,
+    CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low,
                     (uint8_t *)vpx_calloc(
                         (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25,
                         sizeof(*cpi->prev_variance_low)));
   }
   if (cpi->copied_frame_cnt == NULL) {
     CHECK_MEM_ERROR(
-        cm, cpi->copied_frame_cnt,
+        &cm->error, cpi->copied_frame_cnt,
         (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                               sizeof(*cpi->copied_frame_cnt)));
   }
@@ -2370,9 +2372,10 @@
   cm->free_mi = vp9_enc_free_mi;
   cm->setup_mi = vp9_enc_setup_mi;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(&cm->error, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
+      &cm->error, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
   cpi->compute_frame_low_motion_onepass = 1;
@@ -2399,38 +2402,38 @@
   realloc_segmentation_maps(cpi);
 
   CHECK_MEM_ERROR(
-      cm, cpi->skin_map,
+      &cm->error, cpi->skin_map,
       vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
 
 #if !CONFIG_REALTIME_ONLY
-  CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
+  CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
 #endif
 
   CHECK_MEM_ERROR(
-      cm, cpi->consec_zero_mv,
+      &cm->error, cpi->consec_zero_mv,
       vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
 
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
        i++) {
     CHECK_MEM_ERROR(
-        cm, cpi->mbgraph_stats[i].mb_stats,
+        &cm->error, cpi->mbgraph_stats[i].mb_stats,
         vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
   }
 
@@ -2474,7 +2477,7 @@
   }
 
   if (cpi->b_calculate_consistency) {
-    CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+    CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars,
                     vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
                                sizeof(*cpi->ssim_vars) * 4));
     cpi->worst_consistency = 100.0;
@@ -2559,7 +2562,7 @@
           vpx_free(lc->rc_twopass_stats_in.buf);
 
           lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
-          CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf,
+          CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf,
                           vpx_malloc(lc->rc_twopass_stats_in.sz));
           lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
           lc->twopass.stats_in = lc->twopass.stats_in_start;
@@ -2614,7 +2617,7 @@
     const int h = num_8x8_blocks_high_lookup[bsize];
     const int num_cols = (cm->mi_cols + w - 1) / w;
     const int num_rows = (cm->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->mi_ssim_rdmult_scaling_factors,
+    CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors,
                     vpx_calloc(num_rows * num_cols,
                                sizeof(*cpi->mi_ssim_rdmult_scaling_factors)));
   }
@@ -2625,11 +2628,10 @@
 #endif  // CONFIG_NON_GREEDY_MV
   for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) {
     cpi->tpl_stats[i].tpl_stats_ptr = NULL;
-    cpi->tpl_frame_stats[i].block_stats_list = NULL;
   }
 
   // Allocate memory to store variances for a frame.
-  CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+  CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
                   vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
@@ -3752,7 +3754,7 @@
       case 6: l = 150; break;
     }
     if (!cpi->common.postproc_state.limits) {
-      CHECK_MEM_ERROR(cm, cpi->common.postproc_state.limits,
+      CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits,
                       vpx_calloc(cpi->un_scaled_source->y_width,
                                  sizeof(*cpi->common.postproc_state.limits)));
     }
@@ -4096,7 +4098,7 @@
       svc->spatial_layer_id == svc->number_spatial_layers - 2) {
     if (svc->prev_partition_svc == NULL) {
       CHECK_MEM_ERROR(
-          cm, svc->prev_partition_svc,
+          &cm->error, svc->prev_partition_svc,
           (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
                                    sizeof(*svc->prev_partition_svc)));
     }
@@ -4448,10 +4450,13 @@
   const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
 
 #if CONFIG_RATE_CTRL
-  const FRAME_UPDATE_TYPE update_type =
-      cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
-  const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
-  RATE_QSTEP_MODEL *rq_model = &cpi->rq_model[frame_type];
+  RATE_QSTEP_MODEL *rq_model;
+  {
+    const FRAME_UPDATE_TYPE update_type =
+        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+    const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
+    rq_model = &cpi->rq_model[frame_type];
+  }
   init_rq_history(rq_history);
 #endif  // CONFIG_RATE_CTRL
 
@@ -5295,7 +5300,7 @@
   cpi->mb_wiener_variance = NULL;
 
   CHECK_MEM_ERROR(
-      cm, cpi->mb_wiener_variance,
+      &cm->error, cpi->mb_wiener_variance,
       vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance)));
   cpi->mb_wiener_var_rows = cm->mb_rows;
   cpi->mb_wiener_var_cols = cm->mb_cols;
@@ -6542,7 +6547,7 @@
     pthread_mutex_init(&cpi->kmeans_mutex, NULL);
 #endif
     CHECK_MEM_ERROR(
-        cm, cpi->kmeans_data_arr,
+        &cm->error, cpi->kmeans_data_arr,
         vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr)));
     cpi->kmeans_data_stride = mi_cols;
     cpi->kmeans_data_arr_alloc = 1;
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index cca1617..2e0c4db 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -18,6 +18,7 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vpx_ext_ratectrl.h"
 #include "vpx/vp8cx.h"
+#include "vpx/vpx_tpl.h"
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
@@ -745,7 +746,7 @@
   BLOCK_SIZE tpl_bsize;
   TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];
   // Used to store TPL stats before propagation
-  VpxTplFrameStats tpl_frame_stats[MAX_ARF_GOP_SIZE];
+  VpxTplGopStats tpl_gop_stats;
   YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
   EncFrameBuf enc_frame_buf[REF_FRAMES];
 #if CONFIG_MULTITHREAD
@@ -1060,7 +1061,7 @@
   VP9_COMMON *const cm = &cpi->common;
   const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
   const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
-  CHECK_MEM_ERROR(cm, cpi->partition_info,
+  CHECK_MEM_ERROR(&cm->error, cpi->partition_info,
                   (PARTITION_INFO *)vpx_calloc(unit_width * unit_height,
                                                sizeof(PARTITION_INFO)));
   memset(cpi->partition_info, 0,
@@ -1088,7 +1089,7 @@
   VP9_COMMON *const cm = &cpi->common;
   const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
   const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
-  CHECK_MEM_ERROR(cm, cpi->motion_vector_info,
+  CHECK_MEM_ERROR(&cm->error, cpi->motion_vector_info,
                   (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
                                                    sizeof(MOTION_VECTOR_INFO)));
   memset(cpi->motion_vector_info, 0,
@@ -1107,7 +1108,7 @@
 static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   CHECK_MEM_ERROR(
-      cm, cpi->tpl_stats_info,
+      &cm->error, cpi->tpl_stats_info,
       (TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats)));
   memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats));
 }
@@ -1126,7 +1127,7 @@
   VP9_COMMON *const cm = &cpi->common;
   const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width);
   const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height);
-  CHECK_MEM_ERROR(cm, cpi->fp_motion_vector_info,
+  CHECK_MEM_ERROR(&cm->error, cpi->fp_motion_vector_info,
                   (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
                                                    sizeof(MOTION_VECTOR_INFO)));
 }
@@ -1457,9 +1458,10 @@
 
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
-                    unsigned int cols, int delta_q[8], int delta_lf[8],
-                    int skip[8], int ref_frame[8]);
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+                                unsigned int rows, unsigned int cols,
+                                int delta_q[8], int delta_lf[8], int skip[8],
+                                int ref_frame[8]);
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
@@ -1474,7 +1476,7 @@
   if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
       new_fb_ptr->mi_cols < cm->mi_cols) {
     vpx_free(new_fb_ptr->mvs);
-    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
+    CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs,
                     (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
                                          sizeof(*new_fb_ptr->mvs)));
     new_fb_ptr->mi_rows = cm->mi_rows;
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index 453fe2e..fadd233 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -94,10 +94,10 @@
   vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
   vp9_encode_free_mt_data(cpi);
 
-  CHECK_MEM_ERROR(cm, cpi->workers,
+  CHECK_MEM_ERROR(&cm->error, cpi->workers,
                   vpx_malloc(num_workers * sizeof(*cpi->workers)));
 
-  CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+  CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data,
                   vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
 
   for (i = 0; i < num_workers; i++) {
@@ -111,7 +111,7 @@
       thread_data->cpi = cpi;
 
       // Allocate thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td,
+      CHECK_MEM_ERROR(&cm->error, thread_data->td,
                       vpx_memalign(32, sizeof(*thread_data->td)));
       vp9_zero(*thread_data->td);
 
@@ -121,7 +121,7 @@
       vp9_setup_pc_tree(cm, thread_data->td);
 
       // Allocate frame counters in thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td->counts,
+      CHECK_MEM_ERROR(&cm->error, thread_data->td->counts,
                       vpx_calloc(1, sizeof(*thread_data->td->counts)));
 
       // Create threads
@@ -292,7 +292,7 @@
   {
     int i;
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->mutex,
+    CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex,
                     vpx_malloc(sizeof(*row_mt_sync->mutex) * rows));
     if (row_mt_sync->mutex) {
       for (i = 0; i < rows; ++i) {
@@ -300,7 +300,7 @@
       }
     }
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->cond,
+    CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond,
                     vpx_malloc(sizeof(*row_mt_sync->cond) * rows));
     if (row_mt_sync->cond) {
       for (i = 0; i < rows; ++i) {
@@ -310,7 +310,7 @@
   }
 #endif  // CONFIG_MULTITHREAD
 
-  CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
+  CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col,
                   vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
 
   // Set up nsync.
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 71d8775..8fdd976 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1422,7 +1422,7 @@
 
   if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL)
     CHECK_MEM_ERROR(
-        cm, cpi->twopass.fp_mb_float_stats,
+        &cm->error, cpi->twopass.fp_mb_float_stats,
         vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
 
   {
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 9487fc5..fafc673 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -288,7 +288,7 @@
   int *arf_not_zz;
 
   CHECK_MEM_ERROR(
-      cm, arf_not_zz,
+      &cm->error, arf_not_zz,
       vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
 
   // We are not interested in results beyond the alt ref itself.
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 64e9ef0..0ea0f85 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -297,7 +297,7 @@
       besterr =
           vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
     } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
       vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
@@ -312,7 +312,7 @@
   uint32_t besterr;
   (void)xd;
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
     vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
     besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
@@ -635,7 +635,7 @@
     vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
                               0, kernel, MV_PRECISION_Q3, 0, 0);
     if (second_pred != NULL) {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
       vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
       besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
     } else {
@@ -654,7 +654,7 @@
   vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
                             0, kernel, MV_PRECISION_Q3, 0, 0);
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
     vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
     besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
   } else {
diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c
index 45659f2..0843cd97 100644
--- a/vp9/encoder/vp9_multi_thread.c
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -59,7 +59,7 @@
   int i;
 
   CHECK_MEM_ERROR(
-      cm, this_tile->row_base_thresh_freq_fact,
+      &cm->error, this_tile->row_base_thresh_freq_fact,
       (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
                         sizeof(*(this_tile->row_base_thresh_freq_fact))));
   for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
@@ -85,7 +85,7 @@
   multi_thread_ctxt->allocated_tile_rows = tile_rows;
   multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
 
-  CHECK_MEM_ERROR(cm, multi_thread_ctxt->job_queue,
+  CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue,
                   (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)));
 
 #if CONFIG_MULTITHREAD
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f051c62..464705a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1937,10 +1937,10 @@
 
 // Prediction buffer from second frame.
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]);
   uint8_t *second_pred;
 #else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+  DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Check number of iterations do not exceed the max
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 60720e3..48c21c5 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -676,7 +676,7 @@
       if (cpi->content_state_sb_fd == NULL &&
           (!cpi->use_svc ||
            svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
-        CHECK_MEM_ERROR(cm, cpi->content_state_sb_fd,
+        CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd,
                         (uint8_t *)vpx_calloc(
                             (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                             sizeof(uint8_t)));
@@ -832,13 +832,13 @@
     }
     if (cpi->count_arf_frame_usage == NULL) {
       CHECK_MEM_ERROR(
-          cm, cpi->count_arf_frame_usage,
+          &cm->error, cpi->count_arf_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                                 sizeof(*cpi->count_arf_frame_usage)));
     }
     if (cpi->count_lastgolden_frame_usage == NULL)
       CHECK_MEM_ERROR(
-          cm, cpi->count_lastgolden_frame_usage,
+          &cm->error, cpi->count_lastgolden_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                                 sizeof(*cpi->count_lastgolden_frame_usage)));
   }
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index f08d668..e472127 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -163,17 +163,17 @@
         lc->actual_num_seg1_blocks = 0;
         lc->actual_num_seg2_blocks = 0;
         lc->counter_encode_maxq_scene_change = 0;
-        CHECK_MEM_ERROR(cm, lc->map,
+        CHECK_MEM_ERROR(&cm->error, lc->map,
                         vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
         memset(lc->map, 0, mi_rows * mi_cols);
         last_coded_q_map_size =
             mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
-        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
+        CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map,
                         vpx_malloc(last_coded_q_map_size));
         assert(MAXQ <= 255);
         memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
         consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv);
-        CHECK_MEM_ERROR(cm, lc->consec_zero_mv,
+        CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv,
                         vpx_malloc(consec_zero_mv_size));
         memset(lc->consec_zero_mv, 0, consec_zero_mv_size);
       }
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
index ed771dc..9f4bafd 100644
--- a/vp9/encoder/vp9_tpl_model.c
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -154,17 +154,43 @@
   int frame_idx;
   for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
     TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-    VpxTplFrameStats *tpl_frame_stats = &cpi->tpl_frame_stats[frame_idx];
     memset(tpl_frame->tpl_stats_ptr, 0,
            tpl_frame->height * tpl_frame->width *
                sizeof(*tpl_frame->tpl_stats_ptr));
-    memset(tpl_frame_stats->block_stats_list, 0,
-           tpl_frame->height * tpl_frame->width *
-               sizeof(*tpl_frame_stats->block_stats_list));
     tpl_frame->is_valid = 0;
   }
 }
 
+static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) {
+    vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list);
+  }
+  vpx_free(tpl_gop_stats->frame_stats_list);
+}
+
+static void init_tpl_stats_before_propagation(
+    struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats,
+    TplDepFrame *tpl_stats, int tpl_gop_frames) {
+  int frame_idx;
+  free_tpl_frame_stats_list(tpl_gop_stats);
+  CHECK_MEM_ERROR(
+      error_info, tpl_gop_stats->frame_stats_list,
+      vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list)));
+  tpl_gop_stats->size = tpl_gop_frames;
+  for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) {
+    const int mi_rows = tpl_stats[frame_idx].height;
+    const int mi_cols = tpl_stats[frame_idx].width;
+    CHECK_MEM_ERROR(
+        error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list,
+        vpx_calloc(
+            mi_rows * mi_cols,
+            sizeof(
+                *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list)));
+    tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols;
+  }
+}
+
 #if CONFIG_NON_GREEDY_MV
 static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
                                          MotionField *motion_field,
@@ -1106,7 +1132,7 @@
                               int frame_idx, BLOCK_SIZE bsize) {
   TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
   VpxTplFrameStats *tpl_frame_stats_before_propagation =
-      &cpi->tpl_frame_stats[frame_idx];
+      &cpi->tpl_gop_stats.frame_stats_list[frame_idx];
   YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
   YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
 
@@ -1320,7 +1346,7 @@
 
   vpx_free(cpi->select_mv_arr);
   CHECK_MEM_ERROR(
-      cm, cpi->select_mv_arr,
+      &cm->error, cpi->select_mv_arr,
       vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
 #endif
 
@@ -1335,26 +1361,20 @@
     for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
       vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
       CHECK_MEM_ERROR(
-          cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
+          &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
           vpx_calloc(mi_rows * mi_cols * 4,
                      sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
       vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
       CHECK_MEM_ERROR(
-          cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
+          &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
           vpx_calloc(mi_rows * mi_cols * 4,
                      sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
     }
 #endif
     vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
-    CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
+    CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr,
                     vpx_calloc(mi_rows * mi_cols,
                                sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
-    vpx_free(cpi->tpl_frame_stats[frame].block_stats_list);
-    CHECK_MEM_ERROR(
-        cm, cpi->tpl_frame_stats[frame].block_stats_list,
-        vpx_calloc(mi_rows * mi_cols,
-                   sizeof(*cpi->tpl_frame_stats[frame].block_stats_list)));
-    cpi->tpl_frame_stats[frame].num_blocks = mi_rows * mi_cols;
     cpi->tpl_stats[frame].is_valid = 0;
     cpi->tpl_stats[frame].width = mi_cols;
     cpi->tpl_stats[frame].height = mi_rows;
@@ -1385,8 +1405,8 @@
 #endif
     vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
     cpi->tpl_stats[frame].is_valid = 0;
-    vpx_free(cpi->tpl_frame_stats[frame].block_stats_list);
   }
+  free_tpl_frame_stats_list(&cpi->tpl_gop_stats);
 }
 
 #if CONFIG_RATE_CTRL
@@ -1442,6 +1462,9 @@
 
   init_tpl_stats(cpi);
 
+  init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats,
+                                    cpi->tpl_stats, tpl_group_frames);
+
   // Backward propagation from tpl_group_frames to 1.
   for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
     if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index f067efd..409069b 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -29,6 +29,8 @@
 #include "vp9/vp9_cx_iface.h"
 #include "vp9/vp9_iface_common.h"
 
+#include "vpx/vpx_tpl.h"
+
 typedef struct vp9_extracfg {
   int cpu_used;  // available cpu percentage in 1/16
   unsigned int enable_auto_alt_ref;
@@ -815,6 +817,7 @@
     assert(codec_err != VPX_CODEC_OK);
     return codec_err;
   }
+  ctx->cpi->common.error.setjmp = 1;
 
   ctx->cfg = *cfg;
   set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
@@ -1633,13 +1636,9 @@
 
   if (data) {
     vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
-    if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
-                         roi->delta_q, roi->delta_lf, roi->skip,
-                         roi->ref_frame)) {
-      return VPX_CODEC_OK;
-    }
-    return VPX_CODEC_INVALID_PARAM;
+    return vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+                           roi->delta_q, roi->delta_lf, roi->skip,
+                           roi->ref_frame);
   }
   return VPX_CODEC_INVALID_PARAM;
 }
@@ -1793,16 +1792,16 @@
 static vpx_codec_err_t ctrl_get_tpl_stats(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
-  VpxTplFrameStats *data = va_arg(args, VpxTplFrameStats *);
+  VpxTplGopStats *data = va_arg(args, VpxTplGopStats *);
+  VpxTplFrameStats *frame_stats_list = cpi->tpl_gop_stats.frame_stats_list;
   int i;
   if (data == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  for (i = 0; i < MAX_ARF_GOP_SIZE; i++) {
-    data[i].frame_width = cpi->tpl_frame_stats[i].frame_width;
-    data[i].frame_height = cpi->tpl_frame_stats[i].frame_height;
-    data[i].num_blocks = cpi->tpl_frame_stats[i].num_blocks;
-    data[i].block_stats_list = cpi->tpl_frame_stats[i].block_stats_list;
+  data->size = cpi->tpl_gop_stats.size;
+
+  for (i = 0; i < data->size; i++) {
+    data->frame_stats_list[i] = frame_stats_list[i];
   }
 
   return VPX_CODEC_OK;
diff --git a/vpx/exports_com b/vpx/exports_com
index 2ab0509..f0b46aa 100644
--- a/vpx/exports_com
+++ b/vpx/exports_com
@@ -14,3 +14,6 @@
 text vpx_img_free
 text vpx_img_set_rect
 text vpx_img_wrap
+text vpx_free_tpl_gop_stats
+text vpx_read_tpl_gop_stats
+text vpx_write_tpl_gop_stats
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 670fe38..aae3218 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -48,6 +48,8 @@
 #include "../vpx_encoder.h"
 #include <stdarg.h>
 
+#include "vpx_config.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -427,6 +429,27 @@
   jmp_buf jmp;
 };
 
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(error, lval, expr)                                  \
+  do {                                                                      \
+    assert((error)->setjmp);                                                \
+    (lval) = (expr);                                                        \
+    if (!(lval))                                                            \
+      vpx_internal_error(error, VPX_CODEC_MEM_ERROR,                        \
+                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
+                         __LINE__);                                         \
+  } while (0)
+#else
+#define CHECK_MEM_ERROR(error, lval, expr)             \
+  do {                                                 \
+    assert((error)->setjmp);                           \
+    (lval) = (expr);                                   \
+    if (!(lval))                                       \
+      vpx_internal_error(error, VPX_CODEC_MEM_ERROR,   \
+                         "Failed to allocate " #lval); \
+  } while (0)
+#endif
+
 #define CLANG_ANALYZER_NORETURN
 #if defined(__has_feature)
 #if __has_feature(attribute_analyzer_noreturn)
diff --git a/vpx/src/vpx_tpl.c b/vpx/src/vpx_tpl.c
new file mode 100644
index 0000000..9cdb4a0
--- /dev/null
+++ b/vpx/src/vpx_tpl.c
@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_tpl.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define CHECK_FPRINTF_ERROR(expr) \
+  do {                            \
+    if (expr < 0) {               \
+      return VPX_CODEC_ERROR;     \
+    }                             \
+  } while (0)
+
+#define CHECK_FSCANF_ERROR(expr, expected_value) \
+  do {                                           \
+    if (expr != expected_value) {                \
+      return VPX_CODEC_ERROR;                    \
+    }                                            \
+  } while (0)
+
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+                                        const VpxTplGopStats *tpl_gop_stats) {
+  int i;
+  if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+  CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d\n", tpl_gop_stats->size));
+
+  for (i = 0; i < tpl_gop_stats->size; i++) {
+    VpxTplFrameStats frame_stats = tpl_gop_stats->frame_stats_list[i];
+    const int num_blocks = frame_stats.num_blocks;
+    int block;
+    CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d %d %d\n", frame_stats.frame_width,
+                                frame_stats.frame_height, num_blocks));
+    for (block = 0; block < num_blocks; block++) {
+      VpxTplBlockStats block_stats = frame_stats.block_stats_list[block];
+      CHECK_FPRINTF_ERROR(
+          fprintf(tpl_file,
+                  "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64
+                  " %" PRId64 " %d\n",
+                  block_stats.inter_cost, block_stats.intra_cost,
+                  block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist,
+                  block_stats.recrf_rate, block_stats.ref_frame_index));
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+                                       VpxTplGopStats *tpl_gop_stats) {
+  int i, frame_list_size;
+  if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+  CHECK_FSCANF_ERROR(fscanf(tpl_file, "%d\n", &frame_list_size), 1);
+  tpl_gop_stats->size = frame_list_size;
+  tpl_gop_stats->frame_stats_list = (VpxTplFrameStats *)vpx_calloc(
+      frame_list_size, sizeof(tpl_gop_stats->frame_stats_list[0]));
+  if (tpl_gop_stats->frame_stats_list == NULL) {
+    return VPX_CODEC_MEM_ERROR;
+  }
+  for (i = 0; i < frame_list_size; i++) {
+    VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i];
+    int num_blocks, width, height, block;
+    CHECK_FSCANF_ERROR(
+        fscanf(tpl_file, "%d %d %d\n", &width, &height, &num_blocks), 3);
+    frame_stats->num_blocks = num_blocks;
+    frame_stats->frame_width = width;
+    frame_stats->frame_height = height;
+    frame_stats->block_stats_list = (VpxTplBlockStats *)vpx_calloc(
+        num_blocks, sizeof(frame_stats->block_stats_list[0]));
+    if (frame_stats->block_stats_list == NULL) {
+      vpx_free_tpl_gop_stats(tpl_gop_stats);
+      return VPX_CODEC_MEM_ERROR;
+    }
+    for (block = 0; block < num_blocks; block++) {
+      VpxTplBlockStats *block_stats = &frame_stats->block_stats_list[block];
+      CHECK_FSCANF_ERROR(
+          fscanf(tpl_file,
+                 "%" SCNd64 " %" SCNd64 " %" SCNd16 " %" SCNd16 " %" SCNd64
+                 " %" SCNd64 " %d\n",
+                 &block_stats->inter_cost, &block_stats->intra_cost,
+                 &block_stats->mv_c, &block_stats->mv_r,
+                 &block_stats->recrf_dist, &block_stats->recrf_rate,
+                 &block_stats->ref_frame_index),
+          7);
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+void vpx_free_tpl_gop_stats(VpxTplGopStats *data) {
+  int frame;
+  if (data == NULL) return;
+  for (frame = 0; frame < data->size; frame++) {
+    vpx_free(data->frame_stats_list[frame].block_stats_list);
+  }
+  vpx_free(data->frame_stats_list);
+}
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index de86579..25c815e 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -27,6 +27,7 @@
 API_DOC_SRCS-yes += vpx_ext_ratectrl.h
 API_DOC_SRCS-yes += vpx_frame_buffer.h
 API_DOC_SRCS-yes += vpx_image.h
+API_DOC_SRCS-yes += vpx_tpl.h
 
 API_SRCS-yes += src/vpx_decoder.c
 API_SRCS-yes += vpx_decoder.h
@@ -36,9 +37,11 @@
 API_SRCS-yes += internal/vpx_ratectrl_rtc.h
 API_SRCS-yes += src/vpx_codec.c
 API_SRCS-yes += src/vpx_image.c
+API_SRCS-yes += src/vpx_tpl.c
 API_SRCS-yes += vpx_codec.h
 API_SRCS-yes += vpx_codec.mk
 API_SRCS-yes += vpx_frame_buffer.h
 API_SRCS-yes += vpx_image.h
 API_SRCS-yes += vpx_integer.h
 API_SRCS-yes += vpx_ext_ratectrl.h
+API_SRCS-yes += vpx_tpl.h
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 2de8089..c45d1a2 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -31,6 +31,7 @@
 
 #include "./vpx_codec.h"
 #include "./vpx_ext_ratectrl.h"
+#include "./vpx_tpl.h"
 
 /*! Temporal Scalability: Maximum length of the sequence defining frame
  * layer membership
@@ -57,9 +58,9 @@
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_ENCODER_ABI_VERSION \
-  (16 + VPX_CODEC_ABI_VERSION + \
-   VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION                                \
+  (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \
+   VPX_TPL_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -252,25 +253,6 @@
   VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
 };
 
-/*!\brief Temporal dependency model stats for each block before propagation */
-typedef struct VpxTplBlockStats {
-  int64_t intra_cost;  /**< Intra cost */
-  int64_t inter_cost;  /**< Inter cost */
-  int16_t mv_r;        /**< Motion vector row */
-  int16_t mv_c;        /**< Motion vector col */
-  int64_t recrf_rate;  /**< Rate from reconstructed ref frame */
-  int64_t recrf_dist;  /**< Distortion from reconstructed ref frame */
-  int ref_frame_index; /**< Ref frame index */
-} VpxTplBlockStats;
-
-/*!\brief Temporal dependency model stats for each frame before propagation */
-typedef struct VpxTplFrameStats {
-  int frame_width;  /**< Frame width */
-  int frame_height; /**< Frame height */
-  int num_blocks;   /**< Number of blocks. Size of block_stats_list */
-  VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
-} VpxTplFrameStats;
-
 /*!\brief Encoded Frame Flags
  *
  * This type indicates a bitfield to be passed to vpx_codec_encode(), defining
diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h
new file mode 100644
index 0000000..50aec49
--- /dev/null
+++ b/vpx/vpx_tpl.h
@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*!\file
+ * \brief Describes the TPL stats descriptor and associated operations
+ *
+ */
+#ifndef VPX_VPX_VPX_TPL_H_
+#define VPX_VPX_VPX_TPL_H_
+
+#include <stdio.h>
+
+#include "./vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define VPX_TPL_ABI_VERSION (1) /**<\hideinitializer*/
+
+/*!\brief Temporal dependency model stats for each block before propagation */
+typedef struct VpxTplBlockStats {
+  int64_t intra_cost;  /**< Intra cost */
+  int64_t inter_cost;  /**< Inter cost */
+  int16_t mv_r;        /**< Motion vector row */
+  int16_t mv_c;        /**< Motion vector col */
+  int64_t recrf_rate;  /**< Rate from reconstructed ref frame */
+  int64_t recrf_dist;  /**< Distortion from reconstructed ref frame */
+  int ref_frame_index; /**< Ref frame index */
+} VpxTplBlockStats;
+
+/*!\brief Temporal dependency model stats for each frame before propagation */
+typedef struct VpxTplFrameStats {
+  int frame_width;  /**< Frame width */
+  int frame_height; /**< Frame height */
+  int num_blocks;   /**< Number of blocks. Size of block_stats_list */
+  VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
+} VpxTplFrameStats;
+
+/*!\brief Temporal dependency model stats for each GOP before propagation */
+typedef struct VpxTplGopStats {
+  int size; /**< GOP size, also the size of frame_stats_list. */
+  VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */
+} VpxTplGopStats;
+
+/*!\brief Write VpxTplGopStats to file
+ *
+ * Accepts an opened file handle and writes \p tpl_gop_stats.
+ *
+ * \param[in]    tpl_file       A FILE pointer that's already been opened.
+ * \param[in]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                              whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully written.
+ */
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+                                        const VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Read VpxTplGopStats from file
+ *
+ * Accepts an opened file handle and reads TPL stats and stores them into
+ * \p tpl_gop_stats. Allocates memory for TPL stats.
+ *
+ * \param[in]     tpl_file       A FILE pointer that's already been opened.
+ * \param[out]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                               whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully read from file.
+ */
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+                                       VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Free the memory allocated for VpxTplGopStats
+ *
+ * \param[in]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                              whole GOP.
+ */
+void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_VPX_TPL_H_
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 1a20da7..586bfb8 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -263,6 +263,16 @@
   vst1_lane_u32((uint32_t *)buf, a_u32, 1);
 }
 
+static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+}
+
 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2, uint8x8_t *const s3) {
@@ -287,6 +297,16 @@
   vst1_u8(s, s3);
 }
 
+static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+}
+
 static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2, uint8x16_t *const s3) {
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index b312cc7..505d067 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -17,6 +17,7 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 // Note:
@@ -56,17 +57,18 @@
 
 #if defined(__ARM_FEATURE_MATMUL_INT8)
 
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *filter, int x0_q4,
-                              int x_step_q4, int y0_q4, int y_step_q4, int w,
-                              int h) {
+void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
   uint8x16_t s0, s1, s2, s3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
+  assert(h % 4 == 3);
 
   (void)x_step_q4;
   (void)y0_q4;
@@ -75,22 +77,19 @@
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
-      uint8x8_t d01, d23;
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
 
+    do {
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_usdot(s0, filters, permute_tbl);
-      t1 = convolve8_4_usdot(s1, filters, permute_tbl);
-      t2 = convolve8_4_usdot(s2, filters, permute_tbl);
-      t3 = convolve8_4_usdot(s3, filters, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
+      d0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      d1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      d2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      d3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -98,9 +97,22 @@
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve8_4_usdot(s0, filters, perm_tbl);
+    d1 = convolve8_4_usdot(s1, filters, perm_tbl);
+    d2 = convolve8_4_usdot(s2, filters, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
     const uint8_t *s;
     uint8_t *d;
     int width;
@@ -113,10 +125,10 @@
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_usdot(s0, filters, permute_tbl);
-        d1 = convolve8_8_usdot(s1, filters, permute_tbl);
-        d2 = convolve8_8_usdot(s2, filters, permute_tbl);
-        d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -127,7 +139,98 @@
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+      d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+      d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   }
 }
 
@@ -139,8 +242,8 @@
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
   uint8x16_t s0, s1, s2, s3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -150,24 +253,19 @@
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
     do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
+      int16x4_t t0, t1, t2, t3;
       uint8x8_t d01, d23, dd01, dd23;
-      dd01 = vdup_n_u8(0);
-      dd23 = vdup_n_u8(0);
 
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_usdot(s0, filters, permute_tbl);
-      t1 = convolve8_4_usdot(s1, filters, permute_tbl);
-      t2 = convolve8_4_usdot(s2, filters, permute_tbl);
-      t3 = convolve8_4_usdot(s3, filters, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
+      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -181,9 +279,9 @@
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
     const uint8_t *s;
     uint8_t *d;
     int width;
@@ -196,10 +294,10 @@
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_usdot(s0, filters, permute_tbl);
-        d1 = convolve8_8_usdot(s1, filters, permute_tbl);
-        d2 = convolve8_8_usdot(s2, filters, permute_tbl);
-        d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
 
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
@@ -213,11 +311,11 @@
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   }
 }
 
@@ -275,8 +373,8 @@
   uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   uint8x16x2_t samples_LUT;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -288,7 +386,7 @@
   if (w == 4) {
     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
     uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
+    int16x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23;
 
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@@ -325,8 +423,8 @@
       d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
       d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
       d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -341,7 +439,7 @@
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
     uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -426,11 +524,11 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
@@ -444,8 +542,8 @@
   uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   uint8x16x2_t samples_LUT;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -457,7 +555,7 @@
   if (w == 4) {
     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
     uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
+    int16x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23, dd01, dd23;
 
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@@ -494,8 +592,8 @@
       d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
       d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
       d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -516,7 +614,7 @@
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
     uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -608,30 +706,31 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
 #else  // !defined(__ARM_FEATURE_MATMUL_INT8)
 
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *filter, int x0_q4,
-                              int x_step_q4, int y0_q4, int y_step_q4, int w,
-                              int h) {
+void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
   const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
   const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
   const uint8x16_t range_limit = vdupq_n_u8(128);
   uint8x16_t s0, s1, s2, s3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
+  assert(h % 4 == 3);
 
   (void)x_step_q4;
   (void)y0_q4;
@@ -640,22 +739,19 @@
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
-      uint8x8_t d01, d23;
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
 
+    do {
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
+      d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -663,9 +759,22 @@
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+    d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+    d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
     const uint8_t *s;
     uint8_t *d;
     int width;
@@ -678,25 +787,115 @@
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 =
-            convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
-        d1 =
-            convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
-        d2 =
-            convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
-        d3 =
-            convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+      d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+      d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   }
 }
 
@@ -711,8 +910,8 @@
   const uint8x16_t range_limit = vdupq_n_u8(128);
   uint8x16_t s0, s1, s2, s3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -722,24 +921,19 @@
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
     do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
+      int16x4_t t0, t1, t2, t3;
       uint8x8_t d01, d23, dd01, dd23;
-      dd01 = vdup_n_u8(0);
-      dd23 = vdup_n_u8(0);
 
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -753,9 +947,9 @@
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
     const uint8_t *s;
     uint8_t *d;
     int width;
@@ -768,14 +962,10 @@
       do {
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 =
-            convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
-        d1 =
-            convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
-        d2 =
-            convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
-        d3 =
-            convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
 
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
@@ -789,11 +979,11 @@
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   }
 }
 
@@ -854,8 +1044,8 @@
   int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   int8x16x2_t samples_LUT;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -867,7 +1057,7 @@
   if (w == 4) {
     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
     int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
+    int16x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23;
 
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@@ -919,8 +1109,8 @@
       d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
       d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
       d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -935,7 +1125,7 @@
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
     int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -1035,11 +1225,11 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
@@ -1057,8 +1247,8 @@
   int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   int8x16x2_t samples_LUT;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -1070,7 +1260,7 @@
   if (w == 4) {
     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
     int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
+    int16x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23, dd01, dd23;
 
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@@ -1122,8 +1312,8 @@
       d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
       d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
       d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
@@ -1144,7 +1334,7 @@
       src += 4 * src_stride;
       dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
     const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
     int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
@@ -1251,11 +1441,11 @@
         s += 4 * src_stride;
         d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
 
@@ -1273,8 +1463,8 @@
   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
   uint8x8_t t0, t1, t2, t3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -1286,25 +1476,22 @@
   if (h == 4) {
     uint8x8_t d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
 
     __builtin_prefetch(src + 0 * src_stride);
     __builtin_prefetch(src + 1 * src_stride);
     __builtin_prefetch(src + 2 * src_stride);
     __builtin_prefetch(src + 3 * src_stride);
+
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-    s0 = vget_low_s16(tt0);
-    s1 = vget_low_s16(tt1);
-    s2 = vget_low_s16(tt2);
-    s3 = vget_low_s16(tt3);
-    s4 = vget_high_s16(tt0);
-    s5 = vget_high_s16(tt1);
-    s6 = vget_high_s16(tt2);
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     __builtin_prefetch(dst + 0 * dst_stride);
     __builtin_prefetch(dst + 1 * dst_stride);
     __builtin_prefetch(dst + 2 * dst_stride);
@@ -1314,32 +1501,22 @@
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s7 = vget_low_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s9 = vget_low_s16(tt2);
-      s10 = vget_low_s16(tt3);
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
       transpose_u8_4x4(&d01, &d23);
 
-      vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
-                    vreinterpret_u32_u8(d01), 0);
-      vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
-                    vreinterpret_u32_u8(d23), 0);
-      vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
-                    vreinterpret_u32_u8(d01), 1);
-      vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
-                    vreinterpret_u32_u8(d23), 1);
+      store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -1355,7 +1532,7 @@
   } else {
     int width;
     const uint8_t *s;
-    uint8x8_t t4, t5, t6, t7;
+    uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     if (w == 4) {
@@ -1395,32 +1572,24 @@
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
-        dst += dst_stride;
+        transpose_u8_8x4(&d04, &d15, &d26, &d37);
+
+        store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+        store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+        store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+        store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+
+        dst += 8 * dst_stride;
         h -= 8;
       } while (h > 0);
     } else {
       uint8_t *d;
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
       int16x8_t s11, s12, s13, s14;
 
       do {
@@ -1466,17 +1635,18 @@
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+          store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
           s0 = s8;
           s1 = s9;
@@ -1505,8 +1675,8 @@
   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
   uint8x8_t t0, t1, t2, t3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -1516,10 +1686,8 @@
   src -= 3;
 
   if (h == 4) {
-    uint8x8_t d01, d23;
+    uint8x8_t d01, d23, dd01, dd23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
-    uint32x4_t d0123 = vdupq_n_u32(0);
 
     __builtin_prefetch(src + 0 * src_stride);
     __builtin_prefetch(src + 1 * src_stride);
@@ -1527,17 +1695,14 @@
     __builtin_prefetch(src + 3 * src_stride);
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-    s0 = vget_low_s16(tt0);
-    s1 = vget_low_s16(tt1);
-    s2 = vget_low_s16(tt2);
-    s3 = vget_low_s16(tt3);
-    s4 = vget_high_s16(tt0);
-    s5 = vget_high_s16(tt1);
-    s6 = vget_high_s16(tt2);
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     __builtin_prefetch(dst + 0 * dst_stride);
     __builtin_prefetch(dst + 1 * dst_stride);
     __builtin_prefetch(dst + 2 * dst_stride);
@@ -1547,35 +1712,28 @@
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s7 = vget_low_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s9 = vget_low_s16(tt2);
-      s10 = vget_low_s16(tt3);
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
       transpose_u8_4x4(&d01, &d23);
 
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
-      d0123 = vreinterpretq_u32_u8(
-          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+      dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
+      dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
 
-      vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
-      vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
-      vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -1595,8 +1753,8 @@
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     if (w == 4) {
-      uint32x4_t d0415 = vdupq_n_u32(0);
-      uint32x4_t d2637 = vdupq_n_u32(0);
+      uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37;
+
       do {
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1633,48 +1791,35 @@
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+        transpose_u8_8x4(&d04, &d15, &d26, &d37);
 
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
-        d0415 = vreinterpretq_u32_u8(
-            vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
-        d2637 = vreinterpretq_u32_u8(
-            vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
+        dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
+        dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
+        dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
+        dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
 
-        vst1q_lane_u32((uint32_t *)dst, d0415, 0);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 2);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 0);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 2);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 1);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 3);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 1);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 3);
-        dst += dst_stride;
+        d04 = vrhadd_u8(d04, dd04);
+        d15 = vrhadd_u8(d15, dd15);
+        d26 = vrhadd_u8(d26, dd26);
+        d37 = vrhadd_u8(d37, dd37);
+
+        store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+        store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+        store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+        store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+
+        dst += 8 * dst_stride;
         h -= 8;
-      } while (h > 0);
+      } while (h != 0);
     } else {
       uint8_t *d;
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
       int16x8_t s11, s12, s13, s14;
-      uint8x16_t d01, d23, d45, d67;
 
       do {
         __builtin_prefetch(src + 0 * src_stride);
@@ -1719,33 +1864,27 @@
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
-          d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
-                            vld1_u8(d + 1 * dst_stride));
-          d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
-                            vld1_u8(d + 3 * dst_stride));
-          d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
-                            vld1_u8(d + 5 * dst_stride));
-          d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
-                            vld1_u8(d + 7 * dst_stride));
-          d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
-          d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
-          d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
-          d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
+          d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+          d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+          d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+          d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
+          d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride));
+          d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride));
+          d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride));
+          d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride));
 
-          store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
-                       vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
-                       vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
+          store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
           s0 = s8;
           s1 = s9;
@@ -1761,7 +1900,7 @@
         src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
-      } while (h > 0);
+      } while (h != 0);
     }
   }
 }
@@ -1773,8 +1912,8 @@
                              int h) {
   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -1784,33 +1923,26 @@
   src -= 3 * src_stride;
 
   if (w == 4) {
-    uint8x8_t d01, d23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
 
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+    src += 7 * src_stride;
 
     do {
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
 
       __builtin_prefetch(dst + 0 * dst_stride);
       __builtin_prefetch(dst + 1 * dst_stride);
@@ -1820,21 +1952,16 @@
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
+
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -1843,13 +1970,15 @@
       s4 = s8;
       s5 = s9;
       s6 = s10;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h != 0);
   } else {
     int height;
     const uint8_t *s;
     uint8_t *d;
-    uint8x8_t t0, t1, t2, t3;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     do {
@@ -1860,33 +1989,26 @@
       __builtin_prefetch(src + 4 * src_stride);
       __builtin_prefetch(src + 5 * src_stride);
       __builtin_prefetch(src + 6 * src_stride);
-      s = src;
-      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
+
+      load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s = src + 7 * src_stride;
       d = dst;
       height = h;
 
       do {
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
 
         __builtin_prefetch(d + 0 * dst_stride);
         __builtin_prefetch(d + 1 * dst_stride);
@@ -1896,19 +2018,13 @@
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
-        vst1_u8(d, t0);
-        d += dst_stride;
-        vst1_u8(d, t1);
-        d += dst_stride;
-        vst1_u8(d, t2);
-        d += dst_stride;
-        vst1_u8(d, t3);
-        d += dst_stride;
+        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -1917,6 +2033,8 @@
         s4 = s8;
         s5 = s9;
         s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
         height -= 4;
       } while (height != 0);
       src += 8;
@@ -1933,8 +2051,8 @@
                                  int h) {
   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -1944,34 +2062,26 @@
   src -= 3 * src_stride;
 
   if (w == 4) {
-    uint8x8_t d01, d23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    uint32x4_t d0123 = vdupq_n_u32(0);
 
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+    src += 7 * src_stride;
 
     do {
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
 
       __builtin_prefetch(dst + 0 * dst_stride);
       __builtin_prefetch(dst + 1 * dst_stride);
@@ -1981,29 +2091,22 @@
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
+
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
-      d0123 = vreinterpretq_u32_u8(
-          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
 
-      vst1q_lane_u32((uint32_t *)dst, d0123, 0);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 1);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 2);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 3);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -2012,14 +2115,15 @@
       s4 = s8;
       s5 = s9;
       s6 = s10;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h != 0);
   } else {
     int height;
     const uint8_t *s;
     uint8_t *d;
-    uint8x8_t t0, t1, t2, t3;
-    uint8x16_t d01, d23, dd01, dd23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     do {
@@ -2030,33 +2134,26 @@
       __builtin_prefetch(src + 4 * src_stride);
       __builtin_prefetch(src + 5 * src_stride);
       __builtin_prefetch(src + 6 * src_stride);
-      s = src;
-      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
+
+      load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s = src + 7 * src_stride;
       d = dst;
       height = h;
 
       do {
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
 
         __builtin_prefetch(d + 0 * dst_stride);
         __builtin_prefetch(d + 1 * dst_stride);
@@ -2066,28 +2163,18 @@
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
-        d01 = vcombine_u8(t0, t1);
-        d23 = vcombine_u8(t2, t3);
-        dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
-                           vld1_u8(d + 1 * dst_stride));
-        dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
-                           vld1_u8(d + 3 * dst_stride));
-        dd01 = vrhaddq_u8(dd01, d01);
-        dd23 = vrhaddq_u8(dd23, d23);
+        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
-        vst1_u8(d, vget_low_u8(dd01));
-        d += dst_stride;
-        vst1_u8(d, vget_high_u8(dd01));
-        d += dst_stride;
-        vst1_u8(d, vget_low_u8(dd23));
-        d += dst_stride;
-        vst1_u8(d, vget_high_u8(dd23));
-        d += dst_stride;
+        d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+        d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+        d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+        d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -2097,6 +2184,8 @@
         s5 = s9;
         s6 = s10;
         height -= 4;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
       } while (height != 0);
       src += 8;
       dst += 8;
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 07cf824..2f78583 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -15,10 +15,20 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
+
+#if VPX_ARCH_AARCH64 && \
+    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+void vpx_convolve8_2d_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h);
+#endif
 
 #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
-static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
                                                  const int8x16_t samples_hi,
                                                  const int32x4_t correction,
                                                  const int8x8_t filters) {
@@ -29,11 +39,11 @@
   sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
   sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
-static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
                                          const int8x8_t filters,
                                          const int32x4_t correction,
                                          const uint8x16_t range_limit,
@@ -54,8 +64,8 @@
   sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
   sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
 static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
@@ -78,7 +88,7 @@
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
@@ -111,14 +121,14 @@
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 #endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
 #if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
 
-static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
                                                   const uint8x16_t samples_hi,
                                                   const int8x8_t filters) {
   /* Sample permutation is performed by the caller. */
@@ -127,11 +137,11 @@
   sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
   sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
-static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
                                           const int8x8_t filters,
                                           const uint8x16x2_t permute_tbl) {
   uint8x16_t permuted_samples[2];
@@ -147,8 +157,8 @@
   sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
   sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
 static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
@@ -169,7 +179,7 @@
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
@@ -196,7 +206,7 @@
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 #endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
@@ -238,7 +248,7 @@
   sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
   sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
   sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c
index 830f317..f7db3e6 100644
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -14,6 +14,57 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
+#if VPX_ARCH_AARCH64 && \
+    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * (64 + 7). */
+  uint8_t temp[64 * 71];
+
+  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* Filter starting 3 lines back. */
+  vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                              x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                              intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  uint8_t temp[64 * 71];
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                              x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                              intermediate_height);
+
+  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+#else  // !(VPX_ARCH_AARCH64 &&
+       //   (defined(__ARM_FEATURE_DOTPROD) ||
+       //    defined(__ARM_FEATURE_MATMUL_INT8)))
+
 void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const InterpKernel *filter,
                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
@@ -63,3 +114,7 @@
   vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
                               x_step_q4, y0_q4, y_step_q4, w, h);
 }
+
+#endif  // #if VPX_ARCH_AARCH64 &&
+        //     (defined(__ARM_FEATURE_DOTPROD) ||
+        //      defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 619d7aa..2a4c81d 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -40,7 +40,7 @@
   unsigned int vpx_sad##m##x##n##_avg_c(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                           \
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[m * n]);                           \
     vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \
     return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
   }                                                                           \
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index ce1e838..a6793ef 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -156,7 +156,7 @@
       const uint8_t *second_pred) {                                          \
     uint16_t fdata3[(H + 1) * W];                                            \
     uint8_t temp2[H * W];                                                    \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                              \
+    DECLARE_ALIGNED(32, uint8_t, temp3[H * W]);                              \
                                                                              \
     var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
                                       W, bilinear_filters[x_offset]);        \
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 67d3fb0..04969f3 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -424,6 +424,7 @@
 DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
 
 DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/avg_pred_avx2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/variance_vsx.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index cae4ca81..f20f4e0 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1321,7 +1321,7 @@
   specialize qw/vpx_get4x4sse_cs neon msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-  specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/;
+  specialize qw/vpx_comp_avg_pred neon sse2 avx2 vsx lsx/;
 
 #
 # Subpixel Variance
diff --git a/vpx_dsp/x86/avg_pred_avx2.c b/vpx_dsp/x86/avg_pred_avx2.c
new file mode 100644
index 0000000..f435799
--- /dev/null
+++ b/vpx_dsp/x86/avg_pred_avx2.c
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  int row = 0;
+  // comp_pred and pred must be 32 byte aligned.
+  assert(((intptr_t)comp_pred % 32) == 0);
+  assert(((intptr_t)pred % 32) == 0);
+
+  if (width == 8) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i p = _mm256_load_si256((const __m256i *)pred);
+      const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+      const __m128i r_1 =
+          _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
+
+      const __m128i r1 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride)));
+      const __m128i r2 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride)));
+
+      const __m256i ref_0123 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1);
+      const __m256i avg = _mm256_avg_epu8(p, ref_0123);
+
+      _mm256_store_si256((__m256i *)comp_pred, avg);
+
+      row += 4;
+      pred += 32;
+      comp_pred += 32;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 16) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i tmp0 =
+          _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref));
+      const __m256i ref_0 = _mm256_inserti128_si256(
+          tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
+      const __m256i tmp1 = _mm256_castsi128_si256(
+          _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
+      const __m256i ref_1 = _mm256_inserti128_si256(
+          tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 4;
+      pred += 64;
+      comp_pred += 64;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 32) {
+    assert(height % 2 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref);
+      const __m256i ref_1 =
+          _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 2;
+      pred += 64;
+      comp_pred += 64;
+      ref += 2 * ref_stride;
+    } while (row < height);
+  } else if (width % 64 == 0) {
+    do {
+      int x;
+      for (x = 0; x < width; x += 64) {
+        const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x));
+        const __m256i pred_1 =
+            _mm256_load_si256((const __m256i *)(pred + x + 32));
+        const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
+        const __m256i ref_1 =
+            _mm256_loadu_si256((const __m256i *)(ref + x + 32));
+        const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+        const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+        _mm256_store_si256((__m256i *)(comp_pred + x), average_0);
+        _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1);
+      }
+      row++;
+      pred += width;
+      comp_pred += width;
+      ref += ref_stride;
+    } while (row < height);
+  } else {
+    vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride);
+  }
+}
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 2498bba..526c283 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -46,7 +46,7 @@
 };
 
 #define CALC_CONVOLVE8_HORZ_ROW                                               \
-  srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3);             \
+  srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);          \
   s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]);                               \
   s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]);                               \
   s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]);                               \
@@ -60,16 +60,6 @@
                    _mm256_extractf128_si256(s1[0], 1));                       \
   output_ptr += output_pitch;
 
-// 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
-  // 0 0 0 0 0 0 0 0 | 0 0 0 0 lo3 lo2 lo1 lo0
-  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
-
-  // 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
-  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
-  return a;
-}
-
 static INLINE void vpx_filter_block1d16_h8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
@@ -93,12 +83,7 @@
     __m256i srcReg;
 
     // load the 2 strides of source
-    srcReg =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg = _mm256_inserti128_si256(
-        srcReg,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
-        1);
+    srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3);
 
     // filter the source buffer
     s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -109,12 +94,7 @@
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg = _mm256_inserti128_si256(
-        srcReg,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
-        1);
+    srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5);
 
     // filter the source buffer
     s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -129,60 +109,37 @@
 
     src_ptr += src_stride;
 
-    // average if necessary
-    outReg1 = _mm256_castsi256_si128(outReg32b1);
-    outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
     if (avg) {
-      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
-      outReg2 = _mm_avg_epu8(
-          outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+      const __m256i outReg = mm256_loadu2_si128(
+          (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch));
+      outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg);
     }
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, outReg1);
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
-
+    mm256_store2_si128((__m128i *)output_ptr,
+                       (__m128i *)(output_ptr + output_pitch), &outReg32b1);
     output_ptr += dst_stride;
   }
 
   // if the number of strides is odd.
   // process only 16 bytes
   if (i > 0) {
-    __m128i srcReg;
-
-    // load the first 16 bytes of the last row
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+    const __m256i srcReg =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1);
 
     // filter the source buffer
-    s[0] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
-    s[1] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
-    s[2] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
-    s[3] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
-    outReg1 = convolve8_8_avx2(s, f);
+    s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
 
-    // reading the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+    // The low and high 128-bits of each lane contain the first and second
+    // convolve result respectively
+    outReg32b1 = convolve8_16_avx2(s, f);
+    outReg1 = _mm256_castsi256_si128(outReg32b1);
+    outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
 
-    // filter the source buffer
-    s[0] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
-    s[1] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
-    s[2] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
-    s[3] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
-    outReg2 = convolve8_8_avx2(s, f);
-
-    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
-    // contain the first and second convolve result respectively
+    // shrink to 8 bit each 16 bits
     outReg1 = _mm_packus_epi16(outReg1, outReg2);
 
     // average if necessary
@@ -266,7 +223,6 @@
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
     const int avg) {
-  __m128i outReg1, outReg2;
   __m256i srcRegHead1;
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
@@ -345,19 +301,14 @@
     src_ptr += src_stride;
 
     // average if necessary
-    outReg1 = _mm256_castsi256_si128(s1[0]);
-    outReg2 = _mm256_extractf128_si256(s1[0], 1);
     if (avg) {
-      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
-      outReg2 = _mm_avg_epu8(
-          outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+      const __m256i outReg = mm256_loadu2_si128(
+          (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch));
+      s1[0] = _mm256_avg_epu8(s1[0], outReg);
     }
 
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, outReg1);
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
+    mm256_store2_si128((__m128i *)output_ptr,
+                       (__m128i *)(output_ptr + out_pitch), s1);
 
     output_ptr += dst_stride;
 
@@ -1094,7 +1045,7 @@
       // load the 2 strides of source
       // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07
       // r06 r05 r04 r03 r02 r01 r00
-      srcReg32b1 = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3);
+      srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);
 
       // filter the source buffer
       // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06
@@ -1188,8 +1139,7 @@
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
   __m256i f[4], ss[4];
-  __m256i r[8];
-  __m128i r1[10];
+  __m256i r[9], rr[2];
   __m128i s[11];
 
   unsigned int y = output_height;
@@ -1210,48 +1160,35 @@
   s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
   s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
 
-  // R1-0 xxxx .. . . x| r13 r12 r11 r10 r03 r02 r01 r00
-  r1[0] = _mm_unpacklo_epi32(s[0], s[1]);
+  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1);
+  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1);
+  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1);
+  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1);
+  r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1);
 
-  // R2-1 xxxx .. . . x| r23 r22 r21 r20 r13 r12 r11 r10
-  r1[1] = _mm_unpacklo_epi32(s[1], s[2]);
+  // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12
+  // r11 r10 r03 r02 r01 r00
+  rr[0] = _mm256_unpacklo_epi32(r[0], r[1]);
 
-  // R3-2 xxxx .. . . x| r33 r32 r31 r30 r23 r22 r21 r20
-  r1[2] = _mm_unpacklo_epi32(s[2], s[3]);
-
-  // R4-3 xxxx .. . . x| r43 r42 r41 r40 r33 r32 r31 r30
-  r1[3] = _mm_unpacklo_epi32(s[3], s[4]);
-
-  // R5-4 xxxx .. . . x| r53 r52 r51 r50 r43 r42 r41 r40
-  r1[4] = _mm_unpacklo_epi32(s[4], s[5]);
-
-  // R6-5 xxxx .. . . x| r63 r62 r61 r60 r53 r52 r51 r50
-  r1[5] = _mm_unpacklo_epi32(s[5], s[6]);
-
-  // 00000000 r33 r32 r31 r30|r23 r22 r21 r20||00000000|r13 r12 r11 r10|r03 r02
-  // r01 r00
-  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[0]), r1[2], 1);
-
-  // 00000000 r43 r42 r41 r40|r33 r32 r31 r30||00000000|r23 r22 r21 r20|r13 r12
-  // r11 r10
-  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[1]), r1[3], 1);
-
-  // 00000000 r53 r52 r51 r50|r43 r42 r41 r40||00000000|r33 r32 r31 r30|r23 r22
-  // r21 r20
-  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[2]), r1[4], 1);
-
-  // 00000000 r63 r62 r61 r60|r53 r52 r51 r50||00000000|r43 r42 r41 r40|r33 r32
-  // r31 r30
-  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[3]), r1[5], 1);
+  // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+  // r21 r20 r13 r12 r11 r10
+  rr[1] = _mm256_unpacklo_epi32(r[1], r[2]);
 
   // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10
   // r00|
-  ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+  ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+  // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12
+  // r11 r10 r03 r02 r01 r00
+  rr[0] = _mm256_unpacklo_epi32(r[2], r[3]);
+
+  // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+  // r21 r20 r13 r12 r11 r10
+  rr[1] = _mm256_unpacklo_epi32(r[3], r[4]);
 
   // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30
   // r20|
-  ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
-
+  ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]);
   // Process 4 rows at a time
   while (y >= 4) {
     s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
@@ -1259,41 +1196,17 @@
     s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch));
     s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch));
 
-    // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
-    r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+    r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1);
+    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1);
+    rr[0] = _mm256_unpacklo_epi32(r[4], r[5]);
+    rr[1] = _mm256_unpacklo_epi32(r[5], r[6]);
+    ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]);
 
-    // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
-    r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
-
-    // R9-8 xxxx .. . . x| r93 r92 r91 r90 r83 r82 r81 r80
-    r1[8] = _mm_unpacklo_epi32(s[8], s[9]);
-
-    // R10-9 xxxx .. . . x| r10-3 r10-2 r10-1 r10-0 r93 r92 r91 r90
-    r1[9] = _mm_unpacklo_epi32(s[9], s[10]);
-
-    // 00000000 r73 r72 r71 r70|r63 r62 r61 r60||00000000|r53 r52 r51 r50|r43
-    // r42 r41 r40
-    r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[4]), r1[6], 1);
-
-    // 00000000 r83 r82 r81 r80|r73 r72 r71 r70||00000000|r63 r62 r61 r60|r53
-    // r52 r51 r50
-    r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[5]), r1[7], 1);
-
-    // 00000000 r93 r92 r91 r90|r83 r82 r81 r80||00000000|r73 r72 r71 r70|r63
-    // r62 r61 r60
-    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[6]), r1[8], 1);
-
-    // 00000000 r10-3 r10-2 r10-1 r10-0|r93 r92 r91 r90||00000000|r83 r82 r81
-    // r80|r73 r72 r71 r70
-    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[7]), r1[9], 1);
-
-    // r83 r73....r80 r70|r73 r63....r70 r60||r63 r53....r60 r50|r53 r43....r50
-    // r40|
-    ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
-
-    // r10-3 r10-3....r10-0 r10-0|r93 r83....r90 r80||r83 r73....r80 r70|r73
-    // r63....r70 r60|
-    ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1);
+    r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1);
+    rr[0] = _mm256_unpacklo_epi32(r[6], r[7]);
+    rr[1] = _mm256_unpacklo_epi32(r[7], r[8]);
+    ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]);
 
     ss[0] = convolve8_16_avx2(ss, f);
 
@@ -1315,17 +1228,17 @@
     ss[1] = ss[3];
 
     s[6] = s[10];
+    s[5] = s[9];
 
-    r1[4] = r1[8];
-    r1[5] = r1[9];
-
+    r[4] = r[8];
     y -= 4;
   }
 
   // Process 2 rows
   if (y == 2) {
-    __m128i ss1[4], f1[4];
+    __m128i ss1[4], f1[4], r1[4];
 
+    s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
     s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
     s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
 
@@ -1334,11 +1247,14 @@
     f1[2] = _mm256_castsi256_si128(f[2]);
     f1[3] = _mm256_castsi256_si128(f[3]);
 
+    r1[0] = _mm_unpacklo_epi32(s[4], s[5]);
+    r1[1] = _mm_unpacklo_epi32(s[5], s[6]);
+
     // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
-    r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+    r1[2] = _mm_unpacklo_epi32(s[6], s[7]);
 
     // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
-    r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
+    r1[3] = _mm_unpacklo_epi32(s[7], s[8]);
 
     // r23 r13....r20 r10|r13 r03....r10 r00
     ss1[0] = _mm256_castsi256_si128(ss[0]);
@@ -1347,10 +1263,10 @@
     ss1[1] = _mm256_castsi256_si128(ss[1]);
 
     // r63 r53....r60 r50|r53 r43....r50 r40
-    ss1[2] = _mm_unpacklo_epi8(r1[4], r1[5]);
+    ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]);
 
     // r83 r73....r80 r70|r73 r63....r70 r60
-    ss1[3] = _mm_unpacklo_epi8(r1[6], r1[7]);
+    ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]);
 
     ss1[0] = convolve8_8_ssse3(ss1, f1);