Merge "VPX: Improve HBD vpx_hadamard_32x32_sse2()"
diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index 91609da..0bf4816 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -33,8 +33,8 @@
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas.pl script.\n";
 print "\t.equ DO1STROUNDING, 0\n";
+print "\t.syntax unified\n";
 if ($thumb) {
-    print "\t.syntax unified\n";
     print "\t.thumb\n";
 }
 
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index e1ae7b4..806fdd8 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -23,6 +23,7 @@
 print "\t.set WIDE_REFERENCE, 0\n";
 print "\t.set ARCHITECTURE, 5\n";
 print "\t.set DO1STROUNDING, 0\n";
+print "\t.syntax unified\n";
 
 my %register_aliases;
 my %macro_aliases;
diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc
index 4167cf3..d4b67cc 100644
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc
@@ -138,8 +138,30 @@
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
 }
 
-TEST(DecodeAPI, Vp9PeekSI) {
+void TestPeekInfo(const uint8_t *const data, uint32_t data_sz,
+                  uint32_t peek_size) {
   const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get
+  // to decoder_peek_si_internal on frames of size < 8.
+  if (data_sz >= 8) {
+    vpx_codec_ctx_t dec;
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
+    EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM
+                                    : VPX_CODEC_CORRUPT_FRAME,
+              vpx_codec_decode(&dec, data, data_sz, NULL, 0));
+    vpx_codec_iter_t iter = NULL;
+    EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
+  }
+
+  // Verify behavior of vpx_codec_peek_stream_info.
+  vpx_codec_stream_info_t si;
+  si.sz = sizeof(si);
+  EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK,
+            vpx_codec_peek_stream_info(codec, data, data_sz, &si));
+}
+
+TEST(DecodeAPI, Vp9PeekStreamInfo) {
   // The first 9 bytes are valid and the rest of the bytes are made up. Until
   // size 10, this should return VPX_CODEC_UNSUP_BITSTREAM and after that it
   // should return VPX_CODEC_CORRUPT_FRAME.
@@ -150,24 +172,18 @@
   };
 
   for (uint32_t data_sz = 1; data_sz <= 32; ++data_sz) {
-    // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get
-    // to decoder_peek_si_internal on frames of size < 8.
-    if (data_sz >= 8) {
-      vpx_codec_ctx_t dec;
-      EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
-      EXPECT_EQ(
-          (data_sz < 10) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_CORRUPT_FRAME,
-          vpx_codec_decode(&dec, data, data_sz, NULL, 0));
-      vpx_codec_iter_t iter = NULL;
-      EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
-      EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
-    }
+    TestPeekInfo(data, data_sz, 10);
+  }
+}
 
-    // Verify behavior of vpx_codec_peek_stream_info.
-    vpx_codec_stream_info_t si;
-    si.sz = sizeof(si);
-    EXPECT_EQ((data_sz < 10) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK,
-              vpx_codec_peek_stream_info(codec, data, data_sz, &si));
+TEST(DecodeAPI, Vp9PeekStreamInfoTruncated) {
+  // This profile 1 header requires 10.25 bytes, ensure
+  // vpx_codec_peek_stream_info doesn't over read.
+  const uint8_t profile1_data[10] = { 0xa4, 0xe9, 0x30, 0x68, 0x53,
+                                      0xe9, 0x30, 0x68, 0x53, 0x04 };
+
+  for (uint32_t data_sz = 1; data_sz <= 10; ++data_sz) {
+    TestPeekInfo(profile1_data, data_sz, 11);
   }
 }
 #endif  // CONFIG_VP9_DECODER
diff --git a/test/decode_corrupted.cc b/test/decode_corrupted.cc
new file mode 100644
index 0000000..b44c378
--- /dev/null
+++ b/test/decode_corrupted.cc
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/i420_video_source.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+class DecodeCorruptedFrameTest
+    : public ::libvpx_test::EncoderTest,
+      public ::testing::TestWithParam<
+          ::testing::tuple<const libvpx_test::CodecFactory *> > {
+ public:
+  DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {}
+
+ protected:
+  virtual ~DecodeCorruptedFrameTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+
+    // Set small key frame distance such that we insert more key frames.
+    cfg_.kf_max_dist = 3;
+    dec_cfg_.threads = 1;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, 7);
+  }
+
+  virtual void MismatchHook(const vpx_image_t * /*img1*/,
+                            const vpx_image_t * /*img2*/) {}
+
+  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) {
+    // Don't edit frame packet on key frame.
+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) return pkt;
+    if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt;
+
+    memcpy(&modified_pkt_, pkt, sizeof(*pkt));
+
+    // Halve the size so it's corrupted to decoder.
+    modified_pkt_.data.frame.sz = modified_pkt_.data.frame.sz / 2;
+
+    return &modified_pkt_;
+  }
+
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const libvpx_test::VideoSource & /*video*/,
+                                  libvpx_test::Decoder *decoder) {
+    EXPECT_NE(res_dec, VPX_CODEC_MEM_ERROR) << decoder->DecodeError();
+    return VPX_CODEC_MEM_ERROR != res_dec;
+  }
+
+  vpx_codec_cx_pkt_t modified_pkt_;
+};
+
+TEST_P(DecodeCorruptedFrameTest, DecodeCorruptedFrame) {
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+#if CONFIG_VP9
+INSTANTIATE_TEST_CASE_P(
+    VP9, DecodeCorruptedFrameTest,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
+#endif  // CONFIG_VP9
+
+#if CONFIG_VP8
+INSTANTIATE_TEST_CASE_P(
+    VP8, DecodeCorruptedFrameTest,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)));
+#endif  // CONFIG_VP8
+
+}  // namespace
diff --git a/test/stress.sh b/test/stress.sh
index ced9d3f..fdec764 100755
--- a/test/stress.sh
+++ b/test/stress.sh
@@ -144,6 +144,19 @@
   fi
 }
 
+vp8_stress_test_token_parititions() {
+  local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40}
+  if [ "$(vp8_decode_available)" = "yes" -a \
+       "$(vp8_encode_available)" = "yes" ]; then
+    for threads in 2 4 8; do
+      for token_partitions in 1 2 3; do
+        stress vp8 "${VP8}" "${vp8_max_jobs}" ${threads} \
+          "--token-parts=$token_partitions"
+      done
+    done
+  fi
+}
+
 vp9_stress() {
   local vp9_max_jobs=${STRESS_VP9_DECODE_MAX_JOBS:-25}
 
@@ -154,16 +167,17 @@
 }
 
 vp9_stress_test() {
-  for threads in 4 8 100; do
+  for threads in 4 8 64; do
     vp9_stress "$threads" "--row-mt=0"
   done
 }
 
 vp9_stress_test_row_mt() {
-  for threads in 4 8 100; do
+  for threads in 4 8 64; do
     vp9_stress "$threads" "--row-mt=1"
   done
 }
 
 run_tests stress_verify_environment \
-  "vp8_stress_test vp9_stress_test vp9_stress_test_row_mt"
+  "vp8_stress_test vp8_stress_test_token_parititions
+   vp9_stress_test vp9_stress_test_row_mt"
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 7d6bacc..c72df50 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -207,6 +207,8 @@
     }
 
     if (dynamic_drop_layer_) {
+      // TODO(jian): Disable AQ Mode for this test for now.
+      encoder->Control(VP9E_SET_AQ_MODE, 0);
       if (video->frame() == 0) {
         // Change layer bitrates to set top layers to 0. This will trigger skip
         // encoding/dropping of top two spatial layers.
@@ -838,7 +840,7 @@
   layer_framedrop_ = GET_PARAM(2);
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.75,
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.71,
                           1.45);
 #if CONFIG_VP9_DECODER
   // The non-reference frames are expected to be mismatched frames as the
@@ -885,7 +887,7 @@
   layer_framedrop_ = GET_PARAM(2);
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73,
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58,
                           1.2);
 #if CONFIG_VP9_DECODER
   // The non-reference frames are expected to be mismatched frames as the
@@ -1137,7 +1139,8 @@
   ResetModel();
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+  // TODO(jianj): webm:1554
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.76,
                           1.15);
 #if CONFIG_VP9_DECODER
   // The non-reference frames are expected to be mismatched frames as the
diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc
index c90c875..3ee7fb3 100644
--- a/test/svc_end_to_end_test.cc
+++ b/test/svc_end_to_end_test.cc
@@ -46,7 +46,10 @@
     SetConfig(num_temporal_layer);
     cfg_.ss_number_layers = num_spatial_layer;
     cfg_.ts_number_layers = num_temporal_layer;
-    if (num_spatial_layer == 2) {
+    if (num_spatial_layer == 1) {
+      svc_params_.scaling_factor_num[0] = 288;
+      svc_params_.scaling_factor_den[0] = 288;
+    } else if (num_spatial_layer == 2) {
       svc_params_.scaling_factor_num[0] = 144;
       svc_params_.scaling_factor_den[0] = 288;
       svc_params_.scaling_factor_num[1] = 288;
diff --git a/test/svc_test.cc b/test/svc_test.cc
index 6d08dfa..f7e6db4 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -26,6 +26,7 @@
     encoder->Control(VP9E_SET_SVC, 1);
     encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
     encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
+    encoder->Control(VP9E_SET_AQ_MODE, 3);
     encoder->Control(VP9E_SET_TILE_COLUMNS, 0);
     encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300);
     encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads));
diff --git a/test/test.mk b/test/test.mk
index 2efdf76..0166a80 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -53,6 +53,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += decode_corrupted.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_motion_vector_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc
diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc
index f2b832c..9a11d38 100644
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@@ -34,6 +34,7 @@
     tot_frame_number_ = 0;
     first_drop_ = 0;
     num_drops_ = 0;
+    aq_mode_ = 3;
     // Denoiser is off by default.
     denoiser_on_ = 0;
     // For testing up to 3 layers.
@@ -110,7 +111,10 @@
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+    }
 
     if (denoiser_offon_test_) {
       ASSERT_GT(denoiser_offon_period_, 0)
@@ -128,6 +132,7 @@
 
     if (use_roi_) {
       encoder->Control(VP9E_SET_ROI_MAP, &roi_);
+      encoder->Control(VP9E_SET_AQ_MODE, 0);
     }
 
     if (cfg_.ts_number_layers > 1) {
@@ -206,6 +211,7 @@
   int64_t bits_in_buffer_model_;
   vpx_codec_pts_t first_drop_;
   int num_drops_;
+  int aq_mode_;
   int denoiser_on_;
   int denoiser_offon_test_;
   int denoiser_offon_period_;
@@ -516,8 +522,6 @@
 
   cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
 
-  if (deadline_ == VPX_DL_REALTIME) cfg_.g_error_resilient = 1;
-
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 200);
   const int bitrates[4] = { 200, 400, 600, 800 };
@@ -527,6 +531,11 @@
   // 60-40 bitrate allocation for 2 temporal layers.
   cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
   cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate;
+  aq_mode_ = 0;
+  if (deadline_ == VPX_DL_REALTIME) {
+    aq_mode_ = 3;
+    cfg_.g_error_resilient = 1;
+  }
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
     ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
@@ -570,6 +579,11 @@
   cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
   cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
   cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  aq_mode_ = 0;
+  if (deadline_ == VPX_DL_REALTIME) {
+    aq_mode_ = 3;
+    cfg_.g_error_resilient = 1;
+  }
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
     // TODO(yaowu): Work out more stable rc control strategy and
@@ -619,6 +633,11 @@
   cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
   cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
   cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  aq_mode_ = 0;
+  if (deadline_ == VPX_DL_REALTIME) {
+    aq_mode_ = 3;
+    cfg_.g_error_resilient = 1;
+  }
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
     ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index aadc8dc..db17f8d 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -400,16 +400,25 @@
       xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
       xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
 
-      xd->pre.y_buffer =
-          ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
-      xd->pre.u_buffer =
-          ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
-      xd->pre.v_buffer =
-          ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
+      if (!ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame]) {
+        xd->pre.y_buffer =
+            ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] +
+            recon_yoffset;
+        xd->pre.u_buffer =
+            ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] +
+            recon_uvoffset;
+        xd->pre.v_buffer =
+            ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] +
+            recon_uvoffset;
+      }
 
       /* propagate errors from reference frames */
       xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
 
+      if (xd->corrupted)
+        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Corrupted reference frame buffer");
+
       mt_decode_macroblock(pbi, xd, 0);
 
       xd->left_available = 1;
diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index 4a10833..809d731 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -28,7 +28,7 @@
 const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 2, 1, 2,
                                                           4, 2, 4, 8, 4, 8 };
 
-// VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize)))
+// VPXMIN(3, VPXMIN(b_width_log2_lookup(bsize), b_height_log2_lookup(bsize)))
 const uint8_t size_group_lookup[BLOCK_SIZES] = { 0, 0, 0, 1, 1, 1, 2,
                                                  2, 2, 3, 3, 3, 3 };
 
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 42ff493..2d6dbe8 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1532,9 +1532,9 @@
 
 // sorts in descending order
 static int compare_tile_buffers(const void *a, const void *b) {
-  const TileBuffer *const buf1 = (const TileBuffer *)a;
-  const TileBuffer *const buf2 = (const TileBuffer *)b;
-  return (int)((int64_t)buf2->size - buf1->size);
+  const TileBuffer *const buf_a = (const TileBuffer *)a;
+  const TileBuffer *const buf_b = (const TileBuffer *)b;
+  return (buf_a->size < buf_b->size) - (buf_a->size > buf_b->size);
 }
 
 static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
@@ -1730,6 +1730,21 @@
   }
 }
 
+static INLINE void flush_all_fb_on_key(VP9_COMMON *cm) {
+  if (cm->frame_type == KEY_FRAME && cm->current_video_frame > 0) {
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    BufferPool *const pool = cm->buffer_pool;
+    int i;
+    for (i = 0; i < FRAME_BUFFERS; ++i) {
+      frame_bufs[i].ref_count = 0;
+      if (!frame_bufs[i].released) {
+        pool->release_fb_cb(pool->cb_priv, &frame_bufs[i].raw_frame_buffer);
+        frame_bufs[i].released = 1;
+      }
+    }
+  }
+}
+
 static size_t read_uncompressed_header(VP9Decoder *pbi,
                                        struct vpx_read_bit_buffer *rb) {
   VP9_COMMON *const cm = &pbi->common;
@@ -1794,6 +1809,7 @@
     setup_frame_size(cm, rb);
     if (pbi->need_resync) {
       memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+      flush_all_fb_on_key(cm);
       pbi->need_resync = 0;
     }
   } else {
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index f291a0d..aecc565 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -507,13 +507,12 @@
                    num8x8bl;
   if (weight_segment_target < 7 * weight_segment / 8)
     weight_segment = weight_segment_target;
-  // For screen-content: don't include target for the weight segment, since
-  // all for all flat areas the segment is reset, so its more accurate to
-  // just use the previous actual number of seg blocks for the weight.
+  // For screen-content: don't include target for the weight segment,
+  // since for all flat areas the segment is reset, so its more accurate
+  // to just use the previous actual number of seg blocks for the weight.
   if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
     weight_segment =
-        (double)((cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) >>
-                 1) /
+        (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) /
         num8x8bl;
   cr->weight_segment = weight_segment;
 }
@@ -524,15 +523,19 @@
   const RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
+  int scene_change_detected =
+      cpi->rc.high_source_sad ||
+      (cpi->use_svc && cpi->svc.high_source_sad_superframe);
   if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
   // Reset if resoluton change has occurred.
   if (cpi->resize_pending != 0) vp9_cyclic_refresh_reset_resize(cpi);
-  if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation)) {
+  if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation) ||
+      scene_change_detected) {
     // Set segmentation map to 0 and disable.
     unsigned char *const seg_map = cpi->segmentation_map;
     memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
     vp9_disable_segmentation(&cm->seg);
-    if (cm->frame_type == KEY_FRAME) {
+    if (cm->frame_type == KEY_FRAME || scene_change_detected) {
       memset(cr->last_coded_q_map, MAXQ,
              cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
       cr->sb_index = 0;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index f3a4ae7..a172af3 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3978,6 +3978,7 @@
   // the starting point of motion search in the following partition type check.
   if (do_split || must_split) {
     subsize = get_subsize(bsize, PARTITION_SPLIT);
+    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
     if (bsize == BLOCK_8X8) {
       i = 4;
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
@@ -4005,9 +4006,9 @@
         if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
           continue;
 
-        if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
-
         pc_tree->split[i]->index = i;
+        if (cpi->sf.prune_ref_frame_for_rect_partitions)
+          pc_tree->split[i]->none.rate = INT_MAX;
         rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
                           mi_col + x_idx, subsize, &this_rdc,
                           // A must split test here increases the number of sub
@@ -4086,21 +4087,27 @@
   // PARTITION_HORZ
   if (partition_horz_allowed &&
       (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) {
+    const int part_mode_rate = cpi->partition_cost[pl][PARTITION_HORZ];
+    const int64_t part_mode_rdcost =
+        RDCOST(partition_mul, x->rddiv, part_mode_rate, 0);
     subsize = get_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter = pred_interp_filter;
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->horizontal[0], best_rdc.rdcost);
+                     &pc_tree->horizontal[0],
+                     best_rdc.rdcost - part_mode_rdcost);
+    if (sum_rdc.rdcost < INT64_MAX) {
+      sum_rdc.rdcost += part_mode_rdcost;
+      sum_rdc.rate += part_mode_rate;
+    }
 
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
         bsize > BLOCK_8X8) {
       PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
       update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
       encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
-
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter;
@@ -4117,17 +4124,12 @@
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost += RDCOST(partition_mul, x->rddiv,
-                               cpi->partition_cost[pl][PARTITION_HORZ], 0);
-      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        pc_tree->partitioning = PARTITION_HORZ;
+      best_rdc = sum_rdc;
+      pc_tree->partitioning = PARTITION_HORZ;
 
-        if ((cpi->sf.less_rectangular_check) &&
-            (bsize > cpi->sf.use_square_only_threshold))
-          do_rect = 0;
-      }
+      if ((cpi->sf.less_rectangular_check) &&
+          (bsize > cpi->sf.use_square_only_threshold))
+        do_rect = 0;
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
@@ -4135,21 +4137,26 @@
   // PARTITION_VERT
   if (partition_vert_allowed &&
       (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) {
+    const int part_mode_rate = cpi->partition_cost[pl][PARTITION_VERT];
+    const int64_t part_mode_rdcost =
+        RDCOST(partition_mul, x->rddiv, part_mode_rate, 0);
     subsize = get_subsize(bsize, PARTITION_VERT);
-
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter = pred_interp_filter;
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->vertical[0], best_rdc.rdcost);
+                     &pc_tree->vertical[0], best_rdc.rdcost - part_mode_rdcost);
+    if (sum_rdc.rdcost < INT64_MAX) {
+      sum_rdc.rdcost += part_mode_rdcost;
+      sum_rdc.rate += part_mode_rate;
+    }
+
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
         bsize > BLOCK_8X8) {
       update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
       encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
                         &pc_tree->vertical[0]);
-
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter = pred_interp_filter;
@@ -4166,13 +4173,8 @@
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost += RDCOST(partition_mul, x->rddiv,
-                               cpi->partition_cost[pl][PARTITION_VERT], 0);
-      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        pc_tree->partitioning = PARTITION_VERT;
-      }
+      best_rdc = sum_rdc;
+      pc_tree->partitioning = PARTITION_VERT;
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index de5519d..3db11fc 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3985,6 +3985,7 @@
   // (need to check encoding time cost for doing this for speed 8).
   cpi->rc.high_source_sad = 0;
   cpi->rc.hybrid_intra_scene_change = 0;
+  cpi->rc.re_encode_maxq_scene_change = 0;
   if (cm->show_frame && cpi->oxcf.mode == REALTIME &&
       (cpi->oxcf.rc_mode == VPX_VBR ||
        cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
@@ -4051,6 +4052,19 @@
     vp9_svc_assert_constraints_pattern(cpi);
   }
 
+  if (!cpi->sf.re_encode_overshoot_rt &&
+      cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+      (cpi->rc.high_source_sad ||
+       (cpi->use_svc && cpi->svc.high_source_sad_superframe))) {
+    // Check if this high_source_sad (scene/slide change) frame should be
+    // encoded at high/max QP, and if so, set the q and adjust some rate
+    // control parameters.
+    if (vp9_encodedframe_overshoot(cpi, -1, &q)) {
+      vp9_set_quantizer(cm, q);
+      vp9_set_variance_partition_thresholds(cpi, q, 0);
+    }
+  }
+
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
@@ -4470,12 +4484,6 @@
     vp9_encode_frame(cpi);
     vpx_clear_system_state();
     restore_coding_context(cpi);
-    vp9_pack_bitstream(cpi, dest, size);
-
-    vp9_encode_frame(cpi);
-    vpx_clear_system_state();
-
-    restore_coding_context(cpi);
   }
 }
 
@@ -5501,13 +5509,41 @@
 
 void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
                      const GF_GROUP *gf_group, int *tpl_group_frames) {
-  int frame_idx, i;
+  VP9_COMMON *cm = &cpi->common;
+  int frame_idx = 0;
+  int i;
   int gld_index = -1;
   int alt_index = -1;
   int lst_index = -1;
   int extend_frame_count = 0;
   int pframe_qindex = cpi->tpl_stats[2].base_qindex;
 
+  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+  int recon_frame_index[REFS_PER_FRAME + 1] = { -1, -1, -1, -1 };
+
+  for (i = 0; i < FRAME_BUFFERS && frame_idx < REFS_PER_FRAME + 1; ++i) {
+    if (frame_bufs[i].ref_count == 0) {
+      alloc_frame_mvs(cm, i);
+      if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffer");
+
+      recon_frame_index[frame_idx] = i;
+      ++frame_idx;
+    }
+  }
+
+  for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
+    assert(recon_frame_index[i] >= 0);
+    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+  }
+
   *tpl_group_frames = 0;
 
   // Initialize Golden reference frame.
@@ -5583,7 +5619,7 @@
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const SEARCH_METHODS search_method = HEX;
+  const SEARCH_METHODS search_method = NSTEP;
   int step_param;
   int sadpb = x->sadperbit16;
   uint32_t bestsme = UINT_MAX;
@@ -5628,7 +5664,6 @@
 
 int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
                      int ref_pos_col, int block, BLOCK_SIZE bsize) {
-  int overlap_area;
   int width = 0, height = 0;
   int bw = 4 << b_width_log2_lookup[bsize];
   int bh = 4 << b_height_log2_lookup[bsize];
@@ -5653,7 +5688,7 @@
     default: assert(0);
   }
 
-  return overlap_area = width * height;
+  return width * height;
 }
 
 int round_floor(int ref_pos, int bsize_pix) {
@@ -5667,27 +5702,29 @@
 }
 
 void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
-                     BLOCK_SIZE bsize, int stride, int64_t intra_cost,
-                     int64_t inter_cost, int ref_frame_idx, int_mv mv) {
+                     BLOCK_SIZE bsize, int stride,
+                     const TplDepStats *src_stats) {
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   int idx, idy;
 
-  intra_cost = intra_cost / (mi_height * mi_width);
-  inter_cost = inter_cost / (mi_height * mi_width);
+  int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
+  int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
+
+  TplDepStats *tpl_ptr;
 
   intra_cost = VPXMAX(1, intra_cost);
   inter_cost = VPXMAX(1, inter_cost);
 
   for (idy = 0; idy < mi_height; ++idy) {
+    tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col];
     for (idx = 0; idx < mi_width; ++idx) {
-      TplDepStats *tpl_ptr =
-          &tpl_stats[(mi_row + idy) * stride + (mi_col + idx)];
       tpl_ptr->intra_cost = intra_cost;
       tpl_ptr->inter_cost = inter_cost;
       tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
-      tpl_ptr->ref_frame_index = ref_frame_idx;
-      tpl_ptr->mv.as_int = mv.as_int;
+      tpl_ptr->ref_frame_index = src_stats->ref_frame_index;
+      tpl_ptr->mv.as_int = src_stats->mv.as_int;
+      ++tpl_ptr;
     }
   }
 }
@@ -5796,6 +5833,136 @@
   }
 }
 
+void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                     struct scale_factors *sf, GF_PICTURE *gf_picture,
+                     int frame_idx, int16_t *src_diff, tran_low_t *coeff,
+                     tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
+                     int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                     YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
+                     int64_t *recon_error, int64_t *sse,
+                     TplDepStats *tpl_stats) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int pix_num = bw * bh;
+  int best_rf_idx = -1;
+  int_mv best_mv;
+  int64_t best_inter_cost = INT64_MAX;
+  int64_t inter_cost;
+  int rf_idx;
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+
+  int64_t best_intra_cost = INT64_MAX;
+  int64_t intra_cost;
+  PREDICTION_MODE mode;
+  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  MODE_INFO mi_above, mi_left;
+
+  memset(tpl_stats, 0, sizeof(*tpl_stats));
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
+  xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
+  xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
+
+  // Intra prediction search
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    uint8_t *src, *dst;
+    int src_stride, dst_stride;
+
+    src = xd->cur_buf->y_buffer + mb_y_offset;
+    src_stride = xd->cur_buf->y_stride;
+
+    dst = &predictor[0];
+    dst_stride = bw;
+
+    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+    vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
+                            src_stride, dst, dst_stride, 0, 0, 0);
+
+    vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
+
+    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+
+    intra_cost = vpx_satd(coeff, pix_num);
+
+    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+  }
+
+  // Motion compensated prediction
+  best_mv.as_int = 0;
+
+  (void)mb_y_offset;
+  // Motion estimation column boundary
+  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+  x->mv_limits.col_max =
+      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
+
+  for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+    int_mv mv;
+    if (ref_frame[rf_idx] == NULL) continue;
+
+    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
+                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                                  xd->cur_buf->y_stride, &mv.as_mv, bsize);
+
+    // TODO(jingning): Not yet support high bit-depth in the next three
+    // steps.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_build_inter_predictor(
+          CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
+          ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
+          &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
+          mi_row * MI_SIZE, xd->bd);
+      vpx_highbd_subtract_block(
+          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
+          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+    } else {
+      vp9_build_inter_predictor(
+          ref_frame[rf_idx]->y_buffer + mb_y_offset,
+          ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
+          0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
+      vpx_subtract_block(bh, bw, src_diff, bw,
+                         xd->cur_buf->y_buffer + mb_y_offset,
+                         xd->cur_buf->y_stride, &predictor[0], bw);
+    }
+#else
+    vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                              ref_frame[rf_idx]->y_stride, &predictor[0], bw,
+                              &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE, mi_row * MI_SIZE);
+    vpx_subtract_block(bh, bw, src_diff, bw,
+                       xd->cur_buf->y_buffer + mb_y_offset,
+                       xd->cur_buf->y_stride, &predictor[0], bw);
+#endif
+    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+
+    inter_cost = vpx_satd(coeff, pix_num);
+
+    if (inter_cost < best_inter_cost) {
+      best_rf_idx = rf_idx;
+      best_inter_cost = inter_cost;
+      best_mv.as_int = mv.as_int;
+      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
+                         sse);
+    }
+  }
+  best_intra_cost = VPXMAX(best_intra_cost, 1);
+  best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
+  tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->mc_dep_cost = tpl_stats->intra_cost + tpl_stats->mc_flow;
+  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  tpl_stats->mv.as_int = best_mv.as_int;
+}
+
 void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx) {
   TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
   YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
@@ -5808,7 +5975,6 @@
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
   int mi_row, mi_col;
-  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
 
 #if CONFIG_VP9_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
@@ -5822,15 +5988,10 @@
   DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
 
-  MODE_INFO mi_above, mi_left;
-
   const BLOCK_SIZE bsize = BLOCK_32X32;
   const TX_SIZE tx_size = max_txsize_lookup[bsize];
-  const int bw = 4 << b_width_log2_lookup[bsize];
-  const int bh = 4 << b_height_log2_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int pix_num = bw * bh;
   int64_t recon_error, sse;
 
   // Setup scaling factor
@@ -5859,6 +6020,7 @@
 
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
+  xd->cur_buf = this_frame;
 
   // Get rd multiplier set up.
   rdmult =
@@ -5878,126 +6040,14 @@
     x->mv_limits.row_max =
         (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-      int mb_y_offset =
-          mi_row * MI_SIZE * this_frame->y_stride + mi_col * MI_SIZE;
-      int best_rf_idx = -1;
-      int_mv best_mv;
-      int64_t best_inter_cost = INT64_MAX;
-      int64_t inter_cost;
-      int rf_idx;
-
-      int64_t best_intra_cost = INT64_MAX;
-      int64_t intra_cost;
-      PREDICTION_MODE mode;
-
-      // Intra prediction search
-      for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
-        uint8_t *src, *dst;
-        int src_stride, dst_stride;
-
-        xd->cur_buf = this_frame;
-
-        src = this_frame->y_buffer + mb_y_offset;
-        src_stride = this_frame->y_stride;
-
-        dst = &predictor[0];
-        dst_stride = bw;
-
-        xd->mi[0]->sb_type = bsize;
-        xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-        xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-        xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
-        xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-        xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
-        xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
-        xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
-
-        vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode,
-                                src, src_stride, dst, dst_stride, 0, 0, 0);
-
-        vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
-                           dst_stride);
-
-        wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-
-        intra_cost = vpx_satd(coeff, pix_num);
-
-        if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
-      }
-
-      // Motion compensated prediction
-      best_mv.as_int = 0;
-
-      (void)mb_y_offset;
-      // Motion estimation column boundary
-      x->mv_limits.col_min =
-          -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
-      x->mv_limits.col_max =
-          ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
-
-      for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
-        int_mv mv;
-        if (ref_frame[rf_idx] == NULL) continue;
-
-        motion_compensated_prediction(cpi, td,
-                                      this_frame->y_buffer + mb_y_offset,
-                                      ref_frame[rf_idx]->y_buffer + mb_y_offset,
-                                      this_frame->y_stride, &mv.as_mv, bsize);
-
-        // TODO(jingning): Not yet support high bit-depth in the next three
-        // steps.
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          vp9_highbd_build_inter_predictor(
-              CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
-              ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]),
-              bw, &mv.as_mv, &sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
-              mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd);
-          vpx_highbd_subtract_block(
-              bh, bw, src_diff, bw, this_frame->y_buffer + mb_y_offset,
-              this_frame->y_stride, &predictor[0], bw, xd->bd);
-        } else {
-          vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
-                                    ref_frame[rf_idx]->y_stride, &predictor[0],
-                                    bw, &mv.as_mv, &sf, bw, bh, 0, kernel,
-                                    MV_PRECISION_Q3, mi_col * MI_SIZE,
-                                    mi_row * MI_SIZE);
-          vpx_subtract_block(bh, bw, src_diff, bw,
-                             this_frame->y_buffer + mb_y_offset,
-                             this_frame->y_stride, &predictor[0], bw);
-        }
-#else
-        vp9_build_inter_predictor(
-            ref_frame[rf_idx]->y_buffer + mb_y_offset,
-            ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, &sf, bw,
-            bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
-        vpx_subtract_block(bh, bw, src_diff, bw,
-                           this_frame->y_buffer + mb_y_offset,
-                           this_frame->y_stride, &predictor[0], bw);
-#endif
-        wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-
-        inter_cost = vpx_satd(coeff, pix_num);
-
-        if (inter_cost < best_inter_cost) {
-          best_rf_idx = rf_idx;
-          best_inter_cost = inter_cost;
-          best_mv.as_int = mv.as_int;
-          get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, TX_32X32,
-                             &recon_error, &sse);
-        }
-      }
+      TplDepStats tpl_stats;
+      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, src_diff, coeff,
+                      qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size,
+                      ref_frame, predictor, &recon_error, &sse, &tpl_stats);
 
       // Motion flow dependency dispenser.
-      best_intra_cost = VPXMAX(best_intra_cost, 1);
-      best_inter_cost = VPXMIN(best_inter_cost, best_intra_cost);
-
-      best_intra_cost <<= TPL_DEP_COST_SCALE_LOG2;
-      best_inter_cost <<= TPL_DEP_COST_SCALE_LOG2;
-
       tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
-                      tpl_frame->stride, best_intra_cost, best_inter_cost,
-                      gf_picture[frame_idx].ref_frame[best_rf_idx], best_mv);
+                      tpl_frame->stride, &tpl_stats);
 
       tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
                        bsize);
@@ -6005,21 +6055,12 @@
   }
 }
 
-void setup_tpl_stats(VP9_COMP *cpi) {
+static void setup_tpl_stats(VP9_COMP *cpi) {
   GF_PICTURE gf_picture[MAX_LAG_BUFFERS];
   const GF_GROUP *gf_group = &cpi->twopass.gf_group;
   int tpl_group_frames = 0;
   int frame_idx;
 
-  // TODO(jingning): Make the model support high bit-depth route.
-#if CONFIG_VP9_HIGHBITDEPTH
-  (void)gf_picture;
-  (void)gf_group;
-  (void)tpl_group_frames;
-  (void)frame_idx;
-  return;
-#endif
-
   init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
 
   init_tpl_stats(cpi);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 7cacc32..038413a 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -501,6 +501,7 @@
   YV12_BUFFER_CONFIG *raw_source_frame;
 
   TplDepFrame tpl_stats[MAX_LAG_BUFFERS];
+  YV12_BUFFER_CONFIG *tpl_recon_frames[REFS_PER_FRAME + 1];
 
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
@@ -533,7 +534,6 @@
   YV12_BUFFER_CONFIG last_frame_uf;
 
   TOKENEXTRA *tile_tok[4][1 << 6];
-  uint32_t tok_count[4][1 << 6];
   TOKENLIST *tplist[4][1 << 6];
 
   // Ambient reconstruction err target for force key frames
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index ba72c0b..12c147e 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -2186,6 +2186,8 @@
   const SEARCH_METHODS method = (SEARCH_METHODS)search_method;
   vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
+  int run_exhaustive_search = 0;
+
   if (cost_list) {
     cost_list[0] = INT_MAX;
     cost_list[1] = INT_MAX;
@@ -2215,36 +2217,39 @@
       var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
                           fn_ptr, 1, ref_mv, tmp_mv);
       break;
-    default:
-      assert(method == NSTEP);
+    case NSTEP:
+    case MESH:
       var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                MAX_MVSEARCH_STEPS - 1 - step_param, 1,
                                cost_list, fn_ptr, ref_mv, tmp_mv);
-
-      // Should we allow a follow on exhaustive search?
-      if ((sf->exhaustive_searches_thresh < INT_MAX) &&
-          !cpi->rc.is_src_frame_alt_ref) {
-        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
-        exhuastive_thr >>=
-            8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
-
-        // Threshold variance for an exhaustive full search.
-        if (var > exhuastive_thr) {
-          int var_ex;
-          MV tmp_mv_ex;
-          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit,
-                                         cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
-
-          if (var_ex < var) {
-            var = var_ex;
-            *tmp_mv = tmp_mv_ex;
-          }
-        }
-      }
       break;
+    default: assert(0 && "Unknown search method");
   }
 
-  if (method != NSTEP && rd && var < var_max)
+  if (method == NSTEP) {
+    if (sf->exhaustive_searches_thresh < INT_MAX &&
+        !cpi->rc.is_src_frame_alt_ref) {
+      const int64_t exhuastive_thr =
+          sf->exhaustive_searches_thresh >>
+          (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
+      if (var > exhuastive_thr) run_exhaustive_search = 1;
+    }
+  } else if (method == MESH) {
+    run_exhaustive_search = 1;
+  }
+
+  if (run_exhaustive_search) {
+    int var_ex;
+    MV tmp_mv_ex;
+    var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit, cost_list,
+                                   fn_ptr, ref_mv, &tmp_mv_ex);
+    if (var_ex < var) {
+      var = var_ex;
+      *tmp_mv = tmp_mv_ex;
+    }
+  }
+
+  if (method != NSTEP && method != MESH && rd && var < var_max)
     var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
 
   return var;
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index b4787fe..3d1f10a 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -107,6 +107,9 @@
 
 struct VP9_COMP;
 
+// "mvp_full" is the MV search starting point;
+// "ref_mv" is the context reference MV;
+// "tmp_mv" is the searched best MV.
 int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                           MV *mvp_full, int step_param, int search_method,
                           int error_per_bit, int *cost_list, const MV *ref_mv,
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 8248d12..04c7b3c 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -41,6 +41,17 @@
   int in_use;
 } PRED_BUFFER;
 
+typedef struct {
+  PRED_BUFFER *best_pred;
+  PREDICTION_MODE best_mode;
+  TX_SIZE best_tx_size;
+  TX_SIZE best_intra_tx_size;
+  MV_REFERENCE_FRAME best_ref_frame;
+  MV_REFERENCE_FRAME best_second_ref_frame;
+  uint8_t best_mode_skip_txfm;
+  INTERP_FILTER best_pred_filter;
+} BEST_PICKMODE;
+
 static const int pos_shift_16x16[4][4] = {
   { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
 };
@@ -334,6 +345,35 @@
   return 1;
 }
 
+static TX_SIZE calculate_tx_size(VP9_COMP *const cpi, BLOCK_SIZE bsize,
+                                 MACROBLOCKD *const xd, unsigned int var,
+                                 unsigned int sse, int64_t ac_thr) {
+  TX_SIZE tx_size;
+  if (cpi->common.tx_mode == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      tx_size = VPXMIN(max_txsize_lookup[bsize],
+                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    else
+      tx_size = TX_8X8;
+
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
+      tx_size = TX_8X8;
+    else if (tx_size > TX_16X16)
+      tx_size = TX_16X16;
+
+    // For screen-content force 4X4 tx_size over 8X8, for large variance.
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && tx_size == TX_8X8 &&
+        bsize <= BLOCK_16X16 && var > (ac_thr << 6))
+      tx_size = TX_4X4;
+  } else {
+    tx_size = VPXMIN(max_txsize_lookup[bsize],
+                     tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  }
+
+  return tx_size;
+}
+
 static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                     MACROBLOCK *x, MACROBLOCKD *xd,
                                     int *out_rate_sum, int64_t *out_dist_sum,
@@ -394,24 +434,10 @@
                           cpi->common.height, abs(sum) >> (bw + bh));
 #endif
 
-  if (cpi->common.tx_mode == TX_MODE_SELECT) {
-    if (sse > (var << 2))
-      tx_size = VPXMIN(max_txsize_lookup[bsize],
-                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    else
-      tx_size = TX_8X8;
-
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
-        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
-      tx_size = TX_8X8;
-    else if (tx_size > TX_16X16)
-      tx_size = TX_16X16;
-  } else {
-    tx_size = VPXMIN(max_txsize_lookup[bsize],
-                     tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-  }
-
-  assert(tx_size >= TX_8X8);
+  tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr);
+  // The code below for setting skip flag assumes tranform size of at least 8x8,
+  // so force this lower limit on transform.
+  if (tx_size < TX_8X8) tx_size = TX_8X8;
   xd->mi[0]->tx_size = tx_size;
 
   if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->zero_temp_sad_source &&
@@ -575,24 +601,7 @@
   *var_y = var;
   *sse_y = sse;
 
-  if (cpi->common.tx_mode == TX_MODE_SELECT) {
-    if (sse > (var << 2))
-      xd->mi[0]->tx_size =
-          VPXMIN(max_txsize_lookup[bsize],
-                 tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    else
-      xd->mi[0]->tx_size = TX_8X8;
-
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
-        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
-      xd->mi[0]->tx_size = TX_8X8;
-    else if (xd->mi[0]->tx_size > TX_16X16)
-      xd->mi[0]->tx_size = TX_16X16;
-  } else {
-    xd->mi[0]->tx_size =
-        VPXMIN(max_txsize_lookup[bsize],
-               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-  }
+  xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr);
 
   // Evaluate if the partition block is a skippable block in Y plane.
   {
@@ -1304,18 +1313,16 @@
     VP9_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig,
     int ref_frame_cost[MAX_REF_FRAMES],
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int reuse_inter_pred,
-    TX_SIZE best_tx_size, PREDICTION_MODE best_mode,
-    MV_REFERENCE_FRAME best_ref_frame, INTERP_FILTER best_pred_filter,
-    uint8_t best_mode_skip_txfm) {
+    BEST_PICKMODE *bp) {
   ctx_den->zero_last_cost_orig = zero_last_cost_orig;
   ctx_den->ref_frame_cost = ref_frame_cost;
   ctx_den->frame_mv = frame_mv;
   ctx_den->reuse_inter_pred = reuse_inter_pred;
-  ctx_den->best_tx_size = best_tx_size;
-  ctx_den->best_mode = best_mode;
-  ctx_den->best_ref_frame = best_ref_frame;
-  ctx_den->best_pred_filter = best_pred_filter;
-  ctx_den->best_mode_skip_txfm = best_mode_skip_txfm;
+  ctx_den->best_tx_size = bp->best_tx_size;
+  ctx_den->best_mode = bp->best_mode;
+  ctx_den->best_ref_frame = bp->best_ref_frame;
+  ctx_den->best_pred_filter = bp->best_pred_filter;
+  ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm;
 }
 
 static void recheck_zeromv_after_denoising(
@@ -1429,6 +1436,179 @@
   return force_skip_low_temp_var;
 }
 
+static void search_filter_ref(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
+                              int mi_row, int mi_col, PRED_BUFFER *tmp,
+                              BLOCK_SIZE bsize, int reuse_inter_pred,
+                              PRED_BUFFER **this_mode_pred, unsigned int *var_y,
+                              unsigned int *sse_y) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
+
+  int pf_rate[3] = { 0 };
+  int64_t pf_dist[3] = { 0 };
+  int curr_rate[3] = { 0 };
+  unsigned int pf_var[3] = { 0 };
+  unsigned int pf_sse[3] = { 0 };
+  TX_SIZE pf_tx_size[3] = { 0 };
+  int64_t best_cost = INT64_MAX;
+  INTERP_FILTER best_filter = SWITCHABLE, filter;
+  PRED_BUFFER *current_pred = *this_mode_pred;
+  uint8_t skip_txfm = SKIP_TXFM_NONE;
+
+  for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) {
+    int64_t cost;
+    mi->interp_filter = filter;
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+    model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
+                      &pf_var[filter], &pf_sse[filter]);
+    curr_rate[filter] = pf_rate[filter];
+    pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
+    cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
+    pf_tx_size[filter] = mi->tx_size;
+    if (cost < best_cost) {
+      best_filter = filter;
+      best_cost = cost;
+      skip_txfm = x->skip_txfm[0];
+
+      if (reuse_inter_pred) {
+        if (*this_mode_pred != current_pred) {
+          free_pred_buffer(*this_mode_pred);
+          *this_mode_pred = current_pred;
+        }
+        current_pred = &tmp[get_pred_buffer(tmp, 3)];
+        pd->dst.buf = current_pred->data;
+        pd->dst.stride = bw;
+      }
+    }
+  }
+
+  if (reuse_inter_pred && *this_mode_pred != current_pred)
+    free_pred_buffer(current_pred);
+
+  mi->interp_filter = best_filter;
+  mi->tx_size = pf_tx_size[best_filter];
+  this_rdc->rate = curr_rate[best_filter];
+  this_rdc->dist = pf_dist[best_filter];
+  *var_y = pf_var[best_filter];
+  *sse_y = pf_sse[best_filter];
+  x->skip_txfm[0] = skip_txfm;
+  if (reuse_inter_pred) {
+    pd->dst.buf = (*this_mode_pred)->data;
+    pd->dst.stride = (*this_mode_pred)->stride;
+  }
+}
+
+static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x,
+                         int_mv frame_mv[][MAX_REF_FRAMES],
+                         MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
+                         BLOCK_SIZE bsize, int mi_row, int mi_col,
+                         int best_pred_sad, int *rate_mv,
+                         unsigned int best_sse_sofar, RD_COST *best_rdc) {
+  SVC *const svc = &cpi->svc;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  SPEED_FEATURES *const sf = &cpi->sf;
+
+  if (ref_frame > LAST_FRAME && gf_temporal_ref &&
+      cpi->oxcf.rc_mode == VPX_CBR) {
+    int tmp_sad;
+    uint32_t dis;
+    int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+
+    if (bsize < BLOCK_16X16) return -1;
+
+    tmp_sad = vp9_int_pro_motion_estimation(
+        cpi, x, bsize, mi_row, mi_col,
+        &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv);
+
+    if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
+    if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1;
+
+    frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
+    *rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+                               &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+    frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+    frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+    cpi->find_fractional_mv_step(
+        x, &frame_mv[NEWMV][ref_frame].as_mv,
+        &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+        cpi->common.allow_high_precision_mv, x->errorperbit,
+        &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+        cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0);
+  } else if (svc->use_base_mv && svc->spatial_layer_id) {
+    if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
+      const int pre_stride = xd->plane[0].pre[0].stride;
+      unsigned int base_mv_sse = UINT_MAX;
+      int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4;
+      const uint8_t *const pre_buf =
+          xd->plane[0].pre[0].buf +
+          (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
+          (frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
+      cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                            pre_buf, pre_stride, &base_mv_sse);
+
+      // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16,
+      // for SVC encoding.
+      if (cpi->use_svc && svc->use_base_mv && bsize < BLOCK_16X16 &&
+          frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+          frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+        return -1;
+
+      // Exit NEWMV search if base_mv_sse is large.
+      if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
+        return -1;
+      if (base_mv_sse < (best_sse_sofar << 1)) {
+        // Base layer mv is good.
+        // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
+        // (0, 0) mode is already tested.
+        unsigned int base_mv_sse_normalized =
+            base_mv_sse >>
+            (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+        if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar &&
+            base_mv_sse_normalized < 400 &&
+            frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+            frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+          return -1;
+        if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                    &frame_mv[NEWMV][ref_frame], rate_mv,
+                                    best_rdc->rdcost, 1)) {
+          return -1;
+        }
+      } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                         &frame_mv[NEWMV][ref_frame], rate_mv,
+                                         best_rdc->rdcost, 0)) {
+        return -1;
+      }
+    } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                       &frame_mv[NEWMV][ref_frame], rate_mv,
+                                       best_rdc->rdcost, 0)) {
+      return -1;
+    }
+  } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                     &frame_mv[NEWMV][ref_frame], rate_mv,
+                                     best_rdc->rdcost, 0)) {
+    return -1;
+  }
+
+  return 0;
+}
+
+static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
+  bp->best_mode = ZEROMV;
+  bp->best_ref_frame = LAST_FRAME;
+  bp->best_tx_size = TX_SIZES;
+  bp->best_intra_tx_size = TX_SIZES;
+  bp->best_pred_filter = EIGHTTAP;
+  bp->best_mode_skip_txfm = SKIP_TXFM_NONE;
+  bp->best_second_ref_frame = NONE;
+  bp->best_pred = NULL;
+}
+
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                          int mi_row, int mi_col, RD_COST *rd_cost,
                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
@@ -1438,18 +1618,17 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-  PREDICTION_MODE best_mode = ZEROMV;
-  MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
+
+  BEST_PICKMODE best_pickmode;
+
+  MV_REFERENCE_FRAME ref_frame;
   MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame;
-  TX_SIZE best_tx_size = TX_SIZES;
-  INTERP_FILTER best_pred_filter = EIGHTTAP;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   RD_COST this_rdc, best_rdc;
-  uint8_t skip_txfm = SKIP_TXFM_NONE, best_mode_skip_txfm = SKIP_TXFM_NONE;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
@@ -1485,7 +1664,6 @@
   DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64]);
 #endif
   struct buf_2d orig_dst = pd->dst;
-  PRED_BUFFER *best_pred = NULL;
   PRED_BUFFER *this_mode_pred = NULL;
   const int pixels_in_block = bh * bw;
   int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready;
@@ -1508,7 +1686,6 @@
   int denoise_svc_pickmode = 1;
 #endif
   INTERP_FILTER filter_gf_svc = EIGHTTAP;
-  MV_REFERENCE_FRAME best_second_ref_frame = NONE;
   MV_REFERENCE_FRAME inter_layer_ref = GOLDEN_FRAME;
   const struct segmentation *const seg = &cm->seg;
   int comp_modes = 0;
@@ -1521,9 +1698,12 @@
   int scene_change_detected =
       cpi->rc.high_source_sad ||
       (cpi->use_svc && cpi->svc.high_source_sad_superframe);
+
+  init_best_pickmode(&best_pickmode);
+
   x->source_variance = UINT_MAX;
   if (cpi->sf.default_interp_filter == BILINEAR) {
-    best_pred_filter = BILINEAR;
+    best_pickmode.best_pred_filter = BILINEAR;
     filter_gf_svc = BILINEAR;
   }
   if (cpi->use_svc && svc->spatial_layer_id > 0) {
@@ -1615,7 +1795,7 @@
           vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
 
     if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && mi->segment_id > 0 &&
-        x->source_variance == 0) {
+        x->zero_temp_sad_source && x->source_variance == 0) {
       mi->segment_id = 0;
       vp9_init_plane_quantizers(cpi, x);
     }
@@ -1827,7 +2007,9 @@
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
 
     if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
-        frame_mv[this_mode][ref_frame].as_int != 0) {
+        (frame_mv[this_mode][ref_frame].as_int != 0 ||
+         (cpi->oxcf.content == VP9E_CONTENT_SCREEN && !svc->spatial_layer_id &&
+          !x->zero_temp_sad_source))) {
       continue;
     }
 
@@ -1922,8 +2104,9 @@
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
     mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
-    mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1
-                                         : rd_threshes[mode_index];
+    mode_rd_thresh = best_pickmode.best_mode_skip_txfm
+                         ? rd_threshes[mode_index] << 1
+                         : rd_threshes[mode_index];
 
     // Increase mode_rd_thresh value for GOLDEN_FRAME for improved encoding
     // speed with little/no subjective quality loss.
@@ -1940,91 +2123,10 @@
       if (frame_mv[this_mode][ref_frame].as_int != 0) continue;
 
     if (this_mode == NEWMV && !force_mv_inter_layer) {
-      if (ref_frame > LAST_FRAME && gf_temporal_ref &&
-          cpi->oxcf.rc_mode == VPX_CBR) {
-        int tmp_sad;
-        uint32_t dis;
-        int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
-
-        if (bsize < BLOCK_16X16) continue;
-
-        tmp_sad = vp9_int_pro_motion_estimation(
-            cpi, x, bsize, mi_row, mi_col,
-            &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv);
-
-        if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) continue;
-        if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad)
-          continue;
-
-        frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
-        rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
-                                  &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
-                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-        frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
-        frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
-
-        cpi->find_fractional_mv_step(
-            x, &frame_mv[NEWMV][ref_frame].as_mv,
-            &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
-            cpi->common.allow_high_precision_mv, x->errorperbit,
-            &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-            cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-            x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0,
-            0);
-      } else if (svc->use_base_mv && svc->spatial_layer_id) {
-        if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
-          const int pre_stride = xd->plane[0].pre[0].stride;
-          unsigned int base_mv_sse = UINT_MAX;
-          int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4;
-          const uint8_t *const pre_buf =
-              xd->plane[0].pre[0].buf +
-              (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
-              (frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
-          cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                                pre_buf, pre_stride, &base_mv_sse);
-
-          // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16,
-          // for SVC encoding.
-          if (cpi->use_svc && svc->use_base_mv && bsize < BLOCK_16X16 &&
-              frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
-              frame_mv[NEWMV][ref_frame].as_mv.col == 0)
-            continue;
-
-          // Exit NEWMV search if base_mv_sse is large.
-          if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
-            continue;
-          if (base_mv_sse < (best_sse_sofar << 1)) {
-            // Base layer mv is good.
-            // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
-            // (0, 0) mode is already tested.
-            unsigned int base_mv_sse_normalized =
-                base_mv_sse >>
-                (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
-            if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar &&
-                base_mv_sse_normalized < 400 &&
-                frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
-                frame_mv[NEWMV][ref_frame].as_mv.col == 0)
-              continue;
-            if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                        &frame_mv[NEWMV][ref_frame], &rate_mv,
-                                        best_rdc.rdcost, 1)) {
-              continue;
-            }
-          } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                             &frame_mv[NEWMV][ref_frame],
-                                             &rate_mv, best_rdc.rdcost, 0)) {
-            continue;
-          }
-        } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                           &frame_mv[NEWMV][ref_frame],
-                                           &rate_mv, best_rdc.rdcost, 0)) {
-          continue;
-        }
-      } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                         &frame_mv[NEWMV][ref_frame], &rate_mv,
-                                         best_rdc.rdcost, 0)) {
+      if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize,
+                        mi_row, mi_col, best_pred_sad, &rate_mv, best_sse_sofar,
+                        &best_rdc))
         continue;
-      }
     }
 
     // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector
@@ -2085,58 +2187,9 @@
          (ref_frame == GOLDEN_FRAME && !force_mv_inter_layer &&
           (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
         (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
-      int pf_rate[3];
-      int64_t pf_dist[3];
-      int curr_rate[3];
-      unsigned int pf_var[3];
-      unsigned int pf_sse[3];
-      TX_SIZE pf_tx_size[3];
-      int64_t best_cost = INT64_MAX;
-      INTERP_FILTER best_filter = SWITCHABLE, filter;
-      PRED_BUFFER *current_pred = this_mode_pred;
       rd_computed = 1;
-
-      for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) {
-        int64_t cost;
-        mi->interp_filter = filter;
-        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
-                          &pf_var[filter], &pf_sse[filter]);
-        curr_rate[filter] = pf_rate[filter];
-        pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
-        cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
-        pf_tx_size[filter] = mi->tx_size;
-        if (cost < best_cost) {
-          best_filter = filter;
-          best_cost = cost;
-          skip_txfm = x->skip_txfm[0];
-
-          if (reuse_inter_pred) {
-            if (this_mode_pred != current_pred) {
-              free_pred_buffer(this_mode_pred);
-              this_mode_pred = current_pred;
-            }
-            current_pred = &tmp[get_pred_buffer(tmp, 3)];
-            pd->dst.buf = current_pred->data;
-            pd->dst.stride = bw;
-          }
-        }
-      }
-
-      if (reuse_inter_pred && this_mode_pred != current_pred)
-        free_pred_buffer(current_pred);
-
-      mi->interp_filter = best_filter;
-      mi->tx_size = pf_tx_size[best_filter];
-      this_rdc.rate = curr_rate[best_filter];
-      this_rdc.dist = pf_dist[best_filter];
-      var_y = pf_var[best_filter];
-      sse_y = pf_sse[best_filter];
-      x->skip_txfm[0] = skip_txfm;
-      if (reuse_inter_pred) {
-        pd->dst.buf = this_mode_pred->data;
-        pd->dst.stride = this_mode_pred->stride;
-      }
+      search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize,
+                        reuse_inter_pred, &this_mode_pred, &var_y, &sse_y);
     } else {
       // For low motion content use x->sb_is_skin in addition to VeryHighSad
       // for setting large_block.
@@ -2242,7 +2295,7 @@
 
     // Skipping checking: test to see if this block can be reconstructed by
     // prediction only.
-    if (cpi->allow_encode_breakout && !xd->lossless) {
+    if (cpi->allow_encode_breakout && !xd->lossless && !scene_change_detected) {
       encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
                            var_y, sse_y, yv12_mb, &this_rdc.rate,
                            &this_rdc.dist, flag_preduv_computed);
@@ -2269,17 +2322,17 @@
 
     if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
       best_rdc = this_rdc;
-      best_mode = this_mode;
-      best_pred_filter = mi->interp_filter;
-      best_tx_size = mi->tx_size;
-      best_ref_frame = ref_frame;
-      best_mode_skip_txfm = x->skip_txfm[0];
       best_early_term = this_early_term;
-      best_second_ref_frame = second_ref_frame;
+      best_pickmode.best_mode = this_mode;
+      best_pickmode.best_pred_filter = mi->interp_filter;
+      best_pickmode.best_tx_size = mi->tx_size;
+      best_pickmode.best_ref_frame = ref_frame;
+      best_pickmode.best_mode_skip_txfm = x->skip_txfm[0];
+      best_pickmode.best_second_ref_frame = second_ref_frame;
 
       if (reuse_inter_pred) {
-        free_pred_buffer(best_pred);
-        best_pred = this_mode_pred;
+        free_pred_buffer(best_pickmode.best_pred);
+        best_pickmode.best_pred = this_mode_pred;
       }
     } else {
       if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
@@ -2289,20 +2342,21 @@
 
     // If early termination flag is 1 and at least 2 modes are checked,
     // the mode search is terminated.
-    if (best_early_term && idx > 0) {
+    if (best_early_term && idx > 0 && !scene_change_detected) {
       x->skip = 1;
       break;
     }
   }
 
-  mi->mode = best_mode;
-  mi->interp_filter = best_pred_filter;
-  mi->tx_size = best_tx_size;
-  mi->ref_frame[0] = best_ref_frame;
-  mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+  mi->mode = best_pickmode.best_mode;
+  mi->interp_filter = best_pickmode.best_pred_filter;
+  mi->tx_size = best_pickmode.best_tx_size;
+  mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->mv[0].as_int =
+      frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int;
   xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int;
-  x->skip_txfm[0] = best_mode_skip_txfm;
-  mi->ref_frame[1] = best_second_ref_frame;
+  x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm;
+  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
 
   // For spatial enhancemanent layer: perform intra prediction only if base
   // layer is chosen as the reference. Always perform intra prediction if
@@ -2314,7 +2368,7 @@
         svc->layer_context[svc->temporal_layer_id].is_key_frame ||
         !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
         (!svc->layer_context[svc->temporal_layer_id].is_key_frame &&
-         svc_force_zero_mode[best_ref_frame - 1]);
+         svc_force_zero_mode[best_pickmode.best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
   }
   if ((cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
@@ -2339,7 +2393,7 @@
        !x->lowvar_highsumdiff)) {
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
     int i;
-    TX_SIZE best_intra_tx_size = TX_SIZES;
+    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
     TX_SIZE intra_tx_size =
         VPXMIN(max_txsize_lookup[bsize],
                tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
@@ -2364,7 +2418,7 @@
                           this_mode_pred->data, this_mode_pred->stride, NULL, 0,
                           0, 0, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        best_pred = this_mode_pred;
+        best_pickmode.best_pred = this_mode_pred;
       }
     }
     pd->dst = orig_dst;
@@ -2424,36 +2478,37 @@
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = this_rdc;
-        best_mode = this_mode;
-        best_intra_tx_size = mi->tx_size;
-        best_ref_frame = INTRA_FRAME;
-        best_second_ref_frame = NONE;
+        best_pickmode.best_mode = this_mode;
+        best_pickmode.best_intra_tx_size = mi->tx_size;
+        best_pickmode.best_ref_frame = INTRA_FRAME;
+        best_pickmode.best_second_ref_frame = NONE;
         mi->uv_mode = this_mode;
         mi->mv[0].as_int = INVALID_MV;
         mi->mv[1].as_int = INVALID_MV;
-        best_mode_skip_txfm = x->skip_txfm[0];
+        best_pickmode.best_mode_skip_txfm = x->skip_txfm[0];
       }
     }
 
     // Reset mb_mode_info to the best inter mode.
-    if (best_ref_frame != INTRA_FRAME) {
-      mi->tx_size = best_tx_size;
+    if (best_pickmode.best_ref_frame != INTRA_FRAME) {
+      mi->tx_size = best_pickmode.best_tx_size;
     } else {
-      mi->tx_size = best_intra_tx_size;
+      mi->tx_size = best_pickmode.best_intra_tx_size;
     }
   }
 
   pd->dst = orig_dst;
-  mi->mode = best_mode;
-  mi->ref_frame[0] = best_ref_frame;
-  mi->ref_frame[1] = best_second_ref_frame;
-  x->skip_txfm[0] = best_mode_skip_txfm;
+  mi->mode = best_pickmode.best_mode;
+  mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+  x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm;
 
   if (!is_inter_block(mi)) {
     mi->interp_filter = SWITCHABLE_FILTERS;
   }
 
-  if (reuse_inter_pred && best_pred != NULL) {
+  if (reuse_inter_pred && best_pickmode.best_pred != NULL) {
+    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
     if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cm->use_highbitdepth)
@@ -2482,26 +2537,26 @@
     // Remove this condition when the issue is resolved.
     if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1;
     vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost,
-                                frame_mv, reuse_inter_pred, best_tx_size,
-                                best_mode, best_ref_frame, best_pred_filter,
-                                best_mode_skip_txfm);
+                                frame_mv, reuse_inter_pred, &best_pickmode);
     vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision,
                          gf_temporal_ref);
     recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den, yv12_mb,
                                    &best_rdc, bsize, mi_row, mi_col);
-    best_ref_frame = ctx_den.best_ref_frame;
+    best_pickmode.best_ref_frame = ctx_den.best_ref_frame;
   }
 #endif
 
-  if (best_ref_frame == ALTREF_FRAME || best_second_ref_frame == ALTREF_FRAME)
+  if (best_pickmode.best_ref_frame == ALTREF_FRAME ||
+      best_pickmode.best_second_ref_frame == ALTREF_FRAME)
     x->arf_frame_usage++;
-  else if (best_ref_frame != INTRA_FRAME)
+  else if (best_pickmode.best_ref_frame != INTRA_FRAME)
     x->lastgolden_frame_usage++;
 
   if (cpi->sf.adaptive_rd_thresh) {
-    THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)];
+    THR_MODES best_mode_idx =
+        mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)];
 
-    if (best_ref_frame == INTRA_FRAME) {
+    if (best_pickmode.best_ref_frame == INTRA_FRAME) {
       // Only consider the modes that are included in the intra_mode_list.
       int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE);
       int i;
@@ -2521,7 +2576,7 @@
     } else {
       for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
         PREDICTION_MODE this_mode;
-        if (best_ref_frame != ref_frame) continue;
+        if (best_pickmode.best_ref_frame != ref_frame) continue;
         for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
           if (cpi->sf.adaptive_rd_thresh_row_mt)
             update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 22ae726..1ec6965 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -364,6 +364,7 @@
   rc->reset_high_source_sad = 0;
   rc->high_source_sad_lagindex = -1;
   rc->hybrid_intra_scene_change = 0;
+  rc->re_encode_maxq_scene_change = 0;
   rc->alt_ref_gf_group = 0;
   rc->last_frame_is_src_altref = 0;
   rc->fac_active_worst_inter = 150;
@@ -1329,6 +1330,14 @@
     }
   }
 
+  // For normal frames do not allow an active minq lower than the q used for
+  // the last boosted frame.
+  if (!frame_is_intra_only(cm) &&
+      (!(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) ||
+       rc->is_src_frame_alt_ref)) {
+    active_best_quality = VPXMAX(active_best_quality, rc->last_boosted_qindex);
+  }
+
 #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   vpx_clear_system_state();
   // Static forced key frames Q restrictions dealt with elsewhere.
@@ -2631,8 +2640,9 @@
     int frames_to_buffer = 1;
     int frame = 0;
     int scene_cut_force_key_frame = 0;
+    int num_zero_temp_sad = 0;
     uint64_t avg_sad_current = 0;
-    uint32_t min_thresh = 4000;
+    uint32_t min_thresh = 10000;
     float thresh = 8.0f;
     uint32_t thresh_key = 140000;
     if (cpi->oxcf.speed <= 5) thresh_key = 240000;
@@ -2694,6 +2704,7 @@
           last_src_y = frames[frame + 1]->y_buffer;
           last_src_ystride = frames[frame + 1]->y_stride;
         }
+        num_zero_temp_sad = 0;
         for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
           for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
             // Checker-board pattern, ignore boundary.
@@ -2705,6 +2716,7 @@
                                                last_src_ystride);
               avg_sad += tmp_sad;
               num_samples++;
+              if (tmp_sad == 0) num_zero_temp_sad++;
             }
             src_y += 64;
             last_src_y += 64;
@@ -2721,7 +2733,8 @@
           if (avg_sad >
                   VPXMAX(min_thresh,
                          (unsigned int)(rc->avg_source_sad[0] * thresh)) &&
-              rc->frames_since_key > 1)
+              rc->frames_since_key > 1 &&
+              num_zero_temp_sad < 3 * (num_samples >> 2))
             rc->high_source_sad = 1;
           else
             rc->high_source_sad = 0;
@@ -2794,15 +2807,22 @@
 
 // Test if encoded frame will significantly overshoot the target bitrate, and
 // if so, set the QP, reset/adjust some rate control parameters, and return 1.
+// frame_size = -1 means frame has not been encoded.
 int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  SPEED_FEATURES *const sf = &cpi->sf;
   int thresh_qp = 7 * (rc->worst_quality >> 3);
   int thresh_rate = rc->avg_frame_bandwidth << 3;
   // Lower rate threshold for video.
   if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
     thresh_rate = rc->avg_frame_bandwidth << 2;
-  if (cm->base_qindex < thresh_qp && frame_size > thresh_rate) {
+  // If this decision is not based on an encoded frame size but just on
+  // scene/slide change detection (i.e., re_encode_overshoot_rt = 0), adjust the
+  // qp_thresh and skip the (frame_size > thresh_rate) condition in this case.
+  if (!sf->re_encode_overshoot_rt) thresh_qp = 3 * (rc->worst_quality >> 2);
+  if ((!sf->re_encode_overshoot_rt || frame_size > thresh_rate) &&
+      cm->base_qindex < thresh_qp) {
     double rate_correction_factor =
         cpi->rc.rate_correction_factors[INTER_NORMAL];
     const int target_size = cpi->rc.avg_frame_bandwidth;
@@ -2813,11 +2833,13 @@
     // Force a re-encode, and for now use max-QP.
     *q = cpi->rc.worst_quality;
     cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0;
+    cpi->rc.re_encode_maxq_scene_change = 1;
     // If the frame_size is much larger than the threshold (big content change)
     // and the encoded frame used alot of Intra modes, then force hybrid_intra
     // encoding for the re-encode on this scene change. hybrid_intra will
     // use rd-based intra mode selection for small blocks.
-    if (frame_size > (thresh_rate << 1) && cpi->svc.spatial_layer_id == 0) {
+    if (sf->re_encode_overshoot_rt && frame_size > (thresh_rate << 1) &&
+        cpi->svc.spatial_layer_id == 0) {
       MODE_INFO **mi = cm->mi_grid_visible;
       int sum_intra_usage = 0;
       int mi_row, mi_col;
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 373c6a3..cf37117 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -190,6 +190,7 @@
   int high_source_sad;
   int count_last_scene_change;
   int hybrid_intra_scene_change;
+  int re_encode_maxq_scene_change;
   int avg_frame_low_motion;
   int af_ratio_onepass_vbr;
   int force_qpmin;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index ad4b48a..4005f85 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -783,7 +783,7 @@
 static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                              int64_t *distortion, int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane, BLOCK_SIZE bsize,
-                             TX_SIZE tx_size, int use_fast_coef_casting) {
+                             TX_SIZE tx_size, int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   struct rdcost_block_args args;
@@ -791,7 +791,7 @@
   args.cpi = cpi;
   args.x = x;
   args.best_rd = ref_best_rd;
-  args.use_fast_coef_costing = use_fast_coef_casting;
+  args.use_fast_coef_costing = use_fast_coef_costing;
   args.skippable = 1;
 
   if (plane == 0) xd->mi[0]->tx_size = tx_size;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 75a8de2..3471390 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -201,17 +201,18 @@
 
   if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
     sf->exhaustive_searches_thresh = (1 << 22);
-    for (i = 0; i < MAX_MESH_STEP; ++i) {
-      int mesh_density_level = 0;
-      sf->mesh_patterns[i].range =
-          good_quality_mesh_patterns[mesh_density_level][i].range;
-      sf->mesh_patterns[i].interval =
-          good_quality_mesh_patterns[mesh_density_level][i].interval;
-    }
   } else {
     sf->exhaustive_searches_thresh = INT_MAX;
   }
 
+  for (i = 0; i < MAX_MESH_STEP; ++i) {
+    const int mesh_density_level = 0;
+    sf->mesh_patterns[i].range =
+        good_quality_mesh_patterns[mesh_density_level][i].range;
+    sf->mesh_patterns[i].interval =
+        good_quality_mesh_patterns[mesh_density_level][i].interval;
+  }
+
   if (speed >= 1) {
     sf->enable_tpl_model = 0;
     sf->prune_ref_frame_for_rect_partitions = 0;
@@ -580,6 +581,7 @@
   }
 
   if (speed >= 6) {
+    sf->re_encode_overshoot_rt = 0;
     if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) {
       sf->use_altref_onepass = 1;
       sf->use_compound_nonrd_pickmode = 1;
@@ -846,7 +848,8 @@
   sf->quant_opt_thresh = 99.0;
   sf->allow_acl = 1;
 #if CONFIG_VP9_HIGHBITDEPTH
-  sf->enable_tpl_model = 0;
+  // TODO(jingning): Make the model support high bit-depth route.
+  sf->enable_tpl_model = !cm->use_highbitdepth;
 #else
   sf->enable_tpl_model = 1;
 #endif
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index fd4973f..406b407 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -57,7 +57,8 @@
   BIGDIA = 3,
   SQUARE = 4,
   FAST_HEX = 5,
-  FAST_DIAMOND = 6
+  FAST_DIAMOND = 6,
+  MESH = 7
 } SEARCH_METHODS;
 
 typedef enum {
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 4db3e6f..99a03ed 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -230,7 +230,7 @@
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const SEARCH_METHODS search_method = HEX;
+  const SEARCH_METHODS search_method = MESH;
   int step_param;
   int sadpb = x->sadperbit16;
   uint32_t bestsme = UINT_MAX;
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index ea3200f..7f45ab2 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -97,7 +97,7 @@
     const uint8_t *data, unsigned int data_sz, vpx_codec_stream_info_t *si,
     int *is_intra_only, vpx_decrypt_cb decrypt_cb, void *decrypt_state) {
   int intra_only_flag = 0;
-  uint8_t clear_buffer[10];
+  uint8_t clear_buffer[11];
 
   if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM;
 
@@ -158,6 +158,9 @@
         if (profile > PROFILE_0) {
           if (!parse_bitdepth_colorspace_sampling(profile, &rb))
             return VPX_CODEC_UNSUP_BITSTREAM;
+          // The colorspace info may cause vp9_read_frame_size() to need 11
+          // bytes.
+          if (data_sz < 11) return VPX_CODEC_UNSUP_BITSTREAM;
         }
         rb.bit_offset += REF_FRAMES;  // refresh_frame_flags
         vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
diff --git a/vpx_dsp/arm/loopfilter_8_neon.asm b/vpx_dsp/arm/loopfilter_8_neon.asm
index a042d40..a81a9d1 100644
--- a/vpx_dsp/arm/loopfilter_8_neon.asm
+++ b/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -201,7 +201,7 @@
     str         lr, [sp, #16]              ; thresh1
     add         sp, #4
     pop         {r0-r1, lr}
-    add         r0, r1, lsl #3             ; s + 8 * pitch
+    add         r0, r0, r1, lsl #3         ; s + 8 * pitch
     b           vpx_lpf_vertical_8_neon
     ENDP        ; |vpx_lpf_vertical_8_dual_neon|
 
diff --git a/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm b/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
new file mode 100644
index 0000000..d8e4bcc
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
@@ -0,0 +1,438 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers*****************************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_horiz_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlsl.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlal.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlal.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlsl.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlal.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u8         {d6},   [r1]
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u8         {d7},   [r6]
+    vrhadd.u8       d20,    d20,    d6
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vrhadd.u8       d8,     d8,     d7
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlsl.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlal.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlal.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    add             r7,     r1,     #8
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vld1.u8         {d0},   [r1]
+    vmlal.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u8         {d2},   [r7]
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlsl.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vrhadd.u8       d8,     d8,     d0
+    vrhadd.u8       d9,     d9,     d2
+    vmlsl.u8        q11,    d1,     d24
+    vmlsl.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlal.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    vmlal.u8        q11,    d13,    d28
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    subeq           r14,    r14,    #2
+    vhadd.s16       q5,     q5,     q10
+    vmlal.u8        q11,    d15,    d29
+    addeq           r1,     r1,     r8
+    vmlsl.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vdup.16         q10,    r7
+    vld1.u32        {q3},   [r12],  r11
+    add             r7,     r6,     #8
+    moveq           r5,     r10
+    vld1.u8         {d0},   [r6]
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u8         {d2},   [r7]
+    vqrshrun.s16    d11,    q11,    #6
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q6},   [r12],  r11
+    vrhadd.u8       d10,    d10,    d0
+    vld1.u32        {q7},   [r12],  r11
+    vrhadd.u8       d11,    d11,    d2
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    mov             r7,     #0xc000
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    add             r7,     r6,     #8
+    vld1.u8         {d20},  [r6]
+    vld1.u8         {d21},  [r7]
+    vrhadd.u8       d10,    d10,    d20
+    vrhadd.u8       d11,    d11,    d21
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlsl.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlal.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlal.u8        q4,     d5,     d29
+    vmlsl.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vld1.u32        {d10[0]},       [r1]
+    vld1.u32        {d10[1]},       [r6]
+    vrhadd.u8       d8,     d8,     d10
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm b/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
new file mode 100644
index 0000000..7a77747
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
@@ -0,0 +1,439 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_horiz_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlal.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlsl.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlal.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlsl.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlsl.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u8         {d6},   [r1]
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlal.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u8         {d7},   [r6]
+    vrhadd.u8       d20,    d20,    d6
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vrhadd.u8       d8,     d8,     d7
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlal.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlsl.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlsl.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlal.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    add             r7,     r1,     #8
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vld1.u8         {d0},   [r1]
+    vmlsl.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u8         {d2},   [r7]
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vrhadd.u8       d8,     d8,     d0
+    vrhadd.u8       d9,     d9,     d2
+    vmlsl.u8        q11,    d1,     d24
+    vmlal.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlsl.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    vmlal.u8        q11,    d13,    d28
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    subeq           r14,    r14,    #2
+    vhadd.s16       q5,     q5,     q10
+    vmlsl.u8        q11,    d15,    d29
+    addeq           r1,     r1,     r8
+    vmlal.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vdup.16         q10,    r7
+    vld1.u32        {q3},   [r12],  r11
+    add             r7,     r6,     #8
+    moveq           r5,     r10
+    vld1.u8         {d0},   [r6]
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u8         {d2},   [r7]
+    vqrshrun.s16    d11,    q11,    #6
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q6},   [r12],  r11
+    vrhadd.u8       d10,    d10,    d0
+    vld1.u32        {q7},   [r12],  r11
+    vrhadd.u8       d11,    d11,    d2
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    mov             r7,     #0xc000
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    add             r7,     r6,     #8
+    vld1.u8         {d20},  [r6]
+    vld1.u8         {d21},  [r7]
+    vrhadd.u8       d10,    d10,    d20
+    vrhadd.u8       d11,    d11,    d21
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlal.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlsl.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlsl.u8        q4,     d5,     d29
+    vmlal.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vld1.u32        {d10[0]},       [r1]
+    vld1.u32        {d10[1]},       [r6]
+    vrhadd.u8       d8,     d8,     d10
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
deleted file mode 100644
index 1c2ee50..0000000
--- a/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
-
-    EXPORT  |vpx_convolve8_avg_horiz_neon|
-    EXPORT  |vpx_convolve8_avg_vert_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter
-; sp[]int x0_q4
-; sp[]int x_step_q4 ; unused
-; sp[]int y0_q4
-; sp[]int y_step_q4 ; unused
-; sp[]int w
-; sp[]int h
-
-|vpx_convolve8_avg_horiz_neon| PROC
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
-    add             r4, r5, lsl #4
-    ldrd            r6, r7, [sp, #52]       ; w, h
-
-    vld1.s16        {q0}, [r4]              ; filter
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-vpx_convolve8_avg_loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-vpx_convolve8_avg_loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; slightly out of order load to match the existing data
-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
-
-    sub             r2, r2, r3, lsl #2      ; reset for store
-
-    ; src[] * filter
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_avg_loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt vpx_convolve8_avg_loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|vpx_convolve8_avg_vert_neon| PROC
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #24]           ; filter
-    ldr             r5, [sp, #36]           ; y0_q4
-    add             r4, r5, lsl #4
-    ldr             r6, [sp, #44]           ; w
-    ldr             lr, [sp, #48]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-vpx_convolve8_avg_loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-vpx_convolve8_avg_loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    vld1.u32        {d6[0]}, [r5@32], r3
-    vld1.u32        {d6[1]}, [r8@32], r3
-    vld1.u32        {d7[0]}, [r5@32], r3
-    vld1.u32        {d7[1]}, [r8@32], r3
-
-    pld             [r7]
-    pld             [r4]
-
-    ; src[] * filter
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    sub             r5, r5, r3, lsl #1      ; reset for store
-    sub             r8, r8, r3, lsl #1
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             vpx_convolve8_avg_loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_avg_loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
diff --git a/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm b/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
new file mode 100644
index 0000000..d310a83
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
@@ -0,0 +1,486 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_vert_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r3,     r3,     r2
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    add             r14,    r1,     r6
+    vmlal.u8        q6,     d7,     d27
+    vmlsl.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlal.u8        q7,     d5,     d24
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d6,     d25
+    vrhadd.u8       d10,    d10,    d20
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d20},  [r14]
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    addle           r0,     r0,     r8
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vrhadd.u8       d12,    d12,    d20
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    add             r10,    r10,    r2      ; 11*strd
+    vmlal.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlal.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlsl.u8        q6,     d16,    d28
+    add             r10,    r10,    r2      ;12*strd
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    subs            r7,     r7,     #4
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vld1.u8         {d20},  [r14]
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vrhadd.u8       d12,    d12,    d20
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vst1.8          {d12},  [r14],  r6
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vst1.8          {d14},  [r14],  r6
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vmlal.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlal.u8        q6,     d7,     d27
+    add             r14,    r1,     r6
+    vmlsl.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlal.u8        q7,     d16,    d27
+    vmlsl.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d12,    d12,    d20
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d14,    d14,    d20
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlsl.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlal.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlsl.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlal.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlal.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlal.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlsl.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vld1.u32        {d20[0]},       [r1]
+    vld1.u32        {d20[1]},       [r3]
+    vrhadd.u8       d0,     d0,     d20
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    mov             r4,     r3
+    vld1.u32        {d20[0]},       [r4],   r6
+    vld1.u32        {d20[1]},       [r4]
+    vrhadd.u8       d8,     d8,     d20
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm b/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
new file mode 100644
index 0000000..c5695fb
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
@@ -0,0 +1,487 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_vert_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r3,     r3,     r2
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    add             r14,    r1,     r6
+    vmlsl.u8        q6,     d7,     d27
+    vmlal.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlal.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlsl.u8        q7,     d5,     d24
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d6,     d25
+    vrhadd.u8       d10,    d10,    d20
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d20},  [r14]
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    addle           r0,     r0,     r8
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vrhadd.u8       d12,    d12,    d20
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlal.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    add             r10,    r10,    r2      ; 11*strd
+    vmlsl.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlsl.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlal.u8        q6,     d16,    d28
+    add             r10,    r10,    r2      ;12*strd
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vld1.u8         {d20},  [r14]
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vrhadd.u8       d12,    d12,    d20
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vst1.8          {d12},  [r14],  r6
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vst1.8          {d14},  [r14],  r6
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vmlsl.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlsl.u8        q6,     d7,     d27
+    add             r14,    r1,     r6
+    vmlal.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlsl.u8        q7,     d16,    d27
+    vmlal.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d12,    d12,    d20
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d14,    d14,    d20
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlal.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlal.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlsl.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlsl.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlal.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlsl.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlal.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vld1.u32        {d20[0]},       [r1]
+    vld1.u32        {d20[1]},       [r3]
+    vrhadd.u8       d0,     d0,     d20
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    mov             r4,     r3
+    vld1.u32        {d20[0]},       [r4],   r6
+    vld1.u32        {d20[1]},       [r4]
+    vrhadd.u8       d8,     d8,     d20
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm b/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
new file mode 100644
index 0000000..fa1b732
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
@@ -0,0 +1,415 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_horiz_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlsl.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlal.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlal.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlsl.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlal.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vmlsl.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlsl.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlal.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlal.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vmlal.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlsl.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vmlsl.u8        q11,    d1,     d24
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    vmlsl.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlal.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    addeq           r1,     r1,     r8
+    subeq           r14,    r14,    #2
+    vmlal.u8        q11,    d13,    d28
+    vhadd.s16       q5,     q5,     q10
+    vmlal.u8        q11,    d15,    d29
+    vmlsl.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vld1.u32        {q7},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q9},   [r12],  r11
+    vqrshrun.s16    d11,    q11,    #6
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    moveq           r5,     r10
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vdup.16         q10,    r7
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlsl.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlal.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlal.u8        q4,     d5,     d29
+    vmlsl.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm b/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
new file mode 100644
index 0000000..90b2c8f
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
@@ -0,0 +1,415 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_horiz_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlal.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlsl.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlal.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlsl.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlsl.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vmlal.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlal.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlsl.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlsl.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlal.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vmlsl.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vmlsl.u8        q11,    d1,     d24
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    vmlal.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlsl.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    addeq           r1,     r1,     r8
+    subeq           r14,    r14,    #2
+    vmlal.u8        q11,    d13,    d28
+    vhadd.s16       q5,     q5,     q10
+    vmlsl.u8        q11,    d15,    d29
+    vmlal.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vld1.u32        {q7},   [r12],  r11
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r12],  r11
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q9},   [r12],  r11
+    vqrshrun.s16    d11,    q11,    #6
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    moveq           r5,     r10
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vdup.16         q10,    r7
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlal.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlsl.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlsl.u8        q4,     d5,     d29
+    vmlal.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
deleted file mode 100644
index 5eee156..0000000
--- a/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
+++ /dev/null
@@ -1,273 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
-
-    EXPORT  |vpx_convolve8_horiz_neon|
-    EXPORT  |vpx_convolve8_vert_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter
-; sp[]int x0_q4
-; sp[]int x_step_q4 ; unused
-; sp[]int y0_q4
-; sp[]int y_step_q4 ; unused
-; sp[]int w
-; sp[]int h
-
-|vpx_convolve8_horiz_neon| PROC
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
-    add             r4, r5, lsl #4
-    ldrd            r6, r7, [sp, #52]       ; w, h
-
-    vld1.s16        {q0}, [r4]              ; filter
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-vpx_convolve8_loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-vpx_convolve8_loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; src[] * filter
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt vpx_convolve8_loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|vpx_convolve8_vert_neon| PROC
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #24]           ; filter
-    ldr             r5, [sp, #36]           ; y0_q4
-    add             r4, r5, lsl #4
-    ldr             r6, [sp, #44]           ; w
-    ldr             lr, [sp, #48]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-vpx_convolve8_loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-vpx_convolve8_loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    ; src[] * filter
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r7]
-    pld             [r4]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             vpx_convolve8_loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/vpx_dsp/arm/vpx_convolve8_neon_asm.c
new file mode 100644
index 0000000..4470b28
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon_asm.h"
+
+/* Type1 and Type2 functions are called depending on the position of the
+ * negative and positive coefficients in the filter. In type1, the filter kernel
+ * used is sub_pel_filters_8lp, in which only the first two and the last two
+ * coefficients are negative. In type2, the negative coefficients are 0, 2, 5 &
+ * 7.
+ */
+
+#define DEFINE_FILTER(dir)                                                   \
+  void vpx_convolve8_##dir##_neon(                                           \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    if (filter == vp9_filter_kernels[1]) {                                   \
+      vpx_convolve8_##dir##_filter_type1_neon(                               \
+          src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+          y_step_q4, w, h);                                                  \
+    } else {                                                                 \
+      vpx_convolve8_##dir##_filter_type2_neon(                               \
+          src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+          y_step_q4, w, h);                                                  \
+    }                                                                        \
+  }
+
+DEFINE_FILTER(horiz);
+DEFINE_FILTER(avg_horiz);
+DEFINE_FILTER(vert);
+DEFINE_FILTER(avg_vert);
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/vpx_dsp/arm/vpx_convolve8_neon_asm.h
new file mode 100644
index 0000000..e1a9911
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.h
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+#define VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+
+#define DECLARE_FILTER(dir, type)                                  \
+  void vpx_convolve8_##dir##_filter_##type##_neon(                 \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,      \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+DECLARE_FILTER(horiz, type1);
+DECLARE_FILTER(avg_horiz, type1);
+DECLARE_FILTER(horiz, type2);
+DECLARE_FILTER(avg_horiz, type2);
+DECLARE_FILTER(vert, type1);
+DECLARE_FILTER(avg_vert, type1);
+DECLARE_FILTER(vert, type2);
+DECLARE_FILTER(avg_vert, type2);
+
+#endif /* VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ */
diff --git a/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm b/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
new file mode 100644
index 0000000..2666d42
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
@@ -0,0 +1,457 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_vert_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r3,     r3,     r2
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    vmlal.u8        q6,     d7,     d27
+    vmlsl.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    addle           r1,     r1,     r9
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    add             r10,    r10,    r2      ; 11*strd
+    vmlal.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlal.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    vmlsl.u8        q6,     d16,    d28
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlsl.u8        q6,     d17,    d29
+    add             r10,    r10,    r2      ;12*strd
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    subs            r7,     r7,     #4
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vst1.8          {d14},  [r14],  r6
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vmlal.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlal.u8        q6,     d7,     d27
+    vmlsl.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlal.u8        q7,     d16,    d27
+    vmlsl.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from
+                                            ; sp
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlsl.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlal.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlsl.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlal.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlal.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlal.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlsl.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm b/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
new file mode 100644
index 0000000..cb5d6d3
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
@@ -0,0 +1,455 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_vert_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r3,     r3,     r2
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    vmlsl.u8        q6,     d7,     d27
+    vmlal.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlal.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    addle           r1,     r1,     r9
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlal.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    add             r10,    r10,    r2      ; 11*strd
+    vmlsl.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlsl.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    vmlal.u8        q6,     d16,    d28
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlsl.u8        q6,     d17,    d29
+    add             r10,    r10,    r2      ;12*strd
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vst1.8          {d14},  [r14],  r6
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vmlsl.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlsl.u8        q6,     d7,     d27
+    vmlal.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlsl.u8        q7,     d16,    d27
+    vmlal.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlal.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlal.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlsl.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlsl.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlal.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlsl.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlal.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c
index 2bf2d89..830f317 100644
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -24,7 +24,8 @@
   uint8_t temp[64 * 72];
 
   // Account for the vertical phase needing 3 lines prior and 4 lines post
-  const int intermediate_height = h + 7;
+  // (+ 1 to make it divisible by 4).
+  const int intermediate_height = h + 8;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
@@ -48,7 +49,7 @@
                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
   uint8_t temp[64 * 72];
-  const int intermediate_height = h + 7;
+  const int intermediate_height = h + 8;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
diff --git a/vpx_dsp/bitreader_buffer.c b/vpx_dsp/bitreader_buffer.c
index 3e16bfa..f59f1f7 100644
--- a/vpx_dsp/bitreader_buffer.c
+++ b/vpx_dsp/bitreader_buffer.c
@@ -23,7 +23,7 @@
     rb->bit_offset = off + 1;
     return bit;
   } else {
-    rb->error_handler(rb->error_handler_data);
+    if (rb->error_handler != NULL) rb->error_handler(rb->error_handler_data);
     return 0;
   }
 }
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index f16ed6d..4b48f62 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -110,11 +110,20 @@
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
 DSP_SRCS-$(HAVE_NEON)  += arm/vpx_scaled_convolve8_neon.c
 
+
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type1_neon$(ASM)
 DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.c
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.h
 DSP_SRCS-yes += arm/vpx_convolve_neon.c
 else
 ifeq ($(HAVE_NEON),yes)
diff --git a/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
index 2051381..43634ae 100644
--- a/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
@@ -460,7 +460,8 @@
   const int J = left[1];
   const int K = left[2];
   const int L = left[3];
-  const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
+  const __m128i XXXXXABC = _mm_castps_si128(
+      _mm_loadh_pi(_mm_setzero_ps(), (const __m64 *)(above - 1)));
   const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
   const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
   const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index 220b8be..db1db37 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -142,6 +142,10 @@
                              int border, int byte_alignment,
                              vpx_codec_frame_buffer_t *fb,
                              vpx_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
+#endif
+
   if (ybf) {
     const int vp9_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
     const int aligned_width = (width + 7) & ~7;
diff --git a/vpxdec.c b/vpxdec.c
index 26c4752..522eda1 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -99,7 +99,7 @@
 static const arg_def_t framestatsarg =
     ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
 static const arg_def_t rowmtarg =
-    ARG_DEF(NULL, "row-mt", 1, "Enable multi-threading to run row-wise");
+    ARG_DEF(NULL, "row-mt", 1, "Enable multi-threading to run row-wise in VP9");
 
 static const arg_def_t *all_args[] = {
   &help,           &codecarg,      &use_yv12,         &use_i420,
@@ -758,7 +758,8 @@
       goto fail;
     }
   }
-  if (vpx_codec_control(&decoder, VP9D_SET_ROW_MT, enable_row_mt)) {
+  if (interface->fourcc == VP9_FOURCC &&
+      vpx_codec_control(&decoder, VP9D_SET_ROW_MT, enable_row_mt)) {
     fprintf(stderr, "Failed to set decoder in row multi-thread mode: %s\n",
             vpx_codec_error(&decoder));
     goto fail;
diff --git a/vpxenc.c b/vpxenc.c
index 144e60c..3d8d959 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -351,7 +351,8 @@
     ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)");
 static const arg_def_t arnr_strength =
     ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)");
-static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1, "AltRef type");
+static const arg_def_t arnr_type =
+    ARG_DEF(NULL, "arnr-type", 1, "AltRef filter type (1..3)");
 static const struct arg_enum_list tuning_enum[] = {
   { "psnr", VP8_TUNE_PSNR }, { "ssim", VP8_TUNE_SSIM }, { NULL, 0 }
 };
@@ -1614,14 +1615,14 @@
             vpx_img_alloc(NULL, VPX_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16);
       }
       I420Scale_16(
-          (uint16 *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2,
-          (uint16 *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2,
-          (uint16 *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2,
-          img->d_w, img->d_h, (uint16 *)stream->img->planes[VPX_PLANE_Y],
+          (uint16_t *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2,
+          (uint16_t *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2,
+          (uint16_t *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2,
+          img->d_w, img->d_h, (uint16_t *)stream->img->planes[VPX_PLANE_Y],
           stream->img->stride[VPX_PLANE_Y] / 2,
-          (uint16 *)stream->img->planes[VPX_PLANE_U],
+          (uint16_t *)stream->img->planes[VPX_PLANE_U],
           stream->img->stride[VPX_PLANE_U] / 2,
-          (uint16 *)stream->img->planes[VPX_PLANE_V],
+          (uint16_t *)stream->img->planes[VPX_PLANE_V],
           stream->img->stride[VPX_PLANE_V] / 2, stream->img->d_w,
           stream->img->d_h, kFilterBox);
       img = stream->img;