Merge changes I1af88144,I9eaf9563,I58c1bc0f,I8d173add

* changes:
  Remove mv_dist and mv_cost from TplDepStats
  Remove inter_cost_arr and recon_error_arr
  Remove RE_COMPUTE_MV_INCONSISTENCY
  Remove unused mv_[dist/cost]_sum
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 51248f7..848872f 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -20,8 +20,6 @@
 
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas_apple.pl script.\n\n";
-print "\t.set WIDE_REFERENCE, 0\n";
-print "\t.set ARCHITECTURE, 5\n";
 print "\t.syntax unified\n";
 
 my %register_aliases;
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 00214c9..ce3ba55 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1016,9 +1016,6 @@
               EXE_SFX=.exe
               enable_feature thumb
               ;;
-            *)
-              check_add_asflags --defsym ARCHITECTURE=${arch_int}
-              ;;
           esac
 
           if enabled thumb; then
@@ -1090,7 +1087,6 @@
           fi
           arch_int=${tgt_isa##armv}
           arch_int=${arch_int%%te}
-          check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
           enabled debug && add_asflags -g
           add_cflags --gnu
           add_cflags --enum_is_int
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index f3e585d..5a97371 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -32,9 +32,10 @@
 namespace {
 
 const int kThreads = 0;
-const int kFileName = 1;
+const int kMtMode = 1;
+const int kFileName = 2;
 
-typedef std::tuple<int, const char *> DecodeParam;
+typedef std::tuple<int, int, const char *> DecodeParam;
 
 class TestVectorTest : public ::libvpx_test::DecoderTest,
                        public ::libvpx_test::CodecTestWithParam<DecodeParam> {
@@ -57,6 +58,25 @@
         << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
+#if CONFIG_VP9_DECODER
+  virtual void PreDecodeFrameHook(
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    if (video.frame_number() == 0 && mt_mode_ >= 0) {
+      if (mt_mode_ == 1) {
+        decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 1);
+        decoder->Control(VP9D_SET_ROW_MT, 0);
+      } else if (mt_mode_ == 2) {
+        decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0);
+        decoder->Control(VP9D_SET_ROW_MT, 1);
+      } else {
+        decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0);
+        decoder->Control(VP9D_SET_ROW_MT, 0);
+      }
+    }
+  }
+#endif
+
   virtual void DecompressedFrameHook(const vpx_image_t &img,
                                      const unsigned int frame_number) {
     ASSERT_TRUE(md5_file_ != NULL);
@@ -80,6 +100,7 @@
 #if CONFIG_VP9_DECODER
   std::set<std::string> resize_clips_;
 #endif
+  int mt_mode_;
 
  private:
   FILE *md5_file_;
@@ -97,9 +118,10 @@
   char str[256];
 
   cfg.threads = std::get<kThreads>(input);
-
-  snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
-           filename.c_str(), cfg.threads);
+  mt_mode_ = std::get<kMtMode>(input);
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1,
+           "file: %s threads: %d MT mode: %d", filename.c_str(), cfg.threads,
+           mt_mode_);
   SCOPED_TRACE(str);
 
   // Open compressed video file.
@@ -134,7 +156,8 @@
 VP8_INSTANTIATE_TEST_CASE(
     TestVectorTest,
     ::testing::Combine(
-        ::testing::Values(1),  // Single thread.
+        ::testing::Values(1),   // Single thread.
+        ::testing::Values(-1),  // LPF opt and Row MT is not applicable
         ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                             libvpx_test::kVP8TestVectors +
                                 libvpx_test::kNumVP8TestVectors)));
@@ -147,6 +170,7 @@
             static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)),
         ::testing::Combine(
             ::testing::Range(2, 9),  // With 2 ~ 8 threads.
+            ::testing::Values(-1),   // LPF opt and Row MT is not applicable
             ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                                 libvpx_test::kVP8TestVectors +
                                     libvpx_test::kNumVP8TestVectors))));
@@ -157,7 +181,8 @@
 VP9_INSTANTIATE_TEST_CASE(
     TestVectorTest,
     ::testing::Combine(
-        ::testing::Values(1),  // Single thread.
+        ::testing::Values(1),   // Single thread.
+        ::testing::Values(-1),  // LPF opt and Row MT is not applicable
         ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                             libvpx_test::kVP9TestVectors +
                                 libvpx_test::kNumVP9TestVectors)));
@@ -169,6 +194,10 @@
             static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
         ::testing::Combine(
             ::testing::Range(2, 9),  // With 2 ~ 8 threads.
+            ::testing::Range(0, 3),  // With multi threads modes 0 ~ 2
+                                     // 0: LPF opt and Row MT disabled
+                                     // 1: LPF opt enabled
+                                     // 2: Row MT enabled
             ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                                 libvpx_test::kVP9TestVectors +
                                     libvpx_test::kNumVP9TestVectors))));
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index ccfb813..d80472c 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1056,10 +1056,28 @@
     predict_recon_intra(xd, mi, twd, parse_intra_block_row_mt);
   } else {
     if (!mi->skip) {
-      const int eobtotal =
-          predict_recon_inter(xd, mi, twd, parse_inter_block_row_mt);
+      tran_low_t *dqcoeff[MAX_MB_PLANE];
+      int *eob[MAX_MB_PLANE];
+      int plane;
+      int eobtotal;
+      // Based on eobtotal and bsize, this may be mi->skip may be set to true
+      // In that case dqcoeff and eob need to be backed up and restored as
+      // recon_block will not increment these pointers for skip cases
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        dqcoeff[plane] = pd->dqcoeff;
+        eob[plane] = pd->eob;
+      }
+      eobtotal = predict_recon_inter(xd, mi, twd, parse_inter_block_row_mt);
 
-      if (bsize >= BLOCK_8X8 && eobtotal == 0) mi->skip = 1;  // skip loopfilter
+      if (bsize >= BLOCK_8X8 && eobtotal == 0) {
+        mi->skip = 1;  // skip loopfilter
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          struct macroblockd_plane *pd = &xd->plane[plane];
+          pd->dqcoeff = dqcoeff[plane];
+          pd->eob = eob[plane];
+        }
+      }
     }
   }
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 1eac502..da600fb 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1356,7 +1356,12 @@
   write_uncompressed_header(cpi, &wb);
 
   // Skip the rest coding process if use show existing frame.
-  if (cpi->common.show_existing_frame) return;
+  if (cpi->common.show_existing_frame) {
+    uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
+    data += uncompressed_hdr_size;
+    *size = data - dest;
+    return;
+  }
 
   saved_wb = wb;
   vpx_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 8d35ac3..a5d59b6 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -270,6 +270,7 @@
   const int index = row * num_cols + col;
 
   assert(cpi->oxcf.tuning == VP8_TUNE_SSIM);
+  vpx_clear_system_state();
   return cpi->mi_ssim_rdmult_scaling_factors[index];
 }
 
@@ -312,6 +313,7 @@
     const double ssim_factor =
         get_ssim_rdmult_scaling_factor(cpi, mi_row, mi_col);
     x->rdmult = (int)(ssim_factor * x->rdmult);
+    vpx_clear_system_state();
   }
 
   // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs()
@@ -1963,6 +1965,7 @@
     const double ssim_factor =
         get_ssim_rdmult_scaling_factor(cpi, mi_row, mi_col);
     x->rdmult = (int)(ssim_factor * x->rdmult);
+    vpx_clear_system_state();
   }
 }
 
@@ -2208,6 +2211,7 @@
       const double ssim_factor =
           get_ssim_rdmult_scaling_factor(cpi, mi_row, mi_col);
       x->rdmult = (int)(ssim_factor * x->rdmult);
+      vpx_clear_system_state();
     }
   }
 
@@ -3823,6 +3827,7 @@
     const double ssim_factor =
         get_ssim_rdmult_scaling_factor(cpi, mi_row, mi_col);
     partition_mul = (int)(ssim_factor * partition_mul);
+    vpx_clear_system_state();
   }
 
   (void)*tp_orig;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index efa7f3c..34c30c8 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3748,6 +3748,7 @@
           : 0;
 
   if (cm->show_existing_frame) {
+    cpi->rc.this_frame_target = 0;
     if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi);
     return 1;
   }
@@ -4071,6 +4072,7 @@
 #endif
 
   if (cm->show_existing_frame) {
+    rc->this_frame_target = 0;
     if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi);
     return;
   }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index ae84dd5..5fdca8d 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2887,7 +2887,9 @@
 
 #define FRAMES_TO_CHECK_DECAY 8
 #define MIN_KF_TOT_BOOST 300
-#define KF_BOOST_SCAN_MAX_FRAMES 32
+#define DEFAULT_SCAN_FRAMES_FOR_KF_BOOST 32
+#define MAX_SCAN_FRAMES_FOR_KF_BOOST 48
+#define MIN_SCAN_FRAMES_FOR_KF_BOOST 32
 #define KF_ABS_ZOOM_THRESH 6.0
 
 #ifdef AGGRESSIVE_VBR
@@ -2911,6 +2913,12 @@
   int kf_bits = 0;
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
+  double zero_motion_sum = 0.0;
+  double zero_motion_avg;
+  double motion_compensable_sum = 0.0;
+  double motion_compensable_avg;
+  int num_frames = 0;
+  int kf_boost_scan_frames = DEFAULT_SCAN_FRAMES_FOR_KF_BOOST;
   double boost_score = 0.0;
   double kf_mod_err = 0.0;
   double kf_raw_err = 0.0;
@@ -3063,13 +3071,34 @@
   // how many bits to spend on it.
   boost_score = 0.0;
 
+  for (i = 0; i < VPXMIN(MAX_SCAN_FRAMES_FOR_KF_BOOST, (rc->frames_to_key - 1));
+       ++i) {
+    if (EOF == input_stats(twopass, &next_frame)) break;
+
+    zero_motion_sum += next_frame.pcnt_inter - next_frame.pcnt_motion;
+    motion_compensable_sum +=
+        1 - (double)next_frame.coded_error / next_frame.intra_error;
+    num_frames++;
+  }
+
+  if (num_frames >= MIN_SCAN_FRAMES_FOR_KF_BOOST) {
+    zero_motion_avg = zero_motion_sum / num_frames;
+    motion_compensable_avg = motion_compensable_sum / num_frames;
+    kf_boost_scan_frames = (int)(VPXMAX(64 * zero_motion_avg - 16,
+                                        160 * motion_compensable_avg - 112));
+    kf_boost_scan_frames =
+        VPXMAX(VPXMIN(kf_boost_scan_frames, MAX_SCAN_FRAMES_FOR_KF_BOOST),
+               MIN_SCAN_FRAMES_FOR_KF_BOOST);
+  }
+  reset_fpf_position(twopass, start_position);
+
   for (i = 0; i < (rc->frames_to_key - 1); ++i) {
     if (EOF == input_stats(twopass, &next_frame)) break;
 
     // The zero motion test here insures that if we mark a kf group as static
     // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES.
     // It also allows for a larger boost on long static groups.
-    if ((i <= KF_BOOST_SCAN_MAX_FRAMES) || (zero_motion_accumulator >= 0.99)) {
+    if ((i <= kf_boost_scan_frames) || (zero_motion_accumulator >= 0.99)) {
       double frame_boost;
       double zm_factor;