Reduce call num of exhaustive search

The encoding time difference between non_greedy_mv and baseline
is reduced from 51% to 13%

However, there is also a performance impact.

non_greedy_mv performance:
Before this CL
lowres 0.395% midres 0.716% hdres 0.533%
After this CL
lowres 0.242% midres 0.429% hdres 0.305%

Change-Id: I047d6509df504b264981c0b903c0cc955f45b273
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 08d6d4d..868cd43 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -5937,9 +5937,8 @@
   (void)sadpb;
   nb_full_mv_num = vp9_prepare_nb_full_mvs(&cpi->tpl_stats[frame_idx], mi_row,
                                            mi_col, rf_idx, bsize, nb_full_mvs);
-  vp9_full_pixel_diamond_new(cpi, x, &best_ref_mv1_full, step_param, lambda, 1,
-                             &cpi->fn_ptr[bsize], nb_full_mvs, nb_full_mv_num,
-                             mv);
+  vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                             lambda, 1, nb_full_mvs, nb_full_mv_num, mv);
 #else
   (void)frame_idx;
   (void)mi_row;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 0e44bb4..b6e3090 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -2437,16 +2437,24 @@
   return best_sad;
 }
 
+static int get_exhaustive_threshold(int exhaustive_searches_thresh,
+                                    BLOCK_SIZE bsize) {
+  return exhaustive_searches_thresh >>
+         (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
+}
+
 #if CONFIG_NON_GREEDY_MV
 // Runs sequence of diamond searches in smaller steps for RD.
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
-int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
-                               int step_param, int lambda, int do_refine,
-                               const vp9_variance_fn_ptr_t *fn_ptr,
+int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x,
+                               BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+                               int lambda, int do_refine,
                                const int_mv *nb_full_mvs, int full_mv_num,
                                MV *best_mv) {
+  const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+  const SPEED_FEATURES *const sf = &cpi->sf;
   int n, num00 = 0;
   int thissme;
   int bestsme;
@@ -2495,9 +2503,16 @@
     }
   }
 
-  full_pixel_exhaustive_new(cpi, x, best_mv, fn_ptr, best_mv, lambda,
-                            nb_full_mvs, full_mv_num);
-  bestsme = vp9_get_mvpred_var(x, best_mv, &center_mv, fn_ptr, 0);
+  if (sf->exhaustive_searches_thresh < INT_MAX &&
+      !cpi->rc.is_src_frame_alt_ref) {
+    const int64_t exhaustive_thr =
+        get_exhaustive_threshold(sf->exhaustive_searches_thresh, bsize);
+    if (bestsme > exhaustive_thr) {
+      full_pixel_exhaustive_new(cpi, x, best_mv, fn_ptr, best_mv, lambda,
+                                nb_full_mvs, full_mv_num);
+      bestsme = vp9_get_mvpred_var(x, best_mv, &center_mv, fn_ptr, 0);
+    }
+  }
   return bestsme;
 }
 #endif  // CONFIG_NON_GREEDY_MV
@@ -2886,9 +2901,10 @@
     if (sf->exhaustive_searches_thresh < INT_MAX &&
         !cpi->rc.is_src_frame_alt_ref) {
       const int64_t exhaustive_thr =
-          sf->exhaustive_searches_thresh >>
-          (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
-      if (var > exhaustive_thr) run_exhaustive_search = 1;
+          get_exhaustive_threshold(sf->exhaustive_searches_thresh, bsize);
+      if (var > exhaustive_thr) {
+        run_exhaustive_search = 1;
+      }
     }
   } else if (method == MESH) {
     run_exhaustive_search = 1;
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 424ca62..6f46041 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -136,9 +136,8 @@
                                     const int_mv *nb_full_mvs, int full_mv_num);
 
 int vp9_full_pixel_diamond_new(const struct VP9_COMP *cpi, MACROBLOCK *x,
-                               MV *mvp_full, int step_param, int lambda,
-                               int do_refine,
-                               const vp9_variance_fn_ptr_t *fn_ptr,
+                               BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+                               int lambda, int do_refine,
                                const int_mv *nb_full_mvs, int full_mv_num,
                                MV *best_mv);
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 65b9435..e243e03 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2579,9 +2579,9 @@
   mvp_full.row >>= 3;
 
 #if CONFIG_NON_GREEDY_MV
-  bestsme = vp9_full_pixel_diamond_new(cpi, x, &mvp_full, step_param, lambda, 1,
-                                       &cpi->fn_ptr[bsize], nb_full_mvs,
-                                       nb_full_mv_num, &tmp_mv->as_mv);
+  bestsme = vp9_full_pixel_diamond_new(cpi, x, bsize, &mvp_full, step_param,
+                                       lambda, 1, nb_full_mvs, nb_full_mv_num,
+                                       &tmp_mv->as_mv);
 #else   // CONFIG_NON_GREEDY_MV
   bestsme = vp9_full_pixel_search(
       cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
@@ -2617,9 +2617,9 @@
       mvp_full.row >>= 3;
 #if CONFIG_NON_GREEDY_MV
       this_me = vp9_full_pixel_diamond_new(
-          cpi, x, &mvp_full, VPXMAX(step_param, MAX_MVSEARCH_STEPS - step),
-          lambda, 1, &cpi->fn_ptr[bsize], nb_full_mvs, nb_full_mv_num,
-          &this_mv);
+          cpi, x, bsize, &mvp_full,
+          VPXMAX(step_param, MAX_MVSEARCH_STEPS - step), lambda, 1, nb_full_mvs,
+          nb_full_mv_num, &this_mv);
 #else   // CONFIG_NON_GREEDY_MV
       this_me = vp9_full_pixel_search(
           cpi, x, bsize, &mvp_full,