Merge changes Icec98e6f,I63614e65,I25ea05f4

* changes:
  Add full_pixel_exhaustive_new
  Add sse cost in vp9_full_pixel_diamond_new
  Use motion field for mv inconsistency in mv search
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 5a6717a..05f3f28 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1732,6 +1732,9 @@
   return best_sad;
 }
 
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
 #if CONFIG_NON_GREEDY_MV
 double vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_mvs,
                                 int mv_num) {
@@ -1757,6 +1760,152 @@
   return best_cost;
 }
 
+static double exhuastive_mesh_search_new(const MACROBLOCK *x, MV *best_mv,
+                                         int range, int step,
+                                         const vp9_variance_fn_ptr_t *fn_ptr,
+                                         const MV *center_mv, double lambda,
+                                         const int_mv *nb_full_mvs,
+                                         int full_mv_num) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  MV fcenter_mv = { center_mv->row, center_mv->col };
+  double best_sad;
+  int r, c, i;
+  int start_col, end_col, start_row, end_row;
+  int col_step = (step > 1) ? step : 4;
+
+  assert(step >= 1);
+
+  clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  *best_mv = fcenter_mv;
+  best_sad =
+      fn_ptr->sdf(what->buf, what->stride,
+                  get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+      lambda * vp9_nb_mvs_inconsistency(&fcenter_mv, nb_full_mvs, full_mv_num);
+  start_row = VPXMAX(-range, x->mv_limits.row_min - fcenter_mv.row);
+  start_col = VPXMAX(-range, x->mv_limits.col_min - fcenter_mv.col);
+  end_row = VPXMIN(range, x->mv_limits.row_max - fcenter_mv.row);
+  end_col = VPXMIN(range, x->mv_limits.col_max - fcenter_mv.col);
+
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c };
+        double sad =
+            fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+                        in_what->stride);
+        if (sad < best_sad) {
+          sad +=
+              lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+          if (sad < best_sad) {
+            best_sad = sad;
+            *best_mv = mv;
+          }
+        }
+      } else {
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+            addrs[i] = get_buf_from_mv(in_what, &mv);
+          }
+          fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            if (sads[i] < best_sad) {
+              const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+              const double sad =
+                  sads[i] + lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs,
+                                                              full_mv_num);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+            double sad =
+                fn_ptr->sdf(what->buf, what->stride,
+                            get_buf_from_mv(in_what, &mv), in_what->stride);
+            if (sad < best_sad) {
+              sad += lambda *
+                     vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return best_sad;
+}
+
+static double full_pixel_exhaustive_new(const VP9_COMP *cpi, MACROBLOCK *x,
+                                        MV *centre_mv_full,
+                                        const vp9_variance_fn_ptr_t *fn_ptr,
+                                        MV *dst_mv, double lambda,
+                                        const int_mv *nb_full_mvs,
+                                        int full_mv_num) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MV temp_mv = { centre_mv_full->row, centre_mv_full->col };
+  double bestsme;
+  int i;
+  int interval = sf->mesh_patterns[0].interval;
+  int range = sf->mesh_patterns[0].range;
+  int baseline_interval_divisor;
+  const MV dummy_mv = { 0, 0 };
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
+      (interval > range)) {
+    printf("ERROR: invalid range\n");
+    assert(0);
+  }
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+  range = VPXMIN(range, MAX_RANGE);
+  interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme =
+      exhuastive_mesh_search_new(x, &temp_mv, range, interval, fn_ptr, &temp_mv,
+                                 lambda, nb_full_mvs, full_mv_num);
+
+  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhuastive_mesh_search_new(
+          x, &temp_mv, sf->mesh_patterns[i].range,
+          sf->mesh_patterns[i].interval, fn_ptr, &temp_mv, lambda, nb_full_mvs,
+          full_mv_num);
+
+      if (sf->mesh_patterns[i].interval == 1) break;
+    }
+  }
+
+  bestsme = vp9_get_mvpred_var(x, &temp_mv, &dummy_mv, fn_ptr, 0);
+  *dst_mv = temp_mv;
+
+  return bestsme;
+}
+
 double vp9_diamond_search_sad_new(const MACROBLOCK *x,
                                   const search_site_config *cfg,
                                   const MV *init_full_mv, MV *best_full_mv,
@@ -2279,11 +2428,14 @@
   double thissme;
   double bestsme;
   const int further_steps = MAX_MVSEARCH_STEPS - 1 - step_param;
+  const MV center_mv = { 0, 0 };
   vpx_clear_system_state();
   bestsme = vp9_diamond_search_sad_new(
       x, &cpi->ss_cfg, mvp_full, best_mv, best_mv_dist, best_mv_cost,
       step_param, lambda, &n, fn_ptr, nb_full_mvs, full_mv_num);
 
+  bestsme = vp9_get_mvpred_var(x, best_mv, &center_mv, fn_ptr, 0);
+
   // If there won't be more n-step search, check to see if refining search is
   // needed.
   if (n > further_steps) do_refine = 0;
@@ -2299,6 +2451,7 @@
       thissme = vp9_diamond_search_sad_new(
           x, &cpi->ss_cfg, mvp_full, &temp_mv, &mv_dist, &mv_cost,
           step_param + n, lambda, &num00, fn_ptr, nb_full_mvs, full_mv_num);
+      thissme = vp9_get_mvpred_var(x, &temp_mv, &center_mv, fn_ptr, 0);
       // check to see if refining search is needed.
       if (num00 > further_steps - n) do_refine = 0;
 
@@ -2320,6 +2473,7 @@
     thissme = vp9_refining_search_sad_new(x, &temp_mv, &mv_dist, &mv_cost,
                                           lambda, search_range, fn_ptr,
                                           nb_full_mvs, full_mv_num);
+    thissme = vp9_get_mvpred_var(x, &temp_mv, &center_mv, fn_ptr, 0);
     if (thissme < bestsme) {
       bestsme = thissme;
       *best_mv = temp_mv;
@@ -2327,6 +2481,9 @@
       *best_mv_cost = mv_cost;
     }
   }
+
+  bestsme = full_pixel_exhaustive_new(cpi, x, best_mv, fn_ptr, best_mv, lambda,
+                                      nb_full_mvs, full_mv_num);
   return bestsme;
 }
 #endif  // CONFIG_NON_GREEDY_MV
@@ -2395,9 +2552,6 @@
   return bestsme;
 }
 
-#define MIN_RANGE 7
-#define MAX_RANGE 256
-#define MIN_INTERVAL 1
 // Runs an limited range exhaustive mesh search using a pattern set
 // according to the encode speed profile.
 static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index ab69afd..ab880ff 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -148,6 +148,31 @@
 void vp9_prepare_nb_full_mvs(const struct TplDepFrame *tpl_frame, int mi_row,
                              int mi_col, int rf_idx, BLOCK_SIZE bsize,
                              int_mv *nb_full_mvs);
+
+static INLINE BLOCK_SIZE get_square_block_size(BLOCK_SIZE bsize) {
+  BLOCK_SIZE square_bsize;
+  switch (bsize) {
+    case BLOCK_4X4:
+    case BLOCK_4X8:
+    case BLOCK_8X4: square_bsize = BLOCK_4X4; break;
+    case BLOCK_8X8:
+    case BLOCK_8X16:
+    case BLOCK_16X8: square_bsize = BLOCK_8X8; break;
+    case BLOCK_16X16:
+    case BLOCK_16X32:
+    case BLOCK_32X16: square_bsize = BLOCK_16X16; break;
+    case BLOCK_32X32:
+    case BLOCK_32X64:
+    case BLOCK_64X32:
+    case BLOCK_64X64: square_bsize = BLOCK_32X32; break;
+    default:
+      square_bsize = BLOCK_INVALID;
+      printf("ERROR: invlid block size %d\n", bsize);
+      assert(0);
+      break;
+  }
+  return square_bsize;
+}
 #endif  // CONFIG_NON_GREEDY_MV
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index debe88f..c1a079f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2316,59 +2316,20 @@
 }
 
 #if CONFIG_NON_GREEDY_MV
-#define MAX_PREV_NB_FULL_MV_NUM 8
-static int find_prev_nb_full_mvs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                 int ref_frame, BLOCK_SIZE bsize, int mi_row,
-                                 int mi_col, int_mv *nb_full_mvs) {
-  int i;
-  const TileInfo *tile = &xd->tile;
-  int full_mv_num = 0;
-  assert(bsize >= BLOCK_8X8);
-  for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
-    const POSITION *mv_ref = &mv_ref_blocks[bsize][i];
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
-      const MODE_INFO *nb_mi =
-          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
-      if (nb_mi->sb_type >= BLOCK_8X8) {
-        if (nb_mi->ref_frame[0] == ref_frame) {
-          nb_full_mvs[full_mv_num].as_mv = get_full_mv(&nb_mi->mv[0].as_mv);
-          ++full_mv_num;
-          if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) {
-            return full_mv_num;
-          }
-        } else if (nb_mi->ref_frame[1] == ref_frame) {
-          nb_full_mvs[full_mv_num].as_mv = get_full_mv(&nb_mi->mv[1].as_mv);
-          ++full_mv_num;
-          if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) {
-            return full_mv_num;
-          }
-        }
-      } else {
-        int j;
-        for (j = 0; j < 4; ++j) {
-          // TODO(angiebird): avoid using duplicated mvs
-          if (nb_mi->ref_frame[0] == ref_frame) {
-            nb_full_mvs[full_mv_num].as_mv =
-                get_full_mv(&nb_mi->bmi[j].as_mv[0].as_mv);
-            ++full_mv_num;
-            if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) {
-              return full_mv_num;
-            }
-          } else if (nb_mi->ref_frame[1] == ref_frame) {
-            nb_full_mvs[full_mv_num].as_mv =
-                get_full_mv(&nb_mi->bmi[j].as_mv[1].as_mv);
-            ++full_mv_num;
-            if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) {
-              return full_mv_num;
-            }
-          }
-        }
-      }
-    }
+static int ref_frame_to_gf_rf_idx(int ref_frame) {
+  if (ref_frame == GOLDEN_FRAME) {
+    return 0;
   }
-  return full_mv_num;
+  if (ref_frame == LAST_FRAME) {
+    return 1;
+  }
+  if (ref_frame == ALTREF_FRAME) {
+    return 2;
+  }
+  assert(0);
+  return -1;
 }
-#endif  // CONFIG_NON_GREEDY_MV
+#endif
 
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                  int mi_row, int mi_col, int_mv *tmp_mv,
@@ -2395,10 +2356,13 @@
   double mv_cost = 0;
   double lambda = (pw * ph) / 4;
   double bestsme;
-  int_mv nb_full_mvs[MAX_PREV_NB_FULL_MV_NUM];
-
-  const int nb_full_mv_num =
-      find_prev_nb_full_mvs(cm, xd, ref, bsize, mi_row, mi_col, nb_full_mvs);
+  int_mv nb_full_mvs[NB_MVS_NUM];
+  const int nb_full_mv_num = NB_MVS_NUM;
+  int gf_group_idx = cpi->twopass.gf_group.index;
+  int gf_rf_idx = ref_frame_to_gf_rf_idx(ref);
+  BLOCK_SIZE square_bsize = get_square_block_size(bsize);
+  vp9_prepare_nb_full_mvs(&cpi->tpl_stats[gf_group_idx], mi_row, mi_col,
+                          gf_rf_idx, square_bsize, nb_full_mvs);
 #else   // CONFIG_NON_GREEDY_MV
   int bestsme = INT_MAX;
   int sadpb = x->sadperbit16;