vp9-rtc: tx_size selection for intra mode in nonrd

In nonrd_pickmode for intra modes: add tx_size selection
based on Y prediction signal for the bsize.

The tx selection is done in model_rd, same as inter-modes.

Existing code for intra mode was first setting a tx_size based
only on the bsize, and then in some cases in block_yrd
(during the loop over bsize in units of tx_size) the tx_size
may be set again if model_rd is called in block_yrd.

This CL separates out the tx_size setting (based on Y channel
prediction via model_rd), and then block_yrd is called once
for whole bsize. This allows for better tuning of the tx
selection for intra modes in future change.

Adjust threshold in svc datarate test.

Negligible/neutral change in psnr/ssim metrics
for speed 7 and 8, 1 layer and SVC mode.

Change-Id: I33bc8447afdc3785482e13aac5c3636e13c59644
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index bb9b075..6936b56 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -937,7 +937,7 @@
   layer_framedrop_ = GET_PARAM(2);
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.65,
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.64,
                           1.45);
 #if CONFIG_VP9_DECODER
   // The non-reference frames are expected to be mismatched frames as the
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index fede917..67c42f0 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -349,6 +349,7 @@
 static TX_SIZE calculate_tx_size(VP9_COMP *const cpi, BLOCK_SIZE bsize,
                                  MACROBLOCKD *const xd, unsigned int var,
                                  unsigned int sse, int64_t ac_thr) {
+  // TODO(marpan): Tune selection for intra-modes, screen content, etc.
   TX_SIZE tx_size;
   if (cpi->common.tx_mode == TX_MODE_SELECT) {
     if (sse > (var << 2))
@@ -356,13 +357,11 @@
                        tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
     else
       tx_size = TX_8X8;
-
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && var > 0 &&
         cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
       tx_size = TX_8X8;
-    else if (tx_size > TX_16X16)
+    else if (tx_size > TX_16X16 && var > 0)
       tx_size = TX_16X16;
-
     // For screen-content force 4X4 tx_size over 8X8, for large variance.
     if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && tx_size == TX_8X8 &&
         bsize <= BLOCK_16X16 && var > (ac_thr << 6))
@@ -371,10 +370,52 @@
     tx_size = VPXMIN(max_txsize_lookup[bsize],
                      tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   }
-
   return tx_size;
 }
 
+static void compute_intra_yprediction(PREDICTION_MODE mode, BLOCK_SIZE bsize,
+                                      MACROBLOCK *x, MACROBLOCKD *xd) {
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+  int row, col;
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide =
+      num_4x4_w + (xd->mb_to_right_edge >= 0
+                       ? 0
+                       : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high =
+      num_4x4_h + (xd->mb_to_bottom_edge >= 0
+                       ? 0
+                       : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (row = 0; row < max_blocks_high; row += (1 << tx_size)) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
+      p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
+      pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
+      vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode,
+                              x->skip_encode ? p->src.buf : pd->dst.buf,
+                              x->skip_encode ? src_stride : dst_stride,
+                              pd->dst.buf, dst_stride, col, row, 0);
+    }
+  }
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+}
+
 static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                     MACROBLOCK *x, MACROBLOCKD *xd,
                                     int *out_rate_sum, int64_t *out_dist_sum,
@@ -1016,7 +1057,6 @@
 
   if (plane == 0) {
     int64_t this_sse = INT64_MAX;
-    // TODO(jingning): This needs further refactoring.
     block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx,
               VPXMIN(tx_size, TX_16X16), 0);
   } else {
@@ -2291,7 +2331,6 @@
       this_sse = (int64_t)sse_y;
       block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize,
                 VPXMIN(mi->tx_size, TX_16X16), rd_computed);
-
       x->skip_txfm[0] = is_skippable;
       if (is_skippable) {
         this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -2466,13 +2505,12 @@
        bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad &&
        !x->lowvar_highsumdiff)) {
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+    int64_t this_sse = INT64_MAX;
     int i;
     PRED_BUFFER *const best_pred = best_pickmode.best_pred;
     TX_SIZE intra_tx_size =
         VPXMIN(max_txsize_lookup[bsize],
                tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && intra_tx_size > TX_16X16)
-      intra_tx_size = TX_16X16;
 
     if (reuse_inter_pred && best_pred != NULL) {
       if (best_pred->data == orig_dst.buf) {
@@ -2533,8 +2571,13 @@
       args.skippable = 1;
       args.rdc = &this_rdc;
       mi->tx_size = intra_tx_size;
-      vp9_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra,
-                                             &args);
+
+      compute_intra_yprediction(this_mode, bsize, x, xd);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                        &var_y, &sse_y);
+      block_yrd(cpi, x, &this_rdc, &args.skippable, &this_sse, bsize,
+                VPXMIN(mi->tx_size, TX_16X16), 1);
+
       // Check skip cost here since skippable is not set for for uv, this
       // mirrors the behavior used by inter
       if (args.skippable) {