Merge "Loopfilter Multi-Thread Optimization"
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 8f6c5cd..c4b3922 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -789,7 +789,7 @@
}
}
-const int kNumFilterBanks = 4;
+const int kNumFilterBanks = 5;
const int kNumFilters = 16;
TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index fc833bc..e89f71e 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1474,7 +1474,7 @@
// QP threshold: only allow dropping if we are not close to qp_max.
int thresh_qp = 3 * cpi->worst_quality >> 2;
// Rate threshold, in bytes.
- int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3);
+ int thresh_rate = 3 * (cpi->av_per_frame_bandwidth >> 3);
// Threshold for the average (over all macroblocks) of the pixel-sum
// residual error over 16x16 block.
int thresh_pred_err_mb = (200 << 4);
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 6c43af8..cadae6f 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -63,6 +63,20 @@
{ 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 1, 38, 64, 32, -1, -3 }
};
-const InterpKernel *vp9_filter_kernels[4] = {
- sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters
+// 4-tap filter
+DECLARE_ALIGNED(256, static const InterpKernel,
+ sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -3, 125, 8, -2, 0, 0 },
+ { 0, 0, -6, 120, 18, -4, 0, 0 }, { 0, 0, -8, 115, 27, -6, 0, 0 },
+ { 0, 0, -10, 108, 37, -7, 0, 0 }, { 0, 0, -11, 101, 47, -9, 0, 0 },
+ { 0, 0, -11, 93, 56, -10, 0, 0 }, { 0, 0, -12, 85, 66, -11, 0, 0 },
+ { 0, 0, -11, 75, 75, -11, 0, 0 }, { 0, 0, -11, 66, 85, -12, 0, 0 },
+ { 0, 0, -10, 56, 93, -11, 0, 0 }, { 0, 0, -9, 47, 101, -11, 0, 0 },
+ { 0, 0, -7, 37, 108, -10, 0, 0 }, { 0, 0, -6, 27, 115, -8, 0, 0 },
+ { 0, 0, -4, 18, 120, -6, 0, 0 }, { 0, 0, -2, 8, 125, -3, 0, 0 }
+};
+
+const InterpKernel *vp9_filter_kernels[5] = {
+ sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters,
+ sub_pel_filters_4
};
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index b379665..0382c88 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -25,6 +25,7 @@
#define EIGHTTAP_SHARP 2
#define SWITCHABLE_FILTERS 3 /* Number of switchable filters */
#define BILINEAR 3
+#define FOURTAP 4
// The codec can operate in four possible inter prediction filter mode:
// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
@@ -32,7 +33,7 @@
typedef uint8_t INTERP_FILTER;
-extern const InterpKernel *vp9_filter_kernels[4];
+extern const InterpKernel *vp9_filter_kernels[5];
#ifdef __cplusplus
} // extern "C"
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 0613058..563fdbb 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -211,6 +211,8 @@
#if CONFIG_ML_VAR_PARTITION
DECLARE_ALIGNED(16, uint8_t, est_pred[64 * 64]);
#endif // CONFIG_ML_VAR_PARTITION
+
+ struct scale_factors *me_sf;
};
#ifdef __cplusplus
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ad30951..72dc137 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3405,30 +3405,29 @@
MODE_INFO *mi = xd->mi[0];
const NN_CONFIG *nn_config = NULL;
DECLARE_ALIGNED(16, uint8_t, pred_buf[64 * 64]);
+ const int speed = cpi->oxcf.speed;
int i;
- float thresh_low = -1.0f;
- float thresh_high = 0.0f;
+ float thresh = 0.0f;
switch (bsize) {
case BLOCK_64X64:
nn_config = &vp9_var_rd_part_nnconfig_64;
- thresh_low = -3.0f;
- thresh_high = 3.0f;
+ thresh = speed > 0 ? 3.5f : 3.0f;
break;
case BLOCK_32X32:
nn_config = &vp9_var_rd_part_nnconfig_32;
- thresh_low = -3.0;
- thresh_high = 3.0f;
+ thresh = speed > 0 ? 3.5f : 3.0f;
break;
case BLOCK_16X16:
nn_config = &vp9_var_rd_part_nnconfig_16;
- thresh_low = -4.0;
- thresh_high = 4.0f;
+ thresh = speed > 0 ? 3.5f : 4.0f;
break;
case BLOCK_8X8:
nn_config = &vp9_var_rd_part_nnconfig_8;
- thresh_low = -2.0;
- thresh_high = 2.0f;
+ if (cm->width >= 720 && cm->height >= 720)
+ thresh = speed > 0 ? 2.5f : 2.0f;
+ else
+ thresh = speed > 0 ? 3.5f : 2.0f;
break;
default: assert(0 && "Unexpected block size."); return;
}
@@ -3520,8 +3519,8 @@
// partition is better than the non-split partition. So if the score is
// high enough, we skip the none-split partition search; if the score is
// low enough, we skip the split partition search.
- if (score > thresh_high) *none = 0;
- if (score < thresh_low) *split = 0;
+ if (score > thresh) *none = 0;
+ if (score < -thresh) *split = 0;
}
}
#undef FEATURES
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index edb4cb2..4a55e08 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2449,6 +2449,17 @@
vp9_loop_filter_init(cm);
+ // Set up the unit scaling factor used during motion search.
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height,
+ cm->width, cm->height,
+ cm->use_highbitdepth);
+#else
+ vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height,
+ cm->width, cm->height);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ cpi->td.mb.me_sf = &cpi->me_sf;
+
cm->error.setjmp = 0;
return cpi;
@@ -3217,8 +3228,8 @@
if (cpi->oxcf.pass == 0 && !cpi->use_svc) {
// Check for release of scaled reference.
buf_idx = cpi->scaled_ref_idx[ref_frame - 1];
- buf = (buf_idx != INVALID_IDX) ? &pool->frame_bufs[buf_idx] : NULL;
- if (buf != NULL) {
+ if (buf_idx != INVALID_IDX) {
+ buf = &pool->frame_bufs[buf_idx];
--buf->ref_count;
cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
}
@@ -3249,22 +3260,21 @@
refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
const int idx = cpi->scaled_ref_idx[i - 1];
- RefCntBuffer *const buf =
- idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
- const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
- if (buf != NULL &&
- (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width &&
- buf->buf.y_crop_height == ref->y_crop_height))) {
- --buf->ref_count;
- cpi->scaled_ref_idx[i - 1] = INVALID_IDX;
+ if (idx != INVALID_IDX) {
+ RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx];
+ const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
+ if (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width &&
+ buf->buf.y_crop_height == ref->y_crop_height)) {
+ --buf->ref_count;
+ cpi->scaled_ref_idx[i - 1] = INVALID_IDX;
+ }
}
}
} else {
for (i = 0; i < MAX_REF_FRAMES; ++i) {
const int idx = cpi->scaled_ref_idx[i];
- RefCntBuffer *const buf =
- idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
- if (buf != NULL) {
+ if (idx != INVALID_IDX) {
+ RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx];
--buf->ref_count;
cpi->scaled_ref_idx[i] = INVALID_IDX;
}
@@ -4910,6 +4920,8 @@
cm->new_fb_idx = INVALID_IDX;
for (i = 0; i < REF_FRAMES; ++i) {
cm->ref_frame_map[i] = INVALID_IDX;
+ }
+ for (i = 0; i < FRAME_BUFFERS; ++i) {
pool->frame_bufs[i].ref_count = 0;
}
}
@@ -5451,7 +5463,7 @@
#if CONFIG_NON_GREEDY_MV
static void prepare_nb_full_mvs(const TplDepFrame *tpl_frame, int mi_row,
- int mi_col, int_mv *nb_full_mvs) {
+ int mi_col, int rf_idx, int_mv *nb_full_mvs) {
const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } };
int i;
for (i = 0; i < NB_MVS_NUM; ++i) {
@@ -5462,9 +5474,9 @@
const TplDepStats *tpl_ptr =
&tpl_frame
->tpl_stats_ptr[(mi_row + r) * tpl_frame->stride + mi_col + c];
- if (tpl_ptr->ready) {
- nb_full_mvs[i].as_mv.row = tpl_ptr->mv.as_mv.row >> 3;
- nb_full_mvs[i].as_mv.col = tpl_ptr->mv.as_mv.col >> 3;
+ if (tpl_ptr->ready[rf_idx]) {
+ nb_full_mvs[i].as_mv.row = tpl_ptr->mv_arr[rf_idx].as_mv.row >> 3;
+ nb_full_mvs[i].as_mv.col = tpl_ptr->mv_arr[rf_idx].as_mv.col >> 3;
} else {
nb_full_mvs[i].as_int = INVALID_MV;
}
@@ -5503,7 +5515,7 @@
#if CONFIG_NON_GREEDY_MV
// lambda is used to adjust the importance of motion vector consitency.
// TODO(angiebird): Figure out lambda's proper value.
- double lambda = 10000;
+ double lambda = cpi->tpl_stats[frame_idx].lambda;
int_mv nb_full_mvs[NB_MVS_NUM];
#endif
@@ -5527,7 +5539,8 @@
#if CONFIG_NON_GREEDY_MV
(void)search_method;
(void)sadpb;
- prepare_nb_full_mvs(&cpi->tpl_stats[frame_idx], mi_row, mi_col, nb_full_mvs);
+ prepare_nb_full_mvs(&cpi->tpl_stats[frame_idx], mi_row, mi_col, rf_idx,
+ nb_full_mvs);
vp9_full_pixel_diamond_new(cpi, x, &best_ref_mv1_full, step_param, lambda,
MAX_MVSEARCH_STEPS - 1 - step_param, 1,
&cpi->fn_ptr[bsize], nb_full_mvs, tpl_stats,
@@ -5544,12 +5557,13 @@
/* restore UMV window */
x->mv_limits = tmp_mv_limits;
+ // TODO(yunqing): may use higher tap interp filter than 2 taps.
// Ignore mv costing by sending NULL pointer instead of cost array
bestsme = cpi->find_fractional_mv_step(
x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
&cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
- cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0,
- 0);
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+ USE_2_TAPS);
return bestsme;
}
@@ -5614,6 +5628,7 @@
#if CONFIG_NON_GREEDY_MV
int rf_idx;
for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+ tpl_ptr->ready[rf_idx] = src_stats->ready[rf_idx];
tpl_ptr->mv_dist[rf_idx] = src_stats->mv_dist[rf_idx];
tpl_ptr->mv_cost[rf_idx] = src_stats->mv_cost[rf_idx];
tpl_ptr->inter_cost_arr[rf_idx] = src_stats->inter_cost;
@@ -5622,7 +5637,6 @@
tpl_ptr->mv_arr[rf_idx].as_int = src_stats->mv_arr[rf_idx].as_int;
}
tpl_ptr->feature_score = src_stats->feature_score;
- tpl_ptr->ready = 1;
#endif
tpl_ptr->intra_cost = intra_cost;
tpl_ptr->inter_cost = inter_cost;
@@ -5763,6 +5777,16 @@
}
#endif
+static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
+ int mi_col) {
+ x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+ x->mv_limits.row_max =
+ (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
+ x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+ x->mv_limits.col_max =
+ ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
+}
+
void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
struct scale_factors *sf, GF_PICTURE *gf_picture,
int frame_idx, int16_t *src_diff, tran_low_t *coeff,
@@ -5828,24 +5852,19 @@
// Motion compensated prediction
best_mv.as_int = 0;
- (void)mb_y_offset;
- // Motion estimation column boundary
- x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
- x->mv_limits.col_max =
- ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
-
-#if CONFIG_NON_GREEDY_MV
- tpl_stats->feature_score = get_feature_score(
- xd->cur_buf->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bw, bh);
-#endif
+ set_mv_limits(cm, x, mi_row, mi_col);
for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
int_mv mv;
if (ref_frame[rf_idx] == NULL) {
#if CONFIG_NON_GREEDY_MV
- tpl_stats->inter_cost_arr[rf_idx] = -1;
+ tpl_stats->ready[rf_idx] = 0;
#endif
continue;
+ } else {
+#if CONFIG_NON_GREEDY_MV
+ tpl_stats->ready[rf_idx] = 1;
+#endif
}
#if CONFIG_NON_GREEDY_MV
@@ -5954,6 +5973,9 @@
const int mi_height = num_8x8_blocks_high_lookup[bsize];
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
int64_t recon_error, sse;
+#if CONFIG_NON_GREEDY_MV
+ int rf_idx;
+#endif
// Setup scaling factor
#if CONFIG_VP9_HIGHBITDEPTH
@@ -5995,16 +6017,38 @@
cm->base_qindex = tpl_frame->base_qindex;
vp9_frame_init_quantizer(cpi);
+#if CONFIG_NON_GREEDY_MV
+ tpl_frame->lambda = 250;
+
for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
- // Motion estimation row boundary
- x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
- x->mv_limits.row_max =
- (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+ const int mb_y_offset =
+ mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+ const int bw = 4 << b_width_log2_lookup[bsize];
+ const int bh = 4 << b_height_log2_lookup[bsize];
+ TplDepStats *tpl_stats =
+ &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+ tpl_stats->feature_score = get_feature_score(
+ xd->cur_buf->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bw, bh);
+ }
+ }
+
+ for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+ tpl_frame->mv_dist_sum[rf_idx] = 0;
+ tpl_frame->mv_cost_sum[rf_idx] = 0;
+ }
+#endif
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
TplDepStats tpl_stats;
mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, src_diff, coeff,
qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size,
ref_frame, predictor, &recon_error, &sse, &tpl_stats);
+#if CONFIG_NON_GREEDY_MV
+ tpl_stats.feature_score =
+ tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]
+ .feature_score;
+#endif
// Motion flow dependency dispenser.
tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
@@ -6012,6 +6056,16 @@
tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
bsize);
+#if CONFIG_NON_GREEDY_MV
+ {
+ TplDepStats *this_tpl_stats =
+ &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+ for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+ tpl_frame->mv_dist_sum[rf_idx] += this_tpl_stats->mv_dist[rf_idx];
+ tpl_frame->mv_cost_sum[rf_idx] += this_tpl_stats->mv_cost[rf_idx];
+ }
+ }
+#endif
}
}
}
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 75f177f..a9f7daf 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -291,7 +291,7 @@
int_mv mv;
#if CONFIG_NON_GREEDY_MV
- int ready;
+ int ready[3];
double mv_dist[3];
double mv_cost[3];
int64_t inter_cost_arr[3];
@@ -311,6 +311,11 @@
int mi_rows;
int mi_cols;
int base_qindex;
+#if CONFIG_NON_GREEDY_MV
+ double lambda;
+ double mv_dist_sum[3];
+ double mv_cost_sum[3];
+#endif
} TplDepFrame;
#define TPL_DEP_COST_SCALE_LOG2 4
@@ -600,6 +605,7 @@
ActiveMap active_map;
fractional_mv_step_fp *find_fractional_mv_step;
+ struct scale_factors me_sf;
vp9_diamond_search_fn_t diamond_search_sad;
vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
uint64_t time_receive_data;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index b5f21ea..9bd0a9e 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -43,12 +43,6 @@
#define INVALID_ROW -1
-// Length of the bi-predictive frame group (BFG)
-// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
-// number of bi-predictive frames.
-#define BFG_INTERVAL 2
-#define MAX_EXT_ARFS 2
-#define MIN_EXT_ARF_INTERVAL 4
#define MAX_ARF_LAYERS 6
typedef struct {
@@ -200,7 +194,6 @@
struct TileDataEnc;
void vp9_init_first_pass(struct VP9_COMP *cpi);
-void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
void vp9_end_first_pass(struct VP9_COMP *cpi);
@@ -219,17 +212,6 @@
void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width,
int *scaled_frame_height);
-static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
- assert(MAX_EXT_ARFS > 0);
- if (arf_pending) {
- if (interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1))
- return MAX_EXT_ARFS;
- else if (interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS)
- return MAX_EXT_ARFS - 1;
- }
- return 0;
-}
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 2ec048b..831c79c 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -57,11 +57,12 @@
{
uint32_t distortion;
uint32_t sse;
+ // TODO(yunqing): may use higher tap interp filter than 2 taps if needed.
cpi->find_fractional_mv_step(
x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
&v_fn_ptr, 0, mv_sf->subpel_search_level,
cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0,
- 0);
+ 0, USE_2_TAPS);
}
xd->mi[0]->mode = NEWMV;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 995c54f..0f9051b 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -367,14 +367,12 @@
*ir = (int)divide_and_round(x1 * b, y1);
}
-uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv,
- const MV *ref_mv, int allow_hp,
- int error_per_bit,
- const vp9_variance_fn_ptr_t *vfp,
- int forced_stop, int iters_per_step,
- int *cost_list, int *mvjcost, int *mvcost[2],
- uint32_t *distortion, uint32_t *sse1,
- const uint8_t *second_pred, int w, int h) {
+uint32_t vp9_skip_sub_pixel_tree(
+ const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+ int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+ uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+ int h, int use_accurate_subpel_search) {
SETUP_SUBPEL_SEARCH;
besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
src_stride, y, y_stride, second_pred, w, h,
@@ -397,6 +395,7 @@
(void)sse;
(void)thismse;
(void)cost_list;
+ (void)use_accurate_subpel_search;
return besterr;
}
@@ -406,7 +405,7 @@
int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
- int h) {
+ int h, int use_accurate_subpel_search) {
SETUP_SUBPEL_SEARCH;
besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
src_stride, y, y_stride, second_pred, w, h,
@@ -418,6 +417,7 @@
(void)allow_hp;
(void)forced_stop;
(void)hstep;
+ (void)use_accurate_subpel_search;
if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
@@ -471,8 +471,10 @@
int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
- int h) {
+ int h, int use_accurate_subpel_search) {
SETUP_SUBPEL_SEARCH;
+ (void)use_accurate_subpel_search;
+
besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
src_stride, y, y_stride, second_pred, w, h,
offset, mvjcost, mvcost, sse1, distortion);
@@ -531,8 +533,10 @@
int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
- int h) {
+ int h, int use_accurate_subpel_search) {
SETUP_SUBPEL_SEARCH;
+ (void)use_accurate_subpel_search;
+
besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
src_stride, y, y_stride, second_pred, w, h,
offset, mvjcost, mvcost, sse1, distortion);
@@ -617,12 +621,119 @@
};
/* clang-format on */
+static int accurate_sub_pel_search(
+ const MACROBLOCKD *xd, const MV *this_mv, const struct scale_factors *sf,
+ const InterpKernel *kernel, const vp9_variance_fn_ptr_t *vfp,
+ const uint8_t *const src_address, const int src_stride,
+ const uint8_t *const pre_address, int y_stride, const uint8_t *second_pred,
+ int w, int h, uint32_t *sse) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ uint64_t besterr;
+ assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16);
+ assert(w != 0 && h != 0);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]);
+ vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(pre_address), y_stride,
+ pred16, w, this_mv, sf, w, h, 0, kernel,
+ MV_PRECISION_Q3, 0, 0, xd->bd);
+ if (second_pred != NULL) {
+ DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+ vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w,
+ h, pred16, w);
+ besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src_address,
+ src_stride, sse);
+ } else {
+ besterr =
+ vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src_address, src_stride, sse);
+ }
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+ vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
+ 0, kernel, MV_PRECISION_Q3, 0, 0);
+ if (second_pred != NULL) {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
+ besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
+ } else {
+ besterr = vfp->vf(pred, w, src_address, src_stride, sse);
+ }
+ }
+ if (besterr >= UINT_MAX) return UINT_MAX;
+ return (int)besterr;
+#else
+ int besterr;
+ DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+ assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16);
+ assert(w != 0 && h != 0);
+ (void)xd;
+
+ vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
+ 0, kernel, MV_PRECISION_Q3, 0, 0);
+ if (second_pred != NULL) {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
+ besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
+ } else {
+ besterr = vfp->vf(pred, w, src_address, src_stride, sse);
+ }
+ return besterr;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+// TODO(yunqing): this part can be further refactored.
+#if CONFIG_VP9_HIGHBITDEPTH
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ int64_t tmpmse; \
+ const MV mv = { r, c }; \
+ const MV ref_mv = { rr, rc }; \
+ thismse = \
+ accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \
+ y, y_stride, second_pred, w, h, &sse); \
+ tmpmse = thismse; \
+ tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \
+ if (tmpmse >= INT_MAX) { \
+ v = INT_MAX; \
+ } else if ((v = (uint32_t)tmpmse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+#else
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ const MV mv = { r, c }; \
+ const MV ref_mv = { rr, rc }; \
+ thismse = \
+ accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \
+ y, y_stride, second_pred, w, h, &sse); \
+ if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \
+ thismse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+
+#endif
+
uint32_t vp9_find_best_sub_pixel_tree(
const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
- int h) {
+ int h, int use_accurate_subpel_search) {
const uint8_t *const z = x->plane[0].src.buf;
const uint8_t *const src_address = z;
const int src_stride = x->plane[0].src.stride;
@@ -650,6 +761,14 @@
int kr, kc;
MvLimits subpel_mv_limits;
+ // TODO(yunqing): need to add 4-tap filter optimization to speed up the
+ // encoder.
+ const InterpKernel *kernel = (use_accurate_subpel_search > 0)
+ ? ((use_accurate_subpel_search == USE_4_TAPS)
+ ? vp9_filter_kernels[FOURTAP]
+ : vp9_filter_kernels[EIGHTTAP])
+ : vp9_filter_kernels[BILINEAR];
+
vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
minc = subpel_mv_limits.col_min;
maxc = subpel_mv_limits.col_max;
@@ -674,16 +793,25 @@
tr = br + search_step[idx].row;
tc = bc + search_step[idx].col;
if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
- const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
MV this_mv;
this_mv.row = tr;
this_mv.col = tc;
- if (second_pred == NULL)
- thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
- src_stride, &sse);
- else
- thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
- src_address, src_stride, &sse, second_pred);
+
+ if (use_accurate_subpel_search) {
+ thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp,
+ src_address, src_stride, y,
+ y_stride, second_pred, w, h, &sse);
+ } else {
+ const uint8_t *const pre_address =
+ y + (tr >> 3) * y_stride + (tc >> 3);
+ if (second_pred == NULL)
+ thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+ src_address, src_stride, &sse);
+ else
+ thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+ src_address, src_stride, &sse, second_pred);
+ }
+
cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
mvcost, error_per_bit);
@@ -705,14 +833,21 @@
tc = bc + kc;
tr = br + kr;
if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
- const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
MV this_mv = { tr, tc };
- if (second_pred == NULL)
- thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
- src_stride, &sse);
- else
- thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), src_address,
- src_stride, &sse, second_pred);
+ if (use_accurate_subpel_search) {
+ thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp,
+ src_address, src_stride, y, y_stride,
+ second_pred, w, h, &sse);
+ } else {
+ const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+ if (second_pred == NULL)
+ thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
+ src_stride, &sse);
+ else
+ thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+ src_address, src_stride, &sse, second_pred);
+ }
+
cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
error_per_bit);
@@ -743,20 +878,36 @@
if (tr == br && tc != bc) {
kc = bc - tc;
if (iters_per_step == 1) {
- CHECK_BETTER(second, br0, bc0 + kc);
+ if (use_accurate_subpel_search) {
+ CHECK_BETTER1(second, br0, bc0 + kc);
+ } else {
+ CHECK_BETTER(second, br0, bc0 + kc);
+ }
}
} else if (tr != br && tc == bc) {
kr = br - tr;
if (iters_per_step == 1) {
- CHECK_BETTER(second, br0 + kr, bc0);
+ if (use_accurate_subpel_search) {
+ CHECK_BETTER1(second, br0 + kr, bc0);
+ } else {
+ CHECK_BETTER(second, br0 + kr, bc0);
+ }
}
}
if (iters_per_step > 1) {
- CHECK_BETTER(second, br0 + kr, bc0);
- CHECK_BETTER(second, br0, bc0 + kc);
- if (br0 != br || bc0 != bc) {
- CHECK_BETTER(second, br0 + kr, bc0 + kc);
+ if (use_accurate_subpel_search) {
+ CHECK_BETTER1(second, br0 + kr, bc0);
+ CHECK_BETTER1(second, br0, bc0 + kc);
+ if (br0 != br || bc0 != bc) {
+ CHECK_BETTER1(second, br0 + kr, bc0 + kc);
+ }
+ } else {
+ CHECK_BETTER(second, br0 + kr, bc0);
+ CHECK_BETTER(second, br0, bc0 + kc);
+ if (br0 != br || bc0 != bc) {
+ CHECK_BETTER(second, br0 + kr, bc0 + kc);
+ }
}
}
}
@@ -781,6 +932,7 @@
}
#undef CHECK_BETTER
+#undef CHECK_BETTER1
static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
int range) {
@@ -2587,7 +2739,8 @@
(void)tc; \
(void)sse; \
(void)thismse; \
- (void)cost_list;
+ (void)cost_list; \
+ (void)use_accurate_subpel_search;
// Return the maximum MV.
uint32_t vp9_return_max_sub_pixel_mv(
@@ -2595,7 +2748,7 @@
int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
- int h) {
+ int h, int use_accurate_subpel_search) {
COMMON_MV_TEST;
(void)minr;
@@ -2617,7 +2770,7 @@
int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
- int h) {
+ int h, int use_accurate_subpel_search) {
COMMON_MV_TEST;
(void)maxr;
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index adb02bc..6bd85a1 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -75,7 +75,7 @@
int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
- int h);
+ int h, int use_accurate_subpel_search);
extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 416d437..8dce4cf 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -247,7 +247,8 @@
x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop,
cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
- x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0);
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0,
+ cpi->sf.use_accurate_subpel_search);
*rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
x->mvcost, MV_COST_WEIGHT);
}
@@ -1539,7 +1540,8 @@
cpi->common.allow_high_precision_mv, x->errorperbit,
&cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
- x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0);
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0,
+ cpi->sf.use_accurate_subpel_search);
} else if (svc->use_base_mv && svc->spatial_layer_id) {
if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
const int pre_stride = xd->plane[0].pre[0].stride;
@@ -2758,7 +2760,8 @@
&cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
x->nmvjointcost, x->mvcost, &dummy_dist,
- &x->pred_sse[ref_frame], NULL, 0, 0);
+ &x->pred_sse[ref_frame], NULL, 0, 0,
+ cpi->sf.use_accurate_subpel_search);
xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv;
} else {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 698faa3..9cde479 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1821,7 +1821,7 @@
x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
x->errorperbit, &cpi->fn_ptr[bsize], 0,
cpi->sf.mv.subpel_search_level, NULL, x->nmvjointcost, x->mvcost,
- &dis, &sse, second_pred, pw, ph);
+ &dis, &sse, second_pred, pw, ph, cpi->sf.use_accurate_subpel_search);
}
// Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -1875,6 +1875,8 @@
const BLOCK_SIZE bsize = mi->sb_type;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ const int pw = num_4x4_blocks_wide << 2;
+ const int ph = num_4x4_blocks_high << 2;
ENTROPY_CONTEXT t_above[2], t_left[2];
int subpelmv = 1, have_ref = 0;
SPEED_FEATURES *const sf = &cpi->sf;
@@ -2011,7 +2013,8 @@
x->errorperbit, &cpi->fn_ptr[bsize], sf->mv.subpel_force_stop,
sf->mv.subpel_search_level, cond_cost_list(cpi, cost_list),
x->nmvjointcost, x->mvcost, &distortion,
- &x->pred_sse[mi->ref_frame[0]], NULL, 0, 0);
+ &x->pred_sse[mi->ref_frame[0]], NULL, pw, ph,
+ cpi->sf.use_accurate_subpel_search);
// save motion search result for use in compound prediction
seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv;
@@ -2330,6 +2333,8 @@
const int best_predmv_idx = x->mv_best_ref_index[ref];
const YV12_BUFFER_CONFIG *scaled_ref_frame =
vp9_get_scaled_ref_frame(cpi, ref);
+ const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
+ const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
MV pred_mv[3];
pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
@@ -2452,7 +2457,8 @@
x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
&cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
- x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0);
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph,
+ cpi->sf.use_accurate_subpel_search);
}
*rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
x->mvcost, MV_COST_WEIGHT);
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 4490923..87b417a 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -242,7 +242,7 @@
if (speed >= 1) {
sf->enable_tpl_model = 0;
- sf->ml_var_partition_pruning = 0;
+ sf->ml_var_partition_pruning = !boosted;
sf->ml_prune_rect_partition_threhold[1] = 200;
sf->ml_prune_rect_partition_threhold[2] = 200;
sf->ml_prune_rect_partition_threhold[3] = 200;
@@ -288,9 +288,11 @@
sf->exhaustive_searches_thresh =
(cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23)
: INT_MAX;
+ sf->use_accurate_subpel_search = USE_4_TAPS;
}
if (speed >= 2) {
+ sf->ml_var_partition_pruning = 0;
if (oxcf->vbr_corpus_complexity)
sf->recode_loop = ALLOW_RECODE_FIRST;
else
@@ -328,6 +330,8 @@
good_quality_mesh_patterns[mesh_density_level][i].interval;
}
}
+
+ sf->use_accurate_subpel_search = USE_2_TAPS;
}
if (speed >= 3) {
@@ -450,6 +454,7 @@
sf->disable_golden_ref = 0;
sf->enable_tpl_model = 0;
sf->enhanced_full_pixel_motion_search = 0;
+ sf->use_accurate_subpel_search = USE_2_TAPS;
if (speed >= 1) {
sf->allow_txfm_domain_distortion = 1;
@@ -942,6 +947,7 @@
sf->ml_prune_rect_partition_threhold[2] = -1;
sf->ml_prune_rect_partition_threhold[3] = -1;
sf->ml_var_partition_pruning = 0;
+ sf->use_accurate_subpel_search = USE_8_TAPS;
// Some speed-up features even for best quality as minimal impact on quality.
sf->adaptive_rd_thresh = 1;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index a895ed2..0067bb4 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -243,6 +243,12 @@
RE_ENCODE_MAXQ = 2
} OVERSHOOT_DETECTION_CBR_RT;
+typedef enum {
+ USE_2_TAPS = 0,
+ USE_4_TAPS,
+ USE_8_TAPS,
+} SUBPEL_SEARCH_TYPE;
+
typedef struct SPEED_FEATURES {
MV_SPEED_FEATURES mv;
@@ -586,6 +592,10 @@
// Allow for disabling golden reference.
int disable_golden_ref;
+
+ // Allow sub-pixel search to use interpolation filters with different taps in
+ // order to achieve accurate motion search result.
+ SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
} SPEED_FEATURES;
struct VP9_COMP;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 51668d0..7ac70c8 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -421,12 +421,13 @@
/* restore UMV window */
x->mv_limits = tmp_mv_limits;
+ // TODO(yunqing): may use higher tap interp filter than 2 taps if needed.
// Ignore mv costing by sending NULL pointer instead of cost array
bestsme = cpi->find_fractional_mv_step(
x, ref_mv, &best_ref_mv1, cpi->common.allow_high_precision_mv,
x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_search_level,
- cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0,
- 0);
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+ USE_2_TAPS);
// Restore input state
x->plane[0].src = src;
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index 88967a3..1e5f49b 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -155,7 +155,7 @@
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
- lea ecx, [GLOBAL(pw_8)]
+ lea ecx, [GLOBAL(pw_8)]
mov g_pw_8m, ecx
LOAD_IF_USED 0, 1 ; load eax, ecx back