Merge "Call tpl model build at the beginning of a GOP"
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 9cfaa1f..6f61d3b 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -204,6 +204,8 @@
{ 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" },
{ 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" },
{ 2, "invalid-crbug-629481.webm" },
+ { 3, "invalid-crbug-1558.ivf" },
+ { 4, "invalid-crbug-1562.ivf" },
};
INSTANTIATE_TEST_CASE_P(
diff --git a/test/test-data.mk b/test/test-data.mk
index 83ce7b9..1d42b94 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -787,6 +787,10 @@
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += crbug-1539.rawfile
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 839aa80..771aeca 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -859,3 +859,7 @@
a0fbbbc5dd50fd452096f4455a58c1a8c9f66697 *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf
a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res
894fae3afee0290546590823974203ab4b8abd95 *crbug-1539.rawfile
+f1026c03efd5da21b381c8eb21f0d64e6d7e4ba3 *invalid-crbug-1558.ivf
+eb198c25f861c3fe2cbd310de11eb96843019345 *invalid-crbug-1558.ivf.res
+c62b005a9fd32c36a1b3f67de6840330f9915e34 *invalid-crbug-1562.ivf
+f0cd8389948ad16085714d96567612136f6a46c5 *invalid-crbug-1562.ivf.res
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index e89f71e..fc833bc 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1474,7 +1474,7 @@
// QP threshold: only allow dropping if we are not close to qp_max.
int thresh_qp = 3 * cpi->worst_quality >> 2;
// Rate threshold, in bytes.
- int thresh_rate = 3 * (cpi->av_per_frame_bandwidth >> 3);
+ int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3);
// Threshold for the average (over all macroblocks) of the pixel-sum
// residual error over 16x16 block.
int thresh_pred_err_mb = (200 << 4);
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 45d3b0f..c5c63e4 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -259,6 +259,8 @@
PARTITION_CONTEXT *above_seg_context;
ENTROPY_CONTEXT *above_context;
int above_context_alloc_cols;
+
+ int lf_row;
} VP9_COMMON;
static INLINE YV12_BUFFER_CONFIG *get_buf_frame(VP9_COMMON *cm, int index) {
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index d4b0766..36530fa 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -229,6 +229,28 @@
workers, num_workers, lf_sync);
}
+void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level,
+ int num_workers) {
+ const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+
+ if (!frame_filter_level) return;
+
+ if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+ num_workers > lf_sync->num_workers) {
+ vp9_loop_filter_dealloc(lf_sync);
+ vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+ }
+
+ // Initialize cur_sb_col to -1 for all SB rows.
+ memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+ lf_sync->corrupted = 0;
+
+ memset(lf_sync->num_tiles_done, 0,
+ sizeof(*lf_sync->num_tiles_done) * sb_rows);
+ cm->lf_row = 0;
+}
+
// Set up nsync by width.
static INLINE int get_sync_range(int width) {
// nsync numbers are picked by testing. For example, for 4k
@@ -266,6 +288,25 @@
pthread_cond_init(&lf_sync->cond[i], NULL);
}
}
+ pthread_mutex_init(&lf_sync->lf_mutex, NULL);
+
+ CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
+ vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
+ if (lf_sync->recon_done_mutex) {
+ int i;
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
+ vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
+ if (lf_sync->recon_done_cond) {
+ int i;
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&lf_sync->recon_done_cond[i], NULL);
+ }
+ }
}
#endif // CONFIG_MULTITHREAD
@@ -276,6 +317,11 @@
CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
+ CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,
+ vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
+ mi_cols_aligned_to_sb(cm->mi_rows) >>
+ MI_BLOCK_SIZE_LOG2));
+
// Set up nsync.
lf_sync->sync_range = get_sync_range(width);
}
@@ -298,15 +344,126 @@
}
vpx_free(lf_sync->cond);
}
+ if (lf_sync->recon_done_mutex != NULL) {
+ int i;
+ for (i = 0; i < lf_sync->rows; ++i) {
+ pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]);
+ }
+ vpx_free(lf_sync->recon_done_mutex);
+ }
+
+ pthread_mutex_destroy(&lf_sync->lf_mutex);
+ if (lf_sync->recon_done_cond != NULL) {
+ int i;
+ for (i = 0; i < lf_sync->rows; ++i) {
+ pthread_cond_destroy(&lf_sync->recon_done_cond[i]);
+ }
+ vpx_free(lf_sync->recon_done_cond);
+ }
#endif // CONFIG_MULTITHREAD
+
vpx_free(lf_sync->lfdata);
vpx_free(lf_sync->cur_sb_col);
+ vpx_free(lf_sync->num_tiles_done);
// clear the structure as the source of this call may be a resize in which
// case this call will be followed by an _alloc() which may fail.
vp9_zero(*lf_sync);
}
}
+static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
+ int return_val = -1;
+ int cur_row;
+ const int max_rows = cm->mi_rows;
+
+#if CONFIG_MULTITHREAD
+ const int tile_cols = 1 << cm->log2_tile_cols;
+
+ pthread_mutex_lock(&lf_sync->lf_mutex);
+ if (cm->lf_row < max_rows) {
+ cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
+ return_val = cm->lf_row;
+ cm->lf_row += MI_BLOCK_SIZE;
+ if (cm->lf_row < max_rows) {
+ /* If this is not the last row, make sure the next row is also decoded.
+ * This is because the intra predict has to happen before loop filter */
+ cur_row += 1;
+ }
+ }
+ pthread_mutex_unlock(&lf_sync->lf_mutex);
+
+ if (return_val == -1) return return_val;
+
+ pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]);
+ if (lf_sync->num_tiles_done[cur_row] < tile_cols) {
+ pthread_cond_wait(&lf_sync->recon_done_cond[cur_row],
+ &lf_sync->recon_done_mutex[cur_row]);
+ }
+ pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]);
+ pthread_mutex_lock(&lf_sync->lf_mutex);
+ if (lf_sync->corrupted) {
+ return_val = -1;
+ }
+ pthread_mutex_unlock(&lf_sync->lf_mutex);
+#else
+ (void)lf_sync;
+ if (cm->lf_row < max_rows) {
+ cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
+ return_val = cm->lf_row;
+ cm->lf_row += MI_BLOCK_SIZE;
+ if (cm->lf_row < max_rows) {
+ /* If this is not the last row, make sure the next row is also decoded.
+ * This is because the intra predict has to happen before loop filter */
+ cur_row += 1;
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+
+ return return_val;
+}
+
+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync) {
+ int mi_row;
+ VP9_COMMON *cm = lf_data->cm;
+
+ while ((mi_row = get_next_row(cm, lf_sync)) != -1 && mi_row < cm->mi_rows) {
+ lf_data->start = mi_row;
+ lf_data->stop = mi_row + MI_BLOCK_SIZE;
+
+ thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+ lf_data->start, lf_data->stop, lf_data->y_only,
+ lf_sync);
+ }
+}
+
+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row,
+ int corrupted) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&lf_sync->lf_mutex);
+ lf_sync->corrupted |= corrupted;
+ pthread_mutex_unlock(&lf_sync->lf_mutex);
+ pthread_mutex_lock(&lf_sync->recon_done_mutex[row]);
+ lf_sync->num_tiles_done[row] += 1;
+ if (num_tiles == lf_sync->num_tiles_done[row]) {
+ if (is_last_row) {
+ /* The last 2 rows wait on the last row to be done.
+ * So, we have to broadcast the signal in this case.
+ */
+ pthread_cond_broadcast(&lf_sync->recon_done_cond[row]);
+ } else {
+ pthread_cond_signal(&lf_sync->recon_done_cond[row]);
+ }
+ }
+ pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]);
+#else
+ (void)lf_sync;
+ (void)num_tiles;
+ (void)row;
+ (void)is_last_row;
+ (void)corrupted;
+#endif // CONFIG_MULTITHREAD
+}
+
// Accumulate frame counts.
void vp9_accumulate_frame_counts(FRAME_COUNTS *accum,
const FRAME_COUNTS *counts, int is_dec) {
diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h
index f92df5b..b97e9ee 100644
--- a/vp9/common/vp9_thread_common.h
+++ b/vp9/common/vp9_thread_common.h
@@ -37,6 +37,14 @@
// Row-based parallel loopfilter data
LFWorkerData *lfdata;
int num_workers;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t lf_mutex;
+ pthread_mutex_t *recon_done_mutex;
+ pthread_cond_t *recon_done_cond;
+#endif
+ int *num_tiles_done;
+ int corrupted;
} VP9LfSync;
// Allocate memory for loopfilter row synchronization.
@@ -53,6 +61,17 @@
int partial_frame, VPxWorker *workers,
int num_workers, VP9LfSync *lf_sync);
+// Multi-threaded loopfilter initialisations
+void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm,
+ int frame_filter_level, int num_workers);
+
+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync);
+
+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row,
+ int corrupted);
+
+void vp9_set_last_decoded_row(struct VP9Common *cm, int tile_col, int mi_row);
+
void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,
const struct FRAME_COUNTS *counts, int is_dec);
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 48c49e2..95e376d 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1451,6 +1451,25 @@
return vpx_reader_find_end(&tile_data->bit_reader);
}
+static void set_rows_after_error(VP9LfSync *lf_sync, int start_row, int mi_rows,
+ int num_tiles_left, int total_num_tiles) {
+ do {
+ int mi_row;
+ const int aligned_rows = mi_cols_aligned_to_sb(mi_rows);
+ const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2);
+ const int corrupted = 1;
+ for (mi_row = start_row; mi_row < mi_rows; mi_row += MI_BLOCK_SIZE) {
+ const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2);
+ vp9_set_row(lf_sync, total_num_tiles, mi_row >> MI_BLOCK_SIZE_LOG2,
+ is_last_row, corrupted);
+ }
+ /* If there are multiple tiles, the second tile should start marking row
+ * progress from row 0.
+ */
+ start_row = 0;
+ } while (num_tiles_left--);
+}
+
// On entry 'tile_data->data_end' points to the end of the input frame, on exit
// it is updated to reflect the bitreader position of the final tile column if
// present in the tile buffer group or NULL otherwise.
@@ -1461,6 +1480,12 @@
TileInfo *volatile tile = &tile_data->xd.tile;
const int final_col = (1 << pbi->common.log2_tile_cols) - 1;
const uint8_t *volatile bit_reader_end = NULL;
+ VP9_COMMON *cm = &pbi->common;
+
+ LFWorkerData *lf_data = tile_data->lf_data;
+ VP9LfSync *lf_sync = tile_data->lf_sync;
+
+ volatile int mi_row = 0;
volatile int n = tile_data->buf_start;
tile_data->error_info.setjmp = 1;
@@ -1468,14 +1493,26 @@
tile_data->error_info.setjmp = 0;
tile_data->xd.corrupted = 1;
tile_data->data_end = NULL;
+ if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+ const int num_tiles_left = tile_data->buf_end - n;
+ const int mi_row_start = mi_row;
+ set_rows_after_error(lf_sync, mi_row_start, cm->mi_rows, num_tiles_left,
+ 1 << cm->log2_tile_cols);
+ }
return 0;
}
tile_data->xd.corrupted = 0;
do {
- int mi_row, mi_col;
+ int mi_col;
const TileBuffer *const buf = pbi->tile_buffers + n;
+
+ /* Initialize to 0 is safe since we do not deal with streams that have
+ * more than one row of tiles. (So tile->mi_row_start will be 0)
+ */
+ assert(cm->log2_tile_rows == 0);
+ mi_row = 0;
vp9_zero(tile_data->dqcoeff);
vp9_tile_init(tile, &pbi->common, 0, buf->col);
setup_token_decoder(buf->data, tile_data->data_end, buf->size,
@@ -1493,6 +1530,14 @@
mi_col += MI_BLOCK_SIZE) {
decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
}
+ if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+ const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+ const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2);
+ const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2);
+ vp9_set_row(lf_sync, 1 << cm->log2_tile_cols,
+ mi_row >> MI_BLOCK_SIZE_LOG2, is_last_row,
+ tile_data->xd.corrupted);
+ }
}
if (buf->col == final_col) {
@@ -1500,6 +1545,21 @@
}
} while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end);
+ if (pbi->lpf_mt_opt && n < tile_data->buf_end && cm->lf.filter_level &&
+ !cm->skip_loop_filter) {
+ /* This was not incremented in the tile loop, so increment before tiles left
+ * calculation
+ */
+ ++n;
+ set_rows_after_error(lf_sync, 0, cm->mi_rows, tile_data->buf_end - n,
+ 1 << cm->log2_tile_cols);
+ }
+
+ if (pbi->lpf_mt_opt && !tile_data->xd.corrupted && cm->lf.filter_level &&
+ !cm->skip_loop_filter) {
+ vp9_loopfilter_rows(lf_data, lf_sync);
+ }
+
tile_data->data_end = bit_reader_end;
return !tile_data->xd.corrupted;
}
@@ -1516,6 +1576,8 @@
VP9_COMMON *const cm = &pbi->common;
const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
const uint8_t *bit_reader_end = NULL;
+ VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
+ YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
@@ -1542,12 +1604,26 @@
}
}
+ // Initialize LPF
+ if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+ vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level,
+ pbi->num_tile_workers);
+ }
+
// Reset tile decoding hook
for (n = 0; n < num_workers; ++n) {
VPxWorker *const worker = &pbi->tile_workers[n];
TileWorkerData *const tile_data =
&pbi->tile_worker_data[n + pbi->total_tiles];
winterface->sync(worker);
+
+ if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+ tile_data->lf_sync = lf_row_sync;
+ tile_data->lf_data = &tile_data->lf_sync->lfdata[n];
+ vp9_loop_filter_data_reset(tile_data->lf_data, new_fb, cm, pbi->mb.plane);
+ tile_data->lf_data->y_only = 0;
+ }
+
tile_data->xd = pbi->mb;
tile_data->xd.counts =
cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
@@ -2069,17 +2145,19 @@
if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
// Multi-threaded tile decoder
*p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
- if (!xd->corrupted) {
- if (!cm->skip_loop_filter) {
- // If multiple threads are used to decode tiles, then we use those
- // threads to do parallel loopfiltering.
- vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level,
- 0, 0, pbi->tile_workers, pbi->num_tile_workers,
- &pbi->lf_row_sync);
+ if (!pbi->lpf_mt_opt) {
+ if (!xd->corrupted) {
+ if (!cm->skip_loop_filter) {
+ // If multiple threads are used to decode tiles, then we use those
+ // threads to do parallel loopfiltering.
+ vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane,
+ cm->lf.filter_level, 0, 0, pbi->tile_workers,
+ pbi->num_tile_workers, &pbi->lf_row_sync);
+ }
+ } else {
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Decode failed. Frame data is corrupted.");
}
- } else {
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Decode failed. Frame data is corrupted.");
}
} else {
*p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 1c48896..425c896 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -37,6 +37,8 @@
int buf_start, buf_end; // pbi->tile_buffers to decode, inclusive
vpx_reader bit_reader;
FRAME_COUNTS counts;
+ LFWorkerData *lf_data;
+ VP9LfSync *lf_sync;
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
/* dqcoeff are shared by all the planes. So planes must be decoded serially */
DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
@@ -74,6 +76,7 @@
int hold_ref_buf; // hold the reference buffer.
int row_mt;
+ int lpf_mt_opt;
} VP9Decoder;
int vp9_receive_compressed_data(struct VP9Decoder *pbi, size_t size,
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 8e35a8b..7ebd2a8 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -5608,42 +5608,21 @@
}
void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
- BLOCK_SIZE bsize, int stride,
- const TplDepStats *src_stats) {
+ BLOCK_SIZE bsize, int stride) {
const int mi_height = num_8x8_blocks_high_lookup[bsize];
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
int idx, idy;
- int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
- int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
-
- TplDepStats *tpl_ptr;
-
- intra_cost = VPXMAX(1, intra_cost);
- inter_cost = VPXMAX(1, inter_cost);
-
for (idy = 0; idy < mi_height; ++idy) {
- tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col];
for (idx = 0; idx < mi_width; ++idx) {
-#if CONFIG_NON_GREEDY_MV
- int rf_idx;
- for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
- tpl_ptr->ready[rf_idx] = src_stats->ready[rf_idx];
- tpl_ptr->mv_dist[rf_idx] = src_stats->mv_dist[rf_idx];
- tpl_ptr->mv_cost[rf_idx] = src_stats->mv_cost[rf_idx];
- tpl_ptr->inter_cost_arr[rf_idx] = src_stats->inter_cost;
- tpl_ptr->recon_error_arr[rf_idx] = src_stats->recon_error_arr[rf_idx];
- tpl_ptr->sse_arr[rf_idx] = src_stats->sse_arr[rf_idx];
- tpl_ptr->mv_arr[rf_idx].as_int = src_stats->mv_arr[rf_idx].as_int;
- }
- tpl_ptr->feature_score = src_stats->feature_score;
-#endif
- tpl_ptr->intra_cost = intra_cost;
- tpl_ptr->inter_cost = inter_cost;
+ TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
+ const int64_t mc_flow = tpl_ptr->mc_flow;
+ const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
+ *tpl_ptr = *src_stats;
+ tpl_ptr->mc_flow = mc_flow;
+ tpl_ptr->mc_ref_cost = mc_ref_cost;
tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
- tpl_ptr->ref_frame_index = src_stats->ref_frame_index;
- tpl_ptr->mv.as_int = src_stats->mv.as_int;
- ++tpl_ptr;
}
}
}
@@ -5789,12 +5768,11 @@
void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
struct scale_factors *sf, GF_PICTURE *gf_picture,
- int frame_idx, int16_t *src_diff, tran_low_t *coeff,
- tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
- int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int frame_idx, TplDepFrame *tpl_frame, int16_t *src_diff,
+ tran_low_t *coeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ int mi_row, int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
- int64_t *recon_error, int64_t *sse,
- TplDepStats *tpl_stats) {
+ int64_t *recon_error, int64_t *sse) {
VP9_COMMON *cm = &cpi->common;
ThreadData *td = &cpi->td;
@@ -5813,8 +5791,10 @@
PREDICTION_MODE mode;
int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
MODE_INFO mi_above, mi_left;
-
- memset(tpl_stats, 0, sizeof(*tpl_stats));
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ TplDepStats *tpl_stats =
+ &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
@@ -5936,9 +5916,10 @@
}
best_intra_cost = VPXMAX(best_intra_cost, 1);
best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
- tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
- tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
- tpl_stats->mc_dep_cost = tpl_stats->intra_cost + tpl_stats->mc_flow;
+ tpl_stats->inter_cost = VPXMAX(
+ 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+ tpl_stats->intra_cost = VPXMAX(
+ 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
tpl_stats->mv.as_int = best_mv.as_int;
}
@@ -6040,19 +6021,12 @@
#endif
for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
- TplDepStats tpl_stats;
- mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, src_diff, coeff,
- qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size,
- ref_frame, predictor, &recon_error, &sse, &tpl_stats);
-#if CONFIG_NON_GREEDY_MV
- tpl_stats.feature_score =
- tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]
- .feature_score;
-#endif
-
+ mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
+ src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
+ tx_size, ref_frame, predictor, &recon_error, &sse);
// Motion flow dependency dispenser.
tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
- tpl_frame->stride, &tpl_stats);
+ tpl_frame->stride);
tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
bsize);
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index fdff877..6a4cb9a 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -270,6 +270,9 @@
RANGE_CHECK(ctx, row_mt, 0, 1);
ctx->pbi->row_mt = ctx->row_mt;
+ RANGE_CHECK(ctx, lpf_opt, 0, 1);
+ ctx->pbi->lpf_mt_opt = ctx->lpf_opt;
+
// If postprocessing was enabled by the application and a
// configuration has not been provided, default it.
if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
@@ -658,6 +661,13 @@
return VPX_CODEC_OK;
}
+static vpx_codec_err_t ctrl_enable_lpf_opt(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->lpf_opt = va_arg(args, int);
+
+ return VPX_CODEC_OK;
+}
+
static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{ VP8_COPY_REFERENCE, ctrl_copy_reference },
@@ -670,6 +680,7 @@
{ VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
{ VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc },
{ VP9D_SET_ROW_MT, ctrl_set_row_mt },
+ { VP9D_SET_LOOP_FILTER_OPT, ctrl_enable_lpf_opt },
// Getters
{ VPXD_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/vp9/vp9_dx_iface.h b/vp9/vp9_dx_iface.h
index a1c3352..f60688c 100644
--- a/vp9/vp9_dx_iface.h
+++ b/vp9/vp9_dx_iface.h
@@ -46,6 +46,7 @@
int svc_decoding;
int svc_spatial_layer;
int row_mt;
+ int lpf_opt;
};
#endif // VPX_VP9_VP9_DX_IFACE_H_
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index fd60301..c31afc1 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -132,6 +132,16 @@
*/
VP9D_SET_ROW_MT,
+ /*!\brief Codec control function to set loopfilter optimization.
+ *
+ * 0 : off, Loop filter is done after all tiles have been decoded
+ * 1 : on, Loop filter is done immediately after decode without
+ * waiting for all threads to sync.
+ *
+ * Supported in codecs: VP9
+ */
+ VP9D_SET_LOOP_FILTER_OPT,
+
VP8_DECODER_CTRL_ID_MAX
};
@@ -191,6 +201,8 @@
VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
#define VPX_CTRL_VP9_DECODE_SET_ROW_MT
VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int)
+#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT
+VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int)
/*!\endcond */
/*! @} - end defgroup vp8_decoder */
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index 88967a3..1e5f49b 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -155,7 +155,7 @@
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
- lea ecx, [GLOBAL(pw_8)]
+ lea ecx, [GLOBAL(pw_8)]
mov g_pw_8m, ecx
LOAD_IF_USED 0, 1 ; load eax, ecx back
diff --git a/vpx_util/vpx_thread.h b/vpx_util/vpx_thread.h
index 43a9780..19a8bfe 100644
--- a/vpx_util/vpx_thread.h
+++ b/vpx_util/vpx_thread.h
@@ -159,6 +159,23 @@
return 0;
}
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+ int ok = 1;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+ WakeAllConditionVariable(condition);
+#else
+ while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+ // a thread is waiting in pthread_cond_wait: allow it to be notified
+ ok &= SetEvent(condition->signal_event_);
+ // wait until the event is consumed so the signaler cannot consume
+ // the event via its own pthread_cond_wait.
+ ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+ WAIT_OBJECT_0);
+ }
+#endif
+ return !ok;
+}
+
static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
int ok = 1;
#ifdef USE_WINDOWS_CONDITION_VARIABLE
diff --git a/vpxdec.c b/vpxdec.c
index 522eda1..eaa28bd 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -100,19 +100,39 @@
ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
static const arg_def_t rowmtarg =
ARG_DEF(NULL, "row-mt", 1, "Enable multi-threading to run row-wise in VP9");
+static const arg_def_t lpfoptarg =
+ ARG_DEF(NULL, "lpf-opt", 1,
+ "Do loopfilter without waiting for all threads to sync.");
-static const arg_def_t *all_args[] = {
- &help, &codecarg, &use_yv12, &use_i420,
- &flipuvarg, &rawvideo, &noblitarg, &progressarg,
- &limitarg, &skiparg, &postprocarg, &summaryarg,
- &outputfile, &threadsarg, &frameparallelarg, &verbosearg,
- &scalearg, &fb_arg, &md5arg, &error_concealment,
- &continuearg,
+static const arg_def_t *all_args[] = { &help,
+ &codecarg,
+ &use_yv12,
+ &use_i420,
+ &flipuvarg,
+ &rawvideo,
+ &noblitarg,
+ &progressarg,
+ &limitarg,
+ &skiparg,
+ &postprocarg,
+ &summaryarg,
+ &outputfile,
+ &threadsarg,
+ &frameparallelarg,
+ &verbosearg,
+ &scalearg,
+ &fb_arg,
+ &md5arg,
+ &error_concealment,
+ &continuearg,
#if CONFIG_VP9_HIGHBITDEPTH
- &outbitdeptharg,
+ &outbitdeptharg,
#endif
- &svcdecodingarg, &framestatsarg, &rowmtarg, NULL
-};
+ &svcdecodingarg,
+ &framestatsarg,
+ &rowmtarg,
+ &lpfoptarg,
+ NULL };
#if CONFIG_VP8_DECODER
static const arg_def_t addnoise_level =
@@ -509,6 +529,7 @@
int ec_enabled = 0;
int keep_going = 0;
int enable_row_mt = 0;
+ int enable_lpf_opt = 0;
const VpxInterface *interface = NULL;
const VpxInterface *fourcc_interface = NULL;
uint64_t dx_time = 0;
@@ -633,6 +654,8 @@
}
} else if (arg_match(&arg, &rowmtarg, argi)) {
enable_row_mt = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &lpfoptarg, argi)) {
+ enable_lpf_opt = arg_parse_uint(&arg);
}
#if CONFIG_VP8_DECODER
else if (arg_match(&arg, &addnoise_level, argi)) {
@@ -764,6 +787,12 @@
vpx_codec_error(&decoder));
goto fail;
}
+ if (interface->fourcc == VP9_FOURCC &&
+ vpx_codec_control(&decoder, VP9D_SET_LOOP_FILTER_OPT, enable_lpf_opt)) {
+ fprintf(stderr, "Failed to set decoder in optimized loopfilter mode: %s\n",
+ vpx_codec_error(&decoder));
+ goto fail;
+ }
if (!quiet) fprintf(stderr, "%s\n", decoder.name);
#if CONFIG_VP8_DECODER