ref: 01900edc40e30891fd00b4066209a686894dd1c5
parent: fe2fd37bb2121cbbb95d826089d88357e70c87a8
parent: 01483677e567dcb58077540449de7999ec05ffcb
author: James Zern <[email protected]>
date: Fri Oct 24 07:43:51 EDT 2014
Merge changes I8a9c9019,Ic7b2faa3,I44d42a50,I3f3a3924,I10747b32,I31b49c9e * changes: add vp9_loop_filter_data_reset move LFWorkerData allocation to VP9LfSync vp9_loop_filter_frame_mt: remove pbi dependency vp9_loop_filter_frame_mt: pass planes directly vp9_loop_filter_frame_mt: pass VP9LfSync directly vp9: store TileWorkerData allocations separately
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1625,6 +1625,17 @@
y_only);
}
+void vp9_loop_filter_data_reset(
+ LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+ struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) {
+ lf_data->frame_buffer = frame_buffer;
+ lf_data->cm = cm;
+ lf_data->start = 0;
+ lf_data->stop = 0;
+ lf_data->y_only = 0;
+ vpx_memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
+}
+
int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
(void)unused;
vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -124,10 +124,11 @@
int start;
int stop;
int y_only;
-
- struct VP9LfSyncData *lf_sync;
- int num_lf_workers;
} LFWorkerData;
+
+void vp9_loop_filter_data_reset(
+ LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+ struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
// Operates on the rows described by 'lf_data'.
int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -902,11 +902,8 @@
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
// Be sure to sync as we might be resuming after a failed frame decode.
winterface->sync(&pbi->lf_worker);
- lf_data->frame_buffer = get_frame_new_buffer(cm);
- lf_data->cm = cm;
- vp9_copy(lf_data->planes, pbi->mb.plane);
- lf_data->stop = 0;
- lf_data->y_only = 0;
+ vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
+ pbi->mb.plane);
vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
}
@@ -1065,14 +1062,19 @@
// use num_threads - 1 workers.
CHECK_MEM_ERROR(cm, pbi->tile_workers,
vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
+ // Ensure tile data offsets will be properly aligned. This may fail on
+ // platforms without DECLARE_ALIGNED().
+ assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
+ CHECK_MEM_ERROR(cm, pbi->tile_worker_data,
+ vpx_memalign(32, num_threads *
+ sizeof(*pbi->tile_worker_data)));
+ CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
+ vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
for (i = 0; i < num_threads; ++i) {
VP9Worker *const worker = &pbi->tile_workers[i];
++pbi->num_tile_workers;
winterface->init(worker);
- CHECK_MEM_ERROR(cm, worker->data1,
- vpx_memalign(32, sizeof(TileWorkerData)));
- CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
if (i < num_threads - 1 && !winterface->reset(worker)) {
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Tile decoder thread creation failed");
@@ -1082,8 +1084,11 @@
// Reset tile decoding hook
for (n = 0; n < num_workers; ++n) {
- winterface->sync(&pbi->tile_workers[n]);
- pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
+ VP9Worker *const worker = &pbi->tile_workers[n];
+ winterface->sync(worker);
+ worker->hook = (VP9WorkerHook)tile_worker_hook;
+ worker->data1 = &pbi->tile_worker_data[n];
+ worker->data2 = &pbi->tile_worker_info[n];
}
// Note: this memset assumes above_context[0], [1] and [2]
@@ -1555,7 +1560,9 @@
if (!xd->corrupted) {
// If multiple threads are used to decode tiles, then we use those threads
// to do parallel loopfiltering.
- vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+ vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm,
+ pbi->tile_workers, pbi->num_tile_workers,
+ cm->lf.filter_level, 0);
}
} else {
*p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -106,9 +106,9 @@
for (i = 0; i < pbi->num_tile_workers; ++i) {
VP9Worker *const worker = &pbi->tile_workers[i];
vp9_get_worker_interface()->end(worker);
- vpx_free(worker->data1);
- vpx_free(worker->data2);
}
+ vpx_free(pbi->tile_worker_data);
+ vpx_free(pbi->tile_worker_info);
vpx_free(pbi->tile_workers);
if (pbi->num_tile_workers > 0) {
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -46,6 +46,8 @@
VP9Worker lf_worker;
VP9Worker *tile_workers;
+ TileWorkerData *tile_worker_data;
+ TileInfo *tile_worker_info;
int num_tile_workers;
TileData *tile_data;
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -92,12 +92,12 @@
VP9_COMMON *const cm,
struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only,
- VP9LfSync *const lf_sync, int num_lf_workers) {
+ VP9LfSync *const lf_sync) {
const int num_planes = y_only ? 1 : MAX_MB_PLANE;
int r, c; // SB row and col
const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
- for (r = start; r < stop; r += num_lf_workers) {
+ for (r = start; r < stop; r += lf_sync->num_workers) {
const int mi_row = r << MI_BLOCK_SIZE_LOG2;
MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;
@@ -121,35 +121,35 @@
}
// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(TileWorkerData *const tile_data,
- void *unused) {
- LFWorkerData *const lf_data = &tile_data->lfdata;
- (void)unused;
+static int loop_filter_row_worker(VP9LfSync *const lf_sync,
+ LFWorkerData *const lf_data) {
loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
- lf_data->start, lf_data->stop, lf_data->y_only,
- lf_data->lf_sync, lf_data->num_lf_workers);
+ lf_data->start, lf_data->stop, lf_data->y_only, lf_sync);
return 1;
}
// VP9 decoder: Implement multi-threaded loopfilter that uses the tile
// threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
- VP9Decoder *pbi, VP9_COMMON *cm,
+void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
+ YV12_BUFFER_CONFIG *frame,
+ struct macroblockd_plane planes[MAX_MB_PLANE],
+ VP9_COMMON *cm,
+ VP9Worker *workers, int nworkers,
int frame_filter_level,
int y_only) {
- VP9LfSync *const lf_sync = &pbi->lf_row_sync;
const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
// Number of superblock rows and cols
const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
const int tile_cols = 1 << cm->log2_tile_cols;
- const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
+ const int num_workers = MIN(nworkers, tile_cols);
int i;
if (!frame_filter_level) return;
- if (!lf_sync->sync_range || cm->last_height != cm->height) {
+ if (!lf_sync->sync_range || cm->last_height != cm->height ||
+ num_workers > lf_sync->num_workers) {
vp9_loop_filter_dealloc(lf_sync);
- vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);
+ vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
}
vp9_loop_filter_frame_init(cm, frame_filter_level);
@@ -158,33 +158,27 @@
vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
// Set up loopfilter thread data.
- // The decoder is using num_workers instead of pbi->num_tile_workers
- // because it has been observed that using more threads on the
- // loopfilter, than there are tile columns in the frame will hurt
- // performance on Android. This is because the system will only
- // schedule the tile decode workers on cores equal to the number
- // of tile columns. Then if the decoder tries to use more threads for the
- // loopfilter, it will hurt performance because of contention. If the
- // multithreading code changes in the future then the number of workers
- // used by the loopfilter should be revisited.
+ // The decoder is capping num_workers because it has been observed that using
+ // more threads on the loopfilter than there are cores will hurt performance
+ // on Android. This is because the system will only schedule the tile decode
+ // workers on cores equal to the number of tile columns. Then if the decoder
+ // tries to use more threads for the loopfilter, it will hurt performance
+ // because of contention. If the multithreading code changes in the future
+ // then the number of workers used by the loopfilter should be revisited.
for (i = 0; i < num_workers; ++i) {
- VP9Worker *const worker = &pbi->tile_workers[i];
- TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
- LFWorkerData *const lf_data = &tile_data->lfdata;
+ VP9Worker *const worker = &workers[i];
+ LFWorkerData *const lf_data = &lf_sync->lfdata[i];
worker->hook = (VP9WorkerHook)loop_filter_row_worker;
+ worker->data1 = lf_sync;
+ worker->data2 = lf_data;
// Loopfilter data
- lf_data->frame_buffer = frame;
- lf_data->cm = cm;
- vp9_copy(lf_data->planes, pbi->mb.plane);
+ vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
lf_data->start = i;
lf_data->stop = sb_rows;
- lf_data->y_only = y_only; // always do all planes in decoder
+ lf_data->y_only = y_only;
- lf_data->lf_sync = lf_sync;
- lf_data->num_lf_workers = num_workers;
-
// Start loopfiltering
if (i == num_workers - 1) {
winterface->execute(worker);
@@ -195,7 +189,7 @@
// Wait till all rows are finished
for (i = 0; i < num_workers; ++i) {
- winterface->sync(&pbi->tile_workers[i]);
+ winterface->sync(&workers[i]);
}
}
@@ -215,7 +209,7 @@
// Allocate memory for lf row synchronization
void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
- int width) {
+ int width, int num_workers) {
lf_sync->rows = rows;
#if CONFIG_MULTITHREAD
{
@@ -239,6 +233,10 @@
}
#endif // CONFIG_MULTITHREAD
+ CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+ vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
+ lf_sync->num_workers = num_workers;
+
CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
@@ -265,6 +263,7 @@
vpx_free(lf_sync->cond_);
}
#endif // CONFIG_MULTITHREAD
+ vpx_free(lf_sync->lfdata);
vpx_free(lf_sync->cur_sb_col);
// clear the structure as the source of this call may be a resize in which
// case this call will be followed by an _alloc() which may fail.
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -22,9 +22,6 @@
struct VP9Common *cm;
vp9_reader bit_reader;
DECLARE_ALIGNED(16, struct macroblockd, xd);
-
- // Row-based parallel loopfilter data
- LFWorkerData lfdata;
} TileWorkerData;
// Loopfilter row synchronization
@@ -39,19 +36,25 @@
// determined by testing. Currently, it is chosen to be a power-of-2 number.
int sync_range;
int rows;
+
+ // Row-based parallel loopfilter data
+ LFWorkerData *lfdata;
+ int num_workers;
} VP9LfSync;
// Allocate memory for loopfilter row synchronization.
void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
- int width);
+ int width, int num_workers);
// Deallocate loopfilter synchronization related mutex and data.
void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
// Multi-threaded loopfilter that uses the tile threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
- struct VP9Decoder *pbi,
+void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
+ YV12_BUFFER_CONFIG *frame,
+ struct macroblockd_plane planes[MAX_MB_PLANE],
struct VP9Common *cm,
+ VP9Worker *workers, int num_workers,
int frame_filter_level,
int y_only);