shithub: libvpx

--- a/vp9/common/vp9_loopfilter.c

+++ b/vp9/common/vp9_loopfilter.c

@@ -1625,6 +1625,17 @@

                        y_only);

+void vp9_loop_filter_data_reset(

+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,

+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) {

+  lf_data->frame_buffer = frame_buffer;

+  lf_data->cm = cm;

+  lf_data->start = 0;

+  lf_data->stop = 0;

+  lf_data->y_only = 0;

+  vpx_memcpy(lf_data->planes, planes, sizeof(lf_data->planes));

+}

 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {

   (void)unused;

   vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,

--- a/vp9/common/vp9_loopfilter.h

+++ b/vp9/common/vp9_loopfilter.h

@@ -124,10 +124,11 @@

   int start;

   int stop;

   int y_only;

-  struct VP9LfSyncData *lf_sync;

-  int num_lf_workers;

 } LFWorkerData;

+void vp9_loop_filter_data_reset(

+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,

+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);

 // Operates on the rows described by 'lf_data'.

 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);

--- a/vp9/decoder/vp9_decodeframe.c

+++ b/vp9/decoder/vp9_decodeframe.c

@@ -902,11 +902,8 @@

     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;

     // Be sure to sync as we might be resuming after a failed frame decode.

     winterface->sync(&pbi->lf_worker);

-    lf_data->frame_buffer = get_frame_new_buffer(cm);

-    lf_data->cm = cm;

-    vp9_copy(lf_data->planes, pbi->mb.plane);

-    lf_data->stop = 0;

-    lf_data->y_only = 0;

+    vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,

+                               pbi->mb.plane);

     vp9_loop_filter_frame_init(cm, cm->lf.filter_level);

@@ -1065,14 +1062,19 @@

     // use num_threads - 1 workers.

     CHECK_MEM_ERROR(cm, pbi->tile_workers,

                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));

+    // Ensure tile data offsets will be properly aligned. This may fail on

+    // platforms without DECLARE_ALIGNED().

+    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);

+    CHECK_MEM_ERROR(cm, pbi->tile_worker_data,

+                    vpx_memalign(32, num_threads *

+                                 sizeof(*pbi->tile_worker_data)));

+    CHECK_MEM_ERROR(cm, pbi->tile_worker_info,

+                    vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info)));

     for (i = 0; i < num_threads; ++i) {

       VP9Worker *const worker = &pbi->tile_workers[i];

       ++pbi->num_tile_workers;

       winterface->init(worker);

-      CHECK_MEM_ERROR(cm, worker->data1,

-                      vpx_memalign(32, sizeof(TileWorkerData)));

-      CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));

       if (i < num_threads - 1 && !winterface->reset(worker)) {

         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,

                            "Tile decoder thread creation failed");

@@ -1082,8 +1084,11 @@

   // Reset tile decoding hook

   for (n = 0; n < num_workers; ++n) {

-    winterface->sync(&pbi->tile_workers[n]);

-    pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;

+    VP9Worker *const worker = &pbi->tile_workers[n];

+    winterface->sync(worker);

+    worker->hook = (VP9WorkerHook)tile_worker_hook;

+    worker->data1 = &pbi->tile_worker_data[n];

+    worker->data2 = &pbi->tile_worker_info[n];

   // Note: this memset assumes above_context[0], [1] and [2]

@@ -1555,7 +1560,9 @@

     if (!xd->corrupted) {

       // If multiple threads are used to decode tiles, then we use those threads

       // to do parallel loopfiltering.

-      vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);

+      vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm,

+                               pbi->tile_workers, pbi->num_tile_workers,

+                               cm->lf.filter_level, 0);

   } else {

     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);

--- a/vp9/decoder/vp9_decoder.c

+++ b/vp9/decoder/vp9_decoder.c

@@ -106,9 +106,9 @@

   for (i = 0; i < pbi->num_tile_workers; ++i) {

     VP9Worker *const worker = &pbi->tile_workers[i];

     vp9_get_worker_interface()->end(worker);

-    vpx_free(worker->data1);

-    vpx_free(worker->data2);

+  vpx_free(pbi->tile_worker_data);

+  vpx_free(pbi->tile_worker_info);

   vpx_free(pbi->tile_workers);

   if (pbi->num_tile_workers > 0) {

--- a/vp9/decoder/vp9_decoder.h

+++ b/vp9/decoder/vp9_decoder.h

@@ -46,6 +46,8 @@

   VP9Worker lf_worker;

   VP9Worker *tile_workers;

+  TileWorkerData *tile_worker_data;

+  TileInfo *tile_worker_info;

   int num_tile_workers;

   TileData *tile_data;

--- a/vp9/decoder/vp9_dthread.c

+++ b/vp9/decoder/vp9_dthread.c

@@ -92,12 +92,12 @@

                                 VP9_COMMON *const cm,

                                 struct macroblockd_plane planes[MAX_MB_PLANE],

                                 int start, int stop, int y_only,

-                                VP9LfSync *const lf_sync, int num_lf_workers) {

+                                VP9LfSync *const lf_sync) {

   const int num_planes = y_only ? 1 : MAX_MB_PLANE;

   int r, c;  // SB row and col

   const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;

-  for (r = start; r < stop; r += num_lf_workers) {

+  for (r = start; r < stop; r += lf_sync->num_workers) {

     const int mi_row = r << MI_BLOCK_SIZE_LOG2;

     MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;

@@ -121,35 +121,35 @@

 // Row-based multi-threaded loopfilter hook

-static int loop_filter_row_worker(TileWorkerData *const tile_data,

-                                  void *unused) {

-  LFWorkerData *const lf_data = &tile_data->lfdata;

-  (void)unused;

+static int loop_filter_row_worker(VP9LfSync *const lf_sync,

+                                  LFWorkerData *const lf_data) {

   loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,

-                      lf_data->start, lf_data->stop, lf_data->y_only,

-                      lf_data->lf_sync, lf_data->num_lf_workers);

+                      lf_data->start, lf_data->stop, lf_data->y_only, lf_sync);

   return 1;

 // VP9 decoder: Implement multi-threaded loopfilter that uses the tile

 // threads.

-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,

-                              VP9Decoder *pbi, VP9_COMMON *cm,

+void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,

+                              YV12_BUFFER_CONFIG *frame,

+                              struct macroblockd_plane planes[MAX_MB_PLANE],

+                              VP9_COMMON *cm,

+                              VP9Worker *workers, int nworkers,

                               int frame_filter_level,

                               int y_only) {

-  VP9LfSync *const lf_sync = &pbi->lf_row_sync;

   const VP9WorkerInterface *const winterface = vp9_get_worker_interface();

   // Number of superblock rows and cols

   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;

   const int tile_cols = 1 << cm->log2_tile_cols;

-  const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);

+  const int num_workers = MIN(nworkers, tile_cols);

   int i;

   if (!frame_filter_level) return;

-  if (!lf_sync->sync_range || cm->last_height != cm->height) {

+  if (!lf_sync->sync_range || cm->last_height != cm->height ||

+      num_workers > lf_sync->num_workers) {

     vp9_loop_filter_dealloc(lf_sync);

-    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);

+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);

   vp9_loop_filter_frame_init(cm, frame_filter_level);

@@ -158,33 +158,27 @@

   vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);

   // Set up loopfilter thread data.

-  // The decoder is using num_workers instead of pbi->num_tile_workers

-  // because it has been observed that using more threads on the

-  // loopfilter, than there are tile columns in the frame will hurt

-  // performance on Android. This is because the system will only

-  // schedule the tile decode workers on cores equal to the number

-  // of tile columns. Then if the decoder tries to use more threads for the

-  // loopfilter, it will hurt performance because of contention. If the

-  // multithreading code changes in the future then the number of workers

-  // used by the loopfilter should be revisited.

+  // The decoder is capping num_workers because it has been observed that using

+  // more threads on the loopfilter than there are cores will hurt performance

+  // on Android. This is because the system will only schedule the tile decode

+  // workers on cores equal to the number of tile columns. Then if the decoder

+  // tries to use more threads for the loopfilter, it will hurt performance

+  // because of contention. If the multithreading code changes in the future

+  // then the number of workers used by the loopfilter should be revisited.

   for (i = 0; i < num_workers; ++i) {

-    VP9Worker *const worker = &pbi->tile_workers[i];

-    TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;

-    LFWorkerData *const lf_data = &tile_data->lfdata;

+    VP9Worker *const worker = &workers[i];

+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];

     worker->hook = (VP9WorkerHook)loop_filter_row_worker;

+    worker->data1 = lf_sync;

+    worker->data2 = lf_data;

     // Loopfilter data

-    lf_data->frame_buffer = frame;

-    lf_data->cm = cm;

-    vp9_copy(lf_data->planes, pbi->mb.plane);

+    vp9_loop_filter_data_reset(lf_data, frame, cm, planes);

     lf_data->start = i;

     lf_data->stop = sb_rows;

-    lf_data->y_only = y_only;   // always do all planes in decoder

+    lf_data->y_only = y_only;

-    lf_data->lf_sync = lf_sync;

-    lf_data->num_lf_workers = num_workers;

     // Start loopfiltering

     if (i == num_workers - 1) {

       winterface->execute(worker);

@@ -195,7 +189,7 @@

   // Wait till all rows are finished

   for (i = 0; i < num_workers; ++i) {

-    winterface->sync(&pbi->tile_workers[i]);

+    winterface->sync(&workers[i]);

@@ -215,7 +209,7 @@

 // Allocate memory for lf row synchronization

 void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,

-                           int width) {

+                           int width, int num_workers) {

   lf_sync->rows = rows;

 #if CONFIG_MULTITHREAD

@@ -239,6 +233,10 @@

 #endif  // CONFIG_MULTITHREAD

+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,

+                  vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));

+  lf_sync->num_workers = num_workers;

   CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,

                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));

@@ -265,6 +263,7 @@

       vpx_free(lf_sync->cond_);

 #endif  // CONFIG_MULTITHREAD

+    vpx_free(lf_sync->lfdata);

     vpx_free(lf_sync->cur_sb_col);

     // clear the structure as the source of this call may be a resize in which

     // case this call will be followed by an _alloc() which may fail.

--- a/vp9/decoder/vp9_dthread.h

+++ b/vp9/decoder/vp9_dthread.h

@@ -22,9 +22,6 @@

   struct VP9Common *cm;

   vp9_reader bit_reader;

   DECLARE_ALIGNED(16, struct macroblockd, xd);

-  // Row-based parallel loopfilter data

-  LFWorkerData lfdata;

 } TileWorkerData;

 // Loopfilter row synchronization

@@ -39,19 +36,25 @@

   // determined by testing. Currently, it is chosen to be a power-of-2 number.

   int sync_range;

   int rows;

+  // Row-based parallel loopfilter data

+  LFWorkerData *lfdata;

+  int num_workers;

 } VP9LfSync;

 // Allocate memory for loopfilter row synchronization.

 void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,

-                           int width);

+                           int width, int num_workers);

 // Deallocate loopfilter synchronization related mutex and data.

 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);

 // Multi-threaded loopfilter that uses the tile threads.

-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,

-                              struct VP9Decoder *pbi,

+void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,

+                              YV12_BUFFER_CONFIG *frame,

+                              struct macroblockd_plane planes[MAX_MB_PLANE],

                               struct VP9Common *cm,

+                              VP9Worker *workers, int num_workers,

                               int frame_filter_level,

                               int y_only);