shithub: libvpx

--- a/vp9/common/vp9_loopfilter.c

+++ b/vp9/common/vp9_loopfilter.c

@@ -1149,10 +1149,10 @@

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-static void filter_block_plane_non420(VP9_COMMON *cm,

-                                      struct macroblockd_plane *plane,

-                                      MODE_INFO *mi_8x8,

-                                      int mi_row, int mi_col) {

+void vp9_filter_block_plane_non420(VP9_COMMON *cm,

+                                   struct macroblockd_plane *plane,

+                                   MODE_INFO *mi_8x8,

+                                   int mi_row, int mi_col) {

   const int ss_x = plane->subsampling_x;

   const int ss_y = plane->subsampling_y;

   const int row_step = 1 << ss_y;

@@ -1598,8 +1598,8 @@

         if (use_420)

           vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);

         else

-          filter_block_plane_non420(cm, &planes[plane], mi + mi_col,

-                                    mi_row, mi_col);

+          vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,

+                                        mi_row, mi_col);

--- a/vp9/common/vp9_loopfilter.h

+++ b/vp9/common/vp9_loopfilter.h

@@ -97,6 +97,11 @@

                             int mi_row,

                             LOOP_FILTER_MASK *lfm);

+void vp9_filter_block_plane_non420(struct VP9Common *cm,

+                                   struct macroblockd_plane *plane,

+                                   MODE_INFO *mi_8x8,

+                                   int mi_row, int mi_col);

 void vp9_loop_filter_init(struct VP9Common *cm);

 // Update the loop filter for the current frame.

--- /dev/null

+++ b/vp9/common/vp9_loopfilter_thread.c

@@ -1,0 +1,291 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_config.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/vp9_loopfilter_thread.h"

+#include "vp9/common/vp9_reconinter.h"

+#if CONFIG_MULTITHREAD

+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {

+  const int kMaxTryLocks = 4000;

+  int locked = 0;

+  int i;

+  for (i = 0; i < kMaxTryLocks; ++i) {

+    if (!pthread_mutex_trylock(mutex)) {

+      locked = 1;

+      break;

+    }

+  }

+  if (!locked)

+    pthread_mutex_lock(mutex);

+}

+#endif  // CONFIG_MULTITHREAD

+static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {

+#if CONFIG_MULTITHREAD

+  const int nsync = lf_sync->sync_range;

+  if (r && !(c & (nsync - 1))) {

+    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];

+    mutex_lock(mutex);

+    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {

+      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);

+    }

+    pthread_mutex_unlock(mutex);

+  }

+#else

+  (void)lf_sync;

+  (void)r;

+  (void)c;

+#endif  // CONFIG_MULTITHREAD

+}

+static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,

+                              const int sb_cols) {

+#if CONFIG_MULTITHREAD

+  const int nsync = lf_sync->sync_range;

+  int cur;

+  // Only signal when there are enough filtered SB for next row to run.

+  int sig = 1;

+  if (c < sb_cols - 1) {

+    cur = c;

+    if (c % nsync)

+      sig = 0;

+  } else {

+    cur = sb_cols + nsync;

+  }

+  if (sig) {

+    mutex_lock(&lf_sync->mutex_[r]);

+    lf_sync->cur_sb_col[r] = cur;

+    pthread_cond_signal(&lf_sync->cond_[r]);

+    pthread_mutex_unlock(&lf_sync->mutex_[r]);

+  }

+#else

+  (void)lf_sync;

+  (void)r;

+  (void)c;

+  (void)sb_cols;

+#endif  // CONFIG_MULTITHREAD

+}

+// Implement row loopfiltering for each thread.

+static INLINE

+void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,

+                             VP9_COMMON *const cm,

+                             struct macroblockd_plane planes[MAX_MB_PLANE],

+                             int start, int stop, int y_only,

+                             VP9LfSync *const lf_sync) {

+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;

+  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;

+  int mi_row, mi_col;

+  for (mi_row = start; mi_row < stop;

+       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {

+    MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;

+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {

+      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;

+      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;

+      LOOP_FILTER_MASK lfm;

+      int plane;

+      sync_read(lf_sync, r, c);

+      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

+      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);

+      for (plane = 0; plane < num_planes; ++plane) {

+        vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);

+      }

+      sync_write(lf_sync, r, c, sb_cols);

+    }

+  }

+}

+// Row-based multi-threaded loopfilter hook

+static int loop_filter_row_worker(VP9LfSync *const lf_sync,

+                                  LFWorkerData *const lf_data) {

+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,

+                          lf_data->start, lf_data->stop, lf_data->y_only,

+                          lf_sync);

+  return 1;

+}

+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,

+                                VP9_COMMON *cm,

+                                struct macroblockd_plane planes[MAX_MB_PLANE],

+                                int start, int stop, int y_only,

+                                VP9Worker *workers, int nworkers,

+                                VP9LfSync *lf_sync) {

+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();

+  // Number of superblock rows and cols

+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;

+  // Decoder may allocate more threads than number of tiles based on user's

+  // input.

+  const int tile_cols = 1 << cm->log2_tile_cols;

+  const int num_workers = MIN(nworkers, tile_cols);

+  int i;

+  if (!lf_sync->sync_range || cm->last_height != cm->height ||

+      num_workers > lf_sync->num_workers) {

+    vp9_loop_filter_dealloc(lf_sync);

+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);

+  }

+  // Initialize cur_sb_col to -1 for all SB rows.

+  vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);

+  // Set up loopfilter thread data.

+  // The decoder is capping num_workers because it has been observed that using

+  // more threads on the loopfilter than there are cores will hurt performance

+  // on Android. This is because the system will only schedule the tile decode

+  // workers on cores equal to the number of tile columns. Then if the decoder

+  // tries to use more threads for the loopfilter, it will hurt performance

+  // because of contention. If the multithreading code changes in the future

+  // then the number of workers used by the loopfilter should be revisited.

+  for (i = 0; i < num_workers; ++i) {

+    VP9Worker *const worker = &workers[i];

+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];

+    worker->hook = (VP9WorkerHook)loop_filter_row_worker;

+    worker->data1 = lf_sync;

+    worker->data2 = lf_data;

+    // Loopfilter data

+    vp9_loop_filter_data_reset(lf_data, frame, cm, planes);

+    lf_data->start = start + i * MI_BLOCK_SIZE;

+    lf_data->stop = stop;

+    lf_data->y_only = y_only;

+    // Start loopfiltering

+    if (i == num_workers - 1) {

+      winterface->execute(worker);

+    } else {

+      winterface->launch(worker);

+    }

+  }

+  // Wait till all rows are finished

+  for (i = 0; i < num_workers; ++i) {

+    winterface->sync(&workers[i]);

+  }

+}

+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,

+                              VP9_COMMON *cm,

+                              struct macroblockd_plane planes[MAX_MB_PLANE],

+                              int frame_filter_level,

+                              int y_only, int partial_frame,

+                              VP9Worker *workers, int num_workers,

+                              VP9LfSync *lf_sync) {

+  int start_mi_row, end_mi_row, mi_rows_to_filter;

+  if (!frame_filter_level) return;

+  start_mi_row = 0;

+  mi_rows_to_filter = cm->mi_rows;

+  if (partial_frame && cm->mi_rows > 8) {

+    start_mi_row = cm->mi_rows >> 1;

+    start_mi_row &= 0xfffffff8;

+    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);

+  }

+  end_mi_row = start_mi_row + mi_rows_to_filter;

+  vp9_loop_filter_frame_init(cm, frame_filter_level);

+  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,

+                      y_only, workers, num_workers, lf_sync);

+}

+// Set up nsync by width.

+static INLINE int get_sync_range(int width) {

+  // nsync numbers are picked by testing. For example, for 4k

+  // video, using 4 gives best performance.

+  if (width < 640)

+    return 1;

+  else if (width <= 1280)

+    return 2;

+  else if (width <= 4096)

+    return 4;

+  else

+    return 8;

+}

+// Allocate memory for lf row synchronization

+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,

+                           int width, int num_workers) {

+  lf_sync->rows = rows;

+#if CONFIG_MULTITHREAD

+  {

+    int i;

+    CHECK_MEM_ERROR(cm, lf_sync->mutex_,

+                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));

+    if (lf_sync->mutex_) {

+      for (i = 0; i < rows; ++i) {

+        pthread_mutex_init(&lf_sync->mutex_[i], NULL);

+      }

+    }

+    CHECK_MEM_ERROR(cm, lf_sync->cond_,

+                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));

+    if (lf_sync->cond_) {

+      for (i = 0; i < rows; ++i) {

+        pthread_cond_init(&lf_sync->cond_[i], NULL);

+      }

+    }

+  }

+#endif  // CONFIG_MULTITHREAD

+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,

+                  vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));

+  lf_sync->num_workers = num_workers;

+  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,

+                  vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));

+  // Set up nsync.

+  lf_sync->sync_range = get_sync_range(width);

+}

+// Deallocate lf synchronization related mutex and data

+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {

+  if (lf_sync != NULL) {

+#if CONFIG_MULTITHREAD

+    int i;

+    if (lf_sync->mutex_ != NULL) {

+      for (i = 0; i < lf_sync->rows; ++i) {

+        pthread_mutex_destroy(&lf_sync->mutex_[i]);

+      }

+      vpx_free(lf_sync->mutex_);

+    }

+    if (lf_sync->cond_ != NULL) {

+      for (i = 0; i < lf_sync->rows; ++i) {

+        pthread_cond_destroy(&lf_sync->cond_[i]);

+      }

+      vpx_free(lf_sync->cond_);

+    }

+#endif  // CONFIG_MULTITHREAD

+    vpx_free(lf_sync->lfdata);

+    vpx_free(lf_sync->cur_sb_col);

+    // clear the structure as the source of this call may be a resize in which

+    // case this call will be followed by an _alloc() which may fail.

+    vp9_zero(*lf_sync);

+  }

+}

--- /dev/null

+++ b/vp9/common/vp9_loopfilter_thread.h

@@ -1,0 +1,53 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_

+#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_

+#include "./vpx_config.h"

+#include "vp9/common/vp9_loopfilter.h"

+#include "vp9/common/vp9_thread.h"

+struct VP9Common;

+// Loopfilter row synchronization

+typedef struct VP9LfSyncData {

+#if CONFIG_MULTITHREAD

+  pthread_mutex_t *mutex_;

+  pthread_cond_t *cond_;

+#endif

+  // Allocate memory to store the loop-filtered superblock index in each row.

+  int *cur_sb_col;

+  // The optimal sync_range for different resolution and platform should be

+  // determined by testing. Currently, it is chosen to be a power-of-2 number.

+  int sync_range;

+  int rows;

+  // Row-based parallel loopfilter data

+  LFWorkerData *lfdata;

+  int num_workers;

+} VP9LfSync;

+// Allocate memory for loopfilter row synchronization.

+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows,

+                           int width, int num_workers);

+// Deallocate loopfilter synchronization related mutex and data.

+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);

+// Multi-threaded loopfilter that uses the tile threads.

+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,

+                              struct VP9Common *cm,

+                              struct macroblockd_plane planes[MAX_MB_PLANE],

+                              int frame_filter_level,

+                              int y_only, int partial_frame,

+                              VP9Worker *workers, int num_workers,

+                              VP9LfSync *lf_sync);

+#endif  // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_

--- a/vp9/decoder/vp9_decodeframe.c

+++ b/vp9/decoder/vp9_decodeframe.c

@@ -36,7 +36,6 @@

 #include "vp9/decoder/vp9_decodemv.h"

 #include "vp9/decoder/vp9_decoder.h"

 #include "vp9/decoder/vp9_dsubexp.h"

-#include "vp9/decoder/vp9_dthread.h"

 #include "vp9/decoder/vp9_read_bit_buffer.h"

 #include "vp9/decoder/vp9_reader.h"

@@ -1591,9 +1590,9 @@

     if (!xd->corrupted) {

       // If multiple threads are used to decode tiles, then we use those threads

       // to do parallel loopfiltering.

-      vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm,

-                               pbi->tile_workers, pbi->num_tile_workers,

-                               cm->lf.filter_level, 0);

+      vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level,

+                               0, 0, pbi->tile_workers, pbi->num_tile_workers,

+                               &pbi->lf_row_sync);

     } else {

       vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,

                          "Decode failed. Frame data is corrupted.");

--- a/vp9/decoder/vp9_decoder.c

+++ b/vp9/decoder/vp9_decoder.c

@@ -32,7 +32,6 @@

 #include "vp9/decoder/vp9_decodeframe.h"

 #include "vp9/decoder/vp9_decoder.h"

 #include "vp9/decoder/vp9_detokenize.h"

-#include "vp9/decoder/vp9_dthread.h"

 static void initialize_dec(void) {

   static volatile int init_done = 0;

--- a/vp9/decoder/vp9_decoder.h

+++ b/vp9/decoder/vp9_decoder.h

@@ -15,13 +15,12 @@

 #include "vpx/vpx_codec.h"

 #include "vpx_scale/yv12config.h"

+#include "vp9/common/vp9_loopfilter_thread.h"

 #include "vp9/common/vp9_onyxc_int.h"

 #include "vp9/common/vp9_ppflags.h"

 #include "vp9/common/vp9_thread.h"

+#include "vp9/decoder/vp9_reader.h"

-#include "vp9/decoder/vp9_dthread.h"

 #ifdef __cplusplus

 extern "C" {

 #endif

@@ -32,6 +31,13 @@

   vp9_reader bit_reader;

   DECLARE_ALIGNED(16, MACROBLOCKD, xd);

 } TileData;

+typedef struct TileWorkerData {

+  VP9_COMMON *cm;

+  vp9_reader bit_reader;

+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);

+  struct vpx_internal_error_info error_info;

+} TileWorkerData;

 typedef struct VP9Decoder {

   DECLARE_ALIGNED(16, MACROBLOCKD, mb);

--- a/vp9/decoder/vp9_dthread.c

+++ /dev/null

@@ -1,272 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vpx_config.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp9/common/vp9_reconinter.h"

-#include "vp9/decoder/vp9_dthread.h"

-#include "vp9/decoder/vp9_decoder.h"

-#if CONFIG_MULTITHREAD

-static INLINE void mutex_lock(pthread_mutex_t *const mutex) {

-  const int kMaxTryLocks = 4000;

-  int locked = 0;

-  int i;

-  for (i = 0; i < kMaxTryLocks; ++i) {

-    if (!pthread_mutex_trylock(mutex)) {

-      locked = 1;

-      break;

-    }

-  }

-  if (!locked)

-    pthread_mutex_lock(mutex);

-}

-#endif  // CONFIG_MULTITHREAD

-static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {

-#if CONFIG_MULTITHREAD

-  const int nsync = lf_sync->sync_range;

-  if (r && !(c & (nsync - 1))) {

-    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];

-    mutex_lock(mutex);

-    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {

-      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);

-    }

-    pthread_mutex_unlock(mutex);

-  }

-#else

-  (void)lf_sync;

-  (void)r;

-  (void)c;

-#endif  // CONFIG_MULTITHREAD

-}

-static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,

-                              const int sb_cols) {

-#if CONFIG_MULTITHREAD

-  const int nsync = lf_sync->sync_range;

-  int cur;

-  // Only signal when there are enough filtered SB for next row to run.

-  int sig = 1;

-  if (c < sb_cols - 1) {

-    cur = c;

-    if (c % nsync)

-      sig = 0;

-  } else {

-    cur = sb_cols + nsync;

-  }

-  if (sig) {

-    mutex_lock(&lf_sync->mutex_[r]);

-    lf_sync->cur_sb_col[r] = cur;

-    pthread_cond_signal(&lf_sync->cond_[r]);

-    pthread_mutex_unlock(&lf_sync->mutex_[r]);

-  }

-#else

-  (void)lf_sync;

-  (void)r;

-  (void)c;

-  (void)sb_cols;

-#endif  // CONFIG_MULTITHREAD

-}

-// Implement row loopfiltering for each thread.

-static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,

-                                VP9_COMMON *const cm,

-                                struct macroblockd_plane planes[MAX_MB_PLANE],

-                                int start, int stop, int y_only,

-                                VP9LfSync *const lf_sync) {

-  const int num_planes = y_only ? 1 : MAX_MB_PLANE;

-  int r, c;  // SB row and col

-  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;

-  for (r = start; r < stop; r += lf_sync->num_workers) {

-    const int mi_row = r << MI_BLOCK_SIZE_LOG2;

-    MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;

-    for (c = 0; c < sb_cols; ++c) {

-      const int mi_col = c << MI_BLOCK_SIZE_LOG2;

-      LOOP_FILTER_MASK lfm;

-      int plane;

-      sync_read(lf_sync, r, c);

-      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

-      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);

-      for (plane = 0; plane < num_planes; ++plane) {

-        vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);

-      }

-      sync_write(lf_sync, r, c, sb_cols);

-    }

-  }

-}

-// Row-based multi-threaded loopfilter hook

-static int loop_filter_row_worker(VP9LfSync *const lf_sync,

-                                  LFWorkerData *const lf_data) {

-  loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,

-                      lf_data->start, lf_data->stop, lf_data->y_only, lf_sync);

-  return 1;

-}

-// VP9 decoder: Implement multi-threaded loopfilter that uses the tile

-// threads.

-void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,

-                              YV12_BUFFER_CONFIG *frame,

-                              struct macroblockd_plane planes[MAX_MB_PLANE],

-                              VP9_COMMON *cm,

-                              VP9Worker *workers, int nworkers,

-                              int frame_filter_level,

-                              int y_only) {

-  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();

-  // Number of superblock rows and cols

-  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;

-  const int tile_cols = 1 << cm->log2_tile_cols;

-  const int num_workers = MIN(nworkers, tile_cols);

-  int i;

-  if (!frame_filter_level) return;

-  if (!lf_sync->sync_range || cm->last_height != cm->height ||

-      num_workers > lf_sync->num_workers) {

-    vp9_loop_filter_dealloc(lf_sync);

-    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);

-  }

-  vp9_loop_filter_frame_init(cm, frame_filter_level);

-  // Initialize cur_sb_col to -1 for all SB rows.

-  vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);

-  // Set up loopfilter thread data.

-  // The decoder is capping num_workers because it has been observed that using

-  // more threads on the loopfilter than there are cores will hurt performance

-  // on Android. This is because the system will only schedule the tile decode

-  // workers on cores equal to the number of tile columns. Then if the decoder

-  // tries to use more threads for the loopfilter, it will hurt performance

-  // because of contention. If the multithreading code changes in the future

-  // then the number of workers used by the loopfilter should be revisited.

-  for (i = 0; i < num_workers; ++i) {

-    VP9Worker *const worker = &workers[i];

-    LFWorkerData *const lf_data = &lf_sync->lfdata[i];

-    worker->hook = (VP9WorkerHook)loop_filter_row_worker;

-    worker->data1 = lf_sync;

-    worker->data2 = lf_data;

-    // Loopfilter data

-    vp9_loop_filter_data_reset(lf_data, frame, cm, planes);

-    lf_data->start = i;

-    lf_data->stop = sb_rows;

-    lf_data->y_only = y_only;

-    // Start loopfiltering

-    if (i == num_workers - 1) {

-      winterface->execute(worker);

-    } else {

-      winterface->launch(worker);

-    }

-  }

-  // Wait till all rows are finished

-  for (i = 0; i < num_workers; ++i) {

-    winterface->sync(&workers[i]);

-  }

-}

-// Set up nsync by width.

-static int get_sync_range(int width) {

-  // nsync numbers are picked by testing. For example, for 4k

-  // video, using 4 gives best performance.

-  if (width < 640)

-    return 1;

-  else if (width <= 1280)

-    return 2;

-  else if (width <= 4096)

-    return 4;

-  else

-    return 8;

-}

-// Allocate memory for lf row synchronization

-void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,

-                           int width, int num_workers) {

-  lf_sync->rows = rows;

-#if CONFIG_MULTITHREAD

-  {

-    int i;

-    CHECK_MEM_ERROR(cm, lf_sync->mutex_,

-                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));

-    if (lf_sync->mutex_) {

-      for (i = 0; i < rows; ++i) {

-        pthread_mutex_init(&lf_sync->mutex_[i], NULL);

-      }

-    }

-    CHECK_MEM_ERROR(cm, lf_sync->cond_,

-                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));

-    if (lf_sync->cond_) {

-      for (i = 0; i < rows; ++i) {

-        pthread_cond_init(&lf_sync->cond_[i], NULL);

-      }

-    }

-  }

-#endif  // CONFIG_MULTITHREAD

-  CHECK_MEM_ERROR(cm, lf_sync->lfdata,

-                  vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));

-  lf_sync->num_workers = num_workers;

-  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,

-                  vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));

-  // Set up nsync.

-  lf_sync->sync_range = get_sync_range(width);

-}

-// Deallocate lf synchronization related mutex and data

-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {

-  if (lf_sync != NULL) {

-#if CONFIG_MULTITHREAD

-    int i;

-    if (lf_sync->mutex_ != NULL) {

-      for (i = 0; i < lf_sync->rows; ++i) {

-        pthread_mutex_destroy(&lf_sync->mutex_[i]);

-      }

-      vpx_free(lf_sync->mutex_);

-    }

-    if (lf_sync->cond_ != NULL) {

-      for (i = 0; i < lf_sync->rows; ++i) {

-        pthread_cond_destroy(&lf_sync->cond_[i]);

-      }

-      vpx_free(lf_sync->cond_);

-    }

-#endif  // CONFIG_MULTITHREAD

-    vpx_free(lf_sync->lfdata);

-    vpx_free(lf_sync->cur_sb_col);

-    // clear the structure as the source of this call may be a resize in which

-    // case this call will be followed by an _alloc() which may fail.

-    vp9_zero(*lf_sync);

-  }

-}

--- a/vp9/decoder/vp9_dthread.h

+++ /dev/null

@@ -1,63 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_DECODER_VP9_DTHREAD_H_

-#define VP9_DECODER_VP9_DTHREAD_H_

-#include "./vpx_config.h"

-#include "vp9/common/vp9_thread.h"

-#include "vp9/decoder/vp9_reader.h"

-#include "vpx/internal/vpx_codec_internal.h"

-struct VP9Common;

-struct VP9Decoder;

-typedef struct TileWorkerData {

-  struct VP9Common *cm;

-  vp9_reader bit_reader;

-  DECLARE_ALIGNED(16, struct macroblockd, xd);

-  struct vpx_internal_error_info error_info;

-} TileWorkerData;

-// Loopfilter row synchronization

-typedef struct VP9LfSyncData {

-#if CONFIG_MULTITHREAD

-  pthread_mutex_t *mutex_;

-  pthread_cond_t *cond_;

-#endif

-  // Allocate memory to store the loop-filtered superblock index in each row.

-  int *cur_sb_col;

-  // The optimal sync_range for different resolution and platform should be

-  // determined by testing. Currently, it is chosen to be a power-of-2 number.

-  int sync_range;

-  int rows;

-  // Row-based parallel loopfilter data

-  LFWorkerData *lfdata;

-  int num_workers;

-} VP9LfSync;

-// Allocate memory for loopfilter row synchronization.

-void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,

-                           int width, int num_workers);

-// Deallocate loopfilter synchronization related mutex and data.

-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);

-// Multi-threaded loopfilter that uses the tile threads.

-void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,

-                              YV12_BUFFER_CONFIG *frame,

-                              struct macroblockd_plane planes[MAX_MB_PLANE],

-                              struct VP9Common *cm,

-                              VP9Worker *workers, int num_workers,

-                              int frame_filter_level,

-                              int y_only);

-#endif  // VP9_DECODER_VP9_DTHREAD_H_

--- a/vp9/encoder/vp9_encoder.c

+++ b/vp9/encoder/vp9_encoder.c

@@ -1786,7 +1786,7 @@

   for (t = 0; t < cpi->num_workers; ++t) {

     VP9Worker *const worker = &cpi->workers[t];

-    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;

+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];

     // Deallocate allocated threads.

     vp9_get_worker_interface()->end(worker);

@@ -1797,11 +1797,13 @@

       vp9_free_pc_tree(thread_data->td);

       vpx_free(thread_data->td);

-    vpx_free(worker->data1);

+  vpx_free(cpi->tile_thr_data);

   vpx_free(cpi->workers);

+  if (cpi->num_workers > 1)

+    vp9_loop_filter_dealloc(&cpi->lf_row_sync);

   dealloc_compressor_data(cpi);

   for (i = 0; i < sizeof(cpi->mbgraph_stats) /

@@ -2437,7 +2439,13 @@

   if (lf->filter_level > 0) {

-    vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);

+    if (cpi->num_workers > 1)

+      vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,

+                               lf->filter_level, 0, 0,

+                               cpi->workers, cpi->num_workers,

+                               &cpi->lf_row_sync);

+    else

+      vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);

   vp9_extend_frame_inner_borders(cm->frame_to_show);

--- a/vp9/encoder/vp9_encoder.h

+++ b/vp9/encoder/vp9_encoder.h

@@ -19,6 +19,7 @@

 #include "vp9/common/vp9_ppflags.h"

 #include "vp9/common/vp9_entropymode.h"

+#include "vp9/common/vp9_loopfilter_thread.h"

 #include "vp9/common/vp9_onyxc_int.h"

 #include "vp9/common/vp9_thread.h"

@@ -36,6 +37,7 @@

 #include "vp9/encoder/vp9_svc_layercontext.h"

 #include "vp9/encoder/vp9_tokenize.h"

 #include "vp9/encoder/vp9_variance.h"

 #if CONFIG_VP9_TEMPORAL_DENOISING

 #include "vp9/encoder/vp9_denoiser.h"

 #endif

@@ -262,6 +264,8 @@

   PC_TREE *pc_root;

 } ThreadData;

+struct EncWorkerData;

 typedef struct VP9_COMP {

   QUANTS quants;

   ThreadData td;

@@ -447,6 +451,8 @@

   // Multi-threading

   int num_workers;

   VP9Worker *workers;

+  struct EncWorkerData *tile_thr_data;

+  VP9LfSync lf_row_sync;

 } VP9_COMP;

 void vp9_initialize_enc(void);

--- a/vp9/encoder/vp9_ethread.c

+++ b/vp9/encoder/vp9_ethread.c

@@ -167,16 +167,15 @@

     CHECK_MEM_ERROR(cm, cpi->workers,

                     vpx_malloc(num_workers * sizeof(*cpi->workers)));

+    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,

+                    vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));

     for (i = 0; i < num_workers; i++) {

       VP9Worker *const worker = &cpi->workers[i];

-      EncWorkerData *thread_data;

+      EncWorkerData *thread_data = &cpi->tile_thr_data[i];

       ++cpi->num_workers;

       winterface->init(worker);

-      CHECK_MEM_ERROR(cm, worker->data1,

-                      (EncWorkerData*)vpx_calloc(1, sizeof(EncWorkerData)));

-      thread_data = (EncWorkerData*)worker->data1;

       if (i < num_workers - 1) {

       thread_data->cpi = cpi;

@@ -205,17 +204,18 @@

         thread_data->td = &cpi->td;

-      // data2 is unused.

-      worker->data2 = NULL;

       winterface->sync(worker);

-      worker->hook = (VP9WorkerHook)enc_worker_hook;

   for (i = 0; i < num_workers; i++) {

     VP9Worker *const worker = &cpi->workers[i];

-    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;

+    EncWorkerData *thread_data;

+    worker->hook = (VP9WorkerHook)enc_worker_hook;

+    worker->data1 = &cpi->tile_thr_data[i];

+    worker->data2 = NULL;

+    thread_data = (EncWorkerData*)worker->data1;

     // Before encoding a frame, copy the thread data from cpi.

     thread_data->td->mb = cpi->td.mb;

--- a/vp9/encoder/vp9_picklpf.c

+++ b/vp9/encoder/vp9_picklpf.c

@@ -39,8 +39,14 @@

   VP9_COMMON *const cm = &cpi->common;

   int64_t filt_err;

-  vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,

-                        partial_frame);

+  if (cpi->num_workers > 1)

+    vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,

+                             filt_level, 1, partial_frame,

+                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);

+  else

+    vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,

+                          1, partial_frame);

 #if CONFIG_VP9_HIGHBITDEPTH

   if (cm->use_highbitdepth) {

     filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show);

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -33,6 +33,7 @@

 VP9_COMMON_SRCS-yes += common/vp9_enums.h

 VP9_COMMON_SRCS-yes += common/vp9_idct.h

 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h

+VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.h

 VP9_COMMON_SRCS-yes += common/vp9_mv.h

 VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h

 VP9_COMMON_SRCS-yes += common/vp9_pred_common.h

@@ -56,6 +57,7 @@

 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c

 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c

 VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c

+VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.c

 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c

 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h

 VP9_COMMON_SRCS-yes += common/vp9_quant_common.c

--- a/vp9/vp9dx.mk

+++ b/vp9/vp9dx.mk

@@ -21,8 +21,6 @@

 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c

 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h

 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c

-VP9_DX_SRCS-yes += decoder/vp9_dthread.c

-VP9_DX_SRCS-yes += decoder/vp9_dthread.h

 VP9_DX_SRCS-yes += decoder/vp9_reader.h

 VP9_DX_SRCS-yes += decoder/vp9_reader.c

 VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c