shithub: libvpx

--- a/vp9/common/vp9_alloccommon.c

+++ b/vp9/common/vp9_alloccommon.c

@@ -115,6 +115,8 @@

   cm->above_context = NULL;

   vpx_free(cm->above_seg_context);

   cm->above_seg_context = NULL;

+  vpx_free(cm->lf.lfm);

+  cm->lf.lfm = NULL;

 int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {

@@ -148,6 +150,16 @@

     if (!cm->above_seg_context) goto fail;

     cm->above_context_alloc_cols = cm->mi_cols;

+  vpx_free(cm->lf.lfm);

+  // Each lfm holds bit masks for all the 8x8 blocks in a 64x64 region.  The

+  // stride and rows are rounded up / truncated to a multiple of 8.

+  cm->lf.lfm_stride = (cm->mi_cols + (MI_BLOCK_SIZE - 1)) >> 3;

+  cm->lf.lfm = (LOOP_FILTER_MASK *)vpx_calloc(

+      ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride,

+      sizeof(*cm->lf.lfm));

+  if (!cm->lf.lfm) goto fail;

   return 0;

--- a/vp9/common/vp9_loopfilter.c

+++ b/vp9/common/vp9_loopfilter.c

@@ -825,6 +825,120 @@

     *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;

+void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row,

+                     const int mi_col, LOOP_FILTER_MASK *lfm) {

+  int i;

+  // The largest loopfilter we have is 16x16 so we use the 16x16 mask

+  // for 32x32 transforms also.

+  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];

+  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];

+  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];

+  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];

+  // We do at least 8 tap filter on every 32x32 even if the transform size

+  // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and

+  // remove it from the 4x4.

+  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;

+  lfm->left_y[TX_4X4] &= ~left_border;

+  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;

+  lfm->above_y[TX_4X4] &= ~above_border;

+  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;

+  lfm->left_uv[TX_4X4] &= ~left_border_uv;

+  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;

+  lfm->above_uv[TX_4X4] &= ~above_border_uv;

+  // We do some special edge handling.

+  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {

+    const uint64_t rows = cm->mi_rows - mi_row;

+    // Each pixel inside the border gets a 1,

+    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);

+    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);

+    // Remove values completely outside our border.

+    for (i = 0; i < TX_32X32; i++) {

+      lfm->left_y[i] &= mask_y;

+      lfm->above_y[i] &= mask_y;

+      lfm->left_uv[i] &= mask_uv;

+      lfm->above_uv[i] &= mask_uv;

+    }

+    lfm->int_4x4_y &= mask_y;

+    lfm->int_4x4_uv &= mask_uv;

+    // We don't apply a wide loop filter on the last uv block row. If set

+    // apply the shorter one instead.

+    if (rows == 1) {

+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];

+      lfm->above_uv[TX_16X16] = 0;

+    }

+    if (rows == 5) {

+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;

+      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);

+    }

+  }

+  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {

+    const uint64_t columns = cm->mi_cols - mi_col;

+    // Each pixel inside the border gets a 1, the multiply copies the border

+    // to where we need it.

+    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101ULL;

+    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;

+    // Internal edges are not applied on the last column of the image so

+    // we mask 1 more for the internal edges

+    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;

+    // Remove the bits outside the image edge.

+    for (i = 0; i < TX_32X32; i++) {

+      lfm->left_y[i] &= mask_y;

+      lfm->above_y[i] &= mask_y;

+      lfm->left_uv[i] &= mask_uv;

+      lfm->above_uv[i] &= mask_uv;

+    }

+    lfm->int_4x4_y &= mask_y;

+    lfm->int_4x4_uv &= mask_uv_int;

+    // We don't apply a wide loop filter on the last uv column. If set

+    // apply the shorter one instead.

+    if (columns == 1) {

+      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];

+      lfm->left_uv[TX_16X16] = 0;

+    }

+    if (columns == 5) {

+      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);

+      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);

+    }

+  }

+  // We don't apply a loop filter on the first column in the image, mask that

+  // out.

+  if (mi_col == 0) {

+    for (i = 0; i < TX_32X32; i++) {

+      lfm->left_y[i] &= 0xfefefefefefefefeULL;

+      lfm->left_uv[i] &= 0xeeee;

+    }

+  }

+  // Assert if we try to apply 2 different loop filters at the same position.

+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));

+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));

+  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));

+  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));

+  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));

+  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));

+  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));

+  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));

+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));

+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));

+  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));

+  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));

+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));

+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));

+  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));

+  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));

+}

 // This function sets up the bit masks for the entire 64x64 region represented

 // by mi_row, mi_col.

 // TODO(JBB): This function only works for yv12.

@@ -855,7 +969,6 @@

   const int shift_8_y[] = {0, 1, 8, 9};

   const int shift_32_uv[] = {0, 2, 8, 10};

   const int shift_16_uv[] = {0, 1, 4, 5};

-  int i;

   const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?

                         cm->mi_rows - mi_row : MI_BLOCK_SIZE);

   const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?

@@ -970,114 +1083,8 @@

       break;

-  // The largest loopfilter we have is 16x16 so we use the 16x16 mask

-  // for 32x32 transforms also.

-  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];

-  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];

-  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];

-  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];

-  // We do at least 8 tap filter on every 32x32 even if the transform size

-  // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and

-  // remove it from the 4x4.

-  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;

-  lfm->left_y[TX_4X4] &= ~left_border;

-  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;

-  lfm->above_y[TX_4X4] &= ~above_border;

-  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;

-  lfm->left_uv[TX_4X4] &= ~left_border_uv;

-  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;

-  lfm->above_uv[TX_4X4] &= ~above_border_uv;

-  // We do some special edge handling.

-  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {

-    const uint64_t rows = cm->mi_rows - mi_row;

-    // Each pixel inside the border gets a 1,

-    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);

-    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);

-    // Remove values completely outside our border.

-    for (i = 0; i < TX_32X32; i++) {

-      lfm->left_y[i] &= mask_y;

-      lfm->above_y[i] &= mask_y;

-      lfm->left_uv[i] &= mask_uv;

-      lfm->above_uv[i] &= mask_uv;

-    }

-    lfm->int_4x4_y &= mask_y;

-    lfm->int_4x4_uv &= mask_uv;

-    // We don't apply a wide loop filter on the last uv block row. If set

-    // apply the shorter one instead.

-    if (rows == 1) {

-      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];

-      lfm->above_uv[TX_16X16] = 0;

-    }

-    if (rows == 5) {

-      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;

-      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);

-    }

-  }

-  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {

-    const uint64_t columns = cm->mi_cols - mi_col;

-    // Each pixel inside the border gets a 1, the multiply copies the border

-    // to where we need it.

-    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101ULL;

-    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;

-    // Internal edges are not applied on the last column of the image so

-    // we mask 1 more for the internal edges

-    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;

-    // Remove the bits outside the image edge.

-    for (i = 0; i < TX_32X32; i++) {

-      lfm->left_y[i] &= mask_y;

-      lfm->above_y[i] &= mask_y;

-      lfm->left_uv[i] &= mask_uv;

-      lfm->above_uv[i] &= mask_uv;

-    }

-    lfm->int_4x4_y &= mask_y;

-    lfm->int_4x4_uv &= mask_uv_int;

-    // We don't apply a wide loop filter on the last uv column. If set

-    // apply the shorter one instead.

-    if (columns == 1) {

-      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];

-      lfm->left_uv[TX_16X16] = 0;

-    }

-    if (columns == 5) {

-      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);

-      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);

-    }

-  }

-  // We don't apply a loop filter on the first column in the image, mask that

-  // out.

-  if (mi_col == 0) {

-    for (i = 0; i < TX_32X32; i++) {

-      lfm->left_y[i] &= 0xfefefefefefefefeULL;

-      lfm->left_uv[i] &= 0xeeee;

-    }

-  }

-  // Assert if we try to apply 2 different loop filters at the same position.

-  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));

-  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));

-  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));

-  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));

-  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));

-  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));

-  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));

-  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));

-  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));

-  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));

-  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));

-  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));

-  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));

-  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));

-  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));

-  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));

+  vp9_adjust_mask(cm, mi_row, mi_col, lfm);

 static void filter_selectively_vert(uint8_t *s, int pitch,

@@ -1428,6 +1435,7 @@

   struct buf_2d *const dst = &plane->dst;

   uint8_t *const dst0 = dst->buf;

   int r, c;

+  uint8_t lfl_uv[16];

   uint16_t mask_16x16 = lfm->left_uv[TX_16X16];

   uint16_t mask_8x8 = lfm->left_uv[TX_8X8];

@@ -1440,8 +1448,8 @@

   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {

     if (plane->plane_type == 1) {

       for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {

-        lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];

-        lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];

+        lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];

+        lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];

@@ -1457,18 +1465,18 @@

         highbd_filter_selectively_vert_row2(

             plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,

             mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,

-            &lfm->lfl_uv[r << 1], (int)cm->bit_depth);

+            &lfl_uv[r << 1], (int)cm->bit_depth);

       } else {

         filter_selectively_vert_row2(

             plane->subsampling_x, dst->buf, dst->stride,

             mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,

-            &lfm->lfl_uv[r << 1]);

+            &lfl_uv[r << 1]);

 #else

       filter_selectively_vert_row2(

           plane->subsampling_x, dst->buf, dst->stride,

           mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,

-          &lfm->lfl_uv[r << 1]);

+          &lfl_uv[r << 1]);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

       dst->buf += 16 * dst->stride;

@@ -1509,16 +1517,16 @@

       highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),

                                       dst->stride, mask_16x16_r, mask_8x8_r,

                                       mask_4x4_r, mask_4x4_int_r, &cm->lf_info,

-                                      &lfm->lfl_uv[r << 1], (int)cm->bit_depth);

+                                      &lfl_uv[r << 1], (int)cm->bit_depth);

     } else {

       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,

                                mask_4x4_r, mask_4x4_int_r, &cm->lf_info,

-                               &lfm->lfl_uv[r << 1]);

+                               &lfl_uv[r << 1]);

 #else

     filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,

                              mask_4x4_r, mask_4x4_int_r, &cm->lf_info,

-                             &lfm->lfl_uv[r << 1]);

+                             &lfl_uv[r << 1]);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

     dst->buf += 8 * dst->stride;

@@ -1529,13 +1537,11 @@

-void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,

-                          VP9_COMMON *cm,

-                          struct macroblockd_plane planes[MAX_MB_PLANE],

-                          int start, int stop, int y_only) {

+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, VP9_COMMON *cm,

+                             struct macroblockd_plane planes[MAX_MB_PLANE],

+                             int start, int stop, int y_only) {

   const int num_planes = y_only ? 1 : MAX_MB_PLANE;

   enum lf_path path;

-  LOOP_FILTER_MASK lfm;

   int mi_row, mi_col;

   if (y_only)

@@ -1549,24 +1555,24 @@

   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {

     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;

+    LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);

-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {

+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) {

       int plane;

       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

       // TODO(JBB): Make setup_mask work for non 420.

-      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,

-                     &lfm);

+      vp9_adjust_mask(cm, mi_row, mi_col, lfm);

-      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);

+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);

       for (plane = 1; plane < num_planes; ++plane) {

         switch (path) {

           case LF_PATH_420:

-            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);

+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm);

             break;

           case LF_PATH_444:

-            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);

+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm);

             break;

           case LF_PATH_SLOW:

             vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,

@@ -1592,12 +1598,134 @@

     mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);

   end_mi_row = start_mi_row + mi_rows_to_filter;

+  loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only);

+}

+// Used by the encoder to build the loopfilter masks.

+void vp9_build_mask_frame(VP9_COMMON *cm, int frame_filter_level,

+                          int partial_frame) {

+  int start_mi_row, end_mi_row, mi_rows_to_filter;

+  int mi_col, mi_row;

+  if (!frame_filter_level) return;

+  start_mi_row = 0;

+  mi_rows_to_filter = cm->mi_rows;

+  if (partial_frame && cm->mi_rows > 8) {

+    start_mi_row = cm->mi_rows >> 1;

+    start_mi_row &= 0xfffffff8;

+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);

+  }

+  end_mi_row = start_mi_row + mi_rows_to_filter;

   vp9_loop_filter_frame_init(cm, frame_filter_level);

-  vp9_loop_filter_rows(frame, cm, xd->plane,

-                       start_mi_row, end_mi_row,

-                       y_only);

+  for (mi_row = start_mi_row; mi_row < end_mi_row; mi_row += MI_BLOCK_SIZE) {

+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;

+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {

+      // vp9_setup_mask() zeros lfm

+      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,

+                     get_lfm(&cm->lf, mi_row, mi_col));

+    }

+  }

+// 8x8 blocks in a superblock.  A "1" represents the first block in a 16x16

+// or greater area.

+static const uint8_t first_block_in_16x16[8][8] = {

+  {1, 0, 1, 0, 1, 0, 1, 0},

+  {0, 0, 0, 0, 0, 0, 0, 0},

+  {1, 0, 1, 0, 1, 0, 1, 0},

+  {0, 0, 0, 0, 0, 0, 0, 0},

+  {1, 0, 1, 0, 1, 0, 1, 0},

+  {0, 0, 0, 0, 0, 0, 0, 0},

+  {1, 0, 1, 0, 1, 0, 1, 0},

+  {0, 0, 0, 0, 0, 0, 0, 0}

+};

+// This function sets up the bit masks for a block represented

+// by mi_row, mi_col in a 64x64 region.

+// TODO(SJL): This function only works for yv12.

+void vp9_build_mask(VP9_COMMON *cm, const MB_MODE_INFO *mbmi, int mi_row,

+                    int mi_col, int bw, int bh) {

+  const BLOCK_SIZE block_size = mbmi->sb_type;

+  const TX_SIZE tx_size_y = mbmi->tx_size;

+  const loop_filter_info_n *const lfi_n = &cm->lf_info;

+  const int filter_level = get_filter_level(lfi_n, mbmi);

+  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);

+  LOOP_FILTER_MASK *const lfm = get_lfm(&cm->lf, mi_row, mi_col);

+  uint64_t *const left_y = &lfm->left_y[tx_size_y];

+  uint64_t *const above_y = &lfm->above_y[tx_size_y];

+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;

+  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];

+  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];

+  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;

+  const int row_in_sb = (mi_row & 7);

+  const int col_in_sb = (mi_col & 7);

+  const int shift_y = col_in_sb + (row_in_sb << 3);

+  const int shift_uv = (col_in_sb >> 1) + ((row_in_sb >> 1) << 2);

+  const int build_uv = first_block_in_16x16[row_in_sb][col_in_sb];

+  if (!filter_level) {

+    return;

+  } else {

+    int index = shift_y;

+    int i;

+    for (i = 0; i < bh; i++) {

+      memset(&lfm->lfl_y[index], filter_level, bw);

+      index += 8;

+    }

+  }

+  // These set 1 in the current block size for the block size edges.

+  // For instance if the block size is 32x16, we'll set:

+  //    above =   1111

+  //              0000

+  //    and

+  //    left  =   1000

+  //          =   1000

+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as

+  //        1,  not 8...

+  //

+  // U and V set things on a 16 bit scale.

+  //

+  *above_y |= above_prediction_mask[block_size] << shift_y;

+  *left_y |= left_prediction_mask[block_size] << shift_y;

+  if (build_uv) {

+    *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;

+    *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;

+  }

+  // If the block has no coefficients and is not intra we skip applying

+  // the loop filter on block edges.

+  if (mbmi->skip && is_inter_block(mbmi))

+    return;

+  // Add a mask for the transform size. The transform size mask is set to

+  // be correct for a 64x64 prediction block size. Mask to match the size of

+  // the block we are working on and then shift it into place.

+  *above_y |= (size_mask[block_size] &

+               above_64x64_txform_mask[tx_size_y]) << shift_y;

+  *left_y |= (size_mask[block_size] &

+              left_64x64_txform_mask[tx_size_y]) << shift_y;

+  if (build_uv) {

+    *above_uv |= (size_mask_uv[block_size] &

+                  above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;

+    *left_uv |= (size_mask_uv[block_size] &

+                 left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;

+  }

+  // Try to determine what to do with the internal 4x4 block boundaries.  These

+  // differ from the 4x4 boundaries on the outside edge of an 8x8 in that the

+  // internal ones can be skipped and don't depend on the prediction block size.

+  if (tx_size_y == TX_4X4)

+    *int_4x4_y |= (size_mask[block_size] & -1ULL) << shift_y;

+  if (build_uv && tx_size_uv == TX_4X4)

+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;

+}

 void vp9_loop_filter_data_reset(

     LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,

     struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) {

@@ -1609,9 +1737,17 @@

   memcpy(lf_data->planes, planes, sizeof(lf_data->planes));

+void vp9_reset_lfm(VP9_COMMON *cm) {

+  if (cm->lf.filter_level) {

+    memset(cm->lf.lfm, 0,

+           ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride *

+            sizeof(*cm->lf.lfm));

+  }

+}

 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {

   (void)unused;

-  vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,

-                       lf_data->start, lf_data->stop, lf_data->y_only);

+  loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,

+                   lf_data->start, lf_data->stop, lf_data->y_only);

   return 1;

--- a/vp9/common/vp9_loopfilter.h

+++ b/vp9/common/vp9_loopfilter.h

@@ -35,24 +35,6 @@

   LF_PATH_SLOW,

};

-struct loopfilter {

-  int filter_level;

-  int sharpness_level;

-  int last_sharpness_level;

-  uint8_t mode_ref_delta_enabled;

-  uint8_t mode_ref_delta_update;

-  // 0 = Intra, Last, GF, ARF

-  signed char ref_deltas[MAX_REF_LF_DELTAS];

-  signed char last_ref_deltas[MAX_REF_LF_DELTAS];

-  // 0 = ZERO_MV, MV

-  signed char mode_deltas[MAX_MODE_LF_DELTAS];

-  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];

-};

 // Need to align this structure so when it is declared and

 // passed it can be loaded into vector registers.

 typedef struct {

@@ -83,9 +65,29 @@

   uint16_t above_uv[TX_SIZES];

   uint16_t int_4x4_uv;

   uint8_t lfl_y[64];

-  uint8_t lfl_uv[16];

 } LOOP_FILTER_MASK;

+struct loopfilter {

+  int filter_level;

+  int sharpness_level;

+  int last_sharpness_level;

+  uint8_t mode_ref_delta_enabled;

+  uint8_t mode_ref_delta_update;

+  // 0 = Intra, Last, GF, ARF

+  signed char ref_deltas[MAX_REF_LF_DELTAS];

+  signed char last_ref_deltas[MAX_REF_LF_DELTAS];

+  // 0 = ZERO_MV, MV

+  signed char mode_deltas[MAX_MODE_LF_DELTAS];

+  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];

+  LOOP_FILTER_MASK *lfm;

+  int lfm_stride;

+};

 /* assorted loopfilter functions which get used elsewhere */

 struct VP9Common;

 struct macroblockd;

@@ -116,7 +118,7 @@

 void vp9_loop_filter_init(struct VP9Common *cm);

 // Update the loop filter for the current frame.

-// This should be called before vp9_loop_filter_rows(), vp9_loop_filter_frame()

+// This should be called before vp9_loop_filter_frame(), vp9_build_mask_frame()

 // calls this function directly.

 void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl);

@@ -126,11 +128,19 @@

                            int filter_level,

                            int y_only, int partial_frame);

-// Apply the loop filter to [start, stop) macro block rows in frame_buffer.

-void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,

-                          struct VP9Common *cm,

-                          struct macroblockd_plane planes[MAX_MB_PLANE],

-                          int start, int stop, int y_only);

+// Get the superblock lfm for a given mi_row, mi_col.

+static INLINE LOOP_FILTER_MASK *get_lfm(const struct loopfilter *lf,

+                                        const int mi_row, const int mi_col) {

+  return &lf->lfm[(mi_col >> 3) + ((mi_row >> 3) * lf->lfm_stride)];

+}

+void vp9_build_mask(struct VP9Common *cm, const MB_MODE_INFO *mbmi, int mi_row,

+                    int mi_col, int bw, int bh);

+void vp9_adjust_mask(struct VP9Common *const cm, const int mi_row,

+                     const int mi_col, LOOP_FILTER_MASK *lfm);

+void vp9_build_mask_frame(struct VP9Common *cm, int frame_filter_level,

+                          int partial_frame);

+void vp9_reset_lfm(struct VP9Common *const cm);

 typedef struct LoopFilterWorkerData {

   YV12_BUFFER_CONFIG *frame_buffer;

--- a/vp9/common/vp9_thread_common.c

+++ b/vp9/common/vp9_thread_common.c

@@ -109,11 +109,11 @@

   for (mi_row = start; mi_row < stop;

        mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {

     MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;

+    LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);

-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {

+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) {

       const int r = mi_row >> MI_BLOCK_SIZE_LOG2;

       const int c = mi_col >> MI_BLOCK_SIZE_LOG2;

-      LOOP_FILTER_MASK lfm;

       int plane;

       sync_read(lf_sync, r, c);

@@ -120,18 +120,16 @@

       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

-      // TODO(JBB): Make setup_mask work for non 420.

-      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,

-                     &lfm);

+      vp9_adjust_mask(cm, mi_row, mi_col, lfm);

-      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);

+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);

       for (plane = 1; plane < num_planes; ++plane) {

         switch (path) {

           case LF_PATH_420:

-            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);

+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm);

             break;

           case LF_PATH_444:

-            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);

+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm);

             break;

           case LF_PATH_SLOW:

             vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,

--- a/vp9/decoder/vp9_decodeframe.c

+++ b/vp9/decoder/vp9_decodeframe.c

@@ -896,6 +896,10 @@

   xd->corrupted |= vpx_reader_has_error(r);

+  if (cm->lf.filter_level) {

+    vp9_build_mask(cm, mbmi, mi_row, mi_col, bw, bh);

+  }

 static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,

@@ -1468,6 +1472,8 @@

   memset(cm->above_seg_context, 0,

          sizeof(*cm->above_seg_context) * aligned_cols);

+  vp9_reset_lfm(cm);

   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);

   if (pbi->tile_data == NULL ||

@@ -1665,6 +1671,8 @@

          sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);

   memset(cm->above_seg_context, 0,

          sizeof(*cm->above_seg_context) * aligned_mi_cols);

+  vp9_reset_lfm(cm);

   // Load tile data into tile_buffers

   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);

--- a/vp9/encoder/vp9_encoder.c

+++ b/vp9/encoder/vp9_encoder.c

@@ -2764,6 +2764,7 @@

 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {

   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;

   struct loopfilter *lf = &cm->lf;

   if (xd->lossless) {

       lf->filter_level = 0;

   } else {

@@ -2780,6 +2781,8 @@

   if (lf->filter_level > 0) {

+    vp9_build_mask_frame(cm, lf->filter_level, 0);

     if (cpi->num_workers > 1)

       vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,

                                lf->filter_level, 0, 0,

--- a/vp9/encoder/vp9_picklpf.c

+++ b/vp9/encoder/vp9_picklpf.c

@@ -40,6 +40,8 @@

   VP9_COMMON *const cm = &cpi->common;

   int64_t filt_err;

+  vp9_build_mask_frame(cm, filt_level, partial_frame);

   if (cpi->num_workers > 1)

     vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,

                              filt_level, 1, partial_frame,