shithub: libvpx

--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c

+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c

@@ -31,3 +31,34 @@

   vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);

   vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);

+void vp9_loop_filter_vertical_edge_16_neon(uint8_t *s, int p,

+                                           const uint8_t *blimit0,

+                                           const uint8_t *limit0,

+                                           const uint8_t *thresh0,

+                                           const uint8_t *blimit1,

+                                           const uint8_t *limit1,

+                                           const uint8_t *thresh1) {

+  vp9_loop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);

+  vp9_loop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);

+}

+void vp9_mbloop_filter_vertical_edge_16_neon(uint8_t *s, int p,

+                                             const uint8_t *blimit0,

+                                             const uint8_t *limit0,

+                                             const uint8_t *thresh0,

+                                             const uint8_t *blimit1,

+                                             const uint8_t *limit1,

+                                             const uint8_t *thresh1) {

+  vp9_mbloop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);

+  vp9_mbloop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1,

+                                       1);

+}

+void vp9_mb_lpf_vertical_edge_w_16_neon(uint8_t *s, int p,

+                                        const uint8_t *blimit,

+                                        const uint8_t *limit,

+                                        const uint8_t *thresh) {

+  vp9_mb_lpf_vertical_edge_w_neon(s, p, blimit, limit, thresh);

+  vp9_mb_lpf_vertical_edge_w_neon(s + 8 * p, p, blimit, limit, thresh);

+}

--- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c

+++ b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c

@@ -306,4 +306,59 @@

+void vp9_loop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,

+                                              const uint8_t *blimit0,

+                                              const uint8_t *limit0,

+                                              const uint8_t *thresh0,

+                                              const uint8_t *blimit1,

+                                              const uint8_t *limit1,

+                                              const uint8_t *thresh1) {

+  vp9_loop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);

+  vp9_loop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);

+}

+void vp9_mbloop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,

+                                                const uint8_t *blimit0,

+                                                const uint8_t *limit0,

+                                                const uint8_t *thresh0,

+                                                const uint8_t *blimit1,

+                                                const uint8_t *limit1,

+                                                const uint8_t *thresh1) {

+  vp9_mbloop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);

+  vp9_mbloop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1,

+                                          1);

+}

+void vp9_loop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,

+                                            const uint8_t *blimit0,

+                                            const uint8_t *limit0,

+                                            const uint8_t *thresh0,

+                                            const uint8_t *blimit1,

+                                            const uint8_t *limit1,

+                                            const uint8_t *thresh1) {

+  vp9_loop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);

+  vp9_loop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,

+                                      1);

+}

+void vp9_mbloop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,

+                                              const uint8_t *blimit0,

+                                              const uint8_t *limit0,

+                                              const uint8_t *thresh0,

+                                              const uint8_t *blimit1,

+                                              const uint8_t *limit1,

+                                              const uint8_t *thresh1) {

+  vp9_mbloop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);

+  vp9_mbloop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,

+                                       1);

+}

+void vp9_mb_lpf_vertical_edge_w_16_dspr2(uint8_t *s, int p,

+                                         const uint8_t *blimit,

+                                         const uint8_t *limit,

+                                         const uint8_t *thresh) {

+  vp9_mb_lpf_vertical_edge_w_dspr2(s, p, blimit, limit, thresh);

+  vp9_mb_lpf_vertical_edge_w_dspr2(s + 8 * p, p, blimit, limit, thresh);

+}

 #endif  // #if HAVE_DSPR2

--- a/vp9/common/vp9_loopfilter.c

+++ b/vp9/common/vp9_loopfilter.c

@@ -354,12 +354,11 @@

     // TODO(yunqingwang): count in loopfilter functions should be removed.

     if (mask & 1) {

       if ((mask_16x16_0 | mask_16x16_1) & 1) {

+        // TODO(yunqingwang): if (mask_16x16_0 & 1), then (mask_16x16_0 & 1)

+        // is always 1. Same is true for horizontal lf.

         if ((mask_16x16_0 & mask_16x16_1) & 1) {

-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.

-          vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim,

+          vp9_mb_lpf_vertical_edge_w_16(s, pitch, lfi0->mblim, lfi0->lim,

                                      lfi0->hev_thr);

-          vp9_mb_lpf_vertical_edge_w(s + 8 *pitch, pitch, lfi1->mblim,

-                                     lfi1->lim, lfi1->hev_thr);

         } else if (mask_16x16_0 & 1) {

           vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim,

                                      lfi0->hev_thr);

@@ -371,11 +370,9 @@

       if ((mask_8x8_0 | mask_8x8_1) & 1) {

         if ((mask_8x8_0 & mask_8x8_1) & 1) {

-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.

-          vp9_mbloop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,

-                                          lfi0->hev_thr, 1);

-          vp9_mbloop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim,

-                                          lfi1->lim, lfi1->hev_thr, 1);

+          vp9_mbloop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim,

+                                          lfi0->hev_thr, lfi1->mblim,

+                                          lfi1->lim, lfi1->hev_thr);

         } else if (mask_8x8_0 & 1) {

           vp9_mbloop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,

                                           lfi0->hev_thr, 1);

@@ -387,11 +384,9 @@

       if ((mask_4x4_0 | mask_4x4_1) & 1) {

         if ((mask_4x4_0 & mask_4x4_1) & 1) {

-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.

-          vp9_loop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,

-                                        lfi0->hev_thr, 1);

-          vp9_loop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim,

-                                        lfi1->lim, lfi1->hev_thr, 1);

+          vp9_loop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim,

+                                        lfi0->hev_thr, lfi1->mblim,

+                                        lfi1->lim, lfi1->hev_thr);

         } else if (mask_4x4_0 & 1) {

           vp9_loop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,

                                         lfi0->hev_thr, 1);

@@ -403,11 +398,9 @@

       if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {

         if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {

-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.

-          vp9_loop_filter_vertical_edge(s + 4, pitch, lfi0->mblim, lfi0->lim,

-                                        lfi0->hev_thr, 1);

-          vp9_loop_filter_vertical_edge(s + 8 *pitch + 4, pitch, lfi1->mblim,

-                                        lfi1->lim, lfi1->hev_thr, 1);

+          vp9_loop_filter_vertical_edge_16(s + 4, pitch, lfi0->mblim, lfi0->lim,

+                                        lfi0->hev_thr, lfi1->mblim,

+                                        lfi1->lim, lfi1->hev_thr);

         } else if (mask_4x4_int_0 & 1) {

           vp9_loop_filter_vertical_edge(s + 4, pitch, lfi0->mblim, lfi0->lim,

                                         lfi0->hev_thr, 1);

--- a/vp9/common/vp9_loopfilter_filters.c

+++ b/vp9/common/vp9_loopfilter_filters.c

@@ -169,6 +169,34 @@

+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch,

+                                        const uint8_t *blimit0,

+                                        const uint8_t *limit0,

+                                        const uint8_t *thresh0,

+                                        const uint8_t *blimit1,

+                                        const uint8_t *limit1,

+                                        const uint8_t *thresh1) {

+  int i, j;

+  const uint8_t *blimit = blimit0;

+  const uint8_t *limit = limit0;

+  const uint8_t *thresh = thresh0;

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < 8; ++j) {

+      const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];

+      const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];

+      const int8_t mask = filter_mask(*limit, *blimit,

+                                      p3, p2, p1, p0, q0, q1, q2, q3);

+      const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);

+      filter4(mask, hev, s - 2, s - 1, s, s + 1);

+      s += pitch;

+    }

+    blimit = blimit1;

+    limit = limit1;

+    thresh = thresh1;

+  }

+}

 static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat,

                            uint8_t *op3, uint8_t *op2,

                            uint8_t *op1, uint8_t *op0,

@@ -264,6 +292,36 @@

+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch,

+                                          const uint8_t *blimit0,

+                                          const uint8_t *limit0,

+                                          const uint8_t *thresh0,

+                                          const uint8_t *blimit1,

+                                          const uint8_t *limit1,

+                                          const uint8_t *thresh1) {

+  int i, j;

+  const uint8_t *blimit = blimit0;

+  const uint8_t *limit = limit0;

+  const uint8_t *thresh = thresh0;

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < 8; ++j) {

+      const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];

+      const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];

+      const int8_t mask = filter_mask(*limit, *blimit,

+                                      p3, p2, p1, p0, q0, q1, q2, q3);

+      const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1);

+      const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);

+      filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,

+                               s,     s + 1, s + 2, s + 3);

+      s += pitch;

+    }

+    blimit = blimit1;

+    limit = limit1;

+    thresh = thresh1;

+  }

+}

 static INLINE void filter16(int8_t mask, uint8_t hev,

                             uint8_t flat, uint8_t flat2,

                             uint8_t *op7, uint8_t *op6,

@@ -351,6 +409,29 @@

   int i;

   for (i = 0; i < 8; ++i) {

+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];

+    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];

+    const int8_t mask = filter_mask(*limit, *blimit,

+                                    p3, p2, p1, p0, q0, q1, q2, q3);

+    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);

+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);

+    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,

+                                    q0, s[4], s[5], s[6], s[7]);

+    filter16(mask, hev, flat, flat2,

+             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,

+             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);

+    s += p;

+  }

+}

+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int p,

+                                     const uint8_t *blimit,

+                                     const uint8_t *limit,

+                                     const uint8_t *thresh) {

+  int i;

+  for (i = 0; i < 16; ++i) {

     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];

     const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];

     const int8_t mask = filter_mask(*limit, *blimit,

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -193,12 +193,21 @@

 prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"

 specialize vp9_mb_lpf_vertical_edge_w sse2 neon dspr2

+prototype void vp9_mb_lpf_vertical_edge_w_16 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"

+specialize vp9_mb_lpf_vertical_edge_w_16 sse2 neon dspr2

 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"

 specialize vp9_mbloop_filter_vertical_edge sse2 neon dspr2

+prototype void vp9_mbloop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"

+specialize vp9_mbloop_filter_vertical_edge_16 sse2 neon dspr2

 prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"

 specialize vp9_loop_filter_vertical_edge mmx neon dspr2

+prototype void vp9_loop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"

+specialize vp9_loop_filter_vertical_edge_16 sse2 neon dspr2

 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"

 specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2

@@ -206,13 +215,13 @@

 specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2

 prototype void vp9_mbloop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"

-specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon

+specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon dspr2

 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"

 specialize vp9_loop_filter_horizontal_edge mmx neon dspr2

 prototype void vp9_loop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"

-specialize vp9_loop_filter_horizontal_edge_16 sse2 neon

+specialize vp9_loop_filter_horizontal_edge_16 sse2 neon dspr2

 # post proc

--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c

+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include <emmintrin.h>  /* SSE2 */

+#include <emmintrin.h>  // SSE2

 #include "vp9/common/vp9_loopfilter.h"

 #include "vpx_ports/emmintrin_compat.h"

@@ -99,7 +99,7 @@

     filt = _mm_adds_epi8(filt, work_a);

     filt = _mm_adds_epi8(filt, work_a);

     filt = _mm_adds_epi8(filt, work_a);

-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */

+    // (vp9_filter + 3 * (qs0 - ps0)) & mask

     filt = _mm_and_si128(filt, mask);

     filter1 = _mm_adds_epi8(filt, t4);

@@ -110,11 +110,11 @@

     filter2 = _mm_unpacklo_epi8(zero, filter2);

     filter2 = _mm_srai_epi16(filter2, 0xB);

-    /* Filter1 >> 3 */

+    // Filter1 >> 3

     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));

     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);

-    /* filt >> 1 */

+    // filt >> 1

     filt = _mm_adds_epi16(filter1, t1);

     filt = _mm_srai_epi16(filt, 1);

     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),

@@ -473,13 +473,13 @@

     filt = _mm_adds_epi8(filt, work_a);

     filt = _mm_adds_epi8(filt, work_a);

     filt = _mm_adds_epi8(filt, work_a);

-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */

+    // (vp9_filter + 3 * (qs0 - ps0)) & mask

     filt = _mm_and_si128(filt, mask);

     filter1 = _mm_adds_epi8(filt, t4);

     filter2 = _mm_adds_epi8(filt, t3);

-    /* Filter1 >> 3 */

+    // Filter1 >> 3

     work_a = _mm_cmpgt_epi8(zero, filter1);

     filter1 = _mm_srli_epi16(filter1, 3);

     work_a = _mm_and_si128(work_a, te0);

@@ -487,7 +487,7 @@

     filter1 = _mm_or_si128(filter1, work_a);

     qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);

-    /* Filter2 >> 3 */

+    // Filter2 >> 3

     work_a = _mm_cmpgt_epi8(zero, filter2);

     filter2 = _mm_srli_epi16(filter2, 3);

     work_a = _mm_and_si128(work_a, te0);

@@ -495,7 +495,7 @@

     filter2 = _mm_or_si128(filter2, work_a);

     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);

-    /* filt >> 1 */

+    // filt >> 1

     filt = _mm_adds_epi8(filter1, t1);

     work_a = _mm_cmpgt_epi8(zero, filt);

     filt = _mm_srli_epi16(filt, 1);

@@ -1014,23 +1014,23 @@

     filt = _mm_adds_epi8(filt, work_a);

     filt = _mm_adds_epi8(filt, work_a);

     filt = _mm_adds_epi8(filt, work_a);

-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */

+    // (vp9_filter + 3 * (qs0 - ps0)) & mask

     filt = _mm_and_si128(filt, mask);

     filter1 = _mm_adds_epi8(filt, t4);

     filter2 = _mm_adds_epi8(filt, t3);

-    /* Filter1 >> 3 */

+    // Filter1 >> 3

     filter1 = _mm_unpacklo_epi8(zero, filter1);

     filter1 = _mm_srai_epi16(filter1, 11);

     filter1 = _mm_packs_epi16(filter1, filter1);

-    /* Filter2 >> 3 */

+    // Filter2 >> 3

     filter2 = _mm_unpacklo_epi8(zero, filter2);

     filter2 = _mm_srai_epi16(filter2, 11);

     filter2 = _mm_packs_epi16(filter2, zero);

-    /* filt >> 1 */

+    // filt >> 1

     filt = _mm_adds_epi8(filter1, t1);

     filt = _mm_unpacklo_epi8(zero, filt);

     filt = _mm_srai_epi16(filt, 9);

@@ -1083,7 +1083,7 @@

-void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p /* pitch */,

+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p,

                                                const uint8_t *_blimit0,

                                                const uint8_t *_limit0,

                                                const uint8_t *_thresh0,

@@ -1255,13 +1255,13 @@

     filt = _mm_adds_epi8(filt, work_a);

     filt = _mm_adds_epi8(filt, work_a);

     filt = _mm_adds_epi8(filt, work_a);

-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */

+    // (vp9_filter + 3 * (qs0 - ps0)) & mask

     filt = _mm_and_si128(filt, mask);

     filter1 = _mm_adds_epi8(filt, t4);

     filter2 = _mm_adds_epi8(filt, t3);

-    /* Filter1 >> 3 */

+    // Filter1 >> 3

     work_a = _mm_cmpgt_epi8(zero, filter1);

     filter1 = _mm_srli_epi16(filter1, 3);

     work_a = _mm_and_si128(work_a, te0);

@@ -1268,7 +1268,7 @@

     filter1 = _mm_and_si128(filter1, t1f);

     filter1 = _mm_or_si128(filter1, work_a);

-    /* Filter2 >> 3 */

+    // Filter2 >> 3

     work_a = _mm_cmpgt_epi8(zero, filter2);

     filter2 = _mm_srli_epi16(filter2, 3);

     work_a = _mm_and_si128(work_a, te0);

@@ -1275,7 +1275,7 @@

     filter2 = _mm_and_si128(filter2, t1f);

     filter2 = _mm_or_si128(filter2, work_a);

-    /* filt >> 1 */

+    // filt >> 1

     filt = _mm_adds_epi8(filter1, t1);

     work_a = _mm_cmpgt_epi8(zero, filt);

     filt = _mm_srli_epi16(filt, 1);

@@ -1427,13 +1427,13 @@

     filt = _mm_adds_epi8(filt, work_a);

     filt = _mm_adds_epi8(filt, work_a);

     filt = _mm_adds_epi8(filt, work_a);

-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */

+    // (vp9_filter + 3 * (qs0 - ps0)) & mask

     filt = _mm_and_si128(filt, mask);

     filter1 = _mm_adds_epi8(filt, t4);

     filter2 = _mm_adds_epi8(filt, t3);

-    /* Filter1 >> 3 */

+    // Filter1 >> 3

     work_a = _mm_cmpgt_epi8(zero, filter1);

     filter1 = _mm_srli_epi16(filter1, 3);

     work_a = _mm_and_si128(work_a, te0);

@@ -1440,7 +1440,7 @@

     filter1 = _mm_and_si128(filter1, t1f);

     filter1 = _mm_or_si128(filter1, work_a);

-    /* Filter2 >> 3 */

+    // Filter2 >> 3

     work_a = _mm_cmpgt_epi8(zero, filter2);

     filter2 = _mm_srli_epi16(filter2, 3);

     work_a = _mm_and_si128(work_a, te0);

@@ -1447,7 +1447,7 @@

     filter2 = _mm_and_si128(filter2, t1f);

     filter2 = _mm_or_si128(filter2, work_a);

-    /* filt >> 1 */

+    // filt >> 1

     filt = _mm_adds_epi8(filter1, t1);

     work_a = _mm_cmpgt_epi8(zero, filt);

     filt = _mm_srli_epi16(filt, 1);

@@ -1474,7 +1474,7 @@

   __m128i x0, x1, x2, x3, x4, x5, x6, x7;

   __m128i x8, x9, x10, x11, x12, x13, x14, x15;

-  /* Read in 16 lines */

+  // Read in 16 lines

   x0 = _mm_loadl_epi64((__m128i *)in0);

   x8 = _mm_loadl_epi64((__m128i *)in1);

   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));

@@ -1512,7 +1512,7 @@

   x14 = _mm_unpacklo_epi32(x12, x13);

   x15 = _mm_unpackhi_epi32(x12, x13);

-  /* Store first 4-line result */

+  // Store first 4-line result

   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));

   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));

   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));

@@ -1528,7 +1528,7 @@

   x14 = _mm_unpacklo_epi32(x12, x13);

   x15 = _mm_unpackhi_epi32(x12, x13);

-  /* Store second 4-line result */

+  // Store second 4-line result

   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));

   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));

   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));

@@ -1598,61 +1598,129 @@

   } while (++idx8x8 < num_8x8_to_transpose);

-void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,

-                                          int p,

+void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int p,

+                                           const uint8_t *blimit0,

+                                           const uint8_t *limit0,

+                                           const uint8_t *thresh0,

+                                           const uint8_t *blimit1,

+                                           const uint8_t *limit1,

+                                           const uint8_t *thresh1) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);

+  unsigned char *src[2];

+  unsigned char *dst[2];

+  // Transpose 8x16

+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);

+  // Loop filtering

+  vp9_loop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0,

+                                          thresh0, blimit1, limit1, thresh1);

+  src[0] = t_dst;

+  src[1] = t_dst + 8;

+  dst[0] = s - 4;

+  dst[1] = s - 4 + p * 8;

+  // Transpose back

+  transpose(src, 16, dst, p, 2);

+}

+void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, int p,

                                           const unsigned char *blimit,

                                           const unsigned char *limit,

                                           const unsigned char *thresh,

                                           int count) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);

+  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);

+  unsigned char *src[1];

+  unsigned char *dst[1];

+  (void)count;

+  // Transpose 8x8

+  src[0] = s - 4;

+  dst[0] = t_dst;

+  transpose(src, p, dst, 8, 1);

+  // Loop filtering

+  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 4 * 8, 8, blimit, limit,

+                                         thresh, 1);

+  src[0] = t_dst;

+  dst[0] = s - 4;

+  // Transpose back

+  transpose(src, 8, dst, p, 1);

+}

+void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int p,

+                                             const uint8_t *blimit0,

+                                             const uint8_t *limit0,

+                                             const uint8_t *thresh0,

+                                             const uint8_t *blimit1,

+                                             const uint8_t *limit1,

+                                             const uint8_t *thresh1) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);

   unsigned char *src[2];

   unsigned char *dst[2];

-  (void)count;

-  /* Transpose 16x16 */

-  transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);

-  transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);

+  // Transpose 8x16

+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);

-  /* Loop filtering */

-  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,

-                                         thresh, 1);

-  src[0] = t_dst + 3 * 16;

-  src[1] = t_dst + 3 * 16 + 8;

+  // Loop filtering

+  vp9_mbloop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0,

+                                            thresh0, blimit1, limit1, thresh1);

+  src[0] = t_dst;

+  src[1] = t_dst + 8;

-  dst[0] = s - 5;

-  dst[1] = s - 5 + p * 8;

+  dst[0] = s - 4;

+  dst[1] = s - 4 + p * 8;

-  /* Transpose 16x8 */

+  // Transpose back

   transpose(src, 16, dst, p, 2);

-void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,

-                                     int p,

+void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, int p,

                                      const unsigned char *blimit,

                                      const unsigned char *limit,

                                      const unsigned char *thresh) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);

-  unsigned char *src[4];

-  unsigned char *dst[4];

+  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);

+  unsigned char *src[2];

+  unsigned char *dst[2];

+  src[0] = s - 8;

+  src[1] = s;

   dst[0] = t_dst;

-  dst[1] = t_dst + 8 * 16;

+  dst[1] = t_dst + 8 * 8;

-  src[0] = s - 8;

-  src[1] = s - 8 + 8;

+  // Transpose 16x8

+  transpose(src, p, dst, 8, 2);

-  /* Transpose 16x16 */

-  transpose(src, p, dst, 16, 2);

+  // Loop filtering

+  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);

-  /* Loop filtering */

-  vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,

-                                    thresh, 1);

   src[0] = t_dst;

-  src[1] = t_dst + 8 * 16;

+  src[1] = t_dst + 8 * 8;

   dst[0] = s - 8;

-  dst[1] = s - 8 + 8;

+  dst[1] = s;

-  transpose(src, 16, dst, p, 2);

+  // Transpose back

+  transpose(src, 8, dst, p, 2);

+}

+void vp9_mb_lpf_vertical_edge_w_16_sse2(unsigned char *s, int p,

+                                        const uint8_t *blimit,

+                                        const uint8_t *limit,

+                                        const uint8_t *thresh) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);

+  // Transpose 16x16

+  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);

+  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);

+  // Loop filtering

+  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,

+                                   thresh);

+  // Transpose back

+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);

+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);