shithub: libvpx

--- a/test/lpf_8_test.cc

+++ b/test/lpf_8_test.cc

@@ -37,7 +37,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

 typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit,

                           const uint8_t *limit, const uint8_t *thresh,

-                          int count, int bd);

+                          int bd);

 typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0,

                                const uint8_t *limit0, const uint8_t *thresh0,

                                const uint8_t *blimit1, const uint8_t *limit1,

@@ -44,8 +44,7 @@

                                const uint8_t *thresh1, int bd);

 #else

 typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit,

-                          const uint8_t *limit, const uint8_t *thresh,

-                          int count);

+                          const uint8_t *limit, const uint8_t *thresh);

 typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,

                                const uint8_t *limit0, const uint8_t *thresh0,

                                const uint8_t *blimit1, const uint8_t *limit1,

@@ -52,105 +51,9 @@

                                const uint8_t *thresh1);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-typedef std::tr1::tuple<loop_op_t, loop_op_t, int, int> loop8_param_t;

+typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;

 typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;

-#if HAVE_SSE2

-#if CONFIG_VP9_HIGHBITDEPTH

-void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,

-                              const uint8_t *limit, const uint8_t *thresh,

-                              int count, int bd) {

-  vpx_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);

-}

-void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,

-                           const uint8_t *limit, const uint8_t *thresh,

-                           int count, int bd) {

-  vpx_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);

-}

-void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit,

-                                   const uint8_t *limit, const uint8_t *thresh,

-                                   int count, int bd) {

-  vpx_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);

-}

-void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit,

-                                const uint8_t *limit, const uint8_t *thresh,

-                                int count, int bd) {

-  vpx_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);

-}

-#else

-void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit,

-                              const uint8_t *limit, const uint8_t *thresh,

-                              int count) {

-  vpx_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);

-}

-void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,

-                           const uint8_t *limit, const uint8_t *thresh,

-                           int count) {

-  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);

-}

-void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit,

-                                   const uint8_t *limit, const uint8_t *thresh,

-                                   int count) {

-  vpx_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);

-}

-void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,

-                                const uint8_t *limit, const uint8_t *thresh,

-                                int count) {

-  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);

-}

-#endif  // CONFIG_VP9_HIGHBITDEPTH

-#endif  // HAVE_SSE2

-#if HAVE_NEON_ASM

-#if CONFIG_VP9_HIGHBITDEPTH

-// No neon high bitdepth functions.

-#else

-void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,

-                              const uint8_t *limit, const uint8_t *thresh,

-                              int count) {

-  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);

-}

-void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,

-                           const uint8_t *limit, const uint8_t *thresh,

-                           int count) {

-  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);

-}

-void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,

-                                   const uint8_t *limit, const uint8_t *thresh,

-                                   int count) {

-  vpx_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);

-}

-void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,

-                                const uint8_t *limit, const uint8_t *thresh,

-                                int count) {

-  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);

-}

-#endif  // CONFIG_VP9_HIGHBITDEPTH

-#endif  // HAVE_NEON_ASM

-#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)

-void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit,

-                             const uint8_t *limit, const uint8_t *thresh,

-                             int count) {

-  vpx_lpf_vertical_16_msa(s, p, blimit, limit, thresh);

-}

-void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,

-                           const uint8_t *limit, const uint8_t *thresh,

-                           int count) {

-  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);

-}

-#endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)

 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {

  public:

   virtual ~Loop8Test6Param() {}

@@ -158,7 +61,6 @@

     loopfilter_op_ = GET_PARAM(0);

     ref_loopfilter_op_ = GET_PARAM(1);

     bit_depth_ = GET_PARAM(2);

-    count_ = GET_PARAM(3);

     mask_ = (1 << bit_depth_) - 1;

@@ -166,7 +68,6 @@

  protected:

   int bit_depth_;

-  int count_;

   int mask_;

   loop_op_t loopfilter_op_;

   loop_op_t ref_loopfilter_op_;

@@ -253,13 +154,13 @@

       ref_s[j] = s[j];

 #if CONFIG_VP9_HIGHBITDEPTH

-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);

+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd);

     ASM_REGISTER_STATE_CHECK(

-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));

+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd));

 #else

-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);

+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh);

     ASM_REGISTER_STATE_CHECK(

-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));

+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh));

 #endif  // CONFIG_VP9_HIGHBITDEPTH

     for (int j = 0; j < kNumCoeffs; ++j) {

@@ -325,13 +226,13 @@

       ref_s[j] = s[j];

 #if CONFIG_VP9_HIGHBITDEPTH

-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);

+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd);

     ASM_REGISTER_STATE_CHECK(

-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));

+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd));

 #else

-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);

+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh);

     ASM_REGISTER_STATE_CHECK(

-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));

+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh));

 #endif  // CONFIG_VP9_HIGHBITDEPTH

     for (int j = 0; j < kNumCoeffs; ++j) {

       err_count += ref_s[j] != s[j];

@@ -529,6 +430,16 @@

 using std::tr1::make_tuple;

+#if HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH

+INSTANTIATE_TEST_CASE_P(

+    MMX, Loop8Test6Param,

+    ::testing::Values(

+        make_tuple(&vpx_lpf_horizontal_4_mmx,

+                   &vpx_lpf_horizontal_4_c, 8),

+        make_tuple(&vpx_lpf_vertical_4_mmx,

+                   &vpx_lpf_vertical_4_c, 8)));

+#endif  // HAVE_MMX

 #if HAVE_SSE2

 #if CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

@@ -535,64 +446,69 @@

     SSE2, Loop8Test6Param,

     ::testing::Values(

         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,

-                   &vpx_highbd_lpf_horizontal_4_c, 8, 1),

+                   &vpx_highbd_lpf_horizontal_4_c, 8),

         make_tuple(&vpx_highbd_lpf_vertical_4_sse2,

-                   &vpx_highbd_lpf_vertical_4_c, 8, 1),

+                   &vpx_highbd_lpf_vertical_4_c, 8),

         make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,

-                   &vpx_highbd_lpf_horizontal_8_c, 8, 1),

-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,

-                   &vpx_highbd_lpf_horizontal_16_c, 8, 1),

-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,

-                   &vpx_highbd_lpf_horizontal_16_c, 8, 2),

+                   &vpx_highbd_lpf_horizontal_8_c, 8),

+        make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,

+                   &vpx_highbd_lpf_horizontal_edge_8_c, 8),

+        make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,

+                   &vpx_highbd_lpf_horizontal_edge_16_c, 8),

         make_tuple(&vpx_highbd_lpf_vertical_8_sse2,

-                   &vpx_highbd_lpf_vertical_8_c, 8, 1),

-        make_tuple(&wrapper_vertical_16_sse2,

-                   &wrapper_vertical_16_c, 8, 1),

+                   &vpx_highbd_lpf_vertical_8_c, 8),

+        make_tuple(&vpx_highbd_lpf_vertical_16_sse2,

+                   &vpx_highbd_lpf_vertical_16_c, 8),

         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,

-                   &vpx_highbd_lpf_horizontal_4_c, 10, 1),

+                   &vpx_highbd_lpf_horizontal_4_c, 10),

         make_tuple(&vpx_highbd_lpf_vertical_4_sse2,

-                   &vpx_highbd_lpf_vertical_4_c, 10, 1),

+                   &vpx_highbd_lpf_vertical_4_c, 10),

         make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,

-                   &vpx_highbd_lpf_horizontal_8_c, 10, 1),

-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,

-                   &vpx_highbd_lpf_horizontal_16_c, 10, 1),

-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,

-                   &vpx_highbd_lpf_horizontal_16_c, 10, 2),

+                   &vpx_highbd_lpf_horizontal_8_c, 10),

+        make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,

+                   &vpx_highbd_lpf_horizontal_edge_8_c, 10),

+        make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,

+                   &vpx_highbd_lpf_horizontal_edge_16_c, 10),

         make_tuple(&vpx_highbd_lpf_vertical_8_sse2,

-                   &vpx_highbd_lpf_vertical_8_c, 10, 1),

-        make_tuple(&wrapper_vertical_16_sse2,

-                   &wrapper_vertical_16_c, 10, 1),

+                   &vpx_highbd_lpf_vertical_8_c, 10),

+        make_tuple(&vpx_highbd_lpf_vertical_16_sse2,

+                   &vpx_highbd_lpf_vertical_16_c, 10),

         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,

-                   &vpx_highbd_lpf_horizontal_4_c, 12, 1),

+                   &vpx_highbd_lpf_horizontal_4_c, 12),

         make_tuple(&vpx_highbd_lpf_vertical_4_sse2,

-                   &vpx_highbd_lpf_vertical_4_c, 12, 1),

+                   &vpx_highbd_lpf_vertical_4_c, 12),

         make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,

-                   &vpx_highbd_lpf_horizontal_8_c, 12, 1),

-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,

-                   &vpx_highbd_lpf_horizontal_16_c, 12, 1),

-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,

-                   &vpx_highbd_lpf_horizontal_16_c, 12, 2),

+                   &vpx_highbd_lpf_horizontal_8_c, 12),

+        make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,

+                   &vpx_highbd_lpf_horizontal_edge_8_c, 12),

+        make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,

+                   &vpx_highbd_lpf_horizontal_edge_16_c, 12),

         make_tuple(&vpx_highbd_lpf_vertical_8_sse2,

-                   &vpx_highbd_lpf_vertical_8_c, 12, 1),

-        make_tuple(&wrapper_vertical_16_sse2,

-                   &wrapper_vertical_16_c, 12, 1),

-        make_tuple(&wrapper_vertical_16_dual_sse2,

-                   &wrapper_vertical_16_dual_c, 8, 1),

-        make_tuple(&wrapper_vertical_16_dual_sse2,

-                   &wrapper_vertical_16_dual_c, 10, 1),

-        make_tuple(&wrapper_vertical_16_dual_sse2,

-                   &wrapper_vertical_16_dual_c, 12, 1)));

+                   &vpx_highbd_lpf_vertical_8_c, 12),

+        make_tuple(&vpx_highbd_lpf_vertical_16_sse2,

+                   &vpx_highbd_lpf_vertical_16_c, 12),

+        make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2,

+                   &vpx_highbd_lpf_vertical_16_dual_c, 8),

+        make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2,

+                   &vpx_highbd_lpf_vertical_16_dual_c, 10),

+        make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2,

+                   &vpx_highbd_lpf_vertical_16_dual_c, 12)));

 #else

 INSTANTIATE_TEST_CASE_P(

     SSE2, Loop8Test6Param,

     ::testing::Values(

-        make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),

-        make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1),

-        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1),

-        make_tuple(&wrapper_vertical_16_dual_sse2,

-                   &wrapper_vertical_16_dual_c, 8, 1)));

+        make_tuple(&vpx_lpf_horizontal_8_sse2,

+                   &vpx_lpf_horizontal_8_c, 8),

+        make_tuple(&vpx_lpf_horizontal_edge_8_sse2,

+                   &vpx_lpf_horizontal_edge_8_c, 8),

+        make_tuple(&vpx_lpf_horizontal_edge_16_sse2,

+                   &vpx_lpf_horizontal_edge_16_c, 8),

+        make_tuple(&vpx_lpf_vertical_8_sse2,

+                   &vpx_lpf_vertical_8_c, 8),

+        make_tuple(&vpx_lpf_vertical_16_sse2,

+                   &vpx_lpf_vertical_16_c, 8),

+        make_tuple(&vpx_lpf_vertical_16_dual_sse2,

+                   &vpx_lpf_vertical_16_dual_c, 8)));

 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #endif

@@ -600,9 +516,10 @@

 INSTANTIATE_TEST_CASE_P(

     AVX2, Loop8Test6Param,

     ::testing::Values(

-        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8,

-                   2)));

+        make_tuple(&vpx_lpf_horizontal_edge_8_avx2,

+                   &vpx_lpf_horizontal_edge_8_c, 8),

+        make_tuple(&vpx_lpf_horizontal_edge_16_avx2,

+                   &vpx_lpf_horizontal_edge_16_c, 8)));

 #endif

 #if HAVE_SSE2

@@ -659,23 +576,23 @@

 #if HAVE_NEON_ASM

 // Using #if inside the macro is unsupported on MSVS but the tests are not

 // currently built for MSVS with ARM and NEON.

-        make_tuple(&vpx_lpf_horizontal_16_neon,

-                   &vpx_lpf_horizontal_16_c, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_neon,

-                   &vpx_lpf_horizontal_16_c, 8, 2),

-        make_tuple(&wrapper_vertical_16_neon,

-                   &wrapper_vertical_16_c, 8, 1),

-        make_tuple(&wrapper_vertical_16_dual_neon,

-                   &wrapper_vertical_16_dual_c, 8, 1),

+        make_tuple(&vpx_lpf_horizontal_edge_8_neon,

+                   &vpx_lpf_horizontal_edge_8_c, 8),

+        make_tuple(&vpx_lpf_horizontal_edge_16_neon,

+                   &vpx_lpf_horizontal_edge_16_c, 8),

+        make_tuple(&vpx_lpf_vertical_16_neon,

+                   &vpx_lpf_vertical_16_c, 8),

+        make_tuple(&vpx_lpf_vertical_16_dual_neon,

+                   &vpx_lpf_vertical_16_dual_c, 8),

 #endif  // HAVE_NEON_ASM

         make_tuple(&vpx_lpf_horizontal_8_neon,

-                   &vpx_lpf_horizontal_8_c, 8, 1),

+                   &vpx_lpf_horizontal_8_c, 8),

         make_tuple(&vpx_lpf_vertical_8_neon,

-                   &vpx_lpf_vertical_8_c, 8, 1),

+                   &vpx_lpf_vertical_8_c, 8),

         make_tuple(&vpx_lpf_horizontal_4_neon,

-                   &vpx_lpf_horizontal_4_c, 8, 1),

+                   &vpx_lpf_horizontal_4_c, 8),

         make_tuple(&vpx_lpf_vertical_4_neon,

-                   &vpx_lpf_vertical_4_c, 8, 1)));

+                   &vpx_lpf_vertical_4_c, 8)));

 INSTANTIATE_TEST_CASE_P(

     NEON, Loop8Test9Param,

     ::testing::Values(

@@ -692,15 +609,58 @@

 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #endif  // HAVE_NEON

+#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH

+INSTANTIATE_TEST_CASE_P(

+    DSPR2, Loop8Test6Param,

+    ::testing::Values(

+        make_tuple(&vpx_lpf_horizontal_4_dspr2,

+                   &vpx_lpf_horizontal_4_c, 8),

+        make_tuple(&vpx_lpf_horizontal_8_dspr2,

+                   &vpx_lpf_horizontal_8_c, 8),

+        make_tuple(&vpx_lpf_horizontal_edge_8,

+                   &vpx_lpf_horizontal_edge_8, 8),

+        make_tuple(&vpx_lpf_horizontal_edge_16,

+                   &vpx_lpf_horizontal_edge_16, 8),

+        make_tuple(&vpx_lpf_vertical_4_dspr2,

+                   &vpx_lpf_vertical_4_c, 8),

+        make_tuple(&vpx_lpf_vertical_8_dspr2,

+                   &vpx_lpf_vertical_8_c, 8),

+        make_tuple(&vpx_lpf_vertical_16_dspr2,

+                   &vpx_lpf_vertical_16_c, 8),

+        make_tuple(&vpx_lpf_vertical_16_dual_dspr2,

+                   &vpx_lpf_vertical_16_dual_c, 8)));

+INSTANTIATE_TEST_CASE_P(

+    DSPR2, Loop8Test9Param,

+    ::testing::Values(

+        make_tuple(&vpx_lpf_horizontal_4_dual_dspr2,

+                   &vpx_lpf_horizontal_4_dual_c, 8),

+        make_tuple(&vpx_lpf_horizontal_8_dual_dspr2,

+                   &vpx_lpf_horizontal_8_dual_c, 8),

+        make_tuple(&vpx_lpf_vertical_4_dual_dspr2,

+                   &vpx_lpf_vertical_4_dual_c, 8),

+        make_tuple(&vpx_lpf_vertical_8_dual_dspr2,

+                   &vpx_lpf_vertical_8_dual_c, 8)));

+#endif  // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH

 #if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)

 INSTANTIATE_TEST_CASE_P(

     MSA, Loop8Test6Param,

     ::testing::Values(

-        make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),

-        make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1),

-        make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1)));

+        make_tuple(&vpx_lpf_horizontal_4_msa,

+                   &vpx_lpf_horizontal_4_c, 8),

+        make_tuple(&vpx_lpf_horizontal_8_msa,

+                   &vpx_lpf_horizontal_8_c, 8),

+        make_tuple(&vpx_lpf_horizontal_edge_8_msa,

+                   &vpx_lpf_horizontal_edge_8_c, 8),

+        make_tuple(&vpx_lpf_horizontal_edge_16_msa,

+                   &vpx_lpf_horizontal_edge_16_c, 8),

+        make_tuple(&vpx_lpf_vertical_4_msa,

+                   &vpx_lpf_vertical_4_c, 8),

+        make_tuple(&vpx_lpf_vertical_8_msa,

+                   &vpx_lpf_vertical_8_c, 8),

+        make_tuple(&vpx_lpf_vertical_16_msa,

+                   &vpx_lpf_vertical_16_c, 8)));

 INSTANTIATE_TEST_CASE_P(

     MSA, Loop8Test9Param,

--- a/vp10/common/loopfilter.c

+++ b/vp10/common/loopfilter.c

@@ -324,7 +324,6 @@

     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;

     const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);

-    // TODO(yunqingwang): count in loopfilter functions should be removed.

     if (mask & 1) {

       if ((mask_16x16_0 | mask_16x16_1) & 1) {

         if ((mask_16x16_0 & mask_16x16_1) & 1) {

@@ -345,11 +344,10 @@

                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,

                                   lfi1->hev_thr);

         } else if (mask_8x8_0 & 1) {

-          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,

-                             1);

+          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);

         } else {

           vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,

-                             lfi1->hev_thr, 1);

+                             lfi1->hev_thr);

@@ -359,11 +357,10 @@

                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,

                                   lfi1->hev_thr);

         } else if (mask_4x4_0 & 1) {

-          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,

-                             1);

+          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);

         } else {

           vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,

-                             lfi1->hev_thr, 1);

+                             lfi1->hev_thr);

@@ -374,10 +371,10 @@

                                   lfi1->hev_thr);

         } else if (mask_4x4_int_0 & 1) {

           vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,

-                             lfi0->hev_thr, 1);

+                             lfi0->hev_thr);

         } else {

           vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,

-                             lfi1->hev_thr, 1);

+                             lfi1->hev_thr);

@@ -424,7 +421,6 @@

     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;

     const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);

-    // TODO(yunqingwang): count in loopfilter functions should be removed.

     if (mask & 1) {

       if ((mask_16x16_0 | mask_16x16_1) & 1) {

         if ((mask_16x16_0 & mask_16x16_1) & 1) {

@@ -446,10 +442,10 @@

                                          lfi1->hev_thr, bd);

         } else if (mask_8x8_0 & 1) {

           vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,

-                                    lfi0->hev_thr, 1, bd);

+                                    lfi0->hev_thr, bd);

         } else {

           vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,

-                                    lfi1->lim, lfi1->hev_thr, 1, bd);

+                                    lfi1->lim, lfi1->hev_thr, bd);

@@ -460,10 +456,10 @@

                                          lfi1->hev_thr, bd);

         } else if (mask_4x4_0 & 1) {

           vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,

-                                    lfi0->hev_thr, 1, bd);

+                                    lfi0->hev_thr, bd);

         } else {

           vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,

-                                    lfi1->lim, lfi1->hev_thr, 1, bd);

+                                    lfi1->lim, lfi1->hev_thr, bd);

@@ -474,10 +470,10 @@

                                          lfi1->hev_thr, bd);

         } else if (mask_4x4_int_0 & 1) {

           vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,

-                                    lfi0->hev_thr, 1, bd);

+                                    lfi0->hev_thr, bd);

         } else {

           vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,

-                                    lfi1->lim, lfi1->hev_thr, 1, bd);

+                                    lfi1->lim, lfi1->hev_thr, bd);

@@ -514,12 +510,12 @@

     if (mask & 1) {

       if (mask_16x16 & 1) {

         if ((mask_16x16 & 3) == 3) {

-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 2);

+          vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,

+                                     lfi->hev_thr);

           count = 2;

         } else {

-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 1);

+          vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,

+                                    lfi->hev_thr);

       } else if (mask_8x8 & 1) {

         if ((mask_8x8 & 3) == 3) {

@@ -537,18 +533,18 @@

           } else {

             if (mask_4x4_int & 1)

               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                   lfi->hev_thr, 1);

+                                   lfi->hev_thr);

             else if (mask_4x4_int & 2)

               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                   lfin->lim, lfin->hev_thr, 1);

+                                   lfin->lim, lfin->hev_thr);

           count = 2;

         } else {

-          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

           if (mask_4x4_int & 1)

             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                 lfi->hev_thr, 1);

+                                 lfi->hev_thr);

       } else if (mask_4x4 & 1) {

         if ((mask_4x4 & 3) == 3) {

@@ -565,22 +561,22 @@

           } else {

             if (mask_4x4_int & 1)

               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                   lfi->hev_thr, 1);

+                                   lfi->hev_thr);

             else if (mask_4x4_int & 2)

               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                   lfin->lim, lfin->hev_thr, 1);

+                                   lfin->lim, lfin->hev_thr);

           count = 2;

         } else {

-          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

           if (mask_4x4_int & 1)

             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                 lfi->hev_thr, 1);

+                                 lfi->hev_thr);

       } else if (mask_4x4_int & 1) {

         vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                             lfi->hev_thr, 1);

+                             lfi->hev_thr);

     s += 8 * count;

@@ -611,12 +607,12 @@

     if (mask & 1) {

       if (mask_16x16 & 1) {

         if ((mask_16x16 & 3) == 3) {

-          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                       lfi->hev_thr, 2, bd);

+          vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,

+                                            lfi->hev_thr, bd);

           count = 2;

         } else {

-          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                       lfi->hev_thr, 1, bd);

+          vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,

+                                           lfi->hev_thr, bd);

       } else if (mask_8x8 & 1) {

         if ((mask_8x8 & 3) == 3) {

@@ -635,20 +631,20 @@

           } else {

             if (mask_4x4_int & 1) {

               vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,

-                                          lfi->lim, lfi->hev_thr, 1, bd);

+                                          lfi->lim, lfi->hev_thr, bd);

             } else if (mask_4x4_int & 2) {

               vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                          lfin->lim, lfin->hev_thr, 1, bd);

+                                          lfin->lim, lfin->hev_thr, bd);

           count = 2;

         } else {

           vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,

-                                      lfi->hev_thr, 1, bd);

+                                      lfi->hev_thr, bd);

           if (mask_4x4_int & 1) {

             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,

-                                        lfi->lim, lfi->hev_thr, 1, bd);

+                                        lfi->lim, lfi->hev_thr, bd);

       } else if (mask_4x4 & 1) {

@@ -667,25 +663,25 @@

           } else {

             if (mask_4x4_int & 1) {

               vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,

-                                          lfi->lim, lfi->hev_thr, 1, bd);

+                                          lfi->lim, lfi->hev_thr, bd);

             } else if (mask_4x4_int & 2) {

               vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                          lfin->lim, lfin->hev_thr, 1, bd);

+                                          lfin->lim, lfin->hev_thr, bd);

           count = 2;

         } else {

           vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,

-                                      lfi->hev_thr, 1, bd);

+                                      lfi->hev_thr, bd);

           if (mask_4x4_int & 1) {

             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,

-                                        lfi->lim, lfi->hev_thr, 1, bd);

+                                        lfi->lim, lfi->hev_thr, bd);

       } else if (mask_4x4_int & 1) {

         vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                    lfi->hev_thr, 1, bd);

+                                    lfi->hev_thr, bd);

     s += 8 * count;

@@ -1127,13 +1123,13 @@

       if (mask_16x16 & 1) {

         vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

       } else if (mask_8x8 & 1) {

-        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

       } else if (mask_4x4 & 1) {

-        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

     if (mask_4x4_int & 1)

-      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

     s += 8;

     lfl += 1;

     mask_16x16 >>= 1;

@@ -1163,15 +1159,15 @@

                                    lfi->hev_thr, bd);

       } else if (mask_8x8 & 1) {

         vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,

-                                  lfi->hev_thr, 1, bd);

+                                  lfi->hev_thr, bd);

       } else if (mask_4x4 & 1) {

         vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 1, bd);

+                                  lfi->hev_thr, bd);

     if (mask_4x4_int & 1)

       vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 1, bd);

+                                lfi->hev_thr, bd);

     s += 8;

     lfl += 1;

     mask_16x16 >>= 1;

--- a/vp9/common/vp9_loopfilter.c

+++ b/vp9/common/vp9_loopfilter.c

@@ -324,7 +324,6 @@

     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;

     const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);

-    // TODO(yunqingwang): count in loopfilter functions should be removed.

     if (mask & 1) {

       if ((mask_16x16_0 | mask_16x16_1) & 1) {

         if ((mask_16x16_0 & mask_16x16_1) & 1) {

@@ -345,11 +344,10 @@

                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,

                                   lfi1->hev_thr);

         } else if (mask_8x8_0 & 1) {

-          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,

-                             1);

+          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);

         } else {

           vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,

-                             lfi1->hev_thr, 1);

+                             lfi1->hev_thr);

@@ -359,11 +357,10 @@

                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,

                                   lfi1->hev_thr);

         } else if (mask_4x4_0 & 1) {

-          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,

-                             1);

+          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);

         } else {

           vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,

-                             lfi1->hev_thr, 1);

+                             lfi1->hev_thr);

@@ -374,10 +371,10 @@

                                   lfi1->hev_thr);

         } else if (mask_4x4_int_0 & 1) {

           vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,

-                             lfi0->hev_thr, 1);

+                             lfi0->hev_thr);

         } else {

           vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,

-                             lfi1->hev_thr, 1);

+                             lfi1->hev_thr);

@@ -424,7 +421,6 @@

     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;

     const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);

-    // TODO(yunqingwang): count in loopfilter functions should be removed.

     if (mask & 1) {

       if ((mask_16x16_0 | mask_16x16_1) & 1) {

         if ((mask_16x16_0 & mask_16x16_1) & 1) {

@@ -446,10 +442,10 @@

                                          lfi1->hev_thr, bd);

         } else if (mask_8x8_0 & 1) {

           vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,

-                                    lfi0->hev_thr, 1, bd);

+                                    lfi0->hev_thr, bd);

         } else {

           vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,

-                                    lfi1->lim, lfi1->hev_thr, 1, bd);

+                                    lfi1->lim, lfi1->hev_thr, bd);

@@ -460,10 +456,10 @@

                                          lfi1->hev_thr, bd);

         } else if (mask_4x4_0 & 1) {

           vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,

-                                    lfi0->hev_thr, 1, bd);

+                                    lfi0->hev_thr, bd);

         } else {

           vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,

-                                    lfi1->lim, lfi1->hev_thr, 1, bd);

+                                    lfi1->lim, lfi1->hev_thr, bd);

@@ -474,10 +470,10 @@

                                          lfi1->hev_thr, bd);

         } else if (mask_4x4_int_0 & 1) {

           vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,

-                                    lfi0->hev_thr, 1, bd);

+                                    lfi0->hev_thr, bd);

         } else {

           vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,

-                                    lfi1->lim, lfi1->hev_thr, 1, bd);

+                                    lfi1->lim, lfi1->hev_thr, bd);

@@ -514,12 +510,12 @@

     if (mask & 1) {

       if (mask_16x16 & 1) {

         if ((mask_16x16 & 3) == 3) {

-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 2);

+          vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,

+                                     lfi->hev_thr);

           count = 2;

         } else {

-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 1);

+          vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,

+                                    lfi->hev_thr);

       } else if (mask_8x8 & 1) {

         if ((mask_8x8 & 3) == 3) {

@@ -537,18 +533,18 @@

           } else {

             if (mask_4x4_int & 1)

               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                   lfi->hev_thr, 1);

+                                   lfi->hev_thr);

             else if (mask_4x4_int & 2)

               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                   lfin->lim, lfin->hev_thr, 1);

+                                   lfin->lim, lfin->hev_thr);

           count = 2;

         } else {

-          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

           if (mask_4x4_int & 1)

             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                 lfi->hev_thr, 1);

+                                 lfi->hev_thr);

       } else if (mask_4x4 & 1) {

         if ((mask_4x4 & 3) == 3) {

@@ -565,22 +561,22 @@

           } else {

             if (mask_4x4_int & 1)

               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                   lfi->hev_thr, 1);

+                                   lfi->hev_thr);

             else if (mask_4x4_int & 2)

               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                   lfin->lim, lfin->hev_thr, 1);

+                                   lfin->lim, lfin->hev_thr);

           count = 2;

         } else {

-          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

           if (mask_4x4_int & 1)

             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                 lfi->hev_thr, 1);

+                                 lfi->hev_thr);

       } else if (mask_4x4_int & 1) {

         vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                             lfi->hev_thr, 1);

+                             lfi->hev_thr);

     s += 8 * count;

@@ -611,12 +607,12 @@

     if (mask & 1) {

       if (mask_16x16 & 1) {

         if ((mask_16x16 & 3) == 3) {

-          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                       lfi->hev_thr, 2, bd);

+          vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,

+                                            lfi->hev_thr, bd);

           count = 2;

         } else {

-          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,

-                                       lfi->hev_thr, 1, bd);

+          vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,

+                                           lfi->hev_thr, bd);

       } else if (mask_8x8 & 1) {

         if ((mask_8x8 & 3) == 3) {

@@ -635,20 +631,20 @@

           } else {

             if (mask_4x4_int & 1) {

               vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,

-                                          lfi->lim, lfi->hev_thr, 1, bd);

+                                          lfi->lim, lfi->hev_thr, bd);

             } else if (mask_4x4_int & 2) {

               vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                          lfin->lim, lfin->hev_thr, 1, bd);

+                                          lfin->lim, lfin->hev_thr, bd);

           count = 2;

         } else {

           vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,

-                                      lfi->hev_thr, 1, bd);

+                                      lfi->hev_thr, bd);

           if (mask_4x4_int & 1) {

             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,

-                                        lfi->lim, lfi->hev_thr, 1, bd);

+                                        lfi->lim, lfi->hev_thr, bd);

       } else if (mask_4x4 & 1) {

@@ -667,25 +663,25 @@

           } else {

             if (mask_4x4_int & 1) {

               vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,

-                                          lfi->lim, lfi->hev_thr, 1, bd);

+                                          lfi->lim, lfi->hev_thr, bd);

             } else if (mask_4x4_int & 2) {

               vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                          lfin->lim, lfin->hev_thr, 1, bd);

+                                          lfin->lim, lfin->hev_thr, bd);

           count = 2;

         } else {

           vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,

-                                      lfi->hev_thr, 1, bd);

+                                      lfi->hev_thr, bd);

           if (mask_4x4_int & 1) {

             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,

-                                        lfi->lim, lfi->hev_thr, 1, bd);

+                                        lfi->lim, lfi->hev_thr, bd);

       } else if (mask_4x4_int & 1) {

         vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                    lfi->hev_thr, 1, bd);

+                                    lfi->hev_thr, bd);

     s += 8 * count;

@@ -1102,13 +1098,13 @@

       if (mask_16x16 & 1) {

         vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

       } else if (mask_8x8 & 1) {

-        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

       } else if (mask_4x4 & 1) {

-        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

     if (mask_4x4_int & 1)

-      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

     s += 8;

     lfl += 1;

     mask_16x16 >>= 1;

@@ -1138,15 +1134,15 @@

                                    lfi->hev_thr, bd);

       } else if (mask_8x8 & 1) {

         vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,

-                                  lfi->hev_thr, 1, bd);

+                                  lfi->hev_thr, bd);

       } else if (mask_4x4 & 1) {

         vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 1, bd);

+                                  lfi->hev_thr, bd);

     if (mask_4x4_int & 1)

       vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,

-                                lfi->hev_thr, 1, bd);

+                                lfi->hev_thr, bd);

     s += 8;

     lfl += 1;

     mask_16x16 >>= 1;

--- a/vpx_dsp/arm/loopfilter_4_neon.asm

+++ b/vpx_dsp/arm/loopfilter_4_neon.asm

@@ -16,15 +16,12 @@

 ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter

 ; works on 16 iterations at a time.

-; TODO(fgalligan): See about removing the count code as this function is only

-; called with a count of 1.

 ; void vpx_lpf_horizontal_4_neon(uint8_t *s,

 ;                                int p /* pitch */,

 ;                                const uint8_t *blimit,

 ;                                const uint8_t *limit,

-;                                const uint8_t *thresh,

-;                                int count)

+;                                const uint8_t *thresh)

 ; r0    uint8_t *s,

 ; r1    int p, /* pitch */

@@ -31,22 +28,16 @@

 ; r2    const uint8_t *blimit,

 ; r3    const uint8_t *limit,

 ; sp    const uint8_t *thresh,

-; sp+4  int count

 |vpx_lpf_horizontal_4_neon| PROC

     push        {lr}

     vld1.8      {d0[]}, [r2]               ; duplicate *blimit

-    ldr         r12, [sp, #8]              ; load count

     ldr         r2, [sp, #4]               ; load thresh

     add         r1, r1, r1                 ; double pitch

-    cmp         r12, #0

-    beq         end_vpx_lf_h_edge

     vld1.8      {d1[]}, [r3]               ; duplicate *limit

     vld1.8      {d2[]}, [r2]               ; duplicate *thresh

-count_lf_h_loop

     sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines

     add         r3, r2, r1, lsr #1         ; set to 3 lines down

@@ -69,25 +60,17 @@

     vst1.u8     {d6}, [r2@64], r1          ; store oq0

     vst1.u8     {d7}, [r3@64], r1          ; store oq1

-    add         r0, r0, #8

-    subs        r12, r12, #1

-    bne         count_lf_h_loop

-end_vpx_lf_h_edge

     pop         {pc}

     ENDP        ; |vpx_lpf_horizontal_4_neon|

 ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter

 ; works on 16 iterations at a time.

-; TODO(fgalligan): See about removing the count code as this function is only

-; called with a count of 1.

 ; void vpx_lpf_vertical_4_neon(uint8_t *s,

 ;                              int p /* pitch */,

 ;                              const uint8_t *blimit,

 ;                              const uint8_t *limit,

-;                              const uint8_t *thresh,

-;                              int count)

+;                              const uint8_t *thresh)

 ; r0    uint8_t *s,

 ; r1    int p, /* pitch */

@@ -94,22 +77,17 @@

 ; r2    const uint8_t *blimit,

 ; r3    const uint8_t *limit,

 ; sp    const uint8_t *thresh,

-; sp+4  int count

 |vpx_lpf_vertical_4_neon| PROC

     push        {lr}

     vld1.8      {d0[]}, [r2]              ; duplicate *blimit

-    ldr         r12, [sp, #8]             ; load count

     vld1.8      {d1[]}, [r3]              ; duplicate *limit

     ldr         r3, [sp, #4]              ; load thresh

     sub         r2, r0, #4                ; move s pointer down by 4 columns

-    cmp         r12, #0

-    beq         end_vpx_lf_v_edge

     vld1.8      {d2[]}, [r3]              ; duplicate *thresh

-count_lf_v_loop

     vld1.u8     {d3}, [r2], r1             ; load s data

     vld1.u8     {d4}, [r2], r1

     vld1.u8     {d5}, [r2], r1

@@ -149,12 +127,6 @@

     vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1

     vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]

-    add         r0, r0, r1, lsl #3         ; s += pitch * 8

-    subs        r12, r12, #1

-    subne       r2, r0, #4                 ; move s pointer down by 4 columns

-    bne         count_lf_v_loop

-end_vpx_lf_v_edge

     pop         {pc}

     ENDP        ; |vpx_lpf_vertical_4_neon|

--- a/vpx_dsp/arm/loopfilter_4_neon.c

+++ b/vpx_dsp/arm/loopfilter_4_neon.c

@@ -115,22 +115,18 @@

         int pitch,

         const uint8_t *blimit,

         const uint8_t *limit,

-        const uint8_t *thresh,

-        int count) {

+        const uint8_t *thresh) {

     int i;

     uint8_t *s, *psrc;

     uint8x8_t dblimit, dlimit, dthresh;

     uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;

-    if (count == 0)  // end_vpx_lf_h_edge

-        return;

     dblimit = vld1_u8(blimit);

     dlimit = vld1_u8(limit);

     dthresh = vld1_u8(thresh);

     psrc = src - (pitch << 2);

-    for (i = 0; i < count; i++) {

+    for (i = 0; i < 1; i++) {

         s = psrc + i * 8;

         d3u8 = vld1_u8(s);

@@ -170,8 +166,7 @@

         int pitch,

         const uint8_t *blimit,

         const uint8_t *limit,

-        const uint8_t *thresh,

-        int count) {

+        const uint8_t *thresh) {

     int i, pitch8;

     uint8_t *s;

     uint8x8_t dblimit, dlimit, dthresh;

@@ -181,15 +176,12 @@

     uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;

     uint8x8x4_t d4Result;

-    if (count == 0)  // end_vpx_lf_h_edge

-        return;

     dblimit = vld1_u8(blimit);

     dlimit = vld1_u8(limit);

     dthresh = vld1_u8(thresh);

     pitch8 = pitch * 8;

-    for (i = 0; i < count; i++, src += pitch8) {

+    for (i = 0; i < 1; i++, src += pitch8) {

         s = src - (i + 1) * 4;

         d3u8 = vld1_u8(s);

--- a/vpx_dsp/arm/loopfilter_8_neon.asm

+++ b/vpx_dsp/arm/loopfilter_8_neon.asm

@@ -16,35 +16,26 @@

 ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter

 ; works on 16 iterations at a time.

-; TODO(fgalligan): See about removing the count code as this function is only

-; called with a count of 1.

 ; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,

 ;                                const uint8_t *blimit,

 ;                                const uint8_t *limit,

-;                                const uint8_t *thresh,

-;                                int count)

+;                                const uint8_t *thresh)

 ; r0    uint8_t *s,

 ; r1    int p, /* pitch */

 ; r2    const uint8_t *blimit,

 ; r3    const uint8_t *limit,

 ; sp    const uint8_t *thresh,

-; sp+4  int count

 |vpx_lpf_horizontal_8_neon| PROC

     push        {r4-r5, lr}

     vld1.8      {d0[]}, [r2]               ; duplicate *blimit

-    ldr         r12, [sp, #16]             ; load count

     ldr         r2, [sp, #12]              ; load thresh

     add         r1, r1, r1                 ; double pitch

-    cmp         r12, #0

-    beq         end_vpx_mblf_h_edge

     vld1.8      {d1[]}, [r3]               ; duplicate *limit

     vld1.8      {d2[]}, [r2]               ; duplicate *thresh

-count_mblf_h_loop

     sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines

     add         r2, r3, r1, lsr #1         ; set to 3 lines down

@@ -69,11 +60,6 @@

     vst1.u8     {d4}, [r2@64], r1          ; store oq1

     vst1.u8     {d5}, [r3@64], r1          ; store oq2

-    add         r0, r0, #8

-    subs        r12, r12, #1

-    bne         count_mblf_h_loop

-end_vpx_mblf_h_edge

     pop         {r4-r5, pc}

     ENDP        ; |vpx_lpf_horizontal_8_neon|

@@ -82,8 +68,7 @@

 ;                              int pitch,

 ;                              const uint8_t *blimit,

 ;                              const uint8_t *limit,

-;                              const uint8_t *thresh,

-;                              int count)

+;                              const uint8_t *thresh)

 ; r0    uint8_t *s,

 ; r1    int pitch,

@@ -90,22 +75,17 @@

 ; r2    const uint8_t *blimit,

 ; r3    const uint8_t *limit,

 ; sp    const uint8_t *thresh,

-; sp+4  int count

 |vpx_lpf_vertical_8_neon| PROC

     push        {r4-r5, lr}

     vld1.8      {d0[]}, [r2]              ; duplicate *blimit

-    ldr         r12, [sp, #16]            ; load count

     vld1.8      {d1[]}, [r3]              ; duplicate *limit

     ldr         r3, [sp, #12]             ; load thresh

     sub         r2, r0, #4                ; move s pointer down by 4 columns

-    cmp         r12, #0

-    beq         end_vpx_mblf_v_edge

     vld1.8      {d2[]}, [r3]              ; duplicate *thresh

-count_mblf_v_loop

     vld1.u8     {d3}, [r2], r1             ; load s data

     vld1.u8     {d4}, [r2], r1

     vld1.u8     {d5}, [r2], r1

@@ -156,12 +136,6 @@

     vst2.8      {d4[6], d5[6]}, [r3], r1

     vst2.8      {d4[7], d5[7]}, [r3]

-    add         r0, r0, r1, lsl #3         ; s += pitch * 8

-    subs        r12, r12, #1

-    subne       r2, r0, #4                 ; move s pointer down by 4 columns

-    bne         count_mblf_v_loop

-end_vpx_mblf_v_edge

     pop         {r4-r5, pc}

     ENDP        ; |vpx_lpf_vertical_8_neon|

--- a/vpx_dsp/arm/loopfilter_8_neon.c

+++ b/vpx_dsp/arm/loopfilter_8_neon.c

@@ -268,8 +268,7 @@

         int pitch,

         const uint8_t *blimit,

         const uint8_t *limit,

-        const uint8_t *thresh,

-        int count) {

+        const uint8_t *thresh) {

     int i;

     uint8_t *s, *psrc;

     uint8x8_t dblimit, dlimit, dthresh;

@@ -276,15 +275,12 @@

     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;

     uint8x8_t d16u8, d17u8, d18u8;

-    if (count == 0)  // end_vpx_mblf_h_edge

-        return;

     dblimit = vld1_u8(blimit);

     dlimit = vld1_u8(limit);

     dthresh = vld1_u8(thresh);

     psrc = src - (pitch << 2);

-    for (i = 0; i < count; i++) {

+    for (i = 0; i < 1; i++) {

         s = psrc + i * 8;

         d3u8  = vld1_u8(s);

@@ -328,8 +324,7 @@

         int pitch,

         const uint8_t *blimit,

         const uint8_t *limit,

-        const uint8_t *thresh,

-        int count) {

+        const uint8_t *thresh) {

     int i;

     uint8_t *s;

     uint8x8_t dblimit, dlimit, dthresh;

@@ -341,14 +336,11 @@

     uint8x8x4_t d4Result;

     uint8x8x2_t d2Result;

-    if (count == 0)

-        return;

     dblimit = vld1_u8(blimit);

     dlimit = vld1_u8(limit);

     dthresh = vld1_u8(thresh);

-    for (i = 0; i < count; i++) {

+    for (i = 0; i < 1; i++) {

         s = src + (i * (pitch << 3)) - 4;

         d3u8 = vld1_u8(s);

--- a/vpx_dsp/arm/loopfilter_mb_neon.asm

+++ b/vpx_dsp/arm/loopfilter_mb_neon.asm

@@ -8,27 +8,28 @@

 ;  be found in the AUTHORS file in the root of the source tree.

-    EXPORT  |vpx_lpf_horizontal_16_neon|

+    EXPORT  |vpx_lpf_horizontal_edge_8_neon|

+    EXPORT  |vpx_lpf_horizontal_edge_16_neon|

     EXPORT  |vpx_lpf_vertical_16_neon|

ARM

     AREA ||.text||, CODE, READONLY, ALIGN=2

-; void vpx_lpf_horizontal_16_neon(uint8_t *s, int p,

-;                                 const uint8_t *blimit,

-;                                 const uint8_t *limit,

-;                                 const uint8_t *thresh

-;                                 int count)

+; void mb_lpf_horizontal_edge(uint8_t *s, int p,

+;                             const uint8_t *blimit,

+;                             const uint8_t *limit,

+;                             const uint8_t *thresh,

+;                             int count)

 ; r0    uint8_t *s,

 ; r1    int p, /* pitch */

 ; r2    const uint8_t *blimit,

 ; r3    const uint8_t *limit,

 ; sp    const uint8_t *thresh,

-|vpx_lpf_horizontal_16_neon| PROC

+; r12   int count

+|mb_lpf_horizontal_edge| PROC

     push        {r4-r8, lr}

     vpush       {d8-d15}

     ldr         r4, [sp, #88]              ; load thresh

-    ldr         r12, [sp, #92]             ; load count

 h_count

     vld1.8      {d16[]}, [r2]              ; load *blimit

@@ -115,7 +116,35 @@

     vpop        {d8-d15}

     pop         {r4-r8, pc}

-    ENDP        ; |vpx_lpf_horizontal_16_neon|

+    ENDP        ; |mb_lpf_horizontal_edge|

+; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,

+;                                     const uint8_t *blimit,

+;                                     const uint8_t *limit,

+;                                     const uint8_t *thresh)

+; r0    uint8_t *s,

+; r1    int pitch,

+; r2    const uint8_t *blimit,

+; r3    const uint8_t *limit,

+; sp    const uint8_t *thresh

+|vpx_lpf_horizontal_edge_8_neon| PROC

+    mov r12, #1

+    b mb_lpf_horizontal_edge

+    ENDP        ; |vpx_lpf_horizontal_edge_8_neon|

+; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,

+;                                      const uint8_t *blimit,

+;                                      const uint8_t *limit,

+;                                      const uint8_t *thresh)

+; r0    uint8_t *s,

+; r1    int pitch,

+; r2    const uint8_t *blimit,

+; r3    const uint8_t *limit,

+; sp    const uint8_t *thresh

+|vpx_lpf_horizontal_edge_16_neon| PROC

+    mov r12, #2

+    b mb_lpf_horizontal_edge

+    ENDP        ; |vpx_lpf_horizontal_edge_16_neon|

 ; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,

 ;                               const uint8_t *blimit,

--- a/vpx_dsp/arm/loopfilter_neon.c

+++ b/vpx_dsp/arm/loopfilter_neon.c

@@ -21,8 +21,8 @@

                                   const uint8_t *blimit1,

                                   const uint8_t *limit1,

                                   const uint8_t *thresh1) {

-  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);

-  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);

+  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);

+  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);

 #if HAVE_NEON_ASM

@@ -33,8 +33,8 @@

                                     const uint8_t *blimit1,

                                     const uint8_t *limit1,

                                     const uint8_t *thresh1) {

-  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);

-  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);

+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);

+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);

 void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,

@@ -44,8 +44,8 @@

                                   const uint8_t *blimit1,

                                   const uint8_t *limit1,

                                   const uint8_t *thresh1) {

-  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);

-  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);

+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);

+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);

 void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,

--- a/vpx_dsp/loopfilter.c

+++ b/vpx_dsp/loopfilter.c

@@ -119,12 +119,12 @@

 void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,

                             const uint8_t *blimit, const uint8_t *limit,

-                            const uint8_t *thresh, int count) {

+                            const uint8_t *thresh) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

-  for (i = 0; i < 8 * count; ++i) {

+  for (i = 0; i < 8; ++i) {

     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];

     const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];

     const int8_t mask = filter_mask(*limit, *blimit,

@@ -138,18 +138,17 @@

                                  const uint8_t *limit0, const uint8_t *thresh0,

                                  const uint8_t *blimit1, const uint8_t *limit1,

                                  const uint8_t *thresh1) {

-  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);

-  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);

+  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);

+  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);

 void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,

-                          const uint8_t *limit, const uint8_t *thresh,

-                          int count) {

+                          const uint8_t *limit, const uint8_t *thresh) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

-  for (i = 0; i < 8 * count; ++i) {

+  for (i = 0; i < 8; ++i) {

     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];

     const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];

     const int8_t mask = filter_mask(*limit, *blimit,

@@ -163,9 +162,8 @@

                                const uint8_t *limit0, const uint8_t *thresh0,

                                const uint8_t *blimit1, const uint8_t *limit1,

                                const uint8_t *thresh1) {

-  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);

-  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,

-                                  thresh1, 1);

+  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);

+  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);

 static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,

@@ -190,13 +188,12 @@

 void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,

-                            const uint8_t *limit, const uint8_t *thresh,

-                            int count) {

+                            const uint8_t *limit, const uint8_t *thresh) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

-  for (i = 0; i < 8 * count; ++i) {

+  for (i = 0; i < 8; ++i) {

     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];

     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

@@ -213,16 +210,15 @@

                                  const uint8_t *limit0, const uint8_t *thresh0,

                                  const uint8_t *blimit1, const uint8_t *limit1,

                                  const uint8_t *thresh1) {

-  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);

-  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);

+  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);

+  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);

 void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,

-                          const uint8_t *limit, const uint8_t *thresh,

-                          int count) {

+                          const uint8_t *limit, const uint8_t *thresh) {

   int i;

-  for (i = 0; i < 8 * count; ++i) {

+  for (i = 0; i < 8; ++i) {

     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];

     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];

     const int8_t mask = filter_mask(*limit, *blimit,

@@ -238,9 +234,8 @@

                                const uint8_t *limit0, const uint8_t *thresh0,

                                const uint8_t *blimit1, const uint8_t *limit1,

                                const uint8_t *thresh1) {

-  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);

-  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,

-                                    thresh1, 1);

+  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);

+  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);

 static INLINE void filter16(int8_t mask, uint8_t thresh,

@@ -294,9 +289,9 @@

-void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,

-                             const uint8_t *limit, const uint8_t *thresh,

-                             int count) {

+static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,

+                                     const uint8_t *limit,

+                                     const uint8_t *thresh, int count) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

@@ -320,6 +315,16 @@

+void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,

+                                 const uint8_t *limit, const uint8_t *thresh) {

+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);

+}

+void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,

+                                  const uint8_t *limit, const uint8_t *thresh) {

+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);

+}

 static void mb_lpf_vertical_edge_w(uint8_t *s, int p,

                                    const uint8_t *blimit,

                                    const uint8_t *limit,

@@ -450,12 +455,12 @@

 void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,

                                    const uint8_t *blimit, const uint8_t *limit,

-                                   const uint8_t *thresh, int count, int bd) {

+                                   const uint8_t *thresh, int bd) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

-  for (i = 0; i < 8 * count; ++i) {

+  for (i = 0; i < 8; ++i) {

     const uint16_t p3 = s[-4 * p];

     const uint16_t p2 = s[-3 * p];

     const uint16_t p1 = s[-2 * p];

@@ -479,18 +484,18 @@

                                         const uint8_t *limit1,

                                         const uint8_t *thresh1,

                                         int bd) {

-  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);

-  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);

+  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);

+  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);

 void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,

                                  const uint8_t *limit, const uint8_t *thresh,

-                                 int count, int bd) {

+                                 int bd) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

-  for (i = 0; i < 8 * count; ++i) {

+  for (i = 0; i < 8; ++i) {

     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];

     const uint16_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];

     const int8_t mask = highbd_filter_mask(*limit, *blimit,

@@ -508,9 +513,9 @@

                                       const uint8_t *limit1,

                                       const uint8_t *thresh1,

                                       int bd) {

-  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);

+  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);

   vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,

-                              thresh1, 1, bd);

+                              thresh1, bd);

 static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,

@@ -536,12 +541,12 @@

 void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,

                                    const uint8_t *limit, const uint8_t *thresh,

-                                   int count, int bd) {

+                                   int bd) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

-  for (i = 0; i < 8 * count; ++i) {

+  for (i = 0; i < 8; ++i) {

     const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];

     const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

@@ -564,16 +569,16 @@

                                         const uint8_t *limit1,

                                         const uint8_t *thresh1,

                                         int bd) {

-  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);

-  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);

+  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);

+  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);

 void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,

                                  const uint8_t *limit, const uint8_t *thresh,

-                                 int count, int bd) {

+                                 int bd) {

   int i;

-  for (i = 0; i < 8 * count; ++i) {

+  for (i = 0; i < 8; ++i) {

     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];

     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];

     const int8_t mask = highbd_filter_mask(*limit, *blimit,

@@ -596,9 +601,9 @@

                                       const uint8_t *limit1,

                                       const uint8_t *thresh1,

                                       int bd) {

-  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);

+  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);

   vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,

-                              thresh1, 1, bd);

+                              thresh1, bd);

 static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,

@@ -664,9 +669,11 @@

-void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,

-                                    const uint8_t *limit, const uint8_t *thresh,

-                                    int count, int bd) {

+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,

+                                            const uint8_t *blimit,

+                                            const uint8_t *limit,

+                                            const uint8_t *thresh,

+                                            int count, int bd) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

@@ -696,6 +703,20 @@

                     bd);

     ++s;

+}

+void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,

+                                        const uint8_t *blimit,

+                                        const uint8_t *limit,

+                                        const uint8_t *thresh, int bd) {

+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);

+}

+void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,

+                                         const uint8_t *blimit,

+                                         const uint8_t *limit,

+                                         const uint8_t *thresh, int bd) {

+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);

 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,

--- a/vpx_dsp/mips/loopfilter_16_msa.c

+++ b/vpx_dsp/mips/loopfilter_16_msa.c

@@ -423,11 +423,11 @@

-void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,

-                               const uint8_t *b_limit_ptr,

-                               const uint8_t *limit_ptr,

-                               const uint8_t *thresh_ptr,

-                               int32_t count) {

+static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,

+                                   const uint8_t *b_limit_ptr,

+                                   const uint8_t *limit_ptr,

+                                   const uint8_t *thresh_ptr,

+                                   int32_t count) {

   if (1 == count) {

     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;

     uint64_t dword0, dword1;

@@ -646,6 +646,20 @@

     vpx_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,

                                    thresh_ptr, count);

+}

+void vpx_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch,

+                                   const uint8_t *b_limit_ptr,

+                                   const uint8_t *limit_ptr,

+                                   const uint8_t *thresh_ptr) {

+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);

+}

+void vpx_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch,

+                                    const uint8_t *b_limit_ptr,

+                                    const uint8_t *limit_ptr,

+                                    const uint8_t *thresh_ptr) {

+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);

 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,

--- a/vpx_dsp/mips/loopfilter_4_msa.c

+++ b/vpx_dsp/mips/loopfilter_4_msa.c

@@ -13,14 +13,11 @@

 void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,

                               const uint8_t *b_limit_ptr,

                               const uint8_t *limit_ptr,

-                              const uint8_t *thresh_ptr,

-                              int32_t count) {

+                              const uint8_t *thresh_ptr) {

   uint64_t p1_d, p0_d, q0_d, q1_d;

   v16u8 mask, hev, flat, thresh, b_limit, limit;

   v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;

-  (void)count;

   /* load vector elements */

   LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

@@ -74,13 +71,10 @@

 void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,

                             const uint8_t *b_limit_ptr,

                             const uint8_t *limit_ptr,

-                            const uint8_t *thresh_ptr,

-                            int32_t count) {

+                            const uint8_t *thresh_ptr) {

   v16u8 mask, hev, flat, limit, thresh, b_limit;

   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;

   v8i16 vec0, vec1, vec2, vec3;

-  (void)count;

   LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

--- a/vpx_dsp/mips/loopfilter_8_msa.c

+++ b/vpx_dsp/mips/loopfilter_8_msa.c

@@ -13,8 +13,7 @@

 void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,

                               const uint8_t *b_limit_ptr,

                               const uint8_t *limit_ptr,

-                              const uint8_t *thresh_ptr,

-                              int32_t count) {

+                              const uint8_t *thresh_ptr) {

   uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;

   v16u8 mask, hev, flat, thresh, b_limit, limit;

   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;

@@ -23,8 +22,6 @@

   v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;

   v16i8 zero = { 0 };

-  (void)count;

   /* load vector elements */

   LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

@@ -161,8 +158,7 @@

 void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,

                             const uint8_t *b_limit_ptr,

                             const uint8_t *limit_ptr,

-                            const uint8_t *thresh_ptr,

-                            int32_t count) {

+                            const uint8_t *thresh_ptr) {

   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;

   v16u8 p1_out, p0_out, q0_out, q1_out;

   v16u8 flat, mask, hev, thresh, b_limit, limit;

@@ -170,8 +166,6 @@

   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;

   v16u8 zero = { 0 };

   v8i16 vec0, vec1, vec2, vec3, vec4;

-  (void)count;

   /* load vector elements */

   LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);

--- a/vpx_dsp/mips/loopfilter_filters_dspr2.c

+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c

@@ -23,8 +23,7 @@

                                 int pitch,

                                 const uint8_t *blimit,

                                 const uint8_t *limit,

-                                const uint8_t *thresh,

-                                int count) {

+                                const uint8_t *thresh) {

   uint8_t   i;

   uint32_t  mask;

   uint32_t  hev;

@@ -117,8 +116,7 @@

                               int pitch,

                               const uint8_t *blimit,

                               const uint8_t *limit,

-                              const uint8_t *thresh,

-                              int count) {

+                              const uint8_t *thresh) {

   uint8_t   i;

   uint32_t  mask, hev;

   uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;

@@ -313,8 +311,8 @@

                                      const uint8_t *blimit1,

                                      const uint8_t *limit1,

                                      const uint8_t *thresh1) {

-  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);

-  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);

+  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);

+  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);

 void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,

@@ -324,8 +322,8 @@

                                      const uint8_t *blimit1,

                                      const uint8_t *limit1,

                                      const uint8_t *thresh1) {

-  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);

-  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);

+  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);

+  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);

 void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,

@@ -335,8 +333,8 @@

                                    const uint8_t *blimit1,

                                    const uint8_t *limit1,

                                    const uint8_t *thresh1) {

-  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);

-  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);

+  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);

+  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);

 void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,

@@ -346,9 +344,8 @@

                                    const uint8_t *blimit1,

                                    const uint8_t *limit1,

                                    const uint8_t *thresh1) {

-  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);

-  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,

-                                       1);

+  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);

+  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);

 void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,

--- a/vpx_dsp/mips/loopfilter_mb_dspr2.c

+++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c

@@ -23,8 +23,7 @@

                                 int pitch,

                                 const uint8_t *blimit,

                                 const uint8_t *limit,

-                                const uint8_t *thresh,

-                                int count) {

+                                const uint8_t *thresh) {

   uint32_t  mask;

   uint32_t  hev, flat;

   uint8_t   i;

@@ -322,8 +321,7 @@

                               int pitch,

                               const uint8_t *blimit,

                               const uint8_t *limit,

-                              const uint8_t *thresh,

-                              int count) {

+                              const uint8_t *thresh) {

   uint8_t   i;

   uint32_t  mask, hev, flat;

   uint8_t   *s1, *s2, *s3, *s4;

--- a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c

+++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c

@@ -19,12 +19,12 @@

 #include "vpx_mem/vpx_mem.h"

 #if HAVE_DSPR2

-void vpx_lpf_horizontal_16_dspr2(unsigned char *s,

-                                 int pitch,

-                                 const uint8_t *blimit,

-                                 const uint8_t *limit,

-                                 const uint8_t *thresh,

-                                 int count) {

+static void mb_lpf_horizontal_edge(unsigned char *s,

+                                   int pitch,

+                                   const uint8_t *blimit,

+                                   const uint8_t *limit,

+                                   const uint8_t *thresh,

+                                   int count) {

   uint32_t  mask;

   uint32_t  hev, flat, flat2;

   uint8_t   i;

@@ -790,5 +790,19 @@

     s = s + 4;

+}

+void vpx_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch,

+                                     const uint8_t *blimit,

+                                     const uint8_t *limit,

+                                     const uint8_t *thresh) {

+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);

+}

+void vpx_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch,

+                                      const uint8_t *blimit,

+                                      const uint8_t *limit,

+                                      const uint8_t *thresh) {

+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);

 #endif  // #if HAVE_DSPR2

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -535,7 +535,7 @@

 specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;

 $vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon;

-add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";

+add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";

 specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;

 add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";

@@ -542,17 +542,21 @@

 specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;

 $vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;

-add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";

+add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";

 specialize qw/vpx_lpf_vertical_4 mmx neon dspr2 msa/;

 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";

 specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;

-add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";

-specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/;

-$vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon;

+add_proto qw/void vpx_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";

+specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/;

+$vpx_lpf_horizontal_edge_8_neon_asm=vpx_lpf_horizontal_edge_8_neon;

-add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";

+add_proto qw/void vpx_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";

+specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/;

+$vpx_lpf_horizontal_edge_16_neon_asm=vpx_lpf_horizontal_edge_16_neon;

+add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";

 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;

 add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";

@@ -559,7 +563,7 @@

 specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;

 $vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;

-add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";

+add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";

 specialize qw/vpx_lpf_horizontal_4 mmx neon dspr2 msa/;

 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";

@@ -572,28 +576,31 @@

   add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";

   specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/;

-  add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";

+  add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";

   specialize qw/vpx_highbd_lpf_vertical_8 sse2/;

   add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";

   specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/;

-  add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";

+  add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";

   specialize qw/vpx_highbd_lpf_vertical_4 sse2/;

   add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";

   specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/;

-  add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";

-  specialize qw/vpx_highbd_lpf_horizontal_16 sse2/;

+  add_proto qw/void vpx_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";

+  specialize qw/vpx_highbd_lpf_horizontal_edge_8 sse2/;

-  add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";

+  add_proto qw/void vpx_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";

+  specialize qw/vpx_highbd_lpf_horizontal_edge_16 sse2/;

+  add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";

   specialize qw/vpx_highbd_lpf_horizontal_8 sse2/;

   add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";

   specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/;

-  add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";

+  add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";

   specialize qw/vpx_highbd_lpf_horizontal_4 sse2/;

   add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";

--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c

+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c

@@ -51,12 +51,10 @@

 // TODO(debargha, peter): Break up large functions into smaller ones

 // in this file.

-static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,

-                                                   int p,

-                                                   const uint8_t *_blimit,

-                                                   const uint8_t *_limit,

-                                                   const uint8_t *_thresh,

-                                                   int bd) {

+void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,

+                                           const uint8_t *_blimit,

+                                           const uint8_t *_limit,

+                                           const uint8_t *_thresh, int bd) {

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i one = _mm_set1_epi16(1);

   __m128i blimit, limit, thresh;

@@ -496,34 +494,19 @@

   _mm_store_si128((__m128i *)(s - 0 * p), q0);

-static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s,

-                                                    int p,

-                                                    const uint8_t *_blimit,

-                                                    const uint8_t *_limit,

-                                                    const uint8_t *_thresh,

-                                                    int bd) {

-  highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);

-  highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh,

-                                         bd);

+void vpx_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,

+                                            const uint8_t *_blimit,

+                                            const uint8_t *_limit,

+                                            const uint8_t *_thresh, int bd) {

+  vpx_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd);

+  vpx_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd);

-// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.

-void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,

-                                       const uint8_t *_blimit,

-                                       const uint8_t *_limit,

-                                       const uint8_t *_thresh,

-                                       int count, int bd) {

-  if (count == 1)

-    highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);

-  else

-    highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);

-}

 void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,

                                       const uint8_t *_blimit,

                                       const uint8_t *_limit,

                                       const uint8_t *_thresh,

-                                      int count, int bd) {

+                                      int bd) {

   DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);

   DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);

   DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);

@@ -556,8 +539,6 @@

   __m128i work_a;

   __m128i filter1, filter2;

-  (void)count;

   if (bd == 8) {

     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);

     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);

@@ -764,9 +745,8 @@

                                            const uint8_t *_limit1,

                                            const uint8_t *_thresh1,

                                            int bd) {

-  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);

-  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,

-                                   1, bd);

+  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);

+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);

 void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,

@@ -773,7 +753,7 @@

                                       const uint8_t *_blimit,

                                       const uint8_t *_limit,

                                       const uint8_t *_thresh,

-                                      int count, int bd) {

+                                      int bd) {

   const __m128i zero = _mm_set1_epi16(0);

   __m128i blimit, limit, thresh;

   __m128i mask, hev, flat;

@@ -813,8 +793,6 @@

   __m128i work_a;

   __m128i filter1, filter2;

-  (void)count;

   if (bd == 8) {

     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);

     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);

@@ -944,9 +922,8 @@

                                            const uint8_t *_limit1,

                                            const uint8_t *_thresh1,

                                            int bd) {

-  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);

-  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,

-                                   bd);

+  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);

+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);

 static INLINE void highbd_transpose(uint16_t *src[], int in_p,

@@ -1058,11 +1035,10 @@

                                     const uint8_t *blimit,

                                     const uint8_t *limit,

                                     const uint8_t *thresh,

-                                    int count, int bd) {

+                                    int bd) {

   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);

   uint16_t *src[1];

   uint16_t *dst[1];

-  (void)count;

   // Transpose 8x8

   src[0] = s - 4;

@@ -1071,8 +1047,7 @@

   highbd_transpose(src, p, dst, 8, 1);

   // Loop filtering

-  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,

-                                   bd);

+  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);

   src[0] = t_dst;

   dst[0] = s - 4;

@@ -1112,11 +1087,10 @@

                                     const uint8_t *blimit,

                                     const uint8_t *limit,

                                     const uint8_t *thresh,

-                                    int count, int bd) {

+                                    int bd) {

   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);

   uint16_t *src[1];

   uint16_t *dst[1];

-  (void)count;

   // Transpose 8x8

   src[0] = s - 4;

@@ -1125,8 +1099,7 @@

   highbd_transpose(src, p, dst, 8, 1);

   // Loop filtering

-  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,

-                                   bd);

+  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);

   src[0] = t_dst;

   dst[0] = s - 4;

@@ -1181,8 +1154,8 @@

   highbd_transpose(src, p, dst, 8, 2);

   // Loop filtering

-  highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit,

-                                         thresh, bd);

+  vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit,

+                                        thresh, bd);

   src[0] = t_dst;

   src[1] = t_dst + 8 * 8;

   dst[0] = s - 8;

@@ -1205,8 +1178,8 @@

   highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);

   //  Loop filtering

-  highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,

-                                          thresh, bd);

+  vpx_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit,

+                                         thresh, bd);

   //  Transpose back

   highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);

--- a/vpx_dsp/x86/loopfilter_avx2.c

+++ b/vpx_dsp/x86/loopfilter_avx2.c

@@ -13,9 +13,10 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_ports/mem.h"

-static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,

-        const unsigned char *_blimit, const unsigned char *_limit,

-        const unsigned char *_thresh) {

+void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,

+                                    const unsigned char *_blimit,

+                                    const unsigned char *_limit,

+                                    const unsigned char *_thresh) {

     __m128i mask, hev, flat, flat2;

     const __m128i zero = _mm_set1_epi16(0);

     const __m128i one = _mm_set1_epi8(1);

@@ -400,9 +401,10 @@

   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128

};

-static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,

-        const unsigned char *_blimit, const unsigned char *_limit,

-        const unsigned char *_thresh) {

+void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,

+                                     const unsigned char *_blimit,

+                                     const unsigned char *_limit,

+                                     const unsigned char *_thresh) {

     __m128i mask, hev, flat, flat2;

     const __m128i zero = _mm_set1_epi16(0);

     const __m128i one = _mm_set1_epi8(1);

@@ -974,13 +976,4 @@

         q6 = _mm_or_si128(flat2_q6, q6);

         _mm_storeu_si128((__m128i *) (s + 6 * p), q6);

-}

-void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,

-        const unsigned char *_blimit, const unsigned char *_limit,

-        const unsigned char *_thresh, int count) {

-    if (count == 1)

-        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);

-    else

-        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);

--- a/vpx_dsp/x86/loopfilter_mmx.asm

+++ b/vpx_dsp/x86/loopfilter_mmx.asm

@@ -18,14 +18,13 @@

 ;    int src_pixel_step,

 ;    const char *blimit,

 ;    const char *limit,

-;    const char *thresh,

-;    int  count

+;    const char *thresh

;)

 global sym(vpx_lpf_horizontal_4_mmx) PRIVATE

 sym(vpx_lpf_horizontal_4_mmx):

     push        rbp

     mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

+    SHADOW_ARGS_TO_STACK 5

     GET_GOT     rbx

     push        rsi

     push        rdi

@@ -39,8 +38,6 @@

         mov         rsi, arg(0) ;src_ptr

         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

-        movsxd      rcx, dword ptr arg(5) ;count

-.next8_h:

         mov         rdx, arg(3) ;limit

         movq        mm7, [rdx]

         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing

@@ -208,11 +205,6 @@

         pxor        mm7, [GLOBAL(t80)]    ; unoffset

         movq        [rdi], mm7            ; write back

-        add         rsi,8

-        neg         rax

-        dec         rcx

-        jnz         .next8_h

     add rsp, 32

     pop rsp

     ; begin epilog

@@ -230,14 +222,13 @@

 ;    int  src_pixel_step,

 ;    const char *blimit,

 ;    const char *limit,

-;    const char *thresh,

-;    int count

+;    const char *thresh

;)

 global sym(vpx_lpf_vertical_4_mmx) PRIVATE

 sym(vpx_lpf_vertical_4_mmx):

     push        rbp

     mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

+    SHADOW_ARGS_TO_STACK 5

     GET_GOT     rbx

     push        rsi

     push        rdi

@@ -254,8 +245,6 @@

         lea         rsi,        [rsi + rax*4 - 4]

-        movsxd      rcx,        dword ptr arg(5) ;count

-.next8_v:

         mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing

         add         rdi,        rax

@@ -578,10 +567,6 @@

         psrlq       mm5,        32

         movd        [rdi+rax*2+2], mm5

-        lea         rsi,        [rsi+rax*8]

-        dec         rcx

-        jnz         .next8_v

     add rsp, 64

     pop rsp

--- a/vpx_dsp/x86/loopfilter_sse2.c

+++ b/vpx_dsp/x86/loopfilter_sse2.c

@@ -18,11 +18,10 @@

   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));

-static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,

-                                            int p,

-                                            const unsigned char *_blimit,

-                                            const unsigned char *_limit,

-                                            const unsigned char *_thresh) {

+void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,

+                                    const unsigned char *_blimit,

+                                    const unsigned char *_limit,

+                                    const unsigned char *_thresh) {

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i one = _mm_set1_epi8(1);

   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);

@@ -383,11 +382,10 @@

   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);

-static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,

-                                             int p,

-                                             const unsigned char *_blimit,

-                                             const unsigned char *_limit,

-                                             const unsigned char *_thresh) {

+void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,

+                                     const unsigned char *_blimit,

+                                     const unsigned char *_limit,

+                                     const unsigned char *_thresh) {

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i one = _mm_set1_epi8(1);

   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);

@@ -716,21 +714,10 @@

-// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.

-void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,

-                                const unsigned char *_blimit,

-                                const unsigned char *_limit,

-                                const unsigned char *_thresh, int count) {

-  if (count == 1)

-    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);

-  else

-    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);

-}

 void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,

                                const unsigned char *_blimit,

                                const unsigned char *_limit,

-                               const unsigned char *_thresh, int count) {

+                               const unsigned char *_thresh) {

   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);

   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);

   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);

@@ -745,8 +732,6 @@

   __m128i p3, p2, p1, p0, q0, q1, q2, q3;

   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;

-  (void)count;

   q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),

                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));

   q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),

@@ -1492,11 +1477,10 @@

 void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,

                              const unsigned char *blimit,

                              const unsigned char *limit,

-                             const unsigned char *thresh, int count) {

+                             const unsigned char *thresh) {

   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);

   unsigned char *src[1];

   unsigned char *dst[1];

-  (void)count;

   // Transpose 8x8

   src[0] = s - 4;

@@ -1505,7 +1489,7 @@

   transpose(src, p, dst, 8, 1);

   // Loop filtering

-  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);

+  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);

   src[0] = t_dst;

   dst[0] = s - 4;

@@ -1557,7 +1541,7 @@

   transpose(src, p, dst, 8, 2);

   // Loop filtering

-  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);

+  vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);

   src[0] = t_dst;

   src[1] = t_dst + 8 * 8;

@@ -1578,8 +1562,7 @@

   transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);

   // Loop filtering

-  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,

-                                   thresh);

+  vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);

   // Transpose back

   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);