shithub: libvpx

--- a/test/variance_test.cc

+++ b/test/variance_test.cc

@@ -21,13 +21,6 @@

 #include "vpx/vpx_integer.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vpx_ports/mem.h"

-#if CONFIG_VP8_ENCODER

-# include "./vp8_rtcd.h"

-#endif  // CONFIG_VP8_ENCODER

-#if CONFIG_VP9_ENCODER

-# include "./vp9_rtcd.h"

-# include "vp9/encoder/vp9_variance.h"

-#endif  // CONFIG_VP9_ENCODER

 #include "./vpx_dsp_rtcd.h"

 namespace {

@@ -39,9 +32,16 @@

                                          int xoffset, int yoffset,

                                          const uint8_t *b, int b_stride,

                                          unsigned int *sse);

+typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,

+                                            int xoffset, int yoffset,

+                                            const uint8_t *b, int b_stride,

+                                            uint32_t *sse,

+                                            const uint8_t *second_pred);

 typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,

                                       const uint8_t *b, int b_stride);

+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);

 using ::std::tr1::get;

 using ::std::tr1::make_tuple;

 using ::std::tr1::tuple;

@@ -166,8 +166,6 @@

                                 (l2w + l2h)));

-typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);

 class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {

  public:

   SumOfSquaresTest() : func_(GetParam()) {}

@@ -687,9 +685,8 @@

-#if CONFIG_VP9_ENCODER

 template<>

-void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {

+void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {

   for (int x = 0; x < 8; ++x) {

     for (int y = 0; y < 8; ++y) {

       if (!use_high_bit_depth_) {

@@ -726,11 +723,12 @@

-#endif  // CONFIG_VP9_ENCODER

 typedef MseTest<Get4x4SseFunc> VpxSseTest;

 typedef MseTest<VarianceMxNFunc> VpxMseTest;

 typedef VarianceTest<VarianceMxNFunc> VpxVarianceTest;

+typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxSubpelVarianceTest;

+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxSubpelAvgVarianceTest;

 TEST_P(VpxSseTest, Ref_sse) { RefTest_sse(); }

 TEST_P(VpxSseTest, Max_sse) { MaxTest_sse(); }

@@ -742,6 +740,9 @@

 TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }

 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }

 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }

+TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }

+TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }

+TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); }

 INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,

                         ::testing::Values(vpx_get_mb_ss_c));

@@ -773,7 +774,6 @@

 const VarianceMxNFunc variance8x4_c = vpx_variance8x4_c;

 const VarianceMxNFunc variance4x8_c = vpx_variance4x8_c;

 const VarianceMxNFunc variance4x4_c = vpx_variance4x4_c;

 INSTANTIATE_TEST_CASE_P(

     C, VpxVarianceTest,

     ::testing::Values(make_tuple(6, 6, variance64x64_c, 0),

@@ -790,9 +790,79 @@

                       make_tuple(2, 3, variance4x8_c, 0),

                       make_tuple(2, 2, variance4x4_c, 0)));

+const SubpixVarMxNFunc subpel_var64x64_c = vpx_sub_pixel_variance64x64_c;

+const SubpixVarMxNFunc subpel_var64x32_c = vpx_sub_pixel_variance64x32_c;

+const SubpixVarMxNFunc subpel_var32x64_c = vpx_sub_pixel_variance32x64_c;

+const SubpixVarMxNFunc subpel_var32x32_c = vpx_sub_pixel_variance32x32_c;

+const SubpixVarMxNFunc subpel_var32x16_c = vpx_sub_pixel_variance32x16_c;

+const SubpixVarMxNFunc subpel_var16x32_c = vpx_sub_pixel_variance16x32_c;

+const SubpixVarMxNFunc subpel_var16x16_c = vpx_sub_pixel_variance16x16_c;

+const SubpixVarMxNFunc subpel_var16x8_c = vpx_sub_pixel_variance16x8_c;

+const SubpixVarMxNFunc subpel_var8x16_c = vpx_sub_pixel_variance8x16_c;

+const SubpixVarMxNFunc subpel_var8x8_c = vpx_sub_pixel_variance8x8_c;

+const SubpixVarMxNFunc subpel_var8x4_c = vpx_sub_pixel_variance8x4_c;

+const SubpixVarMxNFunc subpel_var4x8_c = vpx_sub_pixel_variance4x8_c;

+const SubpixVarMxNFunc subpel_var4x4_c = vpx_sub_pixel_variance4x4_c;

+INSTANTIATE_TEST_CASE_P(

+    C, VpxSubpelVarianceTest,

+    ::testing::Values(make_tuple(6, 6, subpel_var64x64_c, 0),

+                      make_tuple(6, 5, subpel_var64x32_c, 0),

+                      make_tuple(5, 6, subpel_var32x64_c, 0),

+                      make_tuple(5, 5, subpel_var32x32_c, 0),

+                      make_tuple(5, 4, subpel_var32x16_c, 0),

+                      make_tuple(4, 5, subpel_var16x32_c, 0),

+                      make_tuple(4, 4, subpel_var16x16_c, 0),

+                      make_tuple(4, 3, subpel_var16x8_c, 0),

+                      make_tuple(3, 4, subpel_var8x16_c, 0),

+                      make_tuple(3, 3, subpel_var8x8_c, 0),

+                      make_tuple(3, 2, subpel_var8x4_c, 0),

+                      make_tuple(2, 3, subpel_var4x8_c, 0),

+                      make_tuple(2, 2, subpel_var4x4_c, 0)));

+const SubpixAvgVarMxNFunc subpel_avg_var64x64_c =

+    vpx_sub_pixel_avg_variance64x64_c;

+const SubpixAvgVarMxNFunc subpel_avg_var64x32_c =

+    vpx_sub_pixel_avg_variance64x32_c;

+const SubpixAvgVarMxNFunc subpel_avg_var32x64_c =

+    vpx_sub_pixel_avg_variance32x64_c;

+const SubpixAvgVarMxNFunc subpel_avg_var32x32_c =

+    vpx_sub_pixel_avg_variance32x32_c;

+const SubpixAvgVarMxNFunc subpel_avg_var32x16_c =

+    vpx_sub_pixel_avg_variance32x16_c;

+const SubpixAvgVarMxNFunc subpel_avg_var16x32_c =

+    vpx_sub_pixel_avg_variance16x32_c;

+const SubpixAvgVarMxNFunc subpel_avg_var16x16_c =

+    vpx_sub_pixel_avg_variance16x16_c;

+const SubpixAvgVarMxNFunc subpel_avg_var16x8_c =

+    vpx_sub_pixel_avg_variance16x8_c;

+const SubpixAvgVarMxNFunc subpel_avg_var8x16_c =

+    vpx_sub_pixel_avg_variance8x16_c;

+const SubpixAvgVarMxNFunc subpel_avg_var8x8_c = vpx_sub_pixel_avg_variance8x8_c;

+const SubpixAvgVarMxNFunc subpel_avg_var8x4_c = vpx_sub_pixel_avg_variance8x4_c;

+const SubpixAvgVarMxNFunc subpel_avg_var4x8_c = vpx_sub_pixel_avg_variance4x8_c;

+const SubpixAvgVarMxNFunc subpel_avg_var4x4_c = vpx_sub_pixel_avg_variance4x4_c;

+INSTANTIATE_TEST_CASE_P(

+    C, VpxSubpelAvgVarianceTest,

+    ::testing::Values(make_tuple(6, 6, subpel_avg_var64x64_c, 0),

+                      make_tuple(6, 5, subpel_avg_var64x32_c, 0),

+                      make_tuple(5, 6, subpel_avg_var32x64_c, 0),

+                      make_tuple(5, 5, subpel_avg_var32x32_c, 0),

+                      make_tuple(5, 4, subpel_avg_var32x16_c, 0),

+                      make_tuple(4, 5, subpel_avg_var16x32_c, 0),

+                      make_tuple(4, 4, subpel_avg_var16x16_c, 0),

+                      make_tuple(4, 3, subpel_avg_var16x8_c, 0),

+                      make_tuple(3, 4, subpel_avg_var8x16_c, 0),

+                      make_tuple(3, 3, subpel_avg_var8x8_c, 0),

+                      make_tuple(3, 2, subpel_avg_var8x4_c, 0),

+                      make_tuple(2, 3, subpel_avg_var4x8_c, 0),

+                      make_tuple(2, 2, subpel_avg_var4x4_c, 0)));

 #if CONFIG_VP9_HIGHBITDEPTH

 typedef MseTest<VarianceMxNFunc> VpxHBDMseTest;

 typedef VarianceTest<VarianceMxNFunc> VpxHBDVarianceTest;

+typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxHBDSubpelVarianceTest;

+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc>

+    VpxHBDSubpelAvgVarianceTest;

 TEST_P(VpxHBDMseTest, Ref_mse) { RefTest_mse(); }

 TEST_P(VpxHBDMseTest, Max_mse) { MaxTest_mse(); }

@@ -800,6 +870,9 @@

 TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }

 TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }

 TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }

+TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }

+TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }

+TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }

 /* TODO(debargha): This test does not support the highbd version

 const VarianceMxNFunc highbd_12_mse16x16_c = vpx_highbd_12_mse16x16_c;

@@ -844,7 +917,6 @@

 const VarianceMxNFunc highbd_12_variance8x4_c = vpx_highbd_12_variance8x4_c;

 const VarianceMxNFunc highbd_12_variance4x8_c = vpx_highbd_12_variance4x8_c;

 const VarianceMxNFunc highbd_12_variance4x4_c = vpx_highbd_12_variance4x4_c;

 const VarianceMxNFunc highbd_10_variance64x64_c = vpx_highbd_10_variance64x64_c;

 const VarianceMxNFunc highbd_10_variance64x32_c = vpx_highbd_10_variance64x32_c;

 const VarianceMxNFunc highbd_10_variance32x64_c = vpx_highbd_10_variance32x64_c;

@@ -858,7 +930,6 @@

 const VarianceMxNFunc highbd_10_variance8x4_c = vpx_highbd_10_variance8x4_c;

 const VarianceMxNFunc highbd_10_variance4x8_c = vpx_highbd_10_variance4x8_c;

 const VarianceMxNFunc highbd_10_variance4x4_c = vpx_highbd_10_variance4x4_c;

 const VarianceMxNFunc highbd_8_variance64x64_c = vpx_highbd_8_variance64x64_c;

 const VarianceMxNFunc highbd_8_variance64x32_c = vpx_highbd_8_variance64x32_c;

 const VarianceMxNFunc highbd_8_variance32x64_c = vpx_highbd_8_variance32x64_c;

@@ -913,6 +984,247 @@

                       make_tuple(3, 2, highbd_8_variance8x4_c, 8),

                       make_tuple(2, 3, highbd_8_variance4x8_c, 8),

                       make_tuple(2, 2, highbd_8_variance4x4_c, 8)));

+const SubpixVarMxNFunc highbd_8_subpel_var64x64_c =

+    vpx_highbd_8_sub_pixel_variance64x64_c;

+const SubpixVarMxNFunc highbd_8_subpel_var64x32_c =

+    vpx_highbd_8_sub_pixel_variance64x32_c;

+const SubpixVarMxNFunc highbd_8_subpel_var32x64_c =

+    vpx_highbd_8_sub_pixel_variance32x64_c;

+const SubpixVarMxNFunc highbd_8_subpel_var32x32_c =

+    vpx_highbd_8_sub_pixel_variance32x32_c;

+const SubpixVarMxNFunc highbd_8_subpel_var32x16_c =

+    vpx_highbd_8_sub_pixel_variance32x16_c;

+const SubpixVarMxNFunc highbd_8_subpel_var16x32_c =

+    vpx_highbd_8_sub_pixel_variance16x32_c;

+const SubpixVarMxNFunc highbd_8_subpel_var16x16_c =

+    vpx_highbd_8_sub_pixel_variance16x16_c;

+const SubpixVarMxNFunc highbd_8_subpel_var16x8_c =

+    vpx_highbd_8_sub_pixel_variance16x8_c;

+const SubpixVarMxNFunc highbd_8_subpel_var8x16_c =

+    vpx_highbd_8_sub_pixel_variance8x16_c;

+const SubpixVarMxNFunc highbd_8_subpel_var8x8_c =

+    vpx_highbd_8_sub_pixel_variance8x8_c;

+const SubpixVarMxNFunc highbd_8_subpel_var8x4_c =

+    vpx_highbd_8_sub_pixel_variance8x4_c;

+const SubpixVarMxNFunc highbd_8_subpel_var4x8_c =

+    vpx_highbd_8_sub_pixel_variance4x8_c;

+const SubpixVarMxNFunc highbd_8_subpel_var4x4_c =

+    vpx_highbd_8_sub_pixel_variance4x4_c;

+const SubpixVarMxNFunc highbd_10_subpel_var64x64_c =

+    vpx_highbd_10_sub_pixel_variance64x64_c;

+const SubpixVarMxNFunc highbd_10_subpel_var64x32_c =

+    vpx_highbd_10_sub_pixel_variance64x32_c;

+const SubpixVarMxNFunc highbd_10_subpel_var32x64_c =

+    vpx_highbd_10_sub_pixel_variance32x64_c;

+const SubpixVarMxNFunc highbd_10_subpel_var32x32_c =

+    vpx_highbd_10_sub_pixel_variance32x32_c;

+const SubpixVarMxNFunc highbd_10_subpel_var32x16_c =

+    vpx_highbd_10_sub_pixel_variance32x16_c;

+const SubpixVarMxNFunc highbd_10_subpel_var16x32_c =

+    vpx_highbd_10_sub_pixel_variance16x32_c;

+const SubpixVarMxNFunc highbd_10_subpel_var16x16_c =

+    vpx_highbd_10_sub_pixel_variance16x16_c;

+const SubpixVarMxNFunc highbd_10_subpel_var16x8_c =

+    vpx_highbd_10_sub_pixel_variance16x8_c;

+const SubpixVarMxNFunc highbd_10_subpel_var8x16_c =

+    vpx_highbd_10_sub_pixel_variance8x16_c;

+const SubpixVarMxNFunc highbd_10_subpel_var8x8_c =

+    vpx_highbd_10_sub_pixel_variance8x8_c;

+const SubpixVarMxNFunc highbd_10_subpel_var8x4_c =

+    vpx_highbd_10_sub_pixel_variance8x4_c;

+const SubpixVarMxNFunc highbd_10_subpel_var4x8_c =

+    vpx_highbd_10_sub_pixel_variance4x8_c;

+const SubpixVarMxNFunc highbd_10_subpel_var4x4_c =

+    vpx_highbd_10_sub_pixel_variance4x4_c;

+const SubpixVarMxNFunc highbd_12_subpel_var64x64_c =

+    vpx_highbd_12_sub_pixel_variance64x64_c;

+const SubpixVarMxNFunc highbd_12_subpel_var64x32_c =

+    vpx_highbd_12_sub_pixel_variance64x32_c;

+const SubpixVarMxNFunc highbd_12_subpel_var32x64_c =

+    vpx_highbd_12_sub_pixel_variance32x64_c;

+const SubpixVarMxNFunc highbd_12_subpel_var32x32_c =

+    vpx_highbd_12_sub_pixel_variance32x32_c;

+const SubpixVarMxNFunc highbd_12_subpel_var32x16_c =

+    vpx_highbd_12_sub_pixel_variance32x16_c;

+const SubpixVarMxNFunc highbd_12_subpel_var16x32_c =

+    vpx_highbd_12_sub_pixel_variance16x32_c;

+const SubpixVarMxNFunc highbd_12_subpel_var16x16_c =

+    vpx_highbd_12_sub_pixel_variance16x16_c;

+const SubpixVarMxNFunc highbd_12_subpel_var16x8_c =

+    vpx_highbd_12_sub_pixel_variance16x8_c;

+const SubpixVarMxNFunc highbd_12_subpel_var8x16_c =

+    vpx_highbd_12_sub_pixel_variance8x16_c;

+const SubpixVarMxNFunc highbd_12_subpel_var8x8_c =

+    vpx_highbd_12_sub_pixel_variance8x8_c;

+const SubpixVarMxNFunc highbd_12_subpel_var8x4_c =

+    vpx_highbd_12_sub_pixel_variance8x4_c;

+const SubpixVarMxNFunc highbd_12_subpel_var4x8_c =

+    vpx_highbd_12_sub_pixel_variance4x8_c;

+const SubpixVarMxNFunc highbd_12_subpel_var4x4_c =

+    vpx_highbd_12_sub_pixel_variance4x4_c;

+INSTANTIATE_TEST_CASE_P(

+    C, VpxHBDSubpelVarianceTest,

+    ::testing::Values(make_tuple(6, 6, highbd_8_subpel_var64x64_c, 8),

+                      make_tuple(6, 5, highbd_8_subpel_var64x32_c, 8),

+                      make_tuple(5, 6, highbd_8_subpel_var32x64_c, 8),

+                      make_tuple(5, 5, highbd_8_subpel_var32x32_c, 8),

+                      make_tuple(5, 4, highbd_8_subpel_var32x16_c, 8),

+                      make_tuple(4, 5, highbd_8_subpel_var16x32_c, 8),

+                      make_tuple(4, 4, highbd_8_subpel_var16x16_c, 8),

+                      make_tuple(4, 3, highbd_8_subpel_var16x8_c, 8),

+                      make_tuple(3, 4, highbd_8_subpel_var8x16_c, 8),

+                      make_tuple(3, 3, highbd_8_subpel_var8x8_c, 8),

+                      make_tuple(3, 2, highbd_8_subpel_var8x4_c, 8),

+                      make_tuple(2, 3, highbd_8_subpel_var4x8_c, 8),

+                      make_tuple(2, 2, highbd_8_subpel_var4x4_c, 8),

+                      make_tuple(6, 6, highbd_10_subpel_var64x64_c, 10),

+                      make_tuple(6, 5, highbd_10_subpel_var64x32_c, 10),

+                      make_tuple(5, 6, highbd_10_subpel_var32x64_c, 10),

+                      make_tuple(5, 5, highbd_10_subpel_var32x32_c, 10),

+                      make_tuple(5, 4, highbd_10_subpel_var32x16_c, 10),

+                      make_tuple(4, 5, highbd_10_subpel_var16x32_c, 10),

+                      make_tuple(4, 4, highbd_10_subpel_var16x16_c, 10),

+                      make_tuple(4, 3, highbd_10_subpel_var16x8_c, 10),

+                      make_tuple(3, 4, highbd_10_subpel_var8x16_c, 10),

+                      make_tuple(3, 3, highbd_10_subpel_var8x8_c, 10),

+                      make_tuple(3, 2, highbd_10_subpel_var8x4_c, 10),

+                      make_tuple(2, 3, highbd_10_subpel_var4x8_c, 10),

+                      make_tuple(2, 2, highbd_10_subpel_var4x4_c, 10),

+                      make_tuple(6, 6, highbd_12_subpel_var64x64_c, 12),

+                      make_tuple(6, 5, highbd_12_subpel_var64x32_c, 12),

+                      make_tuple(5, 6, highbd_12_subpel_var32x64_c, 12),

+                      make_tuple(5, 5, highbd_12_subpel_var32x32_c, 12),

+                      make_tuple(5, 4, highbd_12_subpel_var32x16_c, 12),

+                      make_tuple(4, 5, highbd_12_subpel_var16x32_c, 12),

+                      make_tuple(4, 4, highbd_12_subpel_var16x16_c, 12),

+                      make_tuple(4, 3, highbd_12_subpel_var16x8_c, 12),

+                      make_tuple(3, 4, highbd_12_subpel_var8x16_c, 12),

+                      make_tuple(3, 3, highbd_12_subpel_var8x8_c, 12),

+                      make_tuple(3, 2, highbd_12_subpel_var8x4_c, 12),

+                      make_tuple(2, 3, highbd_12_subpel_var4x8_c, 12),

+                      make_tuple(2, 2, highbd_12_subpel_var4x4_c, 12)));

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x64_c =

+    vpx_highbd_8_sub_pixel_avg_variance64x64_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x32_c =

+    vpx_highbd_8_sub_pixel_avg_variance64x32_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x64_c =

+    vpx_highbd_8_sub_pixel_avg_variance32x64_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x32_c =

+    vpx_highbd_8_sub_pixel_avg_variance32x32_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x16_c =

+    vpx_highbd_8_sub_pixel_avg_variance32x16_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x32_c =

+    vpx_highbd_8_sub_pixel_avg_variance16x32_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x16_c =

+    vpx_highbd_8_sub_pixel_avg_variance16x16_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x8_c =

+    vpx_highbd_8_sub_pixel_avg_variance16x8_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x16_c =

+    vpx_highbd_8_sub_pixel_avg_variance8x16_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x8_c =

+    vpx_highbd_8_sub_pixel_avg_variance8x8_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x4_c =

+    vpx_highbd_8_sub_pixel_avg_variance8x4_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x8_c =

+    vpx_highbd_8_sub_pixel_avg_variance4x8_c;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x4_c =

+    vpx_highbd_8_sub_pixel_avg_variance4x4_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x64_c =

+    vpx_highbd_10_sub_pixel_avg_variance64x64_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x32_c =

+    vpx_highbd_10_sub_pixel_avg_variance64x32_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x64_c =

+    vpx_highbd_10_sub_pixel_avg_variance32x64_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x32_c =

+    vpx_highbd_10_sub_pixel_avg_variance32x32_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x16_c =

+    vpx_highbd_10_sub_pixel_avg_variance32x16_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x32_c =

+    vpx_highbd_10_sub_pixel_avg_variance16x32_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x16_c =

+    vpx_highbd_10_sub_pixel_avg_variance16x16_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x8_c =

+    vpx_highbd_10_sub_pixel_avg_variance16x8_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x16_c =

+    vpx_highbd_10_sub_pixel_avg_variance8x16_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x8_c =

+    vpx_highbd_10_sub_pixel_avg_variance8x8_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x4_c =

+    vpx_highbd_10_sub_pixel_avg_variance8x4_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x8_c =

+    vpx_highbd_10_sub_pixel_avg_variance4x8_c;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x4_c =

+    vpx_highbd_10_sub_pixel_avg_variance4x4_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x64_c =

+    vpx_highbd_12_sub_pixel_avg_variance64x64_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x32_c =

+    vpx_highbd_12_sub_pixel_avg_variance64x32_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x64_c =

+    vpx_highbd_12_sub_pixel_avg_variance32x64_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x32_c =

+    vpx_highbd_12_sub_pixel_avg_variance32x32_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x16_c =

+    vpx_highbd_12_sub_pixel_avg_variance32x16_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x32_c =

+    vpx_highbd_12_sub_pixel_avg_variance16x32_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x16_c =

+    vpx_highbd_12_sub_pixel_avg_variance16x16_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x8_c =

+    vpx_highbd_12_sub_pixel_avg_variance16x8_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x16_c =

+    vpx_highbd_12_sub_pixel_avg_variance8x16_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x8_c =

+    vpx_highbd_12_sub_pixel_avg_variance8x8_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x4_c =

+    vpx_highbd_12_sub_pixel_avg_variance8x4_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x8_c =

+    vpx_highbd_12_sub_pixel_avg_variance4x8_c;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x4_c =

+    vpx_highbd_12_sub_pixel_avg_variance4x4_c;

+INSTANTIATE_TEST_CASE_P(

+    C, VpxHBDSubpelAvgVarianceTest,

+    ::testing::Values(

+        make_tuple(6, 6, highbd_8_subpel_avg_var64x64_c, 8),

+        make_tuple(6, 5, highbd_8_subpel_avg_var64x32_c, 8),

+        make_tuple(5, 6, highbd_8_subpel_avg_var32x64_c, 8),

+        make_tuple(5, 5, highbd_8_subpel_avg_var32x32_c, 8),

+        make_tuple(5, 4, highbd_8_subpel_avg_var32x16_c, 8),

+        make_tuple(4, 5, highbd_8_subpel_avg_var16x32_c, 8),

+        make_tuple(4, 4, highbd_8_subpel_avg_var16x16_c, 8),

+        make_tuple(4, 3, highbd_8_subpel_avg_var16x8_c, 8),

+        make_tuple(3, 4, highbd_8_subpel_avg_var8x16_c, 8),

+        make_tuple(3, 3, highbd_8_subpel_avg_var8x8_c, 8),

+        make_tuple(3, 2, highbd_8_subpel_avg_var8x4_c, 8),

+        make_tuple(2, 3, highbd_8_subpel_avg_var4x8_c, 8),

+        make_tuple(2, 2, highbd_8_subpel_avg_var4x4_c, 8),

+        make_tuple(6, 6, highbd_10_subpel_avg_var64x64_c, 10),

+        make_tuple(6, 5, highbd_10_subpel_avg_var64x32_c, 10),

+        make_tuple(5, 6, highbd_10_subpel_avg_var32x64_c, 10),

+        make_tuple(5, 5, highbd_10_subpel_avg_var32x32_c, 10),

+        make_tuple(5, 4, highbd_10_subpel_avg_var32x16_c, 10),

+        make_tuple(4, 5, highbd_10_subpel_avg_var16x32_c, 10),

+        make_tuple(4, 4, highbd_10_subpel_avg_var16x16_c, 10),

+        make_tuple(4, 3, highbd_10_subpel_avg_var16x8_c, 10),

+        make_tuple(3, 4, highbd_10_subpel_avg_var8x16_c, 10),

+        make_tuple(3, 3, highbd_10_subpel_avg_var8x8_c, 10),

+        make_tuple(3, 2, highbd_10_subpel_avg_var8x4_c, 10),

+        make_tuple(2, 3, highbd_10_subpel_avg_var4x8_c, 10),

+        make_tuple(2, 2, highbd_10_subpel_avg_var4x4_c, 10),

+        make_tuple(6, 6, highbd_12_subpel_avg_var64x64_c, 12),

+        make_tuple(6, 5, highbd_12_subpel_avg_var64x32_c, 12),

+        make_tuple(5, 6, highbd_12_subpel_avg_var32x64_c, 12),

+        make_tuple(5, 5, highbd_12_subpel_avg_var32x32_c, 12),

+        make_tuple(5, 4, highbd_12_subpel_avg_var32x16_c, 12),

+        make_tuple(4, 5, highbd_12_subpel_avg_var16x32_c, 12),

+        make_tuple(4, 4, highbd_12_subpel_avg_var16x16_c, 12),

+        make_tuple(4, 3, highbd_12_subpel_avg_var16x8_c, 12),

+        make_tuple(3, 4, highbd_12_subpel_avg_var8x16_c, 12),

+        make_tuple(3, 3, highbd_12_subpel_avg_var8x8_c, 12),

+        make_tuple(3, 2, highbd_12_subpel_avg_var8x4_c, 12),

+        make_tuple(2, 3, highbd_12_subpel_avg_var4x8_c, 12),

+        make_tuple(2, 2, highbd_12_subpel_avg_var4x4_c, 12)));

 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #if HAVE_MMX

@@ -935,6 +1247,19 @@

                       make_tuple(3, 4, variance8x16_mmx, 0),

                       make_tuple(3, 3, variance8x8_mmx, 0),

                       make_tuple(2, 2, variance4x4_mmx, 0)));

+const SubpixVarMxNFunc subpel_var16x16_mmx = vpx_sub_pixel_variance16x16_mmx;

+const SubpixVarMxNFunc subpel_var16x8_mmx = vpx_sub_pixel_variance16x8_mmx;

+const SubpixVarMxNFunc subpel_var8x16_mmx = vpx_sub_pixel_variance8x16_mmx;

+const SubpixVarMxNFunc subpel_var8x8_mmx = vpx_sub_pixel_variance8x8_mmx;

+const SubpixVarMxNFunc subpel_var4x4_mmx = vpx_sub_pixel_variance4x4_mmx;

+INSTANTIATE_TEST_CASE_P(

+    MMX, VpxSubpelVarianceTest,

+    ::testing::Values(make_tuple(4, 4, subpel_var16x16_mmx, 0),

+                      make_tuple(4, 3, subpel_var16x8_mmx, 0),

+                      make_tuple(3, 4, subpel_var8x16_mmx, 0),

+                      make_tuple(3, 3, subpel_var8x8_mmx, 0),

+                      make_tuple(2, 2, subpel_var4x4_mmx, 0)));

 #endif  // HAVE_MMX

 #if HAVE_SSE2

@@ -979,6 +1304,90 @@

                       make_tuple(3, 2, variance8x4_sse2, 0),

                       make_tuple(2, 3, variance4x8_sse2, 0),

                       make_tuple(2, 2, variance4x4_sse2, 0)));

+#if CONFIG_USE_X86INC

+const SubpixVarMxNFunc subpel_variance64x64_sse2 =

+    vpx_sub_pixel_variance64x64_sse2;

+const SubpixVarMxNFunc subpel_variance64x32_sse2 =

+    vpx_sub_pixel_variance64x32_sse2;

+const SubpixVarMxNFunc subpel_variance32x64_sse2 =

+    vpx_sub_pixel_variance32x64_sse2;

+const SubpixVarMxNFunc subpel_variance32x32_sse2 =

+    vpx_sub_pixel_variance32x32_sse2;

+const SubpixVarMxNFunc subpel_variance32x16_sse2 =

+    vpx_sub_pixel_variance32x16_sse2;

+const SubpixVarMxNFunc subpel_variance16x32_sse2 =

+    vpx_sub_pixel_variance16x32_sse2;

+const SubpixVarMxNFunc subpel_variance16x16_sse2 =

+    vpx_sub_pixel_variance16x16_sse2;

+const SubpixVarMxNFunc subpel_variance16x8_sse2 =

+    vpx_sub_pixel_variance16x8_sse2;

+const SubpixVarMxNFunc subpel_variance8x16_sse2 =

+    vpx_sub_pixel_variance8x16_sse2;

+const SubpixVarMxNFunc subpel_variance8x8_sse2 = vpx_sub_pixel_variance8x8_sse2;

+const SubpixVarMxNFunc subpel_variance8x4_sse2 = vpx_sub_pixel_variance8x4_sse2;

+const SubpixVarMxNFunc subpel_variance4x8_sse = vpx_sub_pixel_variance4x8_sse;

+const SubpixVarMxNFunc subpel_variance4x4_sse = vpx_sub_pixel_variance4x4_sse;

+INSTANTIATE_TEST_CASE_P(

+    SSE2, VpxSubpelVarianceTest,

+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_sse2, 0),

+                      make_tuple(6, 5, subpel_variance64x32_sse2, 0),

+                      make_tuple(5, 6, subpel_variance32x64_sse2, 0),

+                      make_tuple(5, 5, subpel_variance32x32_sse2, 0),

+                      make_tuple(5, 4, subpel_variance32x16_sse2, 0),

+                      make_tuple(4, 5, subpel_variance16x32_sse2, 0),

+                      make_tuple(4, 4, subpel_variance16x16_sse2, 0),

+                      make_tuple(4, 3, subpel_variance16x8_sse2, 0),

+                      make_tuple(3, 4, subpel_variance8x16_sse2, 0),

+                      make_tuple(3, 3, subpel_variance8x8_sse2, 0),

+                      make_tuple(3, 2, subpel_variance8x4_sse2, 0),

+                      make_tuple(2, 3, subpel_variance4x8_sse, 0),

+                      make_tuple(2, 2, subpel_variance4x4_sse, 0)));

+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_sse2 =

+    vpx_sub_pixel_avg_variance64x64_sse2;

+const SubpixAvgVarMxNFunc subpel_avg_variance64x32_sse2 =

+    vpx_sub_pixel_avg_variance64x32_sse2;

+const SubpixAvgVarMxNFunc subpel_avg_variance32x64_sse2 =

+    vpx_sub_pixel_avg_variance32x64_sse2;

+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_sse2 =

+    vpx_sub_pixel_avg_variance32x32_sse2;

+const SubpixAvgVarMxNFunc subpel_avg_variance32x16_sse2 =

+    vpx_sub_pixel_avg_variance32x16_sse2;

+const SubpixAvgVarMxNFunc subpel_avg_variance16x32_sse2 =

+    vpx_sub_pixel_avg_variance16x32_sse2;

+const SubpixAvgVarMxNFunc subpel_avg_variance16x16_sse2 =

+    vpx_sub_pixel_avg_variance16x16_sse2;

+const SubpixAvgVarMxNFunc subpel_avg_variance16x8_sse2 =

+    vpx_sub_pixel_avg_variance16x8_sse2;

+const SubpixAvgVarMxNFunc subpel_avg_variance8x16_sse2 =

+    vpx_sub_pixel_avg_variance8x16_sse2;

+const SubpixAvgVarMxNFunc subpel_avg_variance8x8_sse2 =

+    vpx_sub_pixel_avg_variance8x8_sse2;

+const SubpixAvgVarMxNFunc subpel_avg_variance8x4_sse2 =

+    vpx_sub_pixel_avg_variance8x4_sse2;

+const SubpixAvgVarMxNFunc subpel_avg_variance4x8_sse =

+    vpx_sub_pixel_avg_variance4x8_sse;

+const SubpixAvgVarMxNFunc subpel_avg_variance4x4_sse =

+    vpx_sub_pixel_avg_variance4x4_sse;

+INSTANTIATE_TEST_CASE_P(

+    SSE2, VpxSubpelAvgVarianceTest,

+    ::testing::Values(

+                      make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0),

+                      make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),

+                      make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0),

+                      make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0),

+                      make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0),

+                      make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0),

+                      make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0),

+                      make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0),

+                      make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0),

+                      make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0),

+                      make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0),

+                      make_tuple(2, 3, subpel_avg_variance4x8_sse, 0),

+                      make_tuple(2, 2, subpel_avg_variance4x4_sse, 0)));

+#endif  // CONFIG_USE_X86INC

 #if CONFIG_VP9_HIGHBITDEPTH

 /* TODO(debargha): This test does not support the highbd version

 const VarianceMxNFunc highbd_12_mse16x16_sse2 = vpx_highbd_12_mse16x16_sse2;

@@ -1103,795 +1512,304 @@

                       make_tuple(4, 3, highbd_8_variance16x8_sse2, 8),

                       make_tuple(3, 4, highbd_8_variance8x16_sse2, 8),

                       make_tuple(3, 3, highbd_8_variance8x8_sse2, 8)));

-#endif  // CONFIG_VP9_HIGHBITDEPTH

-#endif  // HAVE_SSE2

-#if CONFIG_VP8_ENCODER

-typedef SubpelVarianceTest<SubpixVarMxNFunc> VP8SubpelVarianceTest;

-TEST_P(VP8SubpelVarianceTest, Ref) { RefTest(); }

-TEST_P(VP8SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }

-#endif  // CONFIG_VP8_ENCODER

-#if CONFIG_VP9_ENCODER

-typedef SubpelVarianceTest<SubpixVarMxNFunc> VP9SubpelVarianceTest;

-typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;

-TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }

-TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }

-TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }

-#if CONFIG_VP9_HIGHBITDEPTH

-typedef SubpelVarianceTest<SubpixVarMxNFunc> VP9SubpelVarianceHighTest;

-typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t>

-    VP9SubpelAvgVarianceHighTest;

-TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); }

-TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); }

-TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); }

-#endif  // CONFIG_VP9_HIGHBITDEPTH

-const SubpixVarMxNFunc subpel_variance4x4_c = vp9_sub_pixel_variance4x4_c;

-const SubpixVarMxNFunc subpel_variance4x8_c = vp9_sub_pixel_variance4x8_c;

-const SubpixVarMxNFunc subpel_variance8x4_c = vp9_sub_pixel_variance8x4_c;

-const SubpixVarMxNFunc subpel_variance8x8_c = vp9_sub_pixel_variance8x8_c;

-const SubpixVarMxNFunc subpel_variance8x16_c = vp9_sub_pixel_variance8x16_c;

-const SubpixVarMxNFunc subpel_variance16x8_c = vp9_sub_pixel_variance16x8_c;

-const SubpixVarMxNFunc subpel_variance16x16_c = vp9_sub_pixel_variance16x16_c;

-const SubpixVarMxNFunc subpel_variance16x32_c = vp9_sub_pixel_variance16x32_c;

-const SubpixVarMxNFunc subpel_variance32x16_c = vp9_sub_pixel_variance32x16_c;

-const SubpixVarMxNFunc subpel_variance32x32_c = vp9_sub_pixel_variance32x32_c;

-const SubpixVarMxNFunc subpel_variance32x64_c = vp9_sub_pixel_variance32x64_c;

-const SubpixVarMxNFunc subpel_variance64x32_c = vp9_sub_pixel_variance64x32_c;

-const SubpixVarMxNFunc subpel_variance64x64_c = vp9_sub_pixel_variance64x64_c;

-INSTANTIATE_TEST_CASE_P(

-    C, VP9SubpelVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c, 0),

-                      make_tuple(2, 3, subpel_variance4x8_c, 0),

-                      make_tuple(3, 2, subpel_variance8x4_c, 0),

-                      make_tuple(3, 3, subpel_variance8x8_c, 0),

-                      make_tuple(3, 4, subpel_variance8x16_c, 0),

-                      make_tuple(4, 3, subpel_variance16x8_c, 0),

-                      make_tuple(4, 4, subpel_variance16x16_c, 0),

-                      make_tuple(4, 5, subpel_variance16x32_c, 0),

-                      make_tuple(5, 4, subpel_variance32x16_c, 0),

-                      make_tuple(5, 5, subpel_variance32x32_c, 0),

-                      make_tuple(5, 6, subpel_variance32x64_c, 0),

-                      make_tuple(6, 5, subpel_variance64x32_c, 0),

-                      make_tuple(6, 6, subpel_variance64x64_c, 0)));

-#if CONFIG_VP8_ENCODER

-const SubpixVarMxNFunc vp8_subpel_variance16x16_c =

-    vp8_sub_pixel_variance16x16_c;

-const SubpixVarMxNFunc vp8_subpel_variance16x8_c = vp8_sub_pixel_variance16x8_c;

-const SubpixVarMxNFunc vp8_subpel_variance8x16_c = vp8_sub_pixel_variance8x16_c;

-const SubpixVarMxNFunc vp8_subpel_variance8x8_c = vp8_sub_pixel_variance8x8_c;

-const SubpixVarMxNFunc vp8_subpel_variance4x4_c = vp8_sub_pixel_variance4x4_c;

-INSTANTIATE_TEST_CASE_P(

-    C, VP8SubpelVarianceTest,

-    ::testing::Values(make_tuple(2, 2, vp8_subpel_variance4x4_c, 0),

-                      make_tuple(3, 3, vp8_subpel_variance8x8_c, 0),

-                      make_tuple(3, 4, vp8_subpel_variance8x16_c, 0),

-                      make_tuple(4, 3, vp8_subpel_variance16x8_c, 0),

-                      make_tuple(4, 4, vp8_subpel_variance16x16_c, 0)));

-#endif  // CONFIG_VP8_ENCODER

-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =

-    vp9_sub_pixel_avg_variance4x4_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =

-    vp9_sub_pixel_avg_variance4x8_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c =

-    vp9_sub_pixel_avg_variance8x4_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c =

-    vp9_sub_pixel_avg_variance8x8_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c =

-    vp9_sub_pixel_avg_variance8x16_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c =

-    vp9_sub_pixel_avg_variance16x8_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c =

-    vp9_sub_pixel_avg_variance16x16_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c =

-    vp9_sub_pixel_avg_variance16x32_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c =

-    vp9_sub_pixel_avg_variance32x16_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c =

-    vp9_sub_pixel_avg_variance32x32_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c =

-    vp9_sub_pixel_avg_variance32x64_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c =

-    vp9_sub_pixel_avg_variance64x32_c;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c =

-    vp9_sub_pixel_avg_variance64x64_c;

-INSTANTIATE_TEST_CASE_P(

-    C, VP9SubpelAvgVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c, 0),

-                      make_tuple(2, 3, subpel_avg_variance4x8_c, 0),

-                      make_tuple(3, 2, subpel_avg_variance8x4_c, 0),

-                      make_tuple(3, 3, subpel_avg_variance8x8_c, 0),

-                      make_tuple(3, 4, subpel_avg_variance8x16_c, 0),

-                      make_tuple(4, 3, subpel_avg_variance16x8_c, 0),

-                      make_tuple(4, 4, subpel_avg_variance16x16_c, 0),

-                      make_tuple(4, 5, subpel_avg_variance16x32_c, 0),

-                      make_tuple(5, 4, subpel_avg_variance32x16_c, 0),

-                      make_tuple(5, 5, subpel_avg_variance32x32_c, 0),

-                      make_tuple(5, 6, subpel_avg_variance32x64_c, 0),

-                      make_tuple(6, 5, subpel_avg_variance64x32_c, 0),

-                      make_tuple(6, 6, subpel_avg_variance64x64_c, 0)));

-#if CONFIG_VP9_HIGHBITDEPTH

-const SubpixVarMxNFunc highbd_10_subpel_variance4x4_c =

-    vp9_highbd_10_sub_pixel_variance4x4_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance4x8_c =

-    vp9_highbd_10_sub_pixel_variance4x8_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance8x4_c =

-    vp9_highbd_10_sub_pixel_variance8x4_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance8x8_c =

-    vp9_highbd_10_sub_pixel_variance8x8_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance8x16_c =

-    vp9_highbd_10_sub_pixel_variance8x16_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance16x8_c =

-    vp9_highbd_10_sub_pixel_variance16x8_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance16x16_c =

-    vp9_highbd_10_sub_pixel_variance16x16_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance16x32_c =

-    vp9_highbd_10_sub_pixel_variance16x32_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance32x16_c =

-    vp9_highbd_10_sub_pixel_variance32x16_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance32x32_c =

-    vp9_highbd_10_sub_pixel_variance32x32_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance32x64_c =

-    vp9_highbd_10_sub_pixel_variance32x64_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance64x32_c =

-    vp9_highbd_10_sub_pixel_variance64x32_c;

-const SubpixVarMxNFunc highbd_10_subpel_variance64x64_c =

-    vp9_highbd_10_sub_pixel_variance64x64_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance4x4_c =

-    vp9_highbd_12_sub_pixel_variance4x4_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance4x8_c =

-    vp9_highbd_12_sub_pixel_variance4x8_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance8x4_c =

-    vp9_highbd_12_sub_pixel_variance8x4_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance8x8_c =

-    vp9_highbd_12_sub_pixel_variance8x8_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance8x16_c =

-    vp9_highbd_12_sub_pixel_variance8x16_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance16x8_c =

-    vp9_highbd_12_sub_pixel_variance16x8_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance16x16_c =

-    vp9_highbd_12_sub_pixel_variance16x16_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance16x32_c =

-    vp9_highbd_12_sub_pixel_variance16x32_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance32x16_c =

-    vp9_highbd_12_sub_pixel_variance32x16_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance32x32_c =

-    vp9_highbd_12_sub_pixel_variance32x32_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance32x64_c =

-    vp9_highbd_12_sub_pixel_variance32x64_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance64x32_c =

-    vp9_highbd_12_sub_pixel_variance64x32_c;

-const SubpixVarMxNFunc highbd_12_subpel_variance64x64_c =

-    vp9_highbd_12_sub_pixel_variance64x64_c;

-const SubpixVarMxNFunc highbd_subpel_variance4x4_c =

-    vp9_highbd_sub_pixel_variance4x4_c;

-const SubpixVarMxNFunc highbd_subpel_variance4x8_c =

-    vp9_highbd_sub_pixel_variance4x8_c;

-const SubpixVarMxNFunc highbd_subpel_variance8x4_c =

-    vp9_highbd_sub_pixel_variance8x4_c;

-const SubpixVarMxNFunc highbd_subpel_variance8x8_c =

-    vp9_highbd_sub_pixel_variance8x8_c;

-const SubpixVarMxNFunc highbd_subpel_variance8x16_c =

-    vp9_highbd_sub_pixel_variance8x16_c;

-const SubpixVarMxNFunc highbd_subpel_variance16x8_c =

-    vp9_highbd_sub_pixel_variance16x8_c;

-const SubpixVarMxNFunc highbd_subpel_variance16x16_c =

-    vp9_highbd_sub_pixel_variance16x16_c;

-const SubpixVarMxNFunc highbd_subpel_variance16x32_c =

-    vp9_highbd_sub_pixel_variance16x32_c;

-const SubpixVarMxNFunc highbd_subpel_variance32x16_c =

-    vp9_highbd_sub_pixel_variance32x16_c;

-const SubpixVarMxNFunc highbd_subpel_variance32x32_c =

-    vp9_highbd_sub_pixel_variance32x32_c;

-const SubpixVarMxNFunc highbd_subpel_variance32x64_c =

-    vp9_highbd_sub_pixel_variance32x64_c;

-const SubpixVarMxNFunc highbd_subpel_variance64x32_c =

-    vp9_highbd_sub_pixel_variance64x32_c;

-const SubpixVarMxNFunc highbd_subpel_variance64x64_c =

-    vp9_highbd_sub_pixel_variance64x64_c;

-INSTANTIATE_TEST_CASE_P(

-    C, VP9SubpelVarianceHighTest,

-    ::testing::Values(make_tuple(2, 2, highbd_10_subpel_variance4x4_c, 10),

-                      make_tuple(2, 3, highbd_10_subpel_variance4x8_c, 10),

-                      make_tuple(3, 2, highbd_10_subpel_variance8x4_c, 10),

-                      make_tuple(3, 3, highbd_10_subpel_variance8x8_c, 10),

-                      make_tuple(3, 4, highbd_10_subpel_variance8x16_c, 10),

-                      make_tuple(4, 3, highbd_10_subpel_variance16x8_c, 10),

-                      make_tuple(4, 4, highbd_10_subpel_variance16x16_c, 10),

-                      make_tuple(4, 5, highbd_10_subpel_variance16x32_c, 10),

-                      make_tuple(5, 4, highbd_10_subpel_variance32x16_c, 10),

-                      make_tuple(5, 5, highbd_10_subpel_variance32x32_c, 10),

-                      make_tuple(5, 6, highbd_10_subpel_variance32x64_c, 10),

-                      make_tuple(6, 5, highbd_10_subpel_variance64x32_c, 10),

-                      make_tuple(6, 6, highbd_10_subpel_variance64x64_c, 10),

-                      make_tuple(2, 2, highbd_12_subpel_variance4x4_c, 12),

-                      make_tuple(2, 3, highbd_12_subpel_variance4x8_c, 12),

-                      make_tuple(3, 2, highbd_12_subpel_variance8x4_c, 12),

-                      make_tuple(3, 3, highbd_12_subpel_variance8x8_c, 12),

-                      make_tuple(3, 4, highbd_12_subpel_variance8x16_c, 12),

-                      make_tuple(4, 3, highbd_12_subpel_variance16x8_c, 12),

-                      make_tuple(4, 4, highbd_12_subpel_variance16x16_c, 12),

-                      make_tuple(4, 5, highbd_12_subpel_variance16x32_c, 12),

-                      make_tuple(5, 4, highbd_12_subpel_variance32x16_c, 12),

-                      make_tuple(5, 5, highbd_12_subpel_variance32x32_c, 12),

-                      make_tuple(5, 6, highbd_12_subpel_variance32x64_c, 12),

-                      make_tuple(6, 5, highbd_12_subpel_variance64x32_c, 12),

-                      make_tuple(6, 6, highbd_12_subpel_variance64x64_c, 12),

-                      make_tuple(2, 2, highbd_subpel_variance4x4_c, 8),

-                      make_tuple(2, 3, highbd_subpel_variance4x8_c, 8),

-                      make_tuple(3, 2, highbd_subpel_variance8x4_c, 8),

-                      make_tuple(3, 3, highbd_subpel_variance8x8_c, 8),

-                      make_tuple(3, 4, highbd_subpel_variance8x16_c, 8),

-                      make_tuple(4, 3, highbd_subpel_variance16x8_c, 8),

-                      make_tuple(4, 4, highbd_subpel_variance16x16_c, 8),

-                      make_tuple(4, 5, highbd_subpel_variance16x32_c, 8),

-                      make_tuple(5, 4, highbd_subpel_variance32x16_c, 8),

-                      make_tuple(5, 5, highbd_subpel_variance32x32_c, 8),

-                      make_tuple(5, 6, highbd_subpel_variance32x64_c, 8),

-                      make_tuple(6, 5, highbd_subpel_variance64x32_c, 8),

-                      make_tuple(6, 6, highbd_subpel_variance64x64_c, 8)));

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x4_c =

-    vp9_highbd_10_sub_pixel_avg_variance4x4_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x8_c =

-    vp9_highbd_10_sub_pixel_avg_variance4x8_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_c =

-    vp9_highbd_10_sub_pixel_avg_variance8x4_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_c =

-    vp9_highbd_10_sub_pixel_avg_variance8x8_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_c =

-    vp9_highbd_10_sub_pixel_avg_variance8x16_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_c =

-    vp9_highbd_10_sub_pixel_avg_variance16x8_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_c =

-    vp9_highbd_10_sub_pixel_avg_variance16x16_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_c =

-    vp9_highbd_10_sub_pixel_avg_variance16x32_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_c =

-    vp9_highbd_10_sub_pixel_avg_variance32x16_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_c =

-    vp9_highbd_10_sub_pixel_avg_variance32x32_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_c =

-    vp9_highbd_10_sub_pixel_avg_variance32x64_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_c =

-    vp9_highbd_10_sub_pixel_avg_variance64x32_c;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_c =

-    vp9_highbd_10_sub_pixel_avg_variance64x64_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x4_c =

-    vp9_highbd_12_sub_pixel_avg_variance4x4_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x8_c =

-    vp9_highbd_12_sub_pixel_avg_variance4x8_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_c =

-    vp9_highbd_12_sub_pixel_avg_variance8x4_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_c =

-    vp9_highbd_12_sub_pixel_avg_variance8x8_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_c =

-    vp9_highbd_12_sub_pixel_avg_variance8x16_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_c =

-    vp9_highbd_12_sub_pixel_avg_variance16x8_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_c =

-    vp9_highbd_12_sub_pixel_avg_variance16x16_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_c =

-    vp9_highbd_12_sub_pixel_avg_variance16x32_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_c =

-    vp9_highbd_12_sub_pixel_avg_variance32x16_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_c =

-    vp9_highbd_12_sub_pixel_avg_variance32x32_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_c =

-    vp9_highbd_12_sub_pixel_avg_variance32x64_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_c =

-    vp9_highbd_12_sub_pixel_avg_variance64x32_c;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_c =

-    vp9_highbd_12_sub_pixel_avg_variance64x64_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x4_c =

-    vp9_highbd_sub_pixel_avg_variance4x4_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x8_c =

-    vp9_highbd_sub_pixel_avg_variance4x8_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_c =

-    vp9_highbd_sub_pixel_avg_variance8x4_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_c =

-    vp9_highbd_sub_pixel_avg_variance8x8_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_c =

-    vp9_highbd_sub_pixel_avg_variance8x16_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_c =

-    vp9_highbd_sub_pixel_avg_variance16x8_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_c =

-    vp9_highbd_sub_pixel_avg_variance16x16_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_c =

-    vp9_highbd_sub_pixel_avg_variance16x32_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_c =

-    vp9_highbd_sub_pixel_avg_variance32x16_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_c =

-    vp9_highbd_sub_pixel_avg_variance32x32_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_c =

-    vp9_highbd_sub_pixel_avg_variance32x64_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_c =

-    vp9_highbd_sub_pixel_avg_variance64x32_c;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_c =

-    vp9_highbd_sub_pixel_avg_variance64x64_c;

-INSTANTIATE_TEST_CASE_P(

-    C, VP9SubpelAvgVarianceHighTest,

-    ::testing::Values(

-        make_tuple(2, 2, highbd_10_subpel_avg_variance4x4_c, 10),

-        make_tuple(2, 3, highbd_10_subpel_avg_variance4x8_c, 10),

-        make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_c, 10),

-        make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_c, 10),

-        make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_c, 10),

-        make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_c, 10),

-        make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_c, 10),

-        make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_c, 10),

-        make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_c, 10),

-        make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_c, 10),

-        make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_c, 10),

-        make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_c, 10),

-        make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_c, 10),

-        make_tuple(2, 2, highbd_12_subpel_avg_variance4x4_c, 12),

-        make_tuple(2, 3, highbd_12_subpel_avg_variance4x8_c, 12),

-        make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_c, 12),

-        make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_c, 12),

-        make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_c, 12),

-        make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_c, 12),

-        make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_c, 12),

-        make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_c, 12),

-        make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_c, 12),

-        make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_c, 12),

-        make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_c, 12),

-        make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_c, 12),

-        make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_c, 12),

-        make_tuple(2, 2, highbd_subpel_avg_variance4x4_c, 8),

-        make_tuple(2, 3, highbd_subpel_avg_variance4x8_c, 8),

-        make_tuple(3, 2, highbd_subpel_avg_variance8x4_c, 8),

-        make_tuple(3, 3, highbd_subpel_avg_variance8x8_c, 8),

-        make_tuple(3, 4, highbd_subpel_avg_variance8x16_c, 8),

-        make_tuple(4, 3, highbd_subpel_avg_variance16x8_c, 8),

-        make_tuple(4, 4, highbd_subpel_avg_variance16x16_c, 8),

-        make_tuple(4, 5, highbd_subpel_avg_variance16x32_c, 8),

-        make_tuple(5, 4, highbd_subpel_avg_variance32x16_c, 8),

-        make_tuple(5, 5, highbd_subpel_avg_variance32x32_c, 8),

-        make_tuple(5, 6, highbd_subpel_avg_variance32x64_c, 8),

-        make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8),

-        make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8)));

-#endif  // CONFIG_VP9_HIGHBITDEPTH

-#endif  // CONFIG_VP9_ENCODER

-#if CONFIG_VP8_ENCODER

-#if HAVE_MMX

-const SubpixVarMxNFunc subpel_variance16x16_mmx =

-    vp8_sub_pixel_variance16x16_mmx;

-const SubpixVarMxNFunc subpel_variance16x8_mmx = vp8_sub_pixel_variance16x8_mmx;

-const SubpixVarMxNFunc subpel_variance8x16_mmx = vp8_sub_pixel_variance8x16_mmx;

-const SubpixVarMxNFunc subpel_variance8x8_mmx = vp8_sub_pixel_variance8x8_mmx;

-const SubpixVarMxNFunc subpel_variance4x4_mmx = vp8_sub_pixel_variance4x4_mmx;

-INSTANTIATE_TEST_CASE_P(

-    MMX, VP8SubpelVarianceTest,

-    ::testing::Values(make_tuple(4, 4, subpel_variance16x16_mmx, 0),

-                      make_tuple(4, 3, subpel_variance16x8_mmx, 0),

-                      make_tuple(3, 4, subpel_variance8x16_mmx, 0),

-                      make_tuple(3, 3, subpel_variance8x8_mmx, 0),

-                      make_tuple(2, 2, subpel_variance4x4_mmx, 0)));

-#endif  // HAVE_MMX

-#endif  // CONFIG_VP8_ENCODER

-#if CONFIG_VP9_ENCODER

-#if HAVE_SSE2

 #if CONFIG_USE_X86INC

-const SubpixVarMxNFunc subpel_variance4x4_sse = vp9_sub_pixel_variance4x4_sse;

-const SubpixVarMxNFunc subpel_variance4x8_sse = vp9_sub_pixel_variance4x8_sse;

-const SubpixVarMxNFunc subpel_variance8x4_sse2 = vp9_sub_pixel_variance8x4_sse2;

-const SubpixVarMxNFunc subpel_variance8x8_sse2 = vp9_sub_pixel_variance8x8_sse2;

-const SubpixVarMxNFunc subpel_variance8x16_sse2 =

-    vp9_sub_pixel_variance8x16_sse2;

-const SubpixVarMxNFunc subpel_variance16x8_sse2 =

-    vp9_sub_pixel_variance16x8_sse2;

-const SubpixVarMxNFunc subpel_variance16x16_sse2 =

-    vp9_sub_pixel_variance16x16_sse2;

-const SubpixVarMxNFunc subpel_variance16x32_sse2 =

-    vp9_sub_pixel_variance16x32_sse2;

-const SubpixVarMxNFunc subpel_variance32x16_sse2 =

-    vp9_sub_pixel_variance32x16_sse2;

-const SubpixVarMxNFunc subpel_variance32x32_sse2 =

-    vp9_sub_pixel_variance32x32_sse2;

-const SubpixVarMxNFunc subpel_variance32x64_sse2 =

-    vp9_sub_pixel_variance32x64_sse2;

-const SubpixVarMxNFunc subpel_variance64x32_sse2 =

-    vp9_sub_pixel_variance64x32_sse2;

-const SubpixVarMxNFunc subpel_variance64x64_sse2 =

-    vp9_sub_pixel_variance64x64_sse2;

-INSTANTIATE_TEST_CASE_P(

-    SSE2, VP9SubpelVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse, 0),

-                      make_tuple(2, 3, subpel_variance4x8_sse, 0),

-                      make_tuple(3, 2, subpel_variance8x4_sse2, 0),

-                      make_tuple(3, 3, subpel_variance8x8_sse2, 0),

-                      make_tuple(3, 4, subpel_variance8x16_sse2, 0),

-                      make_tuple(4, 3, subpel_variance16x8_sse2, 0),

-                      make_tuple(4, 4, subpel_variance16x16_sse2, 0),

-                      make_tuple(4, 5, subpel_variance16x32_sse2, 0),

-                      make_tuple(5, 4, subpel_variance32x16_sse2, 0),

-                      make_tuple(5, 5, subpel_variance32x32_sse2, 0),

-                      make_tuple(5, 6, subpel_variance32x64_sse2, 0),

-                      make_tuple(6, 5, subpel_variance64x32_sse2, 0),

-                      make_tuple(6, 6, subpel_variance64x64_sse2, 0)));

-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =

-    vp9_sub_pixel_avg_variance4x4_sse;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =

-    vp9_sub_pixel_avg_variance4x8_sse;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 =

-    vp9_sub_pixel_avg_variance8x4_sse2;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 =

-    vp9_sub_pixel_avg_variance8x8_sse2;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 =

-    vp9_sub_pixel_avg_variance8x16_sse2;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 =

-    vp9_sub_pixel_avg_variance16x8_sse2;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 =

-    vp9_sub_pixel_avg_variance16x16_sse2;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 =

-    vp9_sub_pixel_avg_variance16x32_sse2;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 =

-    vp9_sub_pixel_avg_variance32x16_sse2;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 =

-    vp9_sub_pixel_avg_variance32x32_sse2;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 =

-    vp9_sub_pixel_avg_variance32x64_sse2;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 =

-    vp9_sub_pixel_avg_variance64x32_sse2;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 =

-    vp9_sub_pixel_avg_variance64x64_sse2;

-INSTANTIATE_TEST_CASE_P(

-    SSE2, VP9SubpelAvgVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse, 0),

-                      make_tuple(2, 3, subpel_avg_variance4x8_sse, 0),

-                      make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0),

-                      make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0),

-                      make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0),

-                      make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0),

-                      make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0),

-                      make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0),

-                      make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0),

-                      make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0),

-                      make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0),

-                      make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),

-                      make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0)));

-#if CONFIG_VP9_HIGHBITDEPTH

-const SubpixVarMxNFunc highbd_subpel_variance8x4_sse2 =

-    vp9_highbd_sub_pixel_variance8x4_sse2;

-const SubpixVarMxNFunc highbd_subpel_variance8x8_sse2 =

-    vp9_highbd_sub_pixel_variance8x8_sse2;

-const SubpixVarMxNFunc highbd_subpel_variance8x16_sse2 =

-    vp9_highbd_sub_pixel_variance8x16_sse2;

-const SubpixVarMxNFunc highbd_subpel_variance16x8_sse2 =

-    vp9_highbd_sub_pixel_variance16x8_sse2;

-const SubpixVarMxNFunc highbd_subpel_variance16x16_sse2 =

-    vp9_highbd_sub_pixel_variance16x16_sse2;

-const SubpixVarMxNFunc highbd_subpel_variance16x32_sse2 =

-    vp9_highbd_sub_pixel_variance16x32_sse2;

-const SubpixVarMxNFunc highbd_subpel_variance32x16_sse2 =

-    vp9_highbd_sub_pixel_variance32x16_sse2;

-const SubpixVarMxNFunc highbd_subpel_variance32x32_sse2 =

-    vp9_highbd_sub_pixel_variance32x32_sse2;

-const SubpixVarMxNFunc highbd_subpel_variance32x64_sse2 =

-    vp9_highbd_sub_pixel_variance32x64_sse2;

-const SubpixVarMxNFunc highbd_subpel_variance64x32_sse2 =

-    vp9_highbd_sub_pixel_variance64x32_sse2;

-const SubpixVarMxNFunc highbd_subpel_variance64x64_sse2 =

-    vp9_highbd_sub_pixel_variance64x64_sse2;

-const SubpixVarMxNFunc highbd_10_subpel_variance8x4_sse2 =

-    vp9_highbd_10_sub_pixel_variance8x4_sse2;

-const SubpixVarMxNFunc highbd_10_subpel_variance8x8_sse2 =

-    vp9_highbd_10_sub_pixel_variance8x8_sse2;

-const SubpixVarMxNFunc highbd_10_subpel_variance8x16_sse2 =

-    vp9_highbd_10_sub_pixel_variance8x16_sse2;

-const SubpixVarMxNFunc highbd_10_subpel_variance16x8_sse2 =

-    vp9_highbd_10_sub_pixel_variance16x8_sse2;

-const SubpixVarMxNFunc highbd_10_subpel_variance16x16_sse2 =

-    vp9_highbd_10_sub_pixel_variance16x16_sse2;

-const SubpixVarMxNFunc highbd_10_subpel_variance16x32_sse2 =

-    vp9_highbd_10_sub_pixel_variance16x32_sse2;

-const SubpixVarMxNFunc highbd_10_subpel_variance32x16_sse2 =

-    vp9_highbd_10_sub_pixel_variance32x16_sse2;

-const SubpixVarMxNFunc highbd_10_subpel_variance32x32_sse2 =

-    vp9_highbd_10_sub_pixel_variance32x32_sse2;

-const SubpixVarMxNFunc highbd_10_subpel_variance32x64_sse2 =

-    vp9_highbd_10_sub_pixel_variance32x64_sse2;

-const SubpixVarMxNFunc highbd_10_subpel_variance64x32_sse2 =

-    vp9_highbd_10_sub_pixel_variance64x32_sse2;

-const SubpixVarMxNFunc highbd_10_subpel_variance64x64_sse2 =

-    vp9_highbd_10_sub_pixel_variance64x64_sse2;

-const SubpixVarMxNFunc highbd_12_subpel_variance8x4_sse2 =

-    vp9_highbd_12_sub_pixel_variance8x4_sse2;

-const SubpixVarMxNFunc highbd_12_subpel_variance8x8_sse2 =

-    vp9_highbd_12_sub_pixel_variance8x8_sse2;

-const SubpixVarMxNFunc highbd_12_subpel_variance8x16_sse2 =

-    vp9_highbd_12_sub_pixel_variance8x16_sse2;

-const SubpixVarMxNFunc highbd_12_subpel_variance16x8_sse2 =

-    vp9_highbd_12_sub_pixel_variance16x8_sse2;

-const SubpixVarMxNFunc highbd_12_subpel_variance16x16_sse2 =

-    vp9_highbd_12_sub_pixel_variance16x16_sse2;

-const SubpixVarMxNFunc highbd_12_subpel_variance16x32_sse2 =

-    vp9_highbd_12_sub_pixel_variance16x32_sse2;

-const SubpixVarMxNFunc highbd_12_subpel_variance32x16_sse2 =

-    vp9_highbd_12_sub_pixel_variance32x16_sse2;

-const SubpixVarMxNFunc highbd_12_subpel_variance32x32_sse2 =

-    vp9_highbd_12_sub_pixel_variance32x32_sse2;

-const SubpixVarMxNFunc highbd_12_subpel_variance32x64_sse2 =

-    vp9_highbd_12_sub_pixel_variance32x64_sse2;

-const SubpixVarMxNFunc highbd_12_subpel_variance64x32_sse2 =

-    vp9_highbd_12_sub_pixel_variance64x32_sse2;

 const SubpixVarMxNFunc highbd_12_subpel_variance64x64_sse2 =

-    vp9_highbd_12_sub_pixel_variance64x64_sse2;

+    vpx_highbd_12_sub_pixel_variance64x64_sse2;

+const SubpixVarMxNFunc highbd_12_subpel_variance64x32_sse2 =

+    vpx_highbd_12_sub_pixel_variance64x32_sse2;

+const SubpixVarMxNFunc highbd_12_subpel_variance32x64_sse2 =

+    vpx_highbd_12_sub_pixel_variance32x64_sse2;

+const SubpixVarMxNFunc highbd_12_subpel_variance32x32_sse2 =

+    vpx_highbd_12_sub_pixel_variance32x32_sse2;

+const SubpixVarMxNFunc highbd_12_subpel_variance32x16_sse2 =

+    vpx_highbd_12_sub_pixel_variance32x16_sse2;

+const SubpixVarMxNFunc highbd_12_subpel_variance16x32_sse2 =

+    vpx_highbd_12_sub_pixel_variance16x32_sse2;

+const SubpixVarMxNFunc highbd_12_subpel_variance16x16_sse2 =

+    vpx_highbd_12_sub_pixel_variance16x16_sse2;

+const SubpixVarMxNFunc highbd_12_subpel_variance16x8_sse2 =

+    vpx_highbd_12_sub_pixel_variance16x8_sse2;

+const SubpixVarMxNFunc highbd_12_subpel_variance8x16_sse2 =

+    vpx_highbd_12_sub_pixel_variance8x16_sse2;

+const SubpixVarMxNFunc highbd_12_subpel_variance8x8_sse2 =

+    vpx_highbd_12_sub_pixel_variance8x8_sse2;

+const SubpixVarMxNFunc highbd_12_subpel_variance8x4_sse2 =

+    vpx_highbd_12_sub_pixel_variance8x4_sse2;

+const SubpixVarMxNFunc highbd_10_subpel_variance64x64_sse2 =

+    vpx_highbd_10_sub_pixel_variance64x64_sse2;

+const SubpixVarMxNFunc highbd_10_subpel_variance64x32_sse2 =

+    vpx_highbd_10_sub_pixel_variance64x32_sse2;

+const SubpixVarMxNFunc highbd_10_subpel_variance32x64_sse2 =

+    vpx_highbd_10_sub_pixel_variance32x64_sse2;

+const SubpixVarMxNFunc highbd_10_subpel_variance32x32_sse2 =

+    vpx_highbd_10_sub_pixel_variance32x32_sse2;

+const SubpixVarMxNFunc highbd_10_subpel_variance32x16_sse2 =

+    vpx_highbd_10_sub_pixel_variance32x16_sse2;

+const SubpixVarMxNFunc highbd_10_subpel_variance16x32_sse2 =

+    vpx_highbd_10_sub_pixel_variance16x32_sse2;

+const SubpixVarMxNFunc highbd_10_subpel_variance16x16_sse2 =

+    vpx_highbd_10_sub_pixel_variance16x16_sse2;

+const SubpixVarMxNFunc highbd_10_subpel_variance16x8_sse2 =

+    vpx_highbd_10_sub_pixel_variance16x8_sse2;

+const SubpixVarMxNFunc highbd_10_subpel_variance8x16_sse2 =

+    vpx_highbd_10_sub_pixel_variance8x16_sse2;

+const SubpixVarMxNFunc highbd_10_subpel_variance8x8_sse2 =

+    vpx_highbd_10_sub_pixel_variance8x8_sse2;

+const SubpixVarMxNFunc highbd_10_subpel_variance8x4_sse2 =

+    vpx_highbd_10_sub_pixel_variance8x4_sse2;

+const SubpixVarMxNFunc highbd_8_subpel_variance64x64_sse2 =

+    vpx_highbd_8_sub_pixel_variance64x64_sse2;

+const SubpixVarMxNFunc highbd_8_subpel_variance64x32_sse2 =

+    vpx_highbd_8_sub_pixel_variance64x32_sse2;

+const SubpixVarMxNFunc highbd_8_subpel_variance32x64_sse2 =

+    vpx_highbd_8_sub_pixel_variance32x64_sse2;

+const SubpixVarMxNFunc highbd_8_subpel_variance32x32_sse2 =

+    vpx_highbd_8_sub_pixel_variance32x32_sse2;

+const SubpixVarMxNFunc highbd_8_subpel_variance32x16_sse2 =

+    vpx_highbd_8_sub_pixel_variance32x16_sse2;

+const SubpixVarMxNFunc highbd_8_subpel_variance16x32_sse2 =

+    vpx_highbd_8_sub_pixel_variance16x32_sse2;

+const SubpixVarMxNFunc highbd_8_subpel_variance16x16_sse2 =

+    vpx_highbd_8_sub_pixel_variance16x16_sse2;

+const SubpixVarMxNFunc highbd_8_subpel_variance16x8_sse2 =

+    vpx_highbd_8_sub_pixel_variance16x8_sse2;

+const SubpixVarMxNFunc highbd_8_subpel_variance8x16_sse2 =

+    vpx_highbd_8_sub_pixel_variance8x16_sse2;

+const SubpixVarMxNFunc highbd_8_subpel_variance8x8_sse2 =

+    vpx_highbd_8_sub_pixel_variance8x8_sse2;

+const SubpixVarMxNFunc highbd_8_subpel_variance8x4_sse2 =

+    vpx_highbd_8_sub_pixel_variance8x4_sse2;

 INSTANTIATE_TEST_CASE_P(

-    SSE2, VP9SubpelVarianceHighTest,

-    ::testing::Values(make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10),

-                      make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10),

-                      make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10),

-                      make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10),

-                      make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10),

-                      make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10),

-                      make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10),

-                      make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10),

-                      make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10),

-                      make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10),

-                      make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10),

-                      make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12),

-                      make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12),

-                      make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12),

-                      make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12),

-                      make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12),

-                      make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12),

-                      make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12),

-                      make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12),

-                      make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12),

+    SSE2, VpxHBDSubpelVarianceTest,

+    ::testing::Values(make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12),

                       make_tuple(6, 5, highbd_12_subpel_variance64x32_sse2, 12),

-                      make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12),

-                      make_tuple(3, 2, highbd_subpel_variance8x4_sse2, 8),

-                      make_tuple(3, 3, highbd_subpel_variance8x8_sse2, 8),

-                      make_tuple(3, 4, highbd_subpel_variance8x16_sse2, 8),

-                      make_tuple(4, 3, highbd_subpel_variance16x8_sse2, 8),

-                      make_tuple(4, 4, highbd_subpel_variance16x16_sse2, 8),

-                      make_tuple(4, 5, highbd_subpel_variance16x32_sse2, 8),

-                      make_tuple(5, 4, highbd_subpel_variance32x16_sse2, 8),

-                      make_tuple(5, 5, highbd_subpel_variance32x32_sse2, 8),

-                      make_tuple(5, 6, highbd_subpel_variance32x64_sse2, 8),

-                      make_tuple(6, 5, highbd_subpel_variance64x32_sse2, 8),

-                      make_tuple(6, 6, highbd_subpel_variance64x64_sse2, 8)));

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_sse2 =

-    vp9_highbd_sub_pixel_avg_variance8x4_sse2;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_sse2 =

-    vp9_highbd_sub_pixel_avg_variance8x8_sse2;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_sse2 =

-    vp9_highbd_sub_pixel_avg_variance8x16_sse2;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_sse2 =

-    vp9_highbd_sub_pixel_avg_variance16x8_sse2;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_sse2 =

-    vp9_highbd_sub_pixel_avg_variance16x16_sse2;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_sse2 =

-    vp9_highbd_sub_pixel_avg_variance16x32_sse2;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_sse2 =

-    vp9_highbd_sub_pixel_avg_variance32x16_sse2;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_sse2 =

-    vp9_highbd_sub_pixel_avg_variance32x32_sse2;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_sse2 =

-    vp9_highbd_sub_pixel_avg_variance32x64_sse2;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_sse2 =

-    vp9_highbd_sub_pixel_avg_variance64x32_sse2;

-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_sse2 =

-    vp9_highbd_sub_pixel_avg_variance64x64_sse2;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_sse2 =

-    vp9_highbd_10_sub_pixel_avg_variance8x4_sse2;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_sse2 =

-    vp9_highbd_10_sub_pixel_avg_variance8x8_sse2;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_sse2 =

-    vp9_highbd_10_sub_pixel_avg_variance8x16_sse2;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_sse2 =

-    vp9_highbd_10_sub_pixel_avg_variance16x8_sse2;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_sse2 =

-    vp9_highbd_10_sub_pixel_avg_variance16x16_sse2;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_sse2 =

-    vp9_highbd_10_sub_pixel_avg_variance16x32_sse2;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_sse2 =

-    vp9_highbd_10_sub_pixel_avg_variance32x16_sse2;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_sse2 =

-    vp9_highbd_10_sub_pixel_avg_variance32x32_sse2;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_sse2 =

-    vp9_highbd_10_sub_pixel_avg_variance32x64_sse2;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_sse2 =

-    vp9_highbd_10_sub_pixel_avg_variance64x32_sse2;

-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_sse2 =

-    vp9_highbd_10_sub_pixel_avg_variance64x64_sse2;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_sse2 =

-    vp9_highbd_12_sub_pixel_avg_variance8x4_sse2;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_sse2 =

-    vp9_highbd_12_sub_pixel_avg_variance8x8_sse2;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_sse2 =

-    vp9_highbd_12_sub_pixel_avg_variance8x16_sse2;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_sse2 =

-    vp9_highbd_12_sub_pixel_avg_variance16x8_sse2;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_sse2 =

-    vp9_highbd_12_sub_pixel_avg_variance16x16_sse2;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_sse2 =

-    vp9_highbd_12_sub_pixel_avg_variance16x32_sse2;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_sse2 =

-    vp9_highbd_12_sub_pixel_avg_variance32x16_sse2;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_sse2 =

-    vp9_highbd_12_sub_pixel_avg_variance32x32_sse2;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_sse2 =

-    vp9_highbd_12_sub_pixel_avg_variance32x64_sse2;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_sse2 =

-    vp9_highbd_12_sub_pixel_avg_variance64x32_sse2;

-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_sse2 =

-    vp9_highbd_12_sub_pixel_avg_variance64x64_sse2;

+                      make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12),

+                      make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12),

+                      make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12),

+                      make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12),

+                      make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12),

+                      make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12),

+                      make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12),

+                      make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12),

+                      make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12),

+                      make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10),

+                      make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10),

+                      make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10),

+                      make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10),

+                      make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10),

+                      make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10),

+                      make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10),

+                      make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10),

+                      make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10),

+                      make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10),

+                      make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10),

+                      make_tuple(6, 6, highbd_8_subpel_variance64x64_sse2, 8),

+                      make_tuple(6, 5, highbd_8_subpel_variance64x32_sse2, 8),

+                      make_tuple(5, 6, highbd_8_subpel_variance32x64_sse2, 8),

+                      make_tuple(5, 5, highbd_8_subpel_variance32x32_sse2, 8),

+                      make_tuple(5, 4, highbd_8_subpel_variance32x16_sse2, 8),

+                      make_tuple(4, 5, highbd_8_subpel_variance16x32_sse2, 8),

+                      make_tuple(4, 4, highbd_8_subpel_variance16x16_sse2, 8),

+                      make_tuple(4, 3, highbd_8_subpel_variance16x8_sse2, 8),

+                      make_tuple(3, 4, highbd_8_subpel_variance8x16_sse2, 8),

+                      make_tuple(3, 3, highbd_8_subpel_variance8x8_sse2, 8),

+                      make_tuple(3, 2, highbd_8_subpel_variance8x4_sse2, 8)));

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x64_sse2 =

+    vpx_highbd_12_sub_pixel_avg_variance64x64_sse2;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x32_sse2 =

+    vpx_highbd_12_sub_pixel_avg_variance64x32_sse2;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x64_sse2 =

+    vpx_highbd_12_sub_pixel_avg_variance32x64_sse2;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x32_sse2 =

+    vpx_highbd_12_sub_pixel_avg_variance32x32_sse2;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x16_sse2 =

+    vpx_highbd_12_sub_pixel_avg_variance32x16_sse2;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x32_sse2 =

+    vpx_highbd_12_sub_pixel_avg_variance16x32_sse2;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x16_sse2 =

+    vpx_highbd_12_sub_pixel_avg_variance16x16_sse2;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x8_sse2 =

+    vpx_highbd_12_sub_pixel_avg_variance16x8_sse2;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x16_sse2 =

+    vpx_highbd_12_sub_pixel_avg_variance8x16_sse2;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x8_sse2 =

+    vpx_highbd_12_sub_pixel_avg_variance8x8_sse2;

+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x4_sse2 =

+    vpx_highbd_12_sub_pixel_avg_variance8x4_sse2;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x64_sse2 =

+    vpx_highbd_10_sub_pixel_avg_variance64x64_sse2;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x32_sse2 =

+    vpx_highbd_10_sub_pixel_avg_variance64x32_sse2;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x64_sse2 =

+    vpx_highbd_10_sub_pixel_avg_variance32x64_sse2;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x32_sse2 =

+    vpx_highbd_10_sub_pixel_avg_variance32x32_sse2;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x16_sse2 =

+    vpx_highbd_10_sub_pixel_avg_variance32x16_sse2;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x32_sse2 =

+    vpx_highbd_10_sub_pixel_avg_variance16x32_sse2;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x16_sse2 =

+    vpx_highbd_10_sub_pixel_avg_variance16x16_sse2;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x8_sse2 =

+    vpx_highbd_10_sub_pixel_avg_variance16x8_sse2;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x16_sse2 =

+    vpx_highbd_10_sub_pixel_avg_variance8x16_sse2;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x8_sse2 =

+    vpx_highbd_10_sub_pixel_avg_variance8x8_sse2;

+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x4_sse2 =

+    vpx_highbd_10_sub_pixel_avg_variance8x4_sse2;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x64_sse2 =

+    vpx_highbd_8_sub_pixel_avg_variance64x64_sse2;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x32_sse2 =

+    vpx_highbd_8_sub_pixel_avg_variance64x32_sse2;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x64_sse2 =

+    vpx_highbd_8_sub_pixel_avg_variance32x64_sse2;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x32_sse2 =

+    vpx_highbd_8_sub_pixel_avg_variance32x32_sse2;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x16_sse2 =

+    vpx_highbd_8_sub_pixel_avg_variance32x16_sse2;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x32_sse2 =

+    vpx_highbd_8_sub_pixel_avg_variance16x32_sse2;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x16_sse2 =

+    vpx_highbd_8_sub_pixel_avg_variance16x16_sse2;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x8_sse2 =

+    vpx_highbd_8_sub_pixel_avg_variance16x8_sse2;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x16_sse2 =

+    vpx_highbd_8_sub_pixel_avg_variance8x16_sse2;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x8_sse2 =

+    vpx_highbd_8_sub_pixel_avg_variance8x8_sse2;

+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x4_sse2 =

+    vpx_highbd_8_sub_pixel_avg_variance8x4_sse2;

 INSTANTIATE_TEST_CASE_P(

-    SSE2, VP9SubpelAvgVarianceHighTest,

+    SSE2, VpxHBDSubpelAvgVarianceTest,

     ::testing::Values(

-                  make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10),

-                  make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10),

-                  make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10),

-                  make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10),

-                  make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10),

-                  make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10),

-                  make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10),

-                  make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10),

-                  make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10),

-                  make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10),

-                  make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10),

-                  make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12),

-                  make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12),

-                  make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12),

-                  make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12),

-                  make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12),

-                  make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12),

-                  make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12),

-                  make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12),

-                  make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12),

-                  make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12),

-                  make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12),

-                  make_tuple(3, 2, highbd_subpel_avg_variance8x4_sse2, 8),

-                  make_tuple(3, 3, highbd_subpel_avg_variance8x8_sse2, 8),

-                  make_tuple(3, 4, highbd_subpel_avg_variance8x16_sse2, 8),

-                  make_tuple(4, 3, highbd_subpel_avg_variance16x8_sse2, 8),

-                  make_tuple(4, 4, highbd_subpel_avg_variance16x16_sse2, 8),

-                  make_tuple(4, 5, highbd_subpel_avg_variance16x32_sse2, 8),

-                  make_tuple(5, 4, highbd_subpel_avg_variance32x16_sse2, 8),

-                  make_tuple(5, 5, highbd_subpel_avg_variance32x32_sse2, 8),

-                  make_tuple(5, 6, highbd_subpel_avg_variance32x64_sse2, 8),

-                  make_tuple(6, 5, highbd_subpel_avg_variance64x32_sse2, 8),

-                  make_tuple(6, 6, highbd_subpel_avg_variance64x64_sse2, 8)));

-#endif  // CONFIG_VP9_HIGHBITDEPTH

+        make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12),

+        make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12),

+        make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12),

+        make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12),

+        make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12),

+        make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12),

+        make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12),

+        make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12),

+        make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12),

+        make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12),

+        make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12),

+        make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10),

+        make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10),

+        make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10),

+        make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10),

+        make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10),

+        make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10),

+        make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10),

+        make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10),

+        make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10),

+        make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10),

+        make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10),

+        make_tuple(6, 6, highbd_8_subpel_avg_variance64x64_sse2, 8),

+        make_tuple(6, 5, highbd_8_subpel_avg_variance64x32_sse2, 8),

+        make_tuple(5, 6, highbd_8_subpel_avg_variance32x64_sse2, 8),

+        make_tuple(5, 5, highbd_8_subpel_avg_variance32x32_sse2, 8),

+        make_tuple(5, 4, highbd_8_subpel_avg_variance32x16_sse2, 8),

+        make_tuple(4, 5, highbd_8_subpel_avg_variance16x32_sse2, 8),

+        make_tuple(4, 4, highbd_8_subpel_avg_variance16x16_sse2, 8),

+        make_tuple(4, 3, highbd_8_subpel_avg_variance16x8_sse2, 8),

+        make_tuple(3, 4, highbd_8_subpel_avg_variance8x16_sse2, 8),

+        make_tuple(3, 3, highbd_8_subpel_avg_variance8x8_sse2, 8),

+        make_tuple(3, 2, highbd_8_subpel_avg_variance8x4_sse2, 8)));

 #endif  // CONFIG_USE_X86INC

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #endif  // HAVE_SSE2

-#endif  // CONFIG_VP9_ENCODER

-#if CONFIG_VP8_ENCODER

-#if HAVE_SSE2

-const SubpixVarMxNFunc vp8_subpel_variance16x16_sse2 =

-    vp8_sub_pixel_variance16x16_wmt;

-const SubpixVarMxNFunc vp8_subpel_variance16x8_sse2 =

-    vp8_sub_pixel_variance16x8_wmt;

-const SubpixVarMxNFunc vp8_subpel_variance8x16_sse2 =

-    vp8_sub_pixel_variance8x16_wmt;

-const SubpixVarMxNFunc vp8_subpel_variance8x8_sse2 =

-    vp8_sub_pixel_variance8x8_wmt;

-const SubpixVarMxNFunc vp8_subpel_variance4x4_sse2 =

-    vp8_sub_pixel_variance4x4_wmt;

-INSTANTIATE_TEST_CASE_P(

-    SSE2, VP8SubpelVarianceTest,

-    ::testing::Values(make_tuple(2, 2, vp8_subpel_variance4x4_sse2, 0),

-                      make_tuple(3, 3, vp8_subpel_variance8x8_sse2, 0),

-                      make_tuple(3, 4, vp8_subpel_variance8x16_sse2, 0),

-                      make_tuple(4, 3, vp8_subpel_variance16x8_sse2, 0),

-                      make_tuple(4, 4, vp8_subpel_variance16x16_sse2, 0)));

-#endif  // HAVE_SSE2

-#endif  // CONFIG_VP8_ENCODER

-#if CONFIG_VP9_ENCODER

 #if HAVE_SSSE3

 #if CONFIG_USE_X86INC

-const SubpixVarMxNFunc subpel_variance4x4_ssse3 =

-    vp9_sub_pixel_variance4x4_ssse3;

-const SubpixVarMxNFunc subpel_variance4x8_ssse3 =

-    vp9_sub_pixel_variance4x8_ssse3;

-const SubpixVarMxNFunc subpel_variance8x4_ssse3 =

-    vp9_sub_pixel_variance8x4_ssse3;

-const SubpixVarMxNFunc subpel_variance8x8_ssse3 =

-    vp9_sub_pixel_variance8x8_ssse3;

-const SubpixVarMxNFunc subpel_variance8x16_ssse3 =

-    vp9_sub_pixel_variance8x16_ssse3;

-const SubpixVarMxNFunc subpel_variance16x8_ssse3 =

-    vp9_sub_pixel_variance16x8_ssse3;

-const SubpixVarMxNFunc subpel_variance16x16_ssse3 =

-    vp9_sub_pixel_variance16x16_ssse3;

-const SubpixVarMxNFunc subpel_variance16x32_ssse3 =

-    vp9_sub_pixel_variance16x32_ssse3;

-const SubpixVarMxNFunc subpel_variance32x16_ssse3 =

-    vp9_sub_pixel_variance32x16_ssse3;

-const SubpixVarMxNFunc subpel_variance32x32_ssse3 =

-    vp9_sub_pixel_variance32x32_ssse3;

-const SubpixVarMxNFunc subpel_variance32x64_ssse3 =

-    vp9_sub_pixel_variance32x64_ssse3;

-const SubpixVarMxNFunc subpel_variance64x32_ssse3 =

-    vp9_sub_pixel_variance64x32_ssse3;

 const SubpixVarMxNFunc subpel_variance64x64_ssse3 =

-    vp9_sub_pixel_variance64x64_ssse3;

+    vpx_sub_pixel_variance64x64_ssse3;

+const SubpixVarMxNFunc subpel_variance64x32_ssse3 =

+    vpx_sub_pixel_variance64x32_ssse3;

+const SubpixVarMxNFunc subpel_variance32x64_ssse3 =

+    vpx_sub_pixel_variance32x64_ssse3;

+const SubpixVarMxNFunc subpel_variance32x32_ssse3 =

+    vpx_sub_pixel_variance32x32_ssse3;

+const SubpixVarMxNFunc subpel_variance32x16_ssse3 =

+    vpx_sub_pixel_variance32x16_ssse3;

+const SubpixVarMxNFunc subpel_variance16x32_ssse3 =

+    vpx_sub_pixel_variance16x32_ssse3;

+const SubpixVarMxNFunc subpel_variance16x16_ssse3 =

+    vpx_sub_pixel_variance16x16_ssse3;

+const SubpixVarMxNFunc subpel_variance16x8_ssse3 =

+    vpx_sub_pixel_variance16x8_ssse3;

+const SubpixVarMxNFunc subpel_variance8x16_ssse3 =

+    vpx_sub_pixel_variance8x16_ssse3;

+const SubpixVarMxNFunc subpel_variance8x8_ssse3 =

+    vpx_sub_pixel_variance8x8_ssse3;

+const SubpixVarMxNFunc subpel_variance8x4_ssse3 =

+    vpx_sub_pixel_variance8x4_ssse3;

+const SubpixVarMxNFunc subpel_variance4x8_ssse3 =

+    vpx_sub_pixel_variance4x8_ssse3;

+const SubpixVarMxNFunc subpel_variance4x4_ssse3 =

+    vpx_sub_pixel_variance4x4_ssse3;

 INSTANTIATE_TEST_CASE_P(

-    SSSE3, VP9SubpelVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3, 0),

-                      make_tuple(2, 3, subpel_variance4x8_ssse3, 0),

-                      make_tuple(3, 2, subpel_variance8x4_ssse3, 0),

-                      make_tuple(3, 3, subpel_variance8x8_ssse3, 0),

-                      make_tuple(3, 4, subpel_variance8x16_ssse3, 0),

-                      make_tuple(4, 3, subpel_variance16x8_ssse3, 0),

-                      make_tuple(4, 4, subpel_variance16x16_ssse3, 0),

-                      make_tuple(4, 5, subpel_variance16x32_ssse3, 0),

-                      make_tuple(5, 4, subpel_variance32x16_ssse3, 0),

-                      make_tuple(5, 5, subpel_variance32x32_ssse3, 0),

-                      make_tuple(5, 6, subpel_variance32x64_ssse3, 0),

+    SSSE3, VpxSubpelVarianceTest,

+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_ssse3, 0),

                       make_tuple(6, 5, subpel_variance64x32_ssse3, 0),

-                      make_tuple(6, 6, subpel_variance64x64_ssse3, 0)));

-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =

-    vp9_sub_pixel_avg_variance4x4_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =

-    vp9_sub_pixel_avg_variance4x8_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 =

-    vp9_sub_pixel_avg_variance8x4_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 =

-    vp9_sub_pixel_avg_variance8x8_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 =

-    vp9_sub_pixel_avg_variance8x16_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 =

-    vp9_sub_pixel_avg_variance16x8_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 =

-    vp9_sub_pixel_avg_variance16x16_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 =

-    vp9_sub_pixel_avg_variance16x32_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 =

-    vp9_sub_pixel_avg_variance32x16_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 =

-    vp9_sub_pixel_avg_variance32x32_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 =

-    vp9_sub_pixel_avg_variance32x64_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 =

-    vp9_sub_pixel_avg_variance64x32_ssse3;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 =

-    vp9_sub_pixel_avg_variance64x64_ssse3;

+                      make_tuple(5, 6, subpel_variance32x64_ssse3, 0),

+                      make_tuple(5, 5, subpel_variance32x32_ssse3, 0),

+                      make_tuple(5, 4, subpel_variance32x16_ssse3, 0),

+                      make_tuple(4, 5, subpel_variance16x32_ssse3, 0),

+                      make_tuple(4, 4, subpel_variance16x16_ssse3, 0),

+                      make_tuple(4, 3, subpel_variance16x8_ssse3, 0),

+                      make_tuple(3, 4, subpel_variance8x16_ssse3, 0),

+                      make_tuple(3, 3, subpel_variance8x8_ssse3, 0),

+                      make_tuple(3, 2, subpel_variance8x4_ssse3, 0),

+                      make_tuple(2, 3, subpel_variance4x8_ssse3, 0),

+                      make_tuple(2, 2, subpel_variance4x4_ssse3, 0)));

+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_ssse3 =

+    vpx_sub_pixel_avg_variance64x64_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance64x32_ssse3 =

+    vpx_sub_pixel_avg_variance64x32_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance32x64_ssse3 =

+    vpx_sub_pixel_avg_variance32x64_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_ssse3 =

+    vpx_sub_pixel_avg_variance32x32_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance32x16_ssse3 =

+    vpx_sub_pixel_avg_variance32x16_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance16x32_ssse3 =

+    vpx_sub_pixel_avg_variance16x32_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance16x16_ssse3 =

+    vpx_sub_pixel_avg_variance16x16_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance16x8_ssse3 =

+    vpx_sub_pixel_avg_variance16x8_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance8x16_ssse3 =

+    vpx_sub_pixel_avg_variance8x16_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance8x8_ssse3 =

+    vpx_sub_pixel_avg_variance8x8_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance8x4_ssse3 =

+    vpx_sub_pixel_avg_variance8x4_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance4x8_ssse3 =

+    vpx_sub_pixel_avg_variance4x8_ssse3;

+const SubpixAvgVarMxNFunc subpel_avg_variance4x4_ssse3 =

+    vpx_sub_pixel_avg_variance4x4_ssse3;

 INSTANTIATE_TEST_CASE_P(

-    SSSE3, VP9SubpelAvgVarianceTest,

-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0),

-                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0),

-                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0),

-                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0),

-                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0),

-                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0),

-                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0),

-                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0),

-                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0),

-                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0),

-                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0),

+    SSSE3, VpxSubpelAvgVarianceTest,

+    ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0),

                       make_tuple(6, 5, subpel_avg_variance64x32_ssse3, 0),

-                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0)));

+                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0),

+                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0),

+                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0),

+                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0),

+                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0),

+                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0),

+                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0),

+                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0),

+                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0),

+                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0),

+                      make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0)));

 #endif  // CONFIG_USE_X86INC

 #endif  // HAVE_SSSE3

-#endif  // CONFIG_VP9_ENCODER

-#if CONFIG_VP8_ENCODER

-#if HAVE_SSSE3

-const SubpixVarMxNFunc vp8_subpel_variance16x16_ssse3 =

-    vp8_sub_pixel_variance16x16_ssse3;

-const SubpixVarMxNFunc vp8_subpel_variance16x8_ssse3 =

-    vp8_sub_pixel_variance16x8_ssse3;

-INSTANTIATE_TEST_CASE_P(

-    SSSE3, VP8SubpelVarianceTest,

-    ::testing::Values(make_tuple(4, 3, vp8_subpel_variance16x8_ssse3, 0),

-                      make_tuple(4, 4, vp8_subpel_variance16x16_ssse3, 0)));

-#endif  // HAVE_SSSE3

-#endif  // CONFIG_VP8_ENCODER

 #if HAVE_AVX2

 const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2;

 INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest,

@@ -1910,39 +1828,46 @@

                       make_tuple(5, 4, variance32x16_avx2, 0),

                       make_tuple(4, 4, variance16x16_avx2, 0)));

-#if CONFIG_VP9_ENCODER

-const SubpixVarMxNFunc subpel_variance32x32_avx2 =

-    vp9_sub_pixel_variance32x32_avx2;

 const SubpixVarMxNFunc subpel_variance64x64_avx2 =

-    vp9_sub_pixel_variance64x64_avx2;

+    vpx_sub_pixel_variance64x64_avx2;

+const SubpixVarMxNFunc subpel_variance32x32_avx2 =

+    vpx_sub_pixel_variance32x32_avx2;

 INSTANTIATE_TEST_CASE_P(

-    AVX2, VP9SubpelVarianceTest,

-    ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2, 0),

-                      make_tuple(6, 6, subpel_variance64x64_avx2, 0)));

+    AVX2, VpxSubpelVarianceTest,

+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_avx2, 0),

+                      make_tuple(5, 5, subpel_variance32x32_avx2, 0)));

-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 =

-    vp9_sub_pixel_avg_variance32x32_avx2;

-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 =

-    vp9_sub_pixel_avg_variance64x64_avx2;

+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_avx2 =

+    vpx_sub_pixel_avg_variance64x64_avx2;

+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_avx2 =

+    vpx_sub_pixel_avg_variance32x32_avx2;

 INSTANTIATE_TEST_CASE_P(

-    AVX2, VP9SubpelAvgVarianceTest,

-    ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0),

-                      make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0)));

-#endif  // CONFIG_VP9_ENCODER

+    AVX2, VpxSubpelAvgVarianceTest,

+    ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0),

+                      make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0)));

 #endif  // HAVE_AVX2

-#if CONFIG_VP8_ENCODER

 #if HAVE_MEDIA

+const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media;

+INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,

+                        ::testing::Values(make_tuple(4, 4, mse16x16_media)));

+const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media;

+const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media;

+INSTANTIATE_TEST_CASE_P(

+    MEDIA, VpxVarianceTest,

+    ::testing::Values(make_tuple(4, 4, variance16x16_media, 0),

+                      make_tuple(3, 3, variance8x8_media, 0)));

 const SubpixVarMxNFunc subpel_variance16x16_media =

-    vp8_sub_pixel_variance16x16_armv6;

+    vpx_sub_pixel_variance16x16_media;

 const SubpixVarMxNFunc subpel_variance8x8_media =

-    vp8_sub_pixel_variance8x8_armv6;

+    vpx_sub_pixel_variance8x8_media;

 INSTANTIATE_TEST_CASE_P(

-    MEDIA, VP8SubpelVarianceTest,

-    ::testing::Values(make_tuple(3, 3, subpel_variance8x8_media, 0),

-                      make_tuple(4, 4, subpel_variance16x16_media, 0)));

+    MEDIA, VpxSubpelVarianceTest,

+    ::testing::Values(make_tuple(4, 4, subpel_variance16x16_media, 0),

+                      make_tuple(3, 3, subpel_variance8x8_media, 0)));

 #endif  // HAVE_MEDIA

-#endif  // CONFIG_VP8_ENCODER

 #if HAVE_NEON

 const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon;

@@ -1972,46 +1897,21 @@

                       make_tuple(3, 4, variance8x16_neon, 0),

                       make_tuple(3, 3, variance8x8_neon, 0)));

-#if CONFIG_VP8_ENCODER

-#if HAVE_NEON_ASM

-const SubpixVarMxNFunc vp8_subpel_variance16x16_neon =

-    vp8_sub_pixel_variance16x16_neon;

-INSTANTIATE_TEST_CASE_P(

-    NEON, VP8SubpelVarianceTest,

-    ::testing::Values(make_tuple(4, 4, vp8_subpel_variance16x16_neon, 0)));

-#endif  // HAVE_NEON_ASM

-#endif  // CONFIG_VP8_ENCODER

-#if CONFIG_VP9_ENCODER

-const SubpixVarMxNFunc subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon;

-const SubpixVarMxNFunc subpel_variance16x16_neon =

-    vp9_sub_pixel_variance16x16_neon;

-const SubpixVarMxNFunc subpel_variance32x32_neon =

-    vp9_sub_pixel_variance32x32_neon;

 const SubpixVarMxNFunc subpel_variance64x64_neon =

-    vp9_sub_pixel_variance64x64_neon;

+    vpx_sub_pixel_variance64x64_neon;

+const SubpixVarMxNFunc subpel_variance32x32_neon =

+    vpx_sub_pixel_variance32x32_neon;

+const SubpixVarMxNFunc subpel_variance16x16_neon =

+    vpx_sub_pixel_variance16x16_neon;

+const SubpixVarMxNFunc subpel_variance8x8_neon = vpx_sub_pixel_variance8x8_neon;

 INSTANTIATE_TEST_CASE_P(

-    NEON, VP9SubpelVarianceTest,

-    ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0),

-                      make_tuple(4, 4, subpel_variance16x16_neon, 0),

+    NEON, VpxSubpelVarianceTest,

+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_neon, 0),

                       make_tuple(5, 5, subpel_variance32x32_neon, 0),

-                      make_tuple(6, 6, subpel_variance64x64_neon, 0)));

-#endif  // CONFIG_VP9_ENCODER

+                      make_tuple(4, 4, subpel_variance16x16_neon, 0),

+                      make_tuple(3, 3, subpel_variance8x8_neon, 0)));

 #endif  // HAVE_NEON

-#if HAVE_MEDIA

-const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media;

-INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,

-                        ::testing::Values(make_tuple(4, 4, mse16x16_media)));

-const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media;

-const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media;

-INSTANTIATE_TEST_CASE_P(

-    MEDIA, VpxVarianceTest,

-    ::testing::Values(make_tuple(4, 4, variance16x16_media, 0),

-                      make_tuple(3, 3, variance8x8_media, 0)));

-#endif  // HAVE_MEDIA

 #if HAVE_MSA

 INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest,

                         ::testing::Values(vpx_get_mb_ss_msa));

@@ -2059,29 +1959,28 @@

                       make_tuple(2, 3, variance4x8_msa, 0),

                       make_tuple(2, 2, variance4x4_msa, 0)));

-#if CONFIG_VP9_ENCODER

-const SubpixVarMxNFunc subpel_variance4x4_msa = vp9_sub_pixel_variance4x4_msa;

-const SubpixVarMxNFunc subpel_variance4x8_msa = vp9_sub_pixel_variance4x8_msa;

-const SubpixVarMxNFunc subpel_variance8x4_msa = vp9_sub_pixel_variance8x4_msa;

-const SubpixVarMxNFunc subpel_variance8x8_msa = vp9_sub_pixel_variance8x8_msa;

-const SubpixVarMxNFunc subpel_variance8x16_msa = vp9_sub_pixel_variance8x16_msa;

-const SubpixVarMxNFunc subpel_variance16x8_msa = vp9_sub_pixel_variance16x8_msa;

+const SubpixVarMxNFunc subpel_variance4x4_msa = vpx_sub_pixel_variance4x4_msa;

+const SubpixVarMxNFunc subpel_variance4x8_msa = vpx_sub_pixel_variance4x8_msa;

+const SubpixVarMxNFunc subpel_variance8x4_msa = vpx_sub_pixel_variance8x4_msa;

+const SubpixVarMxNFunc subpel_variance8x8_msa = vpx_sub_pixel_variance8x8_msa;

+const SubpixVarMxNFunc subpel_variance8x16_msa = vpx_sub_pixel_variance8x16_msa;

+const SubpixVarMxNFunc subpel_variance16x8_msa = vpx_sub_pixel_variance16x8_msa;

 const SubpixVarMxNFunc subpel_variance16x16_msa =

-    vp9_sub_pixel_variance16x16_msa;

+    vpx_sub_pixel_variance16x16_msa;

 const SubpixVarMxNFunc subpel_variance16x32_msa =

-    vp9_sub_pixel_variance16x32_msa;

+    vpx_sub_pixel_variance16x32_msa;

 const SubpixVarMxNFunc subpel_variance32x16_msa =

-    vp9_sub_pixel_variance32x16_msa;

+    vpx_sub_pixel_variance32x16_msa;

 const SubpixVarMxNFunc subpel_variance32x32_msa =

-    vp9_sub_pixel_variance32x32_msa;

+    vpx_sub_pixel_variance32x32_msa;

 const SubpixVarMxNFunc subpel_variance32x64_msa =

-    vp9_sub_pixel_variance32x64_msa;

+    vpx_sub_pixel_variance32x64_msa;

 const SubpixVarMxNFunc subpel_variance64x32_msa =

-    vp9_sub_pixel_variance64x32_msa;

+    vpx_sub_pixel_variance64x32_msa;

 const SubpixVarMxNFunc subpel_variance64x64_msa =

-    vp9_sub_pixel_variance64x64_msa;

+    vpx_sub_pixel_variance64x64_msa;

 INSTANTIATE_TEST_CASE_P(

-    MSA, VP9SubpelVarianceTest,

+    MSA, VpxSubpelVarianceTest,

     ::testing::Values(make_tuple(2, 2, subpel_variance4x4_msa, 0),

                       make_tuple(2, 3, subpel_variance4x8_msa, 0),

                       make_tuple(3, 2, subpel_variance8x4_msa, 0),

@@ -2095,6 +1994,5 @@

                       make_tuple(5, 6, subpel_variance32x64_msa, 0),

                       make_tuple(6, 5, subpel_variance64x32_msa, 0),

                       make_tuple(6, 6, subpel_variance64x64_msa, 0)));

-#endif  // CONFIG_VP9_ENCODER

 #endif  // HAVE_MSA

 }  // namespace

--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm

+++ /dev/null

@@ -1,182 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_variance_halfpixvar16x16_h_armv6|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-|vp8_variance_halfpixvar16x16_h_armv6| PROC

-    stmfd   sp!, {r4-r12, lr}

-    pld     [r0, r1, lsl #0]

-    pld     [r2, r3, lsl #0]

-    mov     r8, #0              ; initialize sum = 0

-    ldr     r10, c80808080

-    mov     r11, #0             ; initialize sse = 0

-    mov     r12, #16            ; set loop counter to 16 (=block height)

-    mov     lr, #0              ; constant zero

-loop

-    ; 1st 4 pixels

-    ldr     r4, [r0, #0]        ; load 4 src pixels

-    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset

-    ldr     r5, [r2, #0]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    pld     [r0, r1, lsl #1]

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    pld     [r2, r3, lsl #1]

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    adds    r8, r8, r4          ; add positive differences to sum

-    subs    r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 2nd 4 pixels

-    ldr     r4, [r0, #4]        ; load 4 src pixels

-    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset

-    ldr     r5, [r2, #4]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 3rd 4 pixels

-    ldr     r4, [r0, #8]        ; load 4 src pixels

-    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset

-    ldr     r5, [r2, #8]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 4th 4 pixels

-    ldr     r4, [r0, #12]       ; load 4 src pixels

-    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset

-    ldr     r5, [r2, #12]       ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    add     r0, r0, r1          ; set src_ptr to next row

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    add     r2, r2, r3          ; set dst_ptr to next row

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    subs    r12, r12, #1

-    bne     loop

-    ; return stuff

-    ldr     r6, [sp, #40]       ; get address of sse

-    mul     r0, r8, r8          ; sum * sum

-    str     r11, [r6]           ; store sse

-    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))

-    ldmfd   sp!, {r4-r12, pc}

-    ENDP

-c80808080

-    DCD     0x80808080

-    END

--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm

+++ /dev/null

@@ -1,222 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_variance_halfpixvar16x16_hv_armv6|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-|vp8_variance_halfpixvar16x16_hv_armv6| PROC

-    stmfd   sp!, {r4-r12, lr}

-    pld     [r0, r1, lsl #0]

-    pld     [r2, r3, lsl #0]

-    mov     r8, #0              ; initialize sum = 0

-    ldr     r10, c80808080

-    mov     r11, #0             ; initialize sse = 0

-    mov     r12, #16            ; set loop counter to 16 (=block height)

-    mov     lr, #0              ; constant zero

-loop

-    add     r9, r0, r1          ; pointer to pixels on the next row

-    ; 1st 4 pixels

-    ldr     r4, [r0, #0]        ; load source pixels a, row N

-    ldr     r6, [r0, #1]        ; load source pixels b, row N

-    ldr     r5, [r9, #0]        ; load source pixels c, row N+1

-    ldr     r7, [r9, #1]        ; load source pixels d, row N+1

-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

-    mvn     r7, r7

-    uhsub8  r5, r5, r7

-    eor     r5, r5, r10

-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

-    mvn     r5, r5

-    uhsub8  r4, r4, r5

-    ldr     r5, [r2, #0]        ; load 4 ref pixels

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    pld     [r0, r1, lsl #1]

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    pld     [r2, r3, lsl #1]

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    adds    r8, r8, r4          ; add positive differences to sum

-    subs    r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 2nd 4 pixels

-    ldr     r4, [r0, #4]        ; load source pixels a, row N

-    ldr     r6, [r0, #5]        ; load source pixels b, row N

-    ldr     r5, [r9, #4]        ; load source pixels c, row N+1

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    ldr     r7, [r9, #5]        ; load source pixels d, row N+1

-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

-    mvn     r7, r7

-    uhsub8  r5, r5, r7

-    eor     r5, r5, r10

-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

-    mvn     r5, r5

-    uhsub8  r4, r4, r5

-    ldr     r5, [r2, #4]        ; load 4 ref pixels

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 3rd 4 pixels

-    ldr     r4, [r0, #8]        ; load source pixels a, row N

-    ldr     r6, [r0, #9]        ; load source pixels b, row N

-    ldr     r5, [r9, #8]        ; load source pixels c, row N+1

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    ldr     r7, [r9, #9]        ; load source pixels d, row N+1

-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

-    mvn     r7, r7

-    uhsub8  r5, r5, r7

-    eor     r5, r5, r10

-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

-    mvn     r5, r5

-    uhsub8  r4, r4, r5

-    ldr     r5, [r2, #8]        ; load 4 ref pixels

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 4th 4 pixels

-    ldr     r4, [r0, #12]       ; load source pixels a, row N

-    ldr     r6, [r0, #13]       ; load source pixels b, row N

-    ldr     r5, [r9, #12]       ; load source pixels c, row N+1

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    ldr     r7, [r9, #13]       ; load source pixels d, row N+1

-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

-    mvn     r7, r7

-    uhsub8  r5, r5, r7

-    eor     r5, r5, r10

-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

-    mvn     r5, r5

-    uhsub8  r4, r4, r5

-    ldr     r5, [r2, #12]       ; load 4 ref pixels

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    add     r0, r0, r1          ; set src_ptr to next row

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    add     r2, r2, r3          ; set dst_ptr to next row

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    subs    r12, r12, #1

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    bne     loop

-    ; return stuff

-    ldr     r6, [sp, #40]       ; get address of sse

-    mul     r0, r8, r8          ; sum * sum

-    str     r11, [r6]           ; store sse

-    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))

-    ldmfd   sp!, {r4-r12, pc}

-    ENDP

-c80808080

-    DCD     0x80808080

-    END

--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm

+++ /dev/null

@@ -1,184 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_variance_halfpixvar16x16_v_armv6|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-|vp8_variance_halfpixvar16x16_v_armv6| PROC

-    stmfd   sp!, {r4-r12, lr}

-    pld     [r0, r1, lsl #0]

-    pld     [r2, r3, lsl #0]

-    mov     r8, #0              ; initialize sum = 0

-    ldr     r10, c80808080

-    mov     r11, #0             ; initialize sse = 0

-    mov     r12, #16            ; set loop counter to 16 (=block height)

-    mov     lr, #0              ; constant zero

-loop

-    add     r9, r0, r1          ; set src pointer to next row

-    ; 1st 4 pixels

-    ldr     r4, [r0, #0]        ; load 4 src pixels

-    ldr     r6, [r9, #0]        ; load 4 src pixels from next row

-    ldr     r5, [r2, #0]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    pld     [r0, r1, lsl #1]

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    pld     [r2, r3, lsl #1]

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    adds    r8, r8, r4          ; add positive differences to sum

-    subs    r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 2nd 4 pixels

-    ldr     r4, [r0, #4]        ; load 4 src pixels

-    ldr     r6, [r9, #4]        ; load 4 src pixels from next row

-    ldr     r5, [r2, #4]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 3rd 4 pixels

-    ldr     r4, [r0, #8]        ; load 4 src pixels

-    ldr     r6, [r9, #8]        ; load 4 src pixels from next row

-    ldr     r5, [r2, #8]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 4th 4 pixels

-    ldr     r4, [r0, #12]       ; load 4 src pixels

-    ldr     r6, [r9, #12]       ; load 4 src pixels from next row

-    ldr     r5, [r2, #12]       ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    add     r0, r0, r1          ; set src_ptr to next row

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    add     r2, r2, r3          ; set dst_ptr to next row

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; subtract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    subs    r12, r12, #1

-    bne     loop

-    ; return stuff

-    ldr     r6, [sp, #40]       ; get address of sse

-    mul     r0, r8, r8          ; sum * sum

-    str     r11, [r6]           ; store sse

-    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))

-    ldmfd   sp!, {r4-r12, pc}

-    ENDP

-c80808080

-    DCD     0x80808080

-    END

--- a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c

+++ /dev/null

@@ -1,1017 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include "vpx_ports/mem.h"

-#include "vpx/vpx_integer.h"

-static const uint8_t bilinear_taps_coeff[8][2] = {

-    {128,   0},

-    {112,  16},

-    { 96,  32},

-    { 80,  48},

-    { 64,  64},

-    { 48,  80},

-    { 32,  96},

-    { 16, 112}

-};

-unsigned int vp8_sub_pixel_variance16x16_neon_func(

-        const unsigned char *src_ptr,

-        int src_pixels_per_line,

-        int xoffset,

-        int yoffset,

-        const unsigned char *dst_ptr,

-        int dst_pixels_per_line,

-        unsigned int *sse) {

-    int i;

-    DECLARE_ALIGNED(16, unsigned char, tmp[528]);

-    unsigned char *tmpp;

-    unsigned char *tmpp2;

-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;

-    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;

-    uint8x8_t d19u8, d20u8, d21u8;

-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;

-    uint32x2_t d0u32, d10u32;

-    int64x1_t d0s64, d1s64, d2s64, d3s64;

-    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;

-    uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;

-    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;

-    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;

-    int32x4_t q8s32, q9s32, q10s32;

-    int64x2_t q0s64, q1s64, q5s64;

-    tmpp2 = tmp + 272;

-    tmpp = tmp;

-    if (xoffset == 0) {  // secondpass_bfilter16x16_only

-        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);

-        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);

-        q11u8 = vld1q_u8(src_ptr);

-        src_ptr += src_pixels_per_line;

-        for (i = 4; i > 0; i--) {

-            q12u8 = vld1q_u8(src_ptr);

-            src_ptr += src_pixels_per_line;

-            q13u8 = vld1q_u8(src_ptr);

-            src_ptr += src_pixels_per_line;

-            q14u8 = vld1q_u8(src_ptr);

-            src_ptr += src_pixels_per_line;

-            q15u8 = vld1q_u8(src_ptr);

-            src_ptr += src_pixels_per_line;

-            __builtin_prefetch(src_ptr);

-            __builtin_prefetch(src_ptr + src_pixels_per_line);

-            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);

-            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);

-            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);

-            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);

-            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);

-            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);

-            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);

-            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);

-            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);

-            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);

-            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);

-            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);

-            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);

-            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);

-            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);

-            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);

-            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);

-            d2u8 = vqrshrn_n_u16(q1u16, 7);

-            d3u8 = vqrshrn_n_u16(q2u16, 7);

-            d4u8 = vqrshrn_n_u16(q3u16, 7);

-            d5u8 = vqrshrn_n_u16(q4u16, 7);

-            d6u8 = vqrshrn_n_u16(q5u16, 7);

-            d7u8 = vqrshrn_n_u16(q6u16, 7);

-            d8u8 = vqrshrn_n_u16(q7u16, 7);

-            d9u8 = vqrshrn_n_u16(q8u16, 7);

-            q1u8 = vcombine_u8(d2u8, d3u8);

-            q2u8 = vcombine_u8(d4u8, d5u8);

-            q3u8 = vcombine_u8(d6u8, d7u8);

-            q4u8 = vcombine_u8(d8u8, d9u8);

-            q11u8 = q15u8;

-            vst1q_u8((uint8_t *)tmpp2, q1u8);

-            tmpp2 += 16;

-            vst1q_u8((uint8_t *)tmpp2, q2u8);

-            tmpp2 += 16;

-            vst1q_u8((uint8_t *)tmpp2, q3u8);

-            tmpp2 += 16;

-            vst1q_u8((uint8_t *)tmpp2, q4u8);

-            tmpp2 += 16;

-        }

-    } else if (yoffset == 0) {  // firstpass_bfilter16x16_only

-        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);

-        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);

-        for (i = 4; i > 0 ; i--) {

-            d2u8 = vld1_u8(src_ptr);

-            d3u8 = vld1_u8(src_ptr + 8);

-            d4u8 = vld1_u8(src_ptr + 16);

-            src_ptr += src_pixels_per_line;

-            d5u8 = vld1_u8(src_ptr);

-            d6u8 = vld1_u8(src_ptr + 8);

-            d7u8 = vld1_u8(src_ptr + 16);

-            src_ptr += src_pixels_per_line;

-            d8u8 = vld1_u8(src_ptr);

-            d9u8 = vld1_u8(src_ptr + 8);

-            d10u8 = vld1_u8(src_ptr + 16);

-            src_ptr += src_pixels_per_line;

-            d11u8 = vld1_u8(src_ptr);

-            d12u8 = vld1_u8(src_ptr + 8);

-            d13u8 = vld1_u8(src_ptr + 16);

-            src_ptr += src_pixels_per_line;

-            __builtin_prefetch(src_ptr);

-            __builtin_prefetch(src_ptr + src_pixels_per_line);

-            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);

-            q7u16  = vmull_u8(d2u8, d0u8);

-            q8u16  = vmull_u8(d3u8, d0u8);

-            q9u16  = vmull_u8(d5u8, d0u8);

-            q10u16 = vmull_u8(d6u8, d0u8);

-            q11u16 = vmull_u8(d8u8, d0u8);

-            q12u16 = vmull_u8(d9u8, d0u8);

-            q13u16 = vmull_u8(d11u8, d0u8);

-            q14u16 = vmull_u8(d12u8, d0u8);

-            d2u8  = vext_u8(d2u8, d3u8, 1);

-            d5u8  = vext_u8(d5u8, d6u8, 1);

-            d8u8  = vext_u8(d8u8, d9u8, 1);

-            d11u8 = vext_u8(d11u8, d12u8, 1);

-            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);

-            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);

-            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);

-            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);

-            d3u8  = vext_u8(d3u8, d4u8, 1);

-            d6u8  = vext_u8(d6u8, d7u8, 1);

-            d9u8  = vext_u8(d9u8, d10u8, 1);

-            d12u8 = vext_u8(d12u8, d13u8, 1);

-            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);

-            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);

-            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);

-            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);

-            d14u8 = vqrshrn_n_u16(q7u16, 7);

-            d15u8 = vqrshrn_n_u16(q8u16, 7);

-            d16u8 = vqrshrn_n_u16(q9u16, 7);

-            d17u8 = vqrshrn_n_u16(q10u16, 7);

-            d18u8 = vqrshrn_n_u16(q11u16, 7);

-            d19u8 = vqrshrn_n_u16(q12u16, 7);

-            d20u8 = vqrshrn_n_u16(q13u16, 7);

-            d21u8 = vqrshrn_n_u16(q14u16, 7);

-            q7u8  = vcombine_u8(d14u8, d15u8);

-            q8u8  = vcombine_u8(d16u8, d17u8);

-            q9u8  = vcombine_u8(d18u8, d19u8);

-            q10u8 = vcombine_u8(d20u8, d21u8);

-            vst1q_u8((uint8_t *)tmpp2, q7u8);

-            tmpp2 += 16;

-            vst1q_u8((uint8_t *)tmpp2, q8u8);

-            tmpp2 += 16;

-            vst1q_u8((uint8_t *)tmpp2, q9u8);

-            tmpp2 += 16;

-            vst1q_u8((uint8_t *)tmpp2, q10u8);

-            tmpp2 += 16;

-        }

-    } else {

-        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);

-        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);

-        d2u8 = vld1_u8(src_ptr);

-        d3u8 = vld1_u8(src_ptr + 8);

-        d4u8 = vld1_u8(src_ptr + 16);

-        src_ptr += src_pixels_per_line;

-        d5u8 = vld1_u8(src_ptr);

-        d6u8 = vld1_u8(src_ptr + 8);

-        d7u8 = vld1_u8(src_ptr + 16);

-        src_ptr += src_pixels_per_line;

-        d8u8 = vld1_u8(src_ptr);

-        d9u8 = vld1_u8(src_ptr + 8);

-        d10u8 = vld1_u8(src_ptr + 16);

-        src_ptr += src_pixels_per_line;

-        d11u8 = vld1_u8(src_ptr);

-        d12u8 = vld1_u8(src_ptr + 8);

-        d13u8 = vld1_u8(src_ptr + 16);

-        src_ptr += src_pixels_per_line;

-        // First Pass: output_height lines x output_width columns (17x16)

-        for (i = 3; i > 0; i--) {

-            q7u16  = vmull_u8(d2u8, d0u8);

-            q8u16  = vmull_u8(d3u8, d0u8);

-            q9u16  = vmull_u8(d5u8, d0u8);

-            q10u16 = vmull_u8(d6u8, d0u8);

-            q11u16 = vmull_u8(d8u8, d0u8);

-            q12u16 = vmull_u8(d9u8, d0u8);

-            q13u16 = vmull_u8(d11u8, d0u8);

-            q14u16 = vmull_u8(d12u8, d0u8);

-            d2u8  = vext_u8(d2u8, d3u8, 1);

-            d5u8  = vext_u8(d5u8, d6u8, 1);

-            d8u8  = vext_u8(d8u8, d9u8, 1);

-            d11u8 = vext_u8(d11u8, d12u8, 1);

-            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);

-            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);

-            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);

-            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);

-            d3u8  = vext_u8(d3u8, d4u8, 1);

-            d6u8  = vext_u8(d6u8, d7u8, 1);

-            d9u8  = vext_u8(d9u8, d10u8, 1);

-            d12u8 = vext_u8(d12u8, d13u8, 1);

-            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);

-            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);

-            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);

-            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);

-            d14u8 = vqrshrn_n_u16(q7u16, 7);

-            d15u8 = vqrshrn_n_u16(q8u16, 7);

-            d16u8 = vqrshrn_n_u16(q9u16, 7);

-            d17u8 = vqrshrn_n_u16(q10u16, 7);

-            d18u8 = vqrshrn_n_u16(q11u16, 7);

-            d19u8 = vqrshrn_n_u16(q12u16, 7);

-            d20u8 = vqrshrn_n_u16(q13u16, 7);

-            d21u8 = vqrshrn_n_u16(q14u16, 7);

-            d2u8 = vld1_u8(src_ptr);

-            d3u8 = vld1_u8(src_ptr + 8);

-            d4u8 = vld1_u8(src_ptr + 16);

-            src_ptr += src_pixels_per_line;

-            d5u8 = vld1_u8(src_ptr);

-            d6u8 = vld1_u8(src_ptr + 8);

-            d7u8 = vld1_u8(src_ptr + 16);

-            src_ptr += src_pixels_per_line;

-            d8u8 = vld1_u8(src_ptr);

-            d9u8 = vld1_u8(src_ptr + 8);

-            d10u8 = vld1_u8(src_ptr + 16);

-            src_ptr += src_pixels_per_line;

-            d11u8 = vld1_u8(src_ptr);

-            d12u8 = vld1_u8(src_ptr + 8);

-            d13u8 = vld1_u8(src_ptr + 16);

-            src_ptr += src_pixels_per_line;

-            q7u8 = vcombine_u8(d14u8, d15u8);

-            q8u8 = vcombine_u8(d16u8, d17u8);

-            q9u8 = vcombine_u8(d18u8, d19u8);

-            q10u8 = vcombine_u8(d20u8, d21u8);

-            vst1q_u8((uint8_t *)tmpp, q7u8);

-            tmpp += 16;

-            vst1q_u8((uint8_t *)tmpp, q8u8);

-            tmpp += 16;

-            vst1q_u8((uint8_t *)tmpp, q9u8);

-            tmpp += 16;

-            vst1q_u8((uint8_t *)tmpp, q10u8);

-            tmpp += 16;

-        }

-        // First-pass filtering for rest 5 lines

-        d14u8 = vld1_u8(src_ptr);

-        d15u8 = vld1_u8(src_ptr + 8);

-        d16u8 = vld1_u8(src_ptr + 16);

-        src_ptr += src_pixels_per_line;

-        q9u16  = vmull_u8(d2u8, d0u8);

-        q10u16 = vmull_u8(d3u8, d0u8);

-        q11u16 = vmull_u8(d5u8, d0u8);

-        q12u16 = vmull_u8(d6u8, d0u8);

-        q13u16 = vmull_u8(d8u8, d0u8);

-        q14u16 = vmull_u8(d9u8, d0u8);

-        d2u8  = vext_u8(d2u8, d3u8, 1);

-        d5u8  = vext_u8(d5u8, d6u8, 1);

-        d8u8  = vext_u8(d8u8, d9u8, 1);

-        q9u16  = vmlal_u8(q9u16, d2u8, d1u8);

-        q11u16 = vmlal_u8(q11u16, d5u8, d1u8);

-        q13u16 = vmlal_u8(q13u16, d8u8, d1u8);

-        d3u8  = vext_u8(d3u8, d4u8, 1);

-        d6u8  = vext_u8(d6u8, d7u8, 1);

-        d9u8  = vext_u8(d9u8, d10u8, 1);

-        q10u16 = vmlal_u8(q10u16, d3u8, d1u8);

-        q12u16 = vmlal_u8(q12u16, d6u8, d1u8);

-        q14u16 = vmlal_u8(q14u16, d9u8, d1u8);

-        q1u16 = vmull_u8(d11u8, d0u8);

-        q2u16 = vmull_u8(d12u8, d0u8);

-        q3u16 = vmull_u8(d14u8, d0u8);

-        q4u16 = vmull_u8(d15u8, d0u8);

-        d11u8 = vext_u8(d11u8, d12u8, 1);

-        d14u8 = vext_u8(d14u8, d15u8, 1);

-        q1u16 = vmlal_u8(q1u16, d11u8, d1u8);

-        q3u16 = vmlal_u8(q3u16, d14u8, d1u8);

-        d12u8 = vext_u8(d12u8, d13u8, 1);

-        d15u8 = vext_u8(d15u8, d16u8, 1);

-        q2u16 = vmlal_u8(q2u16, d12u8, d1u8);

-        q4u16 = vmlal_u8(q4u16, d15u8, d1u8);

-        d10u8 = vqrshrn_n_u16(q9u16, 7);

-        d11u8 = vqrshrn_n_u16(q10u16, 7);

-        d12u8 = vqrshrn_n_u16(q11u16, 7);

-        d13u8 = vqrshrn_n_u16(q12u16, 7);

-        d14u8 = vqrshrn_n_u16(q13u16, 7);

-        d15u8 = vqrshrn_n_u16(q14u16, 7);

-        d16u8 = vqrshrn_n_u16(q1u16, 7);

-        d17u8 = vqrshrn_n_u16(q2u16, 7);

-        d18u8 = vqrshrn_n_u16(q3u16, 7);

-        d19u8 = vqrshrn_n_u16(q4u16, 7);

-        q5u8 = vcombine_u8(d10u8, d11u8);

-        q6u8 = vcombine_u8(d12u8, d13u8);

-        q7u8 = vcombine_u8(d14u8, d15u8);

-        q8u8 = vcombine_u8(d16u8, d17u8);

-        q9u8 = vcombine_u8(d18u8, d19u8);

-        vst1q_u8((uint8_t *)tmpp, q5u8);

-        tmpp += 16;

-        vst1q_u8((uint8_t *)tmpp, q6u8);

-        tmpp += 16;

-        vst1q_u8((uint8_t *)tmpp, q7u8);

-        tmpp += 16;

-        vst1q_u8((uint8_t *)tmpp, q8u8);

-        tmpp += 16;

-        vst1q_u8((uint8_t *)tmpp, q9u8);

-        // secondpass_filter

-        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);

-        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);

-        tmpp = tmp;

-        tmpp2 = tmpp + 272;

-        q11u8 = vld1q_u8(tmpp);

-        tmpp += 16;

-        for (i = 4; i > 0; i--) {

-            q12u8 = vld1q_u8(tmpp);

-            tmpp += 16;

-            q13u8 = vld1q_u8(tmpp);

-            tmpp += 16;

-            q14u8 = vld1q_u8(tmpp);

-            tmpp += 16;

-            q15u8 = vld1q_u8(tmpp);

-            tmpp += 16;

-            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);

-            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);

-            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);

-            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);

-            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);

-            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);

-            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);

-            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);

-            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);

-            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);

-            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);

-            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);

-            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);

-            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);

-            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);

-            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);

-            d2u8 = vqrshrn_n_u16(q1u16, 7);

-            d3u8 = vqrshrn_n_u16(q2u16, 7);

-            d4u8 = vqrshrn_n_u16(q3u16, 7);

-            d5u8 = vqrshrn_n_u16(q4u16, 7);

-            d6u8 = vqrshrn_n_u16(q5u16, 7);

-            d7u8 = vqrshrn_n_u16(q6u16, 7);

-            d8u8 = vqrshrn_n_u16(q7u16, 7);

-            d9u8 = vqrshrn_n_u16(q8u16, 7);

-            q1u8 = vcombine_u8(d2u8, d3u8);

-            q2u8 = vcombine_u8(d4u8, d5u8);

-            q3u8 = vcombine_u8(d6u8, d7u8);

-            q4u8 = vcombine_u8(d8u8, d9u8);

-            q11u8 = q15u8;

-            vst1q_u8((uint8_t *)tmpp2, q1u8);

-            tmpp2 += 16;

-            vst1q_u8((uint8_t *)tmpp2, q2u8);

-            tmpp2 += 16;

-            vst1q_u8((uint8_t *)tmpp2, q3u8);

-            tmpp2 += 16;

-            vst1q_u8((uint8_t *)tmpp2, q4u8);

-            tmpp2 += 16;

-        }

-    }

-    // sub_pixel_variance16x16_neon

-    q8s32 = vdupq_n_s32(0);

-    q9s32 = vdupq_n_s32(0);

-    q10s32 = vdupq_n_s32(0);

-    tmpp = tmp + 272;

-    for (i = 0; i < 8; i++) {  // sub_pixel_variance16x16_neon_loop

-        q0u8 = vld1q_u8(tmpp);

-        tmpp += 16;

-        q1u8 = vld1q_u8(tmpp);

-        tmpp += 16;

-        q2u8 = vld1q_u8(dst_ptr);

-        dst_ptr += dst_pixels_per_line;

-        q3u8 = vld1q_u8(dst_ptr);

-        dst_ptr += dst_pixels_per_line;

-        d0u8 = vget_low_u8(q0u8);

-        d1u8 = vget_high_u8(q0u8);

-        d2u8 = vget_low_u8(q1u8);

-        d3u8 = vget_high_u8(q1u8);

-        q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));

-        q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));

-        q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));

-        q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));

-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));

-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));

-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);

-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);

-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));

-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));

-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);

-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);

-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));

-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));

-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);

-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);

-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));

-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));

-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);

-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);

-    }

-    q10s32 = vaddq_s32(q10s32, q9s32);

-    q0s64 = vpaddlq_s32(q8s32);

-    q1s64 = vpaddlq_s32(q10s32);

-    d0s64 = vget_low_s64(q0s64);

-    d1s64 = vget_high_s64(q0s64);

-    d2s64 = vget_low_s64(q1s64);

-    d3s64 = vget_high_s64(q1s64);

-    d0s64 = vadd_s64(d0s64, d1s64);

-    d1s64 = vadd_s64(d2s64, d3s64);

-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),

-                      vreinterpret_s32_s64(d0s64));

-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);

-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

-    return vget_lane_u32(d0u32, 0);

-}

-unsigned int vp8_variance_halfpixvar16x16_h_neon(

-        const unsigned char *src_ptr,

-        int  source_stride,

-        const unsigned char *ref_ptr,

-        int  recon_stride,

-        unsigned int *sse) {

-    int i;

-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;

-    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;

-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;

-    uint32x2_t d0u32, d10u32;

-    int64x1_t d0s64, d1s64, d2s64, d3s64;

-    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8;

-    uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8;

-    uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16;

-    int32x4_t q8s32, q9s32, q10s32;

-    int64x2_t q0s64, q1s64, q5s64;

-    q8s32 = vdupq_n_s32(0);

-    q9s32 = vdupq_n_s32(0);

-    q10s32 = vdupq_n_s32(0);

-    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon

-        q0u8 = vld1q_u8(src_ptr);

-        q1u8 = vld1q_u8(src_ptr + 16);

-        src_ptr += source_stride;

-        q2u8 = vld1q_u8(src_ptr);

-        q3u8 = vld1q_u8(src_ptr + 16);

-        src_ptr += source_stride;

-        q4u8 = vld1q_u8(src_ptr);

-        q5u8 = vld1q_u8(src_ptr + 16);

-        src_ptr += source_stride;

-        q6u8 = vld1q_u8(src_ptr);

-        q7u8 = vld1q_u8(src_ptr + 16);

-        src_ptr += source_stride;

-        q11u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        q12u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        q13u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        q14u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        q1u8 = vextq_u8(q0u8, q1u8, 1);

-        q3u8 = vextq_u8(q2u8, q3u8, 1);

-        q5u8 = vextq_u8(q4u8, q5u8, 1);

-        q7u8 = vextq_u8(q6u8, q7u8, 1);

-        q0u8 = vrhaddq_u8(q0u8, q1u8);

-        q1u8 = vrhaddq_u8(q2u8, q3u8);

-        q2u8 = vrhaddq_u8(q4u8, q5u8);

-        q3u8 = vrhaddq_u8(q6u8, q7u8);

-        d0u8 = vget_low_u8(q0u8);

-        d1u8 = vget_high_u8(q0u8);

-        d2u8 = vget_low_u8(q1u8);

-        d3u8 = vget_high_u8(q1u8);

-        d4u8 = vget_low_u8(q2u8);

-        d5u8 = vget_high_u8(q2u8);

-        d6u8 = vget_low_u8(q3u8);

-        d7u8 = vget_high_u8(q3u8);

-        q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8));

-        q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8));

-        q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8));

-        q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8));

-        q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8));

-        q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8));

-        q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8));

-        q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8));

-        d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16));

-        d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16));

-        q9s32 = vmlal_s16(q9s32, d8s16, d8s16);

-        q10s32 = vmlal_s16(q10s32, d9s16, d9s16);

-        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));

-        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16));

-        q9s32 = vmlal_s16(q9s32, d10s16, d10s16);

-        q10s32 = vmlal_s16(q10s32, d11s16, d11s16);

-        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));

-        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16));

-        q9s32 = vmlal_s16(q9s32, d12s16, d12s16);

-        q10s32 = vmlal_s16(q10s32, d13s16, d13s16);

-        d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16));

-        d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16));

-        q9s32 = vmlal_s16(q9s32, d14s16, d14s16);

-        q10s32 = vmlal_s16(q10s32, d15s16, d15s16);

-        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));

-        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));

-        q9s32 = vmlal_s16(q9s32, d0s16, d0s16);

-        q10s32 = vmlal_s16(q10s32, d1s16, d1s16);

-        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));

-        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));

-        q9s32 = vmlal_s16(q9s32, d2s16, d2s16);

-        q10s32 = vmlal_s16(q10s32, d3s16, d3s16);

-        d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));

-        d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));

-        q9s32 = vmlal_s16(q9s32, d4s16, d4s16);

-        q10s32 = vmlal_s16(q10s32, d5s16, d5s16);

-        d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));

-        d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));

-        q9s32 = vmlal_s16(q9s32, d6s16, d6s16);

-        q10s32 = vmlal_s16(q10s32, d7s16, d7s16);

-    }

-    q10s32 = vaddq_s32(q10s32, q9s32);

-    q0s64 = vpaddlq_s32(q8s32);

-    q1s64 = vpaddlq_s32(q10s32);

-    d0s64 = vget_low_s64(q0s64);

-    d1s64 = vget_high_s64(q0s64);

-    d2s64 = vget_low_s64(q1s64);

-    d3s64 = vget_high_s64(q1s64);

-    d0s64 = vadd_s64(d0s64, d1s64);

-    d1s64 = vadd_s64(d2s64, d3s64);

-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),

-                      vreinterpret_s32_s64(d0s64));

-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);

-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

-    return vget_lane_u32(d0u32, 0);

-}

-unsigned int vp8_variance_halfpixvar16x16_v_neon(

-        const unsigned char *src_ptr,

-        int  source_stride,

-        const unsigned char *ref_ptr,

-        int  recon_stride,

-        unsigned int *sse) {

-    int i;

-    uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8;

-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;

-    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;

-    uint32x2_t d0u32, d10u32;

-    int64x1_t d0s64, d1s64, d2s64, d3s64;

-    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8;

-    uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16;

-    int32x4_t q8s32, q9s32, q10s32;

-    int64x2_t q0s64, q1s64, q5s64;

-    q8s32 = vdupq_n_s32(0);

-    q9s32 = vdupq_n_s32(0);

-    q10s32 = vdupq_n_s32(0);

-    q0u8 = vld1q_u8(src_ptr);

-    src_ptr += source_stride;

-    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon

-        q2u8 = vld1q_u8(src_ptr);

-        src_ptr += source_stride;

-        q4u8 = vld1q_u8(src_ptr);

-        src_ptr += source_stride;

-        q6u8 = vld1q_u8(src_ptr);

-        src_ptr += source_stride;

-        q15u8 = vld1q_u8(src_ptr);

-        src_ptr += source_stride;

-        q1u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        q3u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        q5u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        q7u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        q0u8 = vrhaddq_u8(q0u8, q2u8);

-        q2u8 = vrhaddq_u8(q2u8, q4u8);

-        q4u8 = vrhaddq_u8(q4u8, q6u8);

-        q6u8 = vrhaddq_u8(q6u8, q15u8);

-        d0u8  = vget_low_u8(q0u8);

-        d1u8  = vget_high_u8(q0u8);

-        d4u8  = vget_low_u8(q2u8);

-        d5u8  = vget_high_u8(q2u8);

-        d8u8  = vget_low_u8(q4u8);

-        d9u8  = vget_high_u8(q4u8);

-        d12u8 = vget_low_u8(q6u8);

-        d13u8 = vget_high_u8(q6u8);

-        q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8));

-        q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8));

-        q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8));

-        q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8));

-        q0u16  = vsubl_u8(d8u8, vget_low_u8(q5u8));

-        q1u16  = vsubl_u8(d9u8, vget_high_u8(q5u8));

-        q2u16  = vsubl_u8(d12u8, vget_low_u8(q7u8));

-        q3u16  = vsubl_u8(d13u8, vget_high_u8(q7u8));

-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));

-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));

-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);

-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);

-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));

-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));

-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);

-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);

-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));

-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));

-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);

-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);

-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));

-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));

-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);

-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);

-        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));

-        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));

-        q9s32 = vmlal_s16(q9s32, d0s16, d0s16);

-        q10s32 = vmlal_s16(q10s32, d1s16, d1s16);

-        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));

-        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));

-        q9s32 = vmlal_s16(q9s32, d2s16, d2s16);

-        q10s32 = vmlal_s16(q10s32, d3s16, d3s16);

-        d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));

-        d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));

-        q9s32 = vmlal_s16(q9s32, d4s16, d4s16);

-        q10s32 = vmlal_s16(q10s32, d5s16, d5s16);

-        d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));

-        d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));

-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));

-        q9s32 = vmlal_s16(q9s32, d6s16, d6s16);

-        q10s32 = vmlal_s16(q10s32, d7s16, d7s16);

-        q0u8 = q15u8;

-    }

-    q10s32 = vaddq_s32(q10s32, q9s32);

-    q0s64 = vpaddlq_s32(q8s32);

-    q1s64 = vpaddlq_s32(q10s32);

-    d0s64 = vget_low_s64(q0s64);

-    d1s64 = vget_high_s64(q0s64);

-    d2s64 = vget_low_s64(q1s64);

-    d3s64 = vget_high_s64(q1s64);

-    d0s64 = vadd_s64(d0s64, d1s64);

-    d1s64 = vadd_s64(d2s64, d3s64);

-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),

-                      vreinterpret_s32_s64(d0s64));

-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);

-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

-    return vget_lane_u32(d0u32, 0);

-}

-unsigned int vp8_variance_halfpixvar16x16_hv_neon(

-        const unsigned char *src_ptr,

-        int  source_stride,

-        const unsigned char *ref_ptr,

-        int  recon_stride,

-        unsigned int *sse) {

-    int i;

-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;

-    int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16;

-    int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16;

-    uint32x2_t d0u32, d10u32;

-    int64x1_t d0s64, d1s64, d2s64, d3s64;

-    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;

-    uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16;

-    int32x4_t q13s32, q14s32, q15s32;

-    int64x2_t q0s64, q1s64, q5s64;

-    q13s32 = vdupq_n_s32(0);

-    q14s32 = vdupq_n_s32(0);

-    q15s32 = vdupq_n_s32(0);

-    q0u8 = vld1q_u8(src_ptr);

-    q1u8 = vld1q_u8(src_ptr + 16);

-    src_ptr += source_stride;

-    q1u8 = vextq_u8(q0u8, q1u8, 1);

-    q0u8 = vrhaddq_u8(q0u8, q1u8);

-    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon

-        q2u8 = vld1q_u8(src_ptr);

-        q3u8 = vld1q_u8(src_ptr + 16);

-        src_ptr += source_stride;

-        q4u8 = vld1q_u8(src_ptr);

-        q5u8 = vld1q_u8(src_ptr + 16);

-        src_ptr += source_stride;

-        q6u8 = vld1q_u8(src_ptr);

-        q7u8 = vld1q_u8(src_ptr + 16);

-        src_ptr += source_stride;

-        q8u8 = vld1q_u8(src_ptr);

-        q9u8 = vld1q_u8(src_ptr + 16);

-        src_ptr += source_stride;

-        q3u8 = vextq_u8(q2u8, q3u8, 1);

-        q5u8 = vextq_u8(q4u8, q5u8, 1);

-        q7u8 = vextq_u8(q6u8, q7u8, 1);

-        q9u8 = vextq_u8(q8u8, q9u8, 1);

-        q1u8 = vrhaddq_u8(q2u8, q3u8);

-        q2u8 = vrhaddq_u8(q4u8, q5u8);

-        q3u8 = vrhaddq_u8(q6u8, q7u8);

-        q4u8 = vrhaddq_u8(q8u8, q9u8);

-        q0u8 = vrhaddq_u8(q0u8, q1u8);

-        q1u8 = vrhaddq_u8(q1u8, q2u8);

-        q2u8 = vrhaddq_u8(q2u8, q3u8);

-        q3u8 = vrhaddq_u8(q3u8, q4u8);

-        q5u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        q6u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        q7u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        q8u8 = vld1q_u8(ref_ptr);

-        ref_ptr += recon_stride;

-        d0u8 = vget_low_u8(q0u8);

-        d1u8 = vget_high_u8(q0u8);

-        d2u8 = vget_low_u8(q1u8);

-        d3u8 = vget_high_u8(q1u8);

-        d4u8 = vget_low_u8(q2u8);

-        d5u8 = vget_high_u8(q2u8);

-        d6u8 = vget_low_u8(q3u8);

-        d7u8 = vget_high_u8(q3u8);

-        q9u16  = vsubl_u8(d0u8, vget_low_u8(q5u8));

-        q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8));

-        q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8));

-        q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8));

-        q0u16  = vsubl_u8(d4u8, vget_low_u8(q7u8));

-        q1u16  = vsubl_u8(d5u8, vget_high_u8(q7u8));

-        q5u16  = vsubl_u8(d6u8, vget_low_u8(q8u8));

-        q6u16  = vsubl_u8(d7u8, vget_high_u8(q8u8));

-        d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));

-        d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));

-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16));

-        q14s32 = vmlal_s16(q14s32, d18s16, d18s16);

-        q15s32 = vmlal_s16(q15s32, d19s16, d19s16);

-        d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));

-        d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));

-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16));

-        q14s32 = vmlal_s16(q14s32, d20s16, d20s16);

-        q15s32 = vmlal_s16(q15s32, d21s16, d21s16);

-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));

-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));

-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16));

-        q14s32 = vmlal_s16(q14s32, d22s16, d22s16);

-        q15s32 = vmlal_s16(q15s32, d23s16, d23s16);

-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));

-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));

-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16));

-        q14s32 = vmlal_s16(q14s32, d24s16, d24s16);

-        q15s32 = vmlal_s16(q15s32, d25s16, d25s16);

-        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));

-        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));

-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16));

-        q14s32 = vmlal_s16(q14s32, d0s16, d0s16);

-        q15s32 = vmlal_s16(q15s32, d1s16, d1s16);

-        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));

-        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));

-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16));

-        q14s32 = vmlal_s16(q14s32, d2s16, d2s16);

-        q15s32 = vmlal_s16(q15s32, d3s16, d3s16);

-        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));

-        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));

-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16));

-        q14s32 = vmlal_s16(q14s32, d10s16, d10s16);

-        q15s32 = vmlal_s16(q15s32, d11s16, d11s16);

-        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));

-        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));

-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16));

-        q14s32 = vmlal_s16(q14s32, d12s16, d12s16);

-        q15s32 = vmlal_s16(q15s32, d13s16, d13s16);

-        q0u8 = q4u8;

-    }

-    q15s32 = vaddq_s32(q14s32, q15s32);

-    q0s64 = vpaddlq_s32(q13s32);

-    q1s64 = vpaddlq_s32(q15s32);

-    d0s64 = vget_low_s64(q0s64);

-    d1s64 = vget_high_s64(q0s64);

-    d2s64 = vget_low_s64(q1s64);

-    d3s64 = vget_high_s64(q1s64);

-    d0s64 = vadd_s64(d0s64, d1s64);

-    d1s64 = vadd_s64(d2s64, d3s64);

-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),

-                      vreinterpret_s32_s64(d0s64));

-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);

-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

-    return vget_lane_u32(d0u32, 0);

-}

-#define FILTER_BITS 7

-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {

-  const int32x4_t a = vpaddlq_s16(v_16x8);

-  const int64x2_t b = vpaddlq_s32(a);

-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),

-                               vreinterpret_s32_s64(vget_high_s64(b)));

-  return vget_lane_s32(c, 0);

-}

-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {

-  const int64x2_t b = vpaddlq_s32(v_32x4);

-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),

-                               vreinterpret_s32_s64(vget_high_s64(b)));

-  return vget_lane_s32(c, 0);

-}

-static void variance_neon_w8(const uint8_t *a, int a_stride,

-                             const uint8_t *b, int b_stride,

-                             int w, int h, unsigned int *sse, int *sum) {

-  int i, j;

-  int16x8_t v_sum = vdupq_n_s16(0);

-  int32x4_t v_sse_lo = vdupq_n_s32(0);

-  int32x4_t v_sse_hi = vdupq_n_s32(0);

-  for (i = 0; i < h; ++i) {

-    for (j = 0; j < w; j += 8) {

-      const uint8x8_t v_a = vld1_u8(&a[j]);

-      const uint8x8_t v_b = vld1_u8(&b[j]);

-      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);

-      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);

-      v_sum = vaddq_s16(v_sum, sv_diff);

-      v_sse_lo = vmlal_s16(v_sse_lo,

-                           vget_low_s16(sv_diff),

-                           vget_low_s16(sv_diff));

-      v_sse_hi = vmlal_s16(v_sse_hi,

-                           vget_high_s16(sv_diff),

-                           vget_high_s16(sv_diff));

-    }

-    a += a_stride;

-    b += b_stride;

-  }

-  *sum = horizontal_add_s16x8(v_sum);

-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));

-}

-static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,

-                                     const uint8_t *b, int b_stride,

-                                     unsigned int *sse) {

-  int sum;

-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);

-  return *sse - (((int64_t)sum * sum) / (8 * 8));

-}

-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,

-                                      uint8_t *output_ptr,

-                                      unsigned int src_pixels_per_line,

-                                      int pixel_step,

-                                      unsigned int output_height,

-                                      unsigned int output_width,

-                                      const uint8_t *vpx_filter) {

-  const uint8x8_t f0 = vmov_n_u8(vpx_filter[0]);

-  const uint8x8_t f1 = vmov_n_u8(vpx_filter[1]);

-  unsigned int i;

-  for (i = 0; i < output_height; ++i) {

-    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);

-    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);

-    const uint16x8_t a = vmull_u8(src_0, f0);

-    const uint16x8_t b = vmlal_u8(a, src_1, f1);

-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);

-    vst1_u8(&output_ptr[0], out);

-    // Next row...

-    src_ptr += src_pixels_per_line;

-    output_ptr += output_width;

-  }

-}

-unsigned int vp8_sub_pixel_variance8x8_neon(

-        const unsigned char *src,

-        int src_stride,

-        int xoffset,

-        int yoffset,

-        const unsigned char *dst,

-        int dst_stride,

-        unsigned int *sse) {

-  DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]);

-  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);

-  if (xoffset == 0) {

-    var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8,

-                              8, bilinear_taps_coeff[yoffset]);

-  } else if (yoffset == 0) {

-    var_filter_block2d_bil_w8(src, temp2, src_stride, 1,

-                              9, 8,

-                              bilinear_taps_coeff[xoffset]);

-  } else {

-    var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,

-                              9, 8,

-                              bilinear_taps_coeff[xoffset]);

-    var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,

-                              8, bilinear_taps_coeff[yoffset]);

-  }

-  return variance8x8_neon(temp2, 8, dst, dst_stride, sse);

-}

--- a/vp8/common/arm/variance_arm.c

+++ /dev/null

@@ -1,137 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "./vp8_rtcd.h"

-#include "./vpx_dsp_rtcd.h"

-#include "vp8/common/variance.h"

-#include "vp8/common/filter.h"

-// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder

-#if CONFIG_VP8_ENCODER

-#if HAVE_MEDIA

-#include "vp8/common/arm/bilinearfilter_arm.h"

-unsigned int vp8_sub_pixel_variance8x8_armv6

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    unsigned short first_pass[10*8];

-    unsigned char  second_pass[8*8];

-    const short *HFilter, *VFilter;

-    HFilter = vp8_bilinear_filters[xoffset];

-    VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,

-                                            src_pixels_per_line,

-                                            9, 8, HFilter);

-    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,

-                                             8, 8, 8, VFilter);

-    return vpx_variance8x8_media(second_pass, 8, dst_ptr,

-                                 dst_pixels_per_line, sse);

-}

-unsigned int vp8_sub_pixel_variance16x16_armv6

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    unsigned short first_pass[36*16];

-    unsigned char  second_pass[20*16];

-    const short *HFilter, *VFilter;

-    unsigned int var;

-    if (xoffset == 4 && yoffset == 0)

-    {

-        var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,

-                                                   dst_ptr, dst_pixels_per_line, sse);

-    }

-    else if (xoffset == 0 && yoffset == 4)

-    {

-        var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,

-                                                   dst_ptr, dst_pixels_per_line, sse);

-    }

-    else if (xoffset == 4 && yoffset == 4)

-    {

-        var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,

-                                                   dst_ptr, dst_pixels_per_line, sse);

-    }

-    else

-    {

-        HFilter = vp8_bilinear_filters[xoffset];

-        VFilter = vp8_bilinear_filters[yoffset];

-        vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,

-                                                src_pixels_per_line,

-                                                17, 16, HFilter);

-        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,

-                                                 16, 16, 16, VFilter);

-        var = vpx_variance16x16_media(second_pass, 16, dst_ptr,

-                                      dst_pixels_per_line, sse);

-    }

-    return var;

-}

-#endif  // HAVE_MEDIA

-#if HAVE_NEON

-extern unsigned int vp8_sub_pixel_variance16x16_neon_func

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-);

-unsigned int vp8_sub_pixel_variance16x16_neon

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-  if (xoffset == 4 && yoffset == 0)

-    return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);

-  else if (xoffset == 0 && yoffset == 4)

-    return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);

-  else if (xoffset == 4 && yoffset == 4)

-    return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);

-  else

-    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);

-}

-#endif  // HAVE_NEON

-#endif  // CONFIG_VP8_ENCODER

--- a/vp8/common/mfqe.c

+++ b/vp8/common/mfqe.c

@@ -20,7 +20,7 @@

 #include "./vp8_rtcd.h"

 #include "./vpx_dsp_rtcd.h"

 #include "vp8/common/postproc.h"

-#include "vp8/common/variance.h"

+#include "vpx_dsp/variance.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vpx_scale/yv12config.h"

--- a/vp8/common/rtcd_defs.pl

+++ b/vp8/common/rtcd_defs.pl

@@ -238,47 +238,6 @@

 $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;

-# Sub-pixel Variance

-#

-add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";

-specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/;

-$vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt;

-add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";

-specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media/;

-$vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt;

-$vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6;

-add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";

-specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/;

-$vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt;

-add_proto qw/unsigned int vp8_sub_pixel_variance16x8/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";

-specialize qw/vp8_sub_pixel_variance16x8 mmx sse2 ssse3/;

-$vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt;

-add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";

-specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon_asm/;

-$vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt;

-$vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6;

-$vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon;

-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";

-specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/;

-$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;

-$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;

-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";

-specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/;

-$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;

-$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;

-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";

-specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/;

-$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;

-$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;

-#

 # Encoder functions below this point.

 if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {

--- a/vp8/common/variance.h

+++ /dev/null

@@ -1,92 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP8_COMMON_VARIANCE_H_

-#define VP8_COMMON_VARIANCE_H_

-#include "vpx_config.h"

-#include "vpx/vpx_integer.h"

-#ifdef __cplusplus

-extern "C" {

-#endif

-typedef unsigned int(*vpx_sad_fn_t)(

-    const uint8_t *src_ptr,

-    int source_stride,

-    const uint8_t *ref_ptr,

-    int ref_stride);

-typedef void (*vp8_copy32xn_fn_t)(

-    const unsigned char *src_ptr,

-    int source_stride,

-    unsigned char *ref_ptr,

-    int ref_stride,

-    int n);

-typedef void (*vpx_sad_multi_fn_t)(

-    const unsigned char *src_ptr,

-    int source_stride,

-    const unsigned char *ref_array,

-    int  ref_stride,

-    unsigned int *sad_array);

-typedef void (*vpx_sad_multi_d_fn_t)

-    (

-     const unsigned char *src_ptr,

-     int source_stride,

-     const unsigned char * const ref_array[],

-     int  ref_stride,

-     unsigned int *sad_array

-    );

-typedef unsigned int (*vpx_variance_fn_t)

-    (

-     const unsigned char *src_ptr,

-     int source_stride,

-     const unsigned char *ref_ptr,

-     int  ref_stride,

-     unsigned int *sse

-    );

-typedef unsigned int (*vp8_subpixvariance_fn_t)

-    (

-      const unsigned char  *src_ptr,

-      int  source_stride,

-      int  xoffset,

-      int  yoffset,

-      const unsigned char *ref_ptr,

-      int Refstride,

-      unsigned int *sse

-    );

-typedef struct variance_vtable

-{

-    vpx_sad_fn_t            sdf;

-    vpx_variance_fn_t       vf;

-    vp8_subpixvariance_fn_t svf;

-    vpx_variance_fn_t       svf_halfpix_h;

-    vpx_variance_fn_t       svf_halfpix_v;

-    vpx_variance_fn_t       svf_halfpix_hv;

-    vpx_sad_multi_fn_t      sdx3f;

-    vpx_sad_multi_fn_t      sdx8f;

-    vpx_sad_multi_d_fn_t    sdx4df;

-#if ARCH_X86 || ARCH_X86_64

-    vp8_copy32xn_fn_t       copymem;

-#endif

-} vp8_variance_fn_ptr_t;

-#ifdef __cplusplus

-}  // extern "C"

-#endif

-#endif  // VP8_COMMON_VARIANCE_H_

--- a/vp8/common/variance_c.c

+++ /dev/null

@@ -1,337 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp8_rtcd.h"

-#include "filter.h"

-#include "variance.h"

-/* This is a bad idea.

- * ctz = count trailing zeros */

-static int ctz(int a) {

-  int b = 0;

-  while (a != 1) {

-    a >>= 1;

-    b++;

-  }

-  return b;

-}

-static unsigned int variance(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    int  w,

-    int  h,

-    unsigned int *sse)

-{

-    int i, j;

-    int diff, sum;

-    sum = 0;

-    *sse = 0;

-    for (i = 0; i < h; i++)

-    {

-        for (j = 0; j < w; j++)

-        {

-            diff = src_ptr[j] - ref_ptr[j];

-            sum += diff;

-            *sse += diff * diff;

-        }

-        src_ptr += source_stride;

-        ref_ptr += recon_stride;

-    }

-    return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_first_pass

- *

- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.

- *                  UINT32 src_pixels_per_line : Stride of input block.

- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).

- *                  UINT32 output_height     : Input block height.

- *                  UINT32 output_width      : Input block width.

- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in

- *                  either horizontal or vertical direction to produce the

- *                  filtered output block. Used to implement first-pass

- *                  of 2-D separable filter.

- *

- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.

- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.

- *                  pixel_step defines whether the filter is applied

- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).

- *                  It defines the offset required to move from one input

- *                  to the next.

- *

- ****************************************************************************/

-static void var_filter_block2d_bil_first_pass

-(

-    const unsigned char *src_ptr,

-    unsigned short *output_ptr,

-    unsigned int src_pixels_per_line,

-    int pixel_step,

-    unsigned int output_height,

-    unsigned int output_width,

-    const short *vp8_filter

-)

-{

-    unsigned int i, j;

-    for (i = 0; i < output_height; i++)

-    {

-        for (j = 0; j < output_width; j++)

-        {

-            /* Apply bilinear filter */

-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +

-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +

-                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;

-            src_ptr++;

-        }

-        /* Next row... */

-        src_ptr    += src_pixels_per_line - output_width;

-        output_ptr += output_width;

-    }

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_second_pass

- *

- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.

- *                  UINT32 src_pixels_per_line : Stride of input block.

- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).

- *                  UINT32 output_height     : Input block height.

- *                  UINT32 output_width      : Input block width.

- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in

- *                  either horizontal or vertical direction to produce the

- *                  filtered output block. Used to implement second-pass

- *                  of 2-D separable filter.

- *

- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.

- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.

- *                  pixel_step defines whether the filter is applied

- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).

- *                  It defines the offset required to move from one input

- *                  to the next.

- *

- ****************************************************************************/

-static void var_filter_block2d_bil_second_pass

-(

-    const unsigned short *src_ptr,

-    unsigned char  *output_ptr,

-    unsigned int  src_pixels_per_line,

-    unsigned int  pixel_step,

-    unsigned int  output_height,

-    unsigned int  output_width,

-    const short *vp8_filter

-)

-{

-    unsigned int  i, j;

-    int  Temp;

-    for (i = 0; i < output_height; i++)

-    {

-        for (j = 0; j < output_width; j++)

-        {

-            /* Apply filter */

-            Temp = ((int)src_ptr[0]          * vp8_filter[0]) +

-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +

-                   (VP8_FILTER_WEIGHT / 2);

-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);

-            src_ptr++;

-        }

-        /* Next row... */

-        src_ptr    += src_pixels_per_line - output_width;

-        output_ptr += output_width;

-    }

-}

-unsigned int vp8_sub_pixel_variance4x4_c

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    unsigned char  temp2[20*16];

-    const short *HFilter, *VFilter;

-    unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */

-    HFilter = vp8_bilinear_filters[xoffset];

-    VFilter = vp8_bilinear_filters[yoffset];

-    /* First filter 1d Horizontal */

-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);

-    /* Now filter Verticaly */

-    var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);

-    return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);

-}

-unsigned int vp8_sub_pixel_variance8x8_c

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */

-    unsigned char  temp2[20*16];

-    const short *HFilter, *VFilter;

-    HFilter = vp8_bilinear_filters[xoffset];

-    VFilter = vp8_bilinear_filters[yoffset];

-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);

-    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);

-    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);

-}

-unsigned int vp8_sub_pixel_variance16x16_c

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    unsigned short FData3[17*16];   /* Temp data bufffer used in filtering */

-    unsigned char  temp2[20*16];

-    const short *HFilter, *VFilter;

-    HFilter = vp8_bilinear_filters[xoffset];

-    VFilter = vp8_bilinear_filters[yoffset];

-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);

-    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);

-    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);

-}

-unsigned int vp8_variance_halfpixvar16x16_h_c(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *sse)

-{

-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0,

-                                         ref_ptr, recon_stride, sse);

-}

-unsigned int vp8_variance_halfpixvar16x16_v_c(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *sse)

-{

-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4,

-                                         ref_ptr, recon_stride, sse);

-}

-unsigned int vp8_variance_halfpixvar16x16_hv_c(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *sse)

-{

-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4,

-                                         ref_ptr, recon_stride, sse);

-}

-unsigned int vp8_sub_pixel_variance16x8_c

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    unsigned short FData3[16*9];    /* Temp data bufffer used in filtering */

-    unsigned char  temp2[20*16];

-    const short *HFilter, *VFilter;

-    HFilter = vp8_bilinear_filters[xoffset];

-    VFilter = vp8_bilinear_filters[yoffset];

-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);

-    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);

-    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);

-}

-unsigned int vp8_sub_pixel_variance8x16_c

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    unsigned short FData3[9*16];    /* Temp data bufffer used in filtering */

-    unsigned char  temp2[20*16];

-    const short *HFilter, *VFilter;

-    HFilter = vp8_bilinear_filters[xoffset];

-    VFilter = vp8_bilinear_filters[yoffset];

-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);

-    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);

-    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);

-}

--- a/vp8/common/x86/variance_impl_sse2.asm

+++ /dev/null

@@ -1,972 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define xmm_filter_shift            7

-;void vp8_filter_block2d_bil_var_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int  xoffset,

-;    int  yoffset,

-;    int *sum,

-;    unsigned int *sumsquared;;

-;

-;)

-global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE

-sym(vp8_filter_block2d_bil_var_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    push rbx

-    ; end prolog

-        pxor            xmm6,           xmm6                 ;

-        pxor            xmm7,           xmm7                 ;

-        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding

-        movdqa          xmm4,           XMMWORD PTR [rsi]

-        lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]

-        movsxd          rax,            dword ptr arg(5)     ; xoffset

-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0

-        je              filter_block2d_bil_var_sse2_sp_only

-        shl             rax,            5                    ; point to filter coeff with xoffset

-        lea             rax,            [rax + rcx]          ; HFilter

-        movsxd          rdx,            dword ptr arg(6)     ; yoffset

-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0

-        je              filter_block2d_bil_var_sse2_fp_only

-        shl             rdx,            5

-        lea             rdx,            [rdx + rcx]          ; VFilter

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        pxor            xmm0,           xmm0                 ;

-        movq            xmm1,           QWORD PTR [rsi]      ;

-        movq            xmm3,           QWORD PTR [rsi+1]    ;

-        punpcklbw       xmm1,           xmm0                 ;

-        pmullw          xmm1,           [rax]                ;

-        punpcklbw       xmm3,           xmm0

-        pmullw          xmm3,           [rax+16]             ;

-        paddw           xmm1,           xmm3                 ;

-        paddw           xmm1,           xmm4                 ;

-        psraw           xmm1,           xmm_filter_shift     ;

-        movdqa          xmm5,           xmm1

-        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line

-        lea             rsi,            [rsi + rbx]

-%if ABI_IS_32BIT=0

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

-%endif

-filter_block2d_bil_var_sse2_loop:

-        movq            xmm1,           QWORD PTR [rsi]               ;

-        movq            xmm3,           QWORD PTR [rsi+1]             ;

-        punpcklbw       xmm1,           xmm0                 ;

-        pmullw          xmm1,           [rax]               ;

-        punpcklbw       xmm3,           xmm0                 ;

-        pmullw          xmm3,           [rax+16]             ;

-        paddw           xmm1,           xmm3                 ;

-        paddw           xmm1,           xmm4               ;

-        psraw           xmm1,           xmm_filter_shift    ;

-        movdqa          xmm3,           xmm5                 ;

-        movdqa          xmm5,           xmm1                 ;

-        pmullw          xmm3,           [rdx]               ;

-        pmullw          xmm1,           [rdx+16]             ;

-        paddw           xmm1,           xmm3                 ;

-        paddw           xmm1,           xmm4                 ;

-        psraw           xmm1,           xmm_filter_shift    ;

-        movq            xmm3,           QWORD PTR [rdi]               ;

-        punpcklbw       xmm3,           xmm0                 ;

-        psubw           xmm1,           xmm3                 ;

-        paddw           xmm6,           xmm1                 ;

-        pmaddwd         xmm1,           xmm1                 ;

-        paddd           xmm7,           xmm1                 ;

-        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line

-%if ABI_IS_32BIT

-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

-%else

-        lea             rdi,            [rdi + r9]

-%endif

-        sub             rcx,            1                   ;

-        jnz             filter_block2d_bil_var_sse2_loop       ;

-        jmp             filter_block2d_bil_variance

-filter_block2d_bil_var_sse2_sp_only:

-        movsxd          rdx,            dword ptr arg(6)     ; yoffset

-        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0

-        je              filter_block2d_bil_var_sse2_full_pixel

-        shl             rdx,            5

-        lea             rdx,            [rdx + rcx]          ; VFilter

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

-        pxor            xmm0,           xmm0                 ;

-        movq            xmm1,           QWORD PTR [rsi]      ;

-        punpcklbw       xmm1,           xmm0                 ;

-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

-        lea             rsi,            [rsi + rax]

-filter_block2d_bil_sp_only_loop:

-        movq            xmm3,           QWORD PTR [rsi]             ;

-        punpcklbw       xmm3,           xmm0                 ;

-        movdqa          xmm5,           xmm3

-        pmullw          xmm1,           [rdx]               ;

-        pmullw          xmm3,           [rdx+16]             ;

-        paddw           xmm1,           xmm3                 ;

-        paddw           xmm1,           xmm4                 ;

-        psraw           xmm1,           xmm_filter_shift    ;

-        movq            xmm3,           QWORD PTR [rdi]               ;

-        punpcklbw       xmm3,           xmm0                 ;

-        psubw           xmm1,           xmm3                 ;

-        paddw           xmm6,           xmm1                 ;

-        pmaddwd         xmm1,           xmm1                 ;

-        paddd           xmm7,           xmm1                 ;

-        movdqa          xmm1,           xmm5                 ;

-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

-        sub             rcx,            1                   ;

-        jnz             filter_block2d_bil_sp_only_loop       ;

-        jmp             filter_block2d_bil_variance

-filter_block2d_bil_var_sse2_full_pixel:

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

-        pxor            xmm0,           xmm0                 ;

-filter_block2d_bil_full_pixel_loop:

-        movq            xmm1,           QWORD PTR [rsi]               ;

-        punpcklbw       xmm1,           xmm0                 ;

-        movq            xmm2,           QWORD PTR [rdi]               ;

-        punpcklbw       xmm2,           xmm0                 ;

-        psubw           xmm1,           xmm2                 ;

-        paddw           xmm6,           xmm1                 ;

-        pmaddwd         xmm1,           xmm1                 ;

-        paddd           xmm7,           xmm1                 ;

-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

-        sub             rcx,            1                   ;

-        jnz             filter_block2d_bil_full_pixel_loop       ;

-        jmp             filter_block2d_bil_variance

-filter_block2d_bil_var_sse2_fp_only:

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line

-        pxor            xmm0,           xmm0                 ;

-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

-filter_block2d_bil_fp_only_loop:

-        movq            xmm1,           QWORD PTR [rsi]       ;

-        movq            xmm3,           QWORD PTR [rsi+1]     ;

-        punpcklbw       xmm1,           xmm0                 ;

-        pmullw          xmm1,           [rax]               ;

-        punpcklbw       xmm3,           xmm0                 ;

-        pmullw          xmm3,           [rax+16]             ;

-        paddw           xmm1,           xmm3                 ;

-        paddw           xmm1,           xmm4  ;

-        psraw           xmm1,           xmm_filter_shift    ;

-        movq            xmm3,           QWORD PTR [rdi]     ;

-        punpcklbw       xmm3,           xmm0                 ;

-        psubw           xmm1,           xmm3                 ;

-        paddw           xmm6,           xmm1                 ;

-        pmaddwd         xmm1,           xmm1                 ;

-        paddd           xmm7,           xmm1                 ;

-        lea             rsi,            [rsi + rdx]

-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

-        sub             rcx,            1                   ;

-        jnz             filter_block2d_bil_fp_only_loop       ;

-        jmp             filter_block2d_bil_variance

-filter_block2d_bil_variance:

-        movdq2q         mm6,            xmm6                ;

-        movdq2q         mm7,            xmm7                ;

-        psrldq          xmm6,           8

-        psrldq          xmm7,           8

-        movdq2q         mm2,            xmm6

-        movdq2q         mm3,            xmm7

-        paddw           mm6,            mm2

-        paddd           mm7,            mm3

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rsi,            arg(7) ; sum

-        mov             rdi,            arg(8) ; sumsquared

-        movd            [rsi],          mm2    ; xsum

-        movd            [rdi],          mm4    ; xxsum

-    ; begin epilog

-    pop rbx

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp8_half_horiz_vert_variance8x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE

-sym(vp8_half_horiz_vert_variance8x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-%if ABI_IS_32BIT=0

-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line

-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line

-%endif

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

-        pxor            xmm0,           xmm0                ;

-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8

-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

-%else

-        add             rsi, r8

-%endif

-vp8_half_horiz_vert_variance8x_h_1:

-        movq            xmm1,           QWORD PTR [rsi]     ;

-        movq            xmm2,           QWORD PTR [rsi+1]   ;

-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1

-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above

-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8

-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row

-%if ABI_IS_32BIT

-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination

-%else

-        add             rsi, r8

-        add             rdi, r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             vp8_half_horiz_vert_variance8x_h_1     ;

-        movdq2q         mm6,            xmm6                ;

-        movdq2q         mm7,            xmm7                ;

-        psrldq          xmm6,           8

-        psrldq          xmm7,           8

-        movdq2q         mm2,            xmm6

-        movdq2q         mm3,            xmm7

-        paddw           mm6,            mm2

-        paddd           mm7,            mm3

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rsi,            arg(5) ; sum

-        mov             rdi,            arg(6) ; sumsquared

-        movd            [rsi],          mm2                 ;

-        movd            [rdi],          mm4                 ;

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp8_half_horiz_vert_variance16x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE

-sym(vp8_half_horiz_vert_variance16x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line

-        pxor            xmm0,           xmm0                ;

-        movdqu          xmm5,           XMMWORD PTR [rsi]

-        movdqu          xmm3,           XMMWORD PTR [rsi+1]

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1

-        lea             rsi,            [rsi + rax]

-vp8_half_horiz_vert_variance16x_h_1:

-        movdqu          xmm1,           XMMWORD PTR [rsi]     ;

-        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;

-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1

-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above

-        movdqa          xmm4,           xmm5

-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

-        punpckhbw       xmm4,           xmm0

-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7

-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

-        movq            xmm3,           QWORD PTR [rdi+8]

-        punpcklbw       xmm3,           xmm0

-        psubw           xmm4,           xmm3

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        paddw           xmm6,           xmm4

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        pmaddwd         xmm4,           xmm4

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-        paddd           xmm7,           xmm4

-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row

-        lea             rsi,            [rsi + rax]

-        lea             rdi,            [rdi + rdx]

-        sub             rcx,            1                   ;

-        jnz             vp8_half_horiz_vert_variance16x_h_1     ;

-        pxor        xmm1,           xmm1

-        pxor        xmm5,           xmm5

-        punpcklwd   xmm0,           xmm6

-        punpckhwd   xmm1,           xmm6

-        psrad       xmm0,           16

-        psrad       xmm1,           16

-        paddd       xmm0,           xmm1

-        movdqa      xmm1,           xmm0

-        movdqa      xmm6,           xmm7

-        punpckldq   xmm6,           xmm5

-        punpckhdq   xmm7,           xmm5

-        paddd       xmm6,           xmm7

-        punpckldq   xmm0,           xmm5

-        punpckhdq   xmm1,           xmm5

-        paddd       xmm0,           xmm1

-        movdqa      xmm7,           xmm6

-        movdqa      xmm1,           xmm0

-        psrldq      xmm7,           8

-        psrldq      xmm1,           8

-        paddd       xmm6,           xmm7

-        paddd       xmm0,           xmm1

-        mov         rsi,            arg(5) ;[Sum]

-        mov         rdi,            arg(6) ;[SSE]

-        movd        [rsi],       xmm0

-        movd        [rdi],       xmm6

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp8_half_vert_variance8x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE

-sym(vp8_half_vert_variance8x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-%if ABI_IS_32BIT=0

-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line

-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line

-%endif

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

-        pxor            xmm0,           xmm0                ;

-vp8_half_vert_variance8x_h_1:

-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8

-        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8

-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-%if ABI_IS_32BIT

-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination

-%else

-        add             rsi, r8

-        add             rdi, r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             vp8_half_vert_variance8x_h_1          ;

-        movdq2q         mm6,            xmm6                ;

-        movdq2q         mm7,            xmm7                ;

-        psrldq          xmm6,           8

-        psrldq          xmm7,           8

-        movdq2q         mm2,            xmm6

-        movdq2q         mm3,            xmm7

-        paddw           mm6,            mm2

-        paddd           mm7,            mm3

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rsi,            arg(5) ; sum

-        mov             rdi,            arg(6) ; sumsquared

-        movd            [rsi],          mm2                 ;

-        movd            [rdi],          mm4                 ;

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp8_half_vert_variance16x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE

-sym(vp8_half_vert_variance16x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0)              ;ref_ptr

-        mov             rdi,            arg(2)              ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)    ;Height

-        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line

-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line

-        movdqu          xmm5,           XMMWORD PTR [rsi]

-        lea             rsi,            [rsi + rax          ]

-        pxor            xmm0,           xmm0

-vp8_half_vert_variance16x_h_1:

-        movdqu          xmm3,           XMMWORD PTR [rsi]

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

-        movdqa          xmm4,           xmm5

-        punpcklbw       xmm5,           xmm0

-        punpckhbw       xmm4,           xmm0

-        movq            xmm2,           QWORD PTR [rdi]

-        punpcklbw       xmm2,           xmm0

-        psubw           xmm5,           xmm2

-        movq            xmm2,           QWORD PTR [rdi+8]

-        punpcklbw       xmm2,           xmm0

-        psubw           xmm4,           xmm2

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        paddw           xmm6,           xmm4

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        pmaddwd         xmm4,           xmm4

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-        paddd           xmm7,           xmm4

-        movdqa          xmm5,           xmm3

-        lea             rsi,            [rsi + rax]

-        lea             rdi,            [rdi + rdx]

-        sub             rcx,            1

-        jnz             vp8_half_vert_variance16x_h_1

-        pxor        xmm1,           xmm1

-        pxor        xmm5,           xmm5

-        punpcklwd   xmm0,           xmm6

-        punpckhwd   xmm1,           xmm6

-        psrad       xmm0,           16

-        psrad       xmm1,           16

-        paddd       xmm0,           xmm1

-        movdqa      xmm1,           xmm0

-        movdqa      xmm6,           xmm7

-        punpckldq   xmm6,           xmm5

-        punpckhdq   xmm7,           xmm5

-        paddd       xmm6,           xmm7

-        punpckldq   xmm0,           xmm5

-        punpckhdq   xmm1,           xmm5

-        paddd       xmm0,           xmm1

-        movdqa      xmm7,           xmm6

-        movdqa      xmm1,           xmm0

-        psrldq      xmm7,           8

-        psrldq      xmm1,           8

-        paddd       xmm6,           xmm7

-        paddd       xmm0,           xmm1

-        mov         rsi,            arg(5) ;[Sum]

-        mov         rdi,            arg(6) ;[SSE]

-        movd        [rsi],       xmm0

-        movd        [rdi],       xmm6

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp8_half_horiz_variance8x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE

-sym(vp8_half_horiz_variance8x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-%if ABI_IS_32BIT=0

-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line

-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line

-%endif

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        pxor            xmm0,           xmm0                ;

-vp8_half_horiz_variance8x_h_1:

-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8

-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8

-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-%if ABI_IS_32BIT

-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination

-%else

-        add             rsi, r8

-        add             rdi, r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             vp8_half_horiz_variance8x_h_1        ;

-        movdq2q         mm6,            xmm6                ;

-        movdq2q         mm7,            xmm7                ;

-        psrldq          xmm6,           8

-        psrldq          xmm7,           8

-        movdq2q         mm2,            xmm6

-        movdq2q         mm3,            xmm7

-        paddw           mm6,            mm2

-        paddd           mm7,            mm3

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rsi,            arg(5) ; sum

-        mov             rdi,            arg(6) ; sumsquared

-        movd            [rsi],          mm2                 ;

-        movd            [rdi],          mm4                 ;

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp8_half_horiz_variance16x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE

-sym(vp8_half_horiz_variance16x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line

-        pxor            xmm0,           xmm0                ;

-vp8_half_horiz_variance16x_h_1:

-        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15

-        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

-        movdqa          xmm1,           xmm5

-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

-        punpckhbw       xmm1,           xmm0

-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7

-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

-        movq            xmm2,           QWORD PTR [rdi+8]

-        punpcklbw       xmm2,           xmm0

-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

-        psubw           xmm1,           xmm2

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        paddw           xmm6,           xmm1

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        pmaddwd         xmm1,           xmm1

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-        paddd           xmm7,           xmm1

-        lea             rsi,            [rsi + rax]

-        lea             rdi,            [rdi + rdx]

-        sub             rcx,            1                   ;

-        jnz             vp8_half_horiz_variance16x_h_1        ;

-        pxor        xmm1,           xmm1

-        pxor        xmm5,           xmm5

-        punpcklwd   xmm0,           xmm6

-        punpckhwd   xmm1,           xmm6

-        psrad       xmm0,           16

-        psrad       xmm1,           16

-        paddd       xmm0,           xmm1

-        movdqa      xmm1,           xmm0

-        movdqa      xmm6,           xmm7

-        punpckldq   xmm6,           xmm5

-        punpckhdq   xmm7,           xmm5

-        paddd       xmm6,           xmm7

-        punpckldq   xmm0,           xmm5

-        punpckhdq   xmm1,           xmm5

-        paddd       xmm0,           xmm1

-        movdqa      xmm7,           xmm6

-        movdqa      xmm1,           xmm0

-        psrldq      xmm7,           8

-        psrldq      xmm1,           8

-        paddd       xmm6,           xmm7

-        paddd       xmm0,           xmm1

-        mov         rsi,            arg(5) ;[Sum]

-        mov         rdi,            arg(6) ;[SSE]

-        movd        [rsi],       xmm0

-        movd        [rdi],       xmm6

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};

-align 16

-xmm_bi_rd:

-    times 8 dw 64

-align 16

-vp8_bilinear_filters_sse2:

-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0

-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16

-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32

-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48

-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80

-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96

-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

--- a/vp8/common/x86/variance_impl_ssse3.asm

+++ /dev/null

@@ -1,364 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define xmm_filter_shift            7

-;void vp8_filter_block2d_bil_var_ssse3

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int  xoffset,

-;    int  yoffset,

-;    int *sum,

-;    unsigned int *sumsquared;;

-;

-;)

-;Note: The filter coefficient at offset=0 is 128. Since the second register

-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.

-global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE

-sym(vp8_filter_block2d_bil_var_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-        pxor            xmm6,           xmm6

-        pxor            xmm7,           xmm7

-        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]

-        movsxd          rax,            dword ptr arg(5)     ; xoffset

-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0

-        je              .filter_block2d_bil_var_ssse3_sp_only

-        shl             rax,            4                    ; point to filter coeff with xoffset

-        lea             rax,            [rax + rcx]          ; HFilter

-        movsxd          rdx,            dword ptr arg(6)     ; yoffset

-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0

-        je              .filter_block2d_bil_var_ssse3_fp_only

-        shl             rdx,            4

-        lea             rdx,            [rdx + rcx]          ; VFilter

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movdqu          xmm0,           XMMWORD PTR [rsi]

-        movdqu          xmm1,           XMMWORD PTR [rsi+1]

-        movdqa          xmm2,           xmm0

-        punpcklbw       xmm0,           xmm1

-        punpckhbw       xmm2,           xmm1

-        pmaddubsw       xmm0,           [rax]

-        pmaddubsw       xmm2,           [rax]

-        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]

-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

-        psraw           xmm0,           xmm_filter_shift

-        psraw           xmm2,           xmm_filter_shift

-        packuswb        xmm0,           xmm2

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line

-%else

-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

-        lea             rsi,            [rsi + r8]

-%endif

-.filter_block2d_bil_var_ssse3_loop:

-        movdqu          xmm1,           XMMWORD PTR [rsi]

-        movdqu          xmm2,           XMMWORD PTR [rsi+1]

-        movdqa          xmm3,           xmm1

-        punpcklbw       xmm1,           xmm2

-        punpckhbw       xmm3,           xmm2

-        pmaddubsw       xmm1,           [rax]

-        pmaddubsw       xmm3,           [rax]

-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

-        psraw           xmm1,           xmm_filter_shift

-        psraw           xmm3,           xmm_filter_shift

-        packuswb        xmm1,           xmm3

-        movdqa          xmm2,           xmm0

-        movdqa          xmm0,           xmm1

-        movdqa          xmm3,           xmm2

-        punpcklbw       xmm2,           xmm1

-        punpckhbw       xmm3,           xmm1

-        pmaddubsw       xmm2,           [rdx]

-        pmaddubsw       xmm3,           [rdx]

-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

-        psraw           xmm2,           xmm_filter_shift

-        psraw           xmm3,           xmm_filter_shift

-        movq            xmm1,           QWORD PTR [rdi]

-        pxor            xmm4,           xmm4

-        punpcklbw       xmm1,           xmm4

-        movq            xmm5,           QWORD PTR [rdi+8]

-        punpcklbw       xmm5,           xmm4

-        psubw           xmm2,           xmm1

-        psubw           xmm3,           xmm5

-        paddw           xmm6,           xmm2

-        paddw           xmm6,           xmm3

-        pmaddwd         xmm2,           xmm2

-        pmaddwd         xmm3,           xmm3

-        paddd           xmm7,           xmm2

-        paddd           xmm7,           xmm3

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line

-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

-%else

-        lea             rsi,            [rsi + r8]

-        lea             rdi,            [rdi + r9]

-%endif

-        sub             rcx,            1

-        jnz             .filter_block2d_bil_var_ssse3_loop

-        jmp             .filter_block2d_bil_variance

-.filter_block2d_bil_var_ssse3_sp_only:

-        movsxd          rdx,            dword ptr arg(6)     ; yoffset

-        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0

-        je              .filter_block2d_bil_var_ssse3_full_pixel

-        shl             rdx,            4

-        lea             rdx,            [rdx + rcx]          ; VFilter

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

-        movdqu          xmm1,           XMMWORD PTR [rsi]

-        movdqa          xmm0,           xmm1

-%if ABI_IS_32BIT=0

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

-%endif

-        lea             rsi,            [rsi + rax]

-.filter_block2d_bil_sp_only_loop:

-        movdqu          xmm3,           XMMWORD PTR [rsi]

-        movdqa          xmm2,           xmm1

-        movdqa          xmm0,           xmm3

-        punpcklbw       xmm1,           xmm3

-        punpckhbw       xmm2,           xmm3

-        pmaddubsw       xmm1,           [rdx]

-        pmaddubsw       xmm2,           [rdx]

-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

-        psraw           xmm1,           xmm_filter_shift

-        psraw           xmm2,           xmm_filter_shift

-        movq            xmm3,           QWORD PTR [rdi]

-        pxor            xmm4,           xmm4

-        punpcklbw       xmm3,           xmm4

-        movq            xmm5,           QWORD PTR [rdi+8]

-        punpcklbw       xmm5,           xmm4

-        psubw           xmm1,           xmm3

-        psubw           xmm2,           xmm5

-        paddw           xmm6,           xmm1

-        paddw           xmm6,           xmm2

-        pmaddwd         xmm1,           xmm1

-        pmaddwd         xmm2,           xmm2

-        paddd           xmm7,           xmm1

-        paddd           xmm7,           xmm2

-        movdqa          xmm1,           xmm0

-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

-%if ABI_IS_32BIT

-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

-%else

-        lea             rdi,            [rdi + r9]

-%endif

-        sub             rcx,            1

-        jnz             .filter_block2d_bil_sp_only_loop

-        jmp             .filter_block2d_bil_variance

-.filter_block2d_bil_var_ssse3_full_pixel:

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

-        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line

-        pxor            xmm0,           xmm0

-.filter_block2d_bil_full_pixel_loop:

-        movq            xmm1,           QWORD PTR [rsi]

-        punpcklbw       xmm1,           xmm0

-        movq            xmm2,           QWORD PTR [rsi+8]

-        punpcklbw       xmm2,           xmm0

-        movq            xmm3,           QWORD PTR [rdi]

-        punpcklbw       xmm3,           xmm0

-        movq            xmm4,           QWORD PTR [rdi+8]

-        punpcklbw       xmm4,           xmm0

-        psubw           xmm1,           xmm3

-        psubw           xmm2,           xmm4

-        paddw           xmm6,           xmm1

-        paddw           xmm6,           xmm2

-        pmaddwd         xmm1,           xmm1

-        pmaddwd         xmm2,           xmm2

-        paddd           xmm7,           xmm1

-        paddd           xmm7,           xmm2

-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

-        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line

-        sub             rcx,            1

-        jnz             .filter_block2d_bil_full_pixel_loop

-        jmp             .filter_block2d_bil_variance

-.filter_block2d_bil_var_ssse3_fp_only:

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line

-        pxor            xmm0,           xmm0

-%if ABI_IS_32BIT=0

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

-%endif

-.filter_block2d_bil_fp_only_loop:

-        movdqu          xmm1,           XMMWORD PTR [rsi]

-        movdqu          xmm2,           XMMWORD PTR [rsi+1]

-        movdqa          xmm3,           xmm1

-        punpcklbw       xmm1,           xmm2

-        punpckhbw       xmm3,           xmm2

-        pmaddubsw       xmm1,           [rax]

-        pmaddubsw       xmm3,           [rax]

-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

-        psraw           xmm1,           xmm_filter_shift

-        psraw           xmm3,           xmm_filter_shift

-        movq            xmm2,           XMMWORD PTR [rdi]

-        pxor            xmm4,           xmm4

-        punpcklbw       xmm2,           xmm4

-        movq            xmm5,           QWORD PTR [rdi+8]

-        punpcklbw       xmm5,           xmm4

-        psubw           xmm1,           xmm2

-        psubw           xmm3,           xmm5

-        paddw           xmm6,           xmm1

-        paddw           xmm6,           xmm3

-        pmaddwd         xmm1,           xmm1

-        pmaddwd         xmm3,           xmm3

-        paddd           xmm7,           xmm1

-        paddd           xmm7,           xmm3

-        lea             rsi,            [rsi + rdx]

-%if ABI_IS_32BIT

-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

-%else

-        lea             rdi,            [rdi + r9]

-%endif

-        sub             rcx,            1

-        jnz             .filter_block2d_bil_fp_only_loop

-        jmp             .filter_block2d_bil_variance

-.filter_block2d_bil_variance:

-        pxor        xmm0,           xmm0

-        pxor        xmm1,           xmm1

-        pxor        xmm5,           xmm5

-        punpcklwd   xmm0,           xmm6

-        punpckhwd   xmm1,           xmm6

-        psrad       xmm0,           16

-        psrad       xmm1,           16

-        paddd       xmm0,           xmm1

-        movdqa      xmm1,           xmm0

-        movdqa      xmm6,           xmm7

-        punpckldq   xmm6,           xmm5

-        punpckhdq   xmm7,           xmm5

-        paddd       xmm6,           xmm7

-        punpckldq   xmm0,           xmm5

-        punpckhdq   xmm1,           xmm5

-        paddd       xmm0,           xmm1

-        movdqa      xmm7,           xmm6

-        movdqa      xmm1,           xmm0

-        psrldq      xmm7,           8

-        psrldq      xmm1,           8

-        paddd       xmm6,           xmm7

-        paddd       xmm0,           xmm1

-        mov         rsi,            arg(7) ;[Sum]

-        mov         rdi,            arg(8) ;[SSE]

-        movd        [rsi],       xmm0

-        movd        [rdi],       xmm6

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-xmm_bi_rd:

-    times 8 dw 64

-align 16

-vp8_bilinear_filters_ssse3:

-    times 8 db 128, 0

-    times 8 db 112, 16

-    times 8 db 96,  32

-    times 8 db 80,  48

-    times 8 db 64,  64

-    times 8 db 48,  80

-    times 8 db 32,  96

-    times 8 db 16,  112

--- a/vp8/common/x86/variance_ssse3.c

+++ /dev/null

@@ -1,157 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp8_rtcd.h"

-#include "vpx_config.h"

-#include "vp8/common/variance.h"

-#include "vpx_ports/mem.h"

-extern void vp8_half_horiz_vert_variance16x_h_sse2

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    int *sum,

-    unsigned int *sumsquared

-);

-extern void vp8_half_horiz_variance16x_h_sse2

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    int *sum,

-    unsigned int *sumsquared

-);

-extern void vp8_half_vert_variance16x_h_sse2

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    int *sum,

-    unsigned int *sumsquared

-);

-extern void vp8_filter_block2d_bil_var_ssse3

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    int  xoffset,

-    int  yoffset,

-    int *sum,

-    unsigned int *sumsquared

-);

-unsigned int vp8_sub_pixel_variance16x16_ssse3

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    int xsum0;

-    unsigned int xxsum0;

-    /* note we could avoid these if statements if the calling function

-     * just called the appropriate functions inside.

-     */

-    if (xoffset == 4 && yoffset == 0)

-    {

-        vp8_half_horiz_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            &xsum0, &xxsum0);

-    }

-    else if (xoffset == 0 && yoffset == 4)

-    {

-        vp8_half_vert_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            &xsum0, &xxsum0);

-    }

-    else if (xoffset == 4 && yoffset == 4)

-    {

-        vp8_half_horiz_vert_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            &xsum0, &xxsum0);

-    }

-    else

-    {

-        vp8_filter_block2d_bil_var_ssse3(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            xoffset, yoffset,

-            &xsum0, &xxsum0);

-    }

-    *sse = xxsum0;

-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));

-}

-unsigned int vp8_sub_pixel_variance16x8_ssse3

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    int xsum0;

-    unsigned int xxsum0;

-    if (xoffset == 4 && yoffset == 0)

-    {

-        vp8_half_horiz_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            &xsum0, &xxsum0);

-    }

-    else if (xoffset == 0 && yoffset == 4)

-    {

-        vp8_half_vert_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            &xsum0, &xxsum0);

-    }

-    else if (xoffset == 4 && yoffset == 4)

-    {

-        vp8_half_horiz_vert_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            &xsum0, &xxsum0);

-    }

-    else

-    {

-        vp8_filter_block2d_bil_var_ssse3(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            xoffset, yoffset,

-            &xsum0, &xxsum0);

-    }

-    *sse = xxsum0;

-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));

-}

--- a/vp8/common/x86/vp8_variance_impl_mmx.asm

+++ /dev/null

@@ -1,353 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define mmx_filter_shift            7

-;void vp8_filter_block2d_bil4x4_var_mmx

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned short *HFilter,

-;    unsigned short *VFilter,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE

-sym(vp8_filter_block2d_bil4x4_var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 16

-    ; end prolog

-        pxor            mm6,            mm6                 ;

-        pxor            mm7,            mm7                 ;

-        mov             rax,            arg(4) ;HFilter             ;

-        mov             rdx,            arg(5) ;VFilter             ;

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        mov             rcx,            4                   ;

-        pxor            mm0,            mm0                 ;

-        movd            mm1,            [rsi]               ;

-        movd            mm3,            [rsi+1]             ;

-        punpcklbw       mm1,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        movq            mm5,            mm1

-%if ABI_IS_32BIT

-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;

-%else

-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;

-        add             rsi, r8

-%endif

-.filter_block2d_bil4x4_var_mmx_loop:

-        movd            mm1,            [rsi]               ;

-        movd            mm3,            [rsi+1]             ;

-        punpcklbw       mm1,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        movq            mm3,            mm5                 ;

-        movq            mm5,            mm1                 ;

-        pmullw          mm3,            [rdx]               ;

-        pmullw          mm1,            [rdx+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        movd            mm3,            [rdi]               ;

-        punpcklbw       mm3,            mm0                 ;

-        psubw           mm1,            mm3                 ;

-        paddw           mm6,            mm1                 ;

-        pmaddwd         mm1,            mm1                 ;

-        paddd           mm7,            mm1                 ;

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;

-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;

-%else

-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

-        add             rsi,            r8

-        add             rdi,            r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rdi,            arg(6) ;sum

-        mov             rsi,            arg(7) ;sumsquared

-        movd            dword ptr [rdi],          mm2                 ;

-        movd            dword ptr [rsi],          mm4                 ;

-    ; begin epilog

-    add rsp, 16

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp8_filter_block2d_bil_var_mmx

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    unsigned short *HFilter,

-;    unsigned short *VFilter,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE

-sym(vp8_filter_block2d_bil_var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 16

-    ; end prolog

-        pxor            mm6,            mm6                 ;

-        pxor            mm7,            mm7                 ;

-        mov             rax,            arg(5) ;HFilter             ;

-        mov             rdx,            arg(6) ;VFilter             ;

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        pxor            mm0,            mm0                 ;

-        movq            mm1,            [rsi]               ;

-        movq            mm3,            [rsi+1]             ;

-        movq            mm2,            mm1                 ;

-        movq            mm4,            mm3                 ;

-        punpcklbw       mm1,            mm0                 ;

-        punpckhbw       mm2,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        pmullw          mm2,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        pmullw          mm4,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm2,            mm4                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm2,            mmx_filter_shift    ;

-        movq            mm5,            mm1

-        packuswb        mm5,            mm2                 ;

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line

-%else

-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

-        add             rsi,            r8

-%endif

-.filter_block2d_bil_var_mmx_loop:

-        movq            mm1,            [rsi]               ;

-        movq            mm3,            [rsi+1]             ;

-        movq            mm2,            mm1                 ;

-        movq            mm4,            mm3                 ;

-        punpcklbw       mm1,            mm0                 ;

-        punpckhbw       mm2,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        pmullw          mm2,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        pmullw          mm4,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm2,            mm4                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm2,            mmx_filter_shift    ;

-        movq            mm3,            mm5                 ;

-        movq            mm4,            mm5                 ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        movq            mm5,            mm1                 ;

-        packuswb        mm5,            mm2                 ;

-        pmullw          mm3,            [rdx]               ;

-        pmullw          mm4,            [rdx]               ;

-        pmullw          mm1,            [rdx+8]             ;

-        pmullw          mm2,            [rdx+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm2,            mm4                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        psraw           mm2,            mmx_filter_shift    ;

-        movq            mm3,            [rdi]               ;

-        movq            mm4,            mm3                 ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        psubw           mm1,            mm3                 ;

-        psubw           mm2,            mm4                 ;

-        paddw           mm6,            mm1                 ;

-        pmaddwd         mm1,            mm1                 ;

-        paddw           mm6,            mm2                 ;

-        pmaddwd         mm2,            mm2                 ;

-        paddd           mm7,            mm1                 ;

-        paddd           mm7,            mm2                 ;

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;

-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;

-%else

-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;

-        add             rsi,            r8

-        add             rdi,            r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             .filter_block2d_bil_var_mmx_loop       ;

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rdi,            arg(7) ;sum

-        mov             rsi,            arg(8) ;sumsquared

-        movd            dword ptr [rdi],          mm2                 ;

-        movd            dword ptr [rsi],          mm4                 ;

-    ; begin epilog

-    add rsp, 16

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-;short mmx_bi_rd[4] = { 64, 64, 64, 64};

-align 16

-mmx_bi_rd:

-    times 4 dw 64

--- a/vp8/common/x86/vp8_variance_mmx.c

+++ /dev/null

@@ -1,244 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp8_rtcd.h"

-#include "vpx_config.h"

-#include "vp8/common/variance.h"

-#include "vpx_ports/mem.h"

-#include "vp8/common/x86/filter_x86.h"

-extern void filter_block1d_h6_mmx

-(

-    const unsigned char *src_ptr,

-    unsigned short *output_ptr,

-    unsigned int src_pixels_per_line,

-    unsigned int pixel_step,

-    unsigned int output_height,

-    unsigned int output_width,

-    short *filter

-);

-extern void filter_block1d_v6_mmx

-(

-    const short *src_ptr,

-    unsigned char *output_ptr,

-    unsigned int pixels_per_line,

-    unsigned int pixel_step,

-    unsigned int output_height,

-    unsigned int output_width,

-    short *filter

-);

-extern void vp8_filter_block2d_bil4x4_var_mmx

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    const short *HFilter,

-    const short *VFilter,

-    int *sum,

-    unsigned int *sumsquared

-);

-extern void vp8_filter_block2d_bil_var_mmx

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    const short *HFilter,

-    const short *VFilter,

-    int *sum,

-    unsigned int *sumsquared

-);

-unsigned int vp8_sub_pixel_variance4x4_mmx

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse)

-{

-    int xsum;

-    unsigned int xxsum;

-    vp8_filter_block2d_bil4x4_var_mmx(

-        src_ptr, src_pixels_per_line,

-        dst_ptr, dst_pixels_per_line,

-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],

-        &xsum, &xxsum

-    );

-    *sse = xxsum;

-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));

-}

-unsigned int vp8_sub_pixel_variance8x8_mmx

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    int xsum;

-    unsigned int xxsum;

-    vp8_filter_block2d_bil_var_mmx(

-        src_ptr, src_pixels_per_line,

-        dst_ptr, dst_pixels_per_line, 8,

-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],

-        &xsum, &xxsum

-    );

-    *sse = xxsum;

-    return (xxsum - (((unsigned int)xsum * xsum) >> 6));

-}

-unsigned int vp8_sub_pixel_variance16x16_mmx

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    int xsum0, xsum1;

-    unsigned int xxsum0, xxsum1;

-    vp8_filter_block2d_bil_var_mmx(

-        src_ptr, src_pixels_per_line,

-        dst_ptr, dst_pixels_per_line, 16,

-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],

-        &xsum0, &xxsum0

-    );

-    vp8_filter_block2d_bil_var_mmx(

-        src_ptr + 8, src_pixels_per_line,

-        dst_ptr + 8, dst_pixels_per_line, 16,

-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],

-        &xsum1, &xxsum1

-    );

-    xsum0 += xsum1;

-    xxsum0 += xxsum1;

-    *sse = xxsum0;

-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));

-}

-unsigned int vp8_sub_pixel_variance16x8_mmx

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    int xsum0, xsum1;

-    unsigned int xxsum0, xxsum1;

-    vp8_filter_block2d_bil_var_mmx(

-        src_ptr, src_pixels_per_line,

-        dst_ptr, dst_pixels_per_line, 8,

-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],

-        &xsum0, &xxsum0

-    );

-    vp8_filter_block2d_bil_var_mmx(

-        src_ptr + 8, src_pixels_per_line,

-        dst_ptr + 8, dst_pixels_per_line, 8,

-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],

-        &xsum1, &xxsum1

-    );

-    xsum0 += xsum1;

-    xxsum0 += xxsum1;

-    *sse = xxsum0;

-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));

-}

-unsigned int vp8_sub_pixel_variance8x16_mmx

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    int xsum;

-    unsigned int xxsum;

-    vp8_filter_block2d_bil_var_mmx(

-        src_ptr, src_pixels_per_line,

-        dst_ptr, dst_pixels_per_line, 16,

-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],

-        &xsum, &xxsum

-    );

-    *sse = xxsum;

-    return (xxsum - (((unsigned int)xsum * xsum) >> 7));

-}

-unsigned int vp8_variance_halfpixvar16x16_h_mmx(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *sse)

-{

-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,

-                                           ref_ptr, recon_stride, sse);

-}

-unsigned int vp8_variance_halfpixvar16x16_v_mmx(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *sse)

-{

-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,

-                                           ref_ptr, recon_stride, sse);

-}

-unsigned int vp8_variance_halfpixvar16x16_hv_mmx(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *sse)

-{

-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,

-                                           ref_ptr, recon_stride, sse);

-}

--- a/vp8/common/x86/vp8_variance_sse2.c

+++ /dev/null

@@ -1,403 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp8_rtcd.h"

-#include "vpx_config.h"

-#include "vp8/common/variance.h"

-#include "vpx_ports/mem.h"

-#include "vp8/common/x86/filter_x86.h"

-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);

-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);

-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);

-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);

-extern void vp8_filter_block2d_bil4x4_var_mmx

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    const short *HFilter,

-    const short *VFilter,

-    int *sum,

-    unsigned int *sumsquared

-);

-void vp8_filter_block2d_bil_var_sse2

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    int  xoffset,

-    int  yoffset,

-    int *sum,

-    unsigned int *sumsquared

-);

-void vp8_half_horiz_vert_variance8x_h_sse2

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    int *sum,

-    unsigned int *sumsquared

-);

-void vp8_half_horiz_vert_variance16x_h_sse2

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    int *sum,

-    unsigned int *sumsquared

-);

-void vp8_half_horiz_variance8x_h_sse2

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    int *sum,

-    unsigned int *sumsquared

-);

-void vp8_half_horiz_variance16x_h_sse2

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    int *sum,

-    unsigned int *sumsquared

-);

-void vp8_half_vert_variance8x_h_sse2

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    int *sum,

-    unsigned int *sumsquared

-);

-void vp8_half_vert_variance16x_h_sse2

-(

-    const unsigned char *ref_ptr,

-    int ref_pixels_per_line,

-    const unsigned char *src_ptr,

-    int src_pixels_per_line,

-    unsigned int Height,

-    int *sum,

-    unsigned int *sumsquared

-);

-unsigned int vp8_sub_pixel_variance4x4_wmt

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    int xsum;

-    unsigned int xxsum;

-    vp8_filter_block2d_bil4x4_var_mmx(

-        src_ptr, src_pixels_per_line,

-        dst_ptr, dst_pixels_per_line,

-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],

-        &xsum, &xxsum

-    );

-    *sse = xxsum;

-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));

-}

-unsigned int vp8_sub_pixel_variance8x8_wmt

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    int xsum;

-    unsigned int xxsum;

-    if (xoffset == 4 && yoffset == 0)

-    {

-        vp8_half_horiz_variance8x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            &xsum, &xxsum);

-    }

-    else if (xoffset == 0 && yoffset == 4)

-    {

-        vp8_half_vert_variance8x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            &xsum, &xxsum);

-    }

-    else if (xoffset == 4 && yoffset == 4)

-    {

-        vp8_half_horiz_vert_variance8x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            &xsum, &xxsum);

-    }

-    else

-    {

-        vp8_filter_block2d_bil_var_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            xoffset, yoffset,

-            &xsum, &xxsum);

-    }

-    *sse = xxsum;

-    return (xxsum - (((unsigned int)xsum * xsum) >> 6));

-}

-unsigned int vp8_sub_pixel_variance16x16_wmt

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    int xsum0, xsum1;

-    unsigned int xxsum0, xxsum1;

-    /* note we could avoid these if statements if the calling function

-     * just called the appropriate functions inside.

-     */

-    if (xoffset == 4 && yoffset == 0)

-    {

-        vp8_half_horiz_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            &xsum0, &xxsum0);

-    }

-    else if (xoffset == 0 && yoffset == 4)

-    {

-        vp8_half_vert_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            &xsum0, &xxsum0);

-    }

-    else if (xoffset == 4 && yoffset == 4)

-    {

-        vp8_half_horiz_vert_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            &xsum0, &xxsum0);

-    }

-    else

-    {

-        vp8_filter_block2d_bil_var_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            xoffset, yoffset,

-            &xsum0, &xxsum0

-        );

-        vp8_filter_block2d_bil_var_sse2(

-            src_ptr + 8, src_pixels_per_line,

-            dst_ptr + 8, dst_pixels_per_line, 16,

-            xoffset, yoffset,

-            &xsum1, &xxsum1

-        );

-        xsum0 += xsum1;

-        xxsum0 += xxsum1;

-    }

-    *sse = xxsum0;

-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));

-}

-unsigned int vp8_sub_pixel_variance16x8_wmt

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    int xsum0, xsum1;

-    unsigned int xxsum0, xxsum1;

-    if (xoffset == 4 && yoffset == 0)

-    {

-        vp8_half_horiz_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            &xsum0, &xxsum0);

-    }

-    else if (xoffset == 0 && yoffset == 4)

-    {

-        vp8_half_vert_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            &xsum0, &xxsum0);

-    }

-    else if (xoffset == 4 && yoffset == 4)

-    {

-        vp8_half_horiz_vert_variance16x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            &xsum0, &xxsum0);

-    }

-    else

-    {

-        vp8_filter_block2d_bil_var_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 8,

-            xoffset, yoffset,

-            &xsum0, &xxsum0);

-        vp8_filter_block2d_bil_var_sse2(

-            src_ptr + 8, src_pixels_per_line,

-            dst_ptr + 8, dst_pixels_per_line, 8,

-            xoffset, yoffset,

-            &xsum1, &xxsum1);

-        xsum0 += xsum1;

-        xxsum0 += xxsum1;

-    }

-    *sse = xxsum0;

-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));

-}

-unsigned int vp8_sub_pixel_variance8x16_wmt

-(

-    const unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    const unsigned char *dst_ptr,

-    int dst_pixels_per_line,

-    unsigned int *sse

-)

-{

-    int xsum;

-    unsigned int xxsum;

-    if (xoffset == 4 && yoffset == 0)

-    {

-        vp8_half_horiz_variance8x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            &xsum, &xxsum);

-    }

-    else if (xoffset == 0 && yoffset == 4)

-    {

-        vp8_half_vert_variance8x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            &xsum, &xxsum);

-    }

-    else if (xoffset == 4 && yoffset == 4)

-    {

-        vp8_half_horiz_vert_variance8x_h_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            &xsum, &xxsum);

-    }

-    else

-    {

-        vp8_filter_block2d_bil_var_sse2(

-            src_ptr, src_pixels_per_line,

-            dst_ptr, dst_pixels_per_line, 16,

-            xoffset, yoffset,

-            &xsum, &xxsum);

-    }

-    *sse = xxsum;

-    return (xxsum - (((unsigned int)xsum * xsum) >> 7));

-}

-unsigned int vp8_variance_halfpixvar16x16_h_wmt(

-    const unsigned char *src_ptr,

-    int  src_pixels_per_line,

-    const unsigned char *dst_ptr,

-    int  dst_pixels_per_line,

-    unsigned int *sse)

-{

-    int xsum0;

-    unsigned int xxsum0;

-    vp8_half_horiz_variance16x_h_sse2(

-        src_ptr, src_pixels_per_line,

-        dst_ptr, dst_pixels_per_line, 16,

-        &xsum0, &xxsum0);

-    *sse = xxsum0;

-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));

-}

-unsigned int vp8_variance_halfpixvar16x16_v_wmt(

-    const unsigned char *src_ptr,

-    int  src_pixels_per_line,

-    const unsigned char *dst_ptr,

-    int  dst_pixels_per_line,

-    unsigned int *sse)

-{

-    int xsum0;

-    unsigned int xxsum0;

-    vp8_half_vert_variance16x_h_sse2(

-        src_ptr, src_pixels_per_line,

-        dst_ptr, dst_pixels_per_line, 16,

-        &xsum0, &xxsum0);

-    *sse = xxsum0;

-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));

-}

-unsigned int vp8_variance_halfpixvar16x16_hv_wmt(

-    const unsigned char *src_ptr,

-    int  src_pixels_per_line,

-    const unsigned char *dst_ptr,

-    int  dst_pixels_per_line,

-    unsigned int *sse)

-{

-    int xsum0;

-    unsigned int xxsum0;

-    vp8_half_horiz_vert_variance16x_h_sse2(

-        src_ptr, src_pixels_per_line,

-        dst_ptr, dst_pixels_per_line, 16,

-        &xsum0, &xxsum0);

-    *sse = xxsum0;

-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));

-}

--- a/vp8/encoder/firstpass.c

+++ b/vp8/encoder/firstpass.c

@@ -16,7 +16,7 @@

 #include "./vpx_scale_rtcd.h"

 #include "block.h"

 #include "onyx_int.h"

-#include "vp8/common/variance.h"

+#include "vpx_dsp/variance.h"

 #include "encodeintra.h"

 #include "vp8/common/setupintrarecon.h"

 #include "vp8/common/systemdependent.h"

--- a/vp8/encoder/mcomp.h

+++ b/vp8/encoder/mcomp.h

@@ -13,7 +13,7 @@

 #define VP8_ENCODER_MCOMP_H_

 #include "block.h"

-#include "vp8/common/variance.h"

+#include "vpx_dsp/variance.h"

 #ifdef __cplusplus

 extern "C" {

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -2132,10 +2132,10 @@

     cpi->fn_ptr[BLOCK_16X16].sdf            = vpx_sad16x16;

     cpi->fn_ptr[BLOCK_16X16].vf             = vpx_variance16x16;

-    cpi->fn_ptr[BLOCK_16X16].svf            = vp8_sub_pixel_variance16x16;

-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vp8_variance_halfpixvar16x16_h;

-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vp8_variance_halfpixvar16x16_v;

-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;

+    cpi->fn_ptr[BLOCK_16X16].svf            = vpx_sub_pixel_variance16x16;

+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vpx_variance_halfpixvar16x16_h;

+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vpx_variance_halfpixvar16x16_v;

+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv;

     cpi->fn_ptr[BLOCK_16X16].sdx3f          = vpx_sad16x16x3;

     cpi->fn_ptr[BLOCK_16X16].sdx8f          = vpx_sad16x16x8;

     cpi->fn_ptr[BLOCK_16X16].sdx4df         = vpx_sad16x16x4d;

@@ -2142,7 +2142,7 @@

     cpi->fn_ptr[BLOCK_16X8].sdf            = vpx_sad16x8;

     cpi->fn_ptr[BLOCK_16X8].vf             = vpx_variance16x8;

-    cpi->fn_ptr[BLOCK_16X8].svf            = vp8_sub_pixel_variance16x8;

+    cpi->fn_ptr[BLOCK_16X8].svf            = vpx_sub_pixel_variance16x8;

     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;

     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;

     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;

@@ -2152,7 +2152,7 @@

     cpi->fn_ptr[BLOCK_8X16].sdf            = vpx_sad8x16;

     cpi->fn_ptr[BLOCK_8X16].vf             = vpx_variance8x16;

-    cpi->fn_ptr[BLOCK_8X16].svf            = vp8_sub_pixel_variance8x16;

+    cpi->fn_ptr[BLOCK_8X16].svf            = vpx_sub_pixel_variance8x16;

     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;

     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;

     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;

@@ -2162,7 +2162,7 @@

     cpi->fn_ptr[BLOCK_8X8].sdf            = vpx_sad8x8;

     cpi->fn_ptr[BLOCK_8X8].vf             = vpx_variance8x8;

-    cpi->fn_ptr[BLOCK_8X8].svf            = vp8_sub_pixel_variance8x8;

+    cpi->fn_ptr[BLOCK_8X8].svf            = vpx_sub_pixel_variance8x8;

     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;

     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;

     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;

@@ -2172,7 +2172,7 @@

     cpi->fn_ptr[BLOCK_4X4].sdf            = vpx_sad4x4;

     cpi->fn_ptr[BLOCK_4X4].vf             = vpx_variance4x4;

-    cpi->fn_ptr[BLOCK_4X4].svf            = vp8_sub_pixel_variance4x4;

+    cpi->fn_ptr[BLOCK_4X4].svf            = vpx_sub_pixel_variance4x4;

     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;

     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;

     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;

--- a/vp8/encoder/onyx_int.h

+++ b/vp8/encoder/onyx_int.h

@@ -18,7 +18,7 @@

 #include "treewriter.h"

 #include "tokenize.h"

 #include "vp8/common/onyxc_int.h"

-#include "vp8/common/variance.h"

+#include "vpx_dsp/variance.h"

 #include "encodemb.h"

 #include "quantize.h"

 #include "vp8/common/entropy.h"

--- a/vp8/encoder/pickinter.c

+++ b/vp8/encoder/pickinter.c

@@ -22,7 +22,7 @@

 #include "encodemb.h"

 #include "vp8/common/reconinter.h"

 #include "vp8/common/reconintra4x4.h"

-#include "vp8/common/variance.h"

+#include "vpx_dsp/variance.h"

 #include "mcomp.h"

 #include "rdopt.h"

 #include "vpx_mem/vpx_mem.h"

--- a/vp8/encoder/rdopt.c

+++ b/vp8/encoder/rdopt.c

@@ -29,7 +29,7 @@

 #include "vp8/common/quant_common.h"

 #include "encodemb.h"

 #include "quantize.h"

-#include "vp8/common/variance.h"

+#include "vpx_dsp/variance.h"

 #include "mcomp.h"

 #include "rdopt.h"

 #include "vpx_mem/vpx_mem.h"

@@ -500,9 +500,9 @@

     if ((mv_row | mv_col) & 7)

-        vp8_sub_pixel_variance8x8(uptr, pre_stride,

+        vpx_sub_pixel_variance8x8(uptr, pre_stride,

             mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);

-        vp8_sub_pixel_variance8x8(vptr, pre_stride,

+        vpx_sub_pixel_variance8x8(vptr, pre_stride,

             mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);

         sse2 += sse1;

--- a/vp8/vp8_common.mk

+++ b/vp8/vp8_common.mk

@@ -63,8 +63,6 @@

 VP8_COMMON_SRCS-yes += common/reconintra4x4.c

 VP8_COMMON_SRCS-yes += common/setupintrarecon.c

 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c

-VP8_COMMON_SRCS-yes += common/variance_c.c

-VP8_COMMON_SRCS-yes += common/variance.h

 VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h

@@ -86,8 +84,6 @@

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm

-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_mmx.c

-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_impl_mmx.asm

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm

@@ -96,12 +92,8 @@

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp8_variance_sse2.c

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm

 VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm

 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm

-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c

-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm

 ifeq ($(CONFIG_POSTPROC),yes)

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm

@@ -129,7 +121,6 @@

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/variance_arm.c

 # common (media)

 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.c

@@ -149,9 +140,6 @@

 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)

 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)

 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c

-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)

 # common (neon intrinsics)

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c

@@ -170,6 +158,5 @@

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/reconintra_neon.c

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c

 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c

-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance_neon.c

 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))

--- a/vp9/common/mips/msa/vp9_convolve_avg_msa.c

+++ b/vp9/common/mips/msa/vp9_convolve_avg_msa.c

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#include "vpx_dsp/mips/macros_msa.h"

 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,

                            uint8_t *dst, int32_t dst_stride, int32_t height) {

--- a/vp9/common/mips/msa/vp9_convolve_copy_msa.c

+++ b/vp9/common/mips/msa/vp9_convolve_copy_msa.c

@@ -9,7 +9,7 @@

*/

 #include <string.h>

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#include "vpx_dsp/mips/macros_msa.h"

 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,

                             uint8_t *dst, int32_t dst_stride, int32_t height) {

--- a/vp9/common/mips/msa/vp9_convolve_msa.h

+++ b/vp9/common/mips/msa/vp9_convolve_msa.h

@@ -12,7 +12,7 @@

 #define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_

 #include "vp9/common/vp9_filter.h"

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#include "vpx_dsp/mips/macros_msa.h"

 extern const uint8_t mc_filt_mask_arr[16 * 3];

--- a/vp9/common/mips/msa/vp9_idct_msa.h

+++ b/vp9/common/mips/msa/vp9_idct_msa.h

@@ -13,7 +13,7 @@

 #include "vpx_ports/mem.h"

 #include "vp9/common/vp9_idct.h"

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#include "vpx_dsp/mips/macros_msa.h"

 #define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) {  \

   v8i16 k0_m = __msa_fill_h(cnst0);                                  \

--- a/vp9/common/mips/msa/vp9_intra_predict_msa.c

+++ b/vp9/common/mips/msa/vp9_intra_predict_msa.c

@@ -9,7 +9,7 @@

*/

 #include "./vp9_rtcd.h"

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#include "vpx_dsp/mips/macros_msa.h"

 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) {  \

   out0 = __msa_subs_u_h(out0, in0);                \

--- a/vp9/common/mips/msa/vp9_loopfilter_msa.h

+++ b/vp9/common/mips/msa/vp9_loopfilter_msa.h

@@ -11,7 +11,7 @@

 #ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_

 #define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#include "vpx_dsp/mips/macros_msa.h"

 #define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \

                            p1_out, p0_out, q0_out, q1_out) {             \

--- a/vp9/common/mips/msa/vp9_macros_msa.h

+++ /dev/null

@@ -1,1885 +1,0 @@

-/*

- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_

-#define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_

-#include <msa.h>

-#include "./vpx_config.h"

-#include "vpx/vpx_integer.h"

-#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))

-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)

-#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)

-#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))

-#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)

-#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)

-#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))

-#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)

-#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)

-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)

-#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)

-#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)

-#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)

-#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)

-#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)

-#if (__mips_isa_rev >= 6)

-#define LH(psrc) ({                                 \

-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

-  uint16_t val_m;                                   \

-                                                    \

-  __asm__ __volatile__ (                            \

-      "lh  %[val_m],  %[psrc_m]  \n\t"              \

-                                                    \

-      : [val_m] "=r" (val_m)                        \

-      : [psrc_m] "m" (*psrc_m)                      \

-  );                                                \

-                                                    \

-  val_m;                                            \

-})

-#define LW(psrc) ({                                 \

-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

-  uint32_t val_m;                                   \

-                                                    \

-  __asm__ __volatile__ (                            \

-      "lw  %[val_m],  %[psrc_m]  \n\t"              \

-                                                    \

-      : [val_m] "=r" (val_m)                        \

-      : [psrc_m] "m" (*psrc_m)                      \

-  );                                                \

-                                                    \

-  val_m;                                            \

-})

-#if (__mips == 64)

-#define LD(psrc) ({                                 \

-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

-  uint64_t val_m = 0;                               \

-                                                    \

-  __asm__ __volatile__ (                            \

-      "ld  %[val_m],  %[psrc_m]  \n\t"              \

-                                                    \

-      : [val_m] "=r" (val_m)                        \

-      : [psrc_m] "m" (*psrc_m)                      \

-  );                                                \

-                                                    \

-  val_m;                                            \

-})

-#else  // !(__mips == 64)

-#define LD(psrc) ({                                        \

-  const uint8_t *psrc_m = (const uint8_t *)(psrc);         \

-  uint32_t val0_m, val1_m;                                 \

-  uint64_t val_m = 0;                                      \

-                                                           \

-  val0_m = LW(psrc_m);                                     \

-  val1_m = LW(psrc_m + 4);                                 \

-                                                           \

-  val_m = (uint64_t)(val1_m);                              \

-  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \

-  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \

-                                                           \

-  val_m;                                                   \

-})

-#endif  // (__mips == 64)

-#define SH(val, pdst) {                 \

-  uint8_t *pdst_m = (uint8_t *)(pdst);  \

-  const uint16_t val_m = (val);         \

-                                        \

-  __asm__ __volatile__ (                \

-      "sh  %[val_m],  %[pdst_m]  \n\t"  \

-                                        \

-      : [pdst_m] "=m" (*pdst_m)         \

-      : [val_m] "r" (val_m)             \

-  );                                    \

-}

-#define SW(val, pdst) {                 \

-  uint8_t *pdst_m = (uint8_t *)(pdst);  \

-  const uint32_t val_m = (val);         \

-                                        \

-  __asm__ __volatile__ (                \

-      "sw  %[val_m],  %[pdst_m]  \n\t"  \

-                                        \

-      : [pdst_m] "=m" (*pdst_m)         \

-      : [val_m] "r" (val_m)             \

-  );                                    \

-}

-#define SD(val, pdst) {                 \

-  uint8_t *pdst_m = (uint8_t *)(pdst);  \

-  const uint64_t val_m = (val);         \

-                                        \

-  __asm__ __volatile__ (                \

-      "sd  %[val_m],  %[pdst_m]  \n\t"  \

-                                        \

-      : [pdst_m] "=m" (*pdst_m)         \

-      : [val_m] "r" (val_m)             \

-  );                                    \

-}

-#else  // !(__mips_isa_rev >= 6)

-#define LH(psrc) ({                                 \

-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

-  uint16_t val_m;                                   \

-                                                    \

-  __asm__ __volatile__ (                            \

-      "ulh  %[val_m],  %[psrc_m]  \n\t"             \

-                                                    \

-      : [val_m] "=r" (val_m)                        \

-      : [psrc_m] "m" (*psrc_m)                      \

-  );                                                \

-                                                    \

-  val_m;                                            \

-})

-#define LW(psrc) ({                                 \

-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

-  uint32_t val_m;                                   \

-                                                    \

-  __asm__ __volatile__ (                            \

-      "ulw  %[val_m],  %[psrc_m]  \n\t"             \

-                                                    \

-      : [val_m] "=r" (val_m)                        \

-      : [psrc_m] "m" (*psrc_m)                      \

-  );                                                \

-                                                    \

-  val_m;                                            \

-})

-#if (__mips == 64)

-#define LD(psrc) ({                                 \

-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

-  uint64_t val_m = 0;                               \

-                                                    \

-  __asm__ __volatile__ (                            \

-      "uld  %[val_m],  %[psrc_m]  \n\t"             \

-                                                    \

-      : [val_m] "=r" (val_m)                        \

-      : [psrc_m] "m" (*psrc_m)                      \

-  );                                                \

-                                                    \

-  val_m;                                            \

-})

-#else  // !(__mips == 64)

-#define LD(psrc) ({                                        \

-  const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \

-  uint32_t val0_m, val1_m;                                 \

-  uint64_t val_m = 0;                                      \

-                                                           \

-  val0_m = LW(psrc_m1);                                    \

-  val1_m = LW(psrc_m1 + 4);                                \

-                                                           \

-  val_m = (uint64_t)(val1_m);                              \

-  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \

-  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \

-                                                           \

-  val_m;                                                   \

-})

-#endif  // (__mips == 64)

-#define SH(val, pdst) {                  \

-  uint8_t *pdst_m = (uint8_t *)(pdst);   \

-  const uint16_t val_m = (val);          \

-                                         \

-  __asm__ __volatile__ (                 \

-      "ush  %[val_m],  %[pdst_m]  \n\t"  \

-                                         \

-      : [pdst_m] "=m" (*pdst_m)          \

-      : [val_m] "r" (val_m)              \

-  );                                     \

-}

-#define SW(val, pdst) {                  \

-  uint8_t *pdst_m = (uint8_t *)(pdst);   \

-  const uint32_t val_m = (val);          \

-                                         \

-  __asm__ __volatile__ (                 \

-      "usw  %[val_m],  %[pdst_m]  \n\t"  \

-                                         \

-      : [pdst_m] "=m" (*pdst_m)          \

-      : [val_m] "r" (val_m)              \

-  );                                     \

-}

-#define SD(val, pdst) {                                     \

-  uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \

-  uint32_t val0_m, val1_m;                                  \

-                                                            \

-  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \

-  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \

-                                                            \

-  SW(val0_m, pdst_m1);                                      \

-  SW(val1_m, pdst_m1 + 4);                                  \

-}

-#endif  // (__mips_isa_rev >= 6)

-/* Description : Load 4 words with stride

-   Arguments   : Inputs  - psrc, stride

-                 Outputs - out0, out1, out2, out3

-   Details     : Load word in 'out0' from (psrc)

-                 Load word in 'out1' from (psrc + stride)

-                 Load word in 'out2' from (psrc + 2 * stride)

-                 Load word in 'out3' from (psrc + 3 * stride)

-*/

-#define LW4(psrc, stride, out0, out1, out2, out3) {  \

-  out0 = LW((psrc));                                 \

-  out1 = LW((psrc) + stride);                        \

-  out2 = LW((psrc) + 2 * stride);                    \

-  out3 = LW((psrc) + 3 * stride);                    \

-}

-/* Description : Load double words with stride

-   Arguments   : Inputs  - psrc, stride

-                 Outputs - out0, out1

-   Details     : Load double word in 'out0' from (psrc)

-                 Load double word in 'out1' from (psrc + stride)

-*/

-#define LD2(psrc, stride, out0, out1) {  \

-  out0 = LD((psrc));                     \

-  out1 = LD((psrc) + stride);            \

-}

-#define LD4(psrc, stride, out0, out1, out2, out3) {  \

-  LD2((psrc), stride, out0, out1);                   \

-  LD2((psrc) + 2 * stride, stride, out2, out3);      \

-}

-/* Description : Store 4 words with stride

-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride

-   Details     : Store word from 'in0' to (pdst)

-                 Store word from 'in1' to (pdst + stride)

-                 Store word from 'in2' to (pdst + 2 * stride)

-                 Store word from 'in3' to (pdst + 3 * stride)

-*/

-#define SW4(in0, in1, in2, in3, pdst, stride) {  \

-  SW(in0, (pdst))                                \

-  SW(in1, (pdst) + stride);                      \

-  SW(in2, (pdst) + 2 * stride);                  \

-  SW(in3, (pdst) + 3 * stride);                  \

-}

-/* Description : Store 4 double words with stride

-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride

-   Details     : Store double word from 'in0' to (pdst)

-                 Store double word from 'in1' to (pdst + stride)

-                 Store double word from 'in2' to (pdst + 2 * stride)

-                 Store double word from 'in3' to (pdst + 3 * stride)

-*/

-#define SD4(in0, in1, in2, in3, pdst, stride) {  \

-  SD(in0, (pdst))                                \

-  SD(in1, (pdst) + stride);                      \

-  SD(in2, (pdst) + 2 * stride);                  \

-  SD(in3, (pdst) + 3 * stride);                  \

-}

-/* Description : Load vectors with 16 byte elements with stride

-   Arguments   : Inputs  - psrc, stride

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Load 16 byte elements in 'out0' from (psrc)

-                 Load 16 byte elements in 'out1' from (psrc + stride)

-*/

-#define LD_B2(RTYPE, psrc, stride, out0, out1) {  \

-  out0 = LD_B(RTYPE, (psrc));                     \

-  out1 = LD_B(RTYPE, (psrc) + stride);            \

-}

-#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)

-#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)

-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \

-  LD_B2(RTYPE, (psrc), stride, out0, out1);                   \

-  LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \

-}

-#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)

-#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)

-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \

-  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \

-  out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \

-}

-#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)

-#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)

-#define LD_B7(RTYPE, psrc, stride,                             \

-              out0, out1, out2, out3, out4, out5, out6) {      \

-  LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \

-  LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \

-}

-#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)

-#define LD_B8(RTYPE, psrc, stride,                                    \

-              out0, out1, out2, out3, out4, out5, out6, out7) {       \

-  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \

-  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \

-}

-#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)

-#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)

-/* Description : Load vectors with 8 halfword elements with stride

-   Arguments   : Inputs  - psrc, stride

-                 Outputs - out0, out1

-   Details     : Load 8 halfword elements in 'out0' from (psrc)

-                 Load 8 halfword elements in 'out1' from (psrc + stride)

-*/

-#define LD_H2(RTYPE, psrc, stride, out0, out1) {  \

-  out0 = LD_H(RTYPE, (psrc));                     \

-  out1 = LD_H(RTYPE, (psrc) + (stride));          \

-}

-#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)

-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \

-  LD_H2(RTYPE, (psrc), stride, out0, out1);                   \

-  LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);      \

-}

-#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)

-#define LD_H8(RTYPE, psrc, stride,                                    \

-              out0, out1, out2, out3, out4, out5, out6, out7) {       \

-  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \

-  LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \

-}

-#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)

-#define LD_H16(RTYPE, psrc, stride,                                     \

-               out0, out1, out2, out3, out4, out5, out6, out7,          \

-               out8, out9, out10, out11, out12, out13, out14, out15) {  \

-  LD_H8(RTYPE, (psrc), stride,                                          \

-        out0, out1, out2, out3, out4, out5, out6, out7);                \

-  LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \

-        out8, out9, out10, out11, out12, out13, out14, out15);          \

-}

-#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)

-/* Description : Load 4x4 block of signed halfword elements from 1D source

-                 data into 4 vectors (Each vector with 4 signed halfwords)

-   Arguments   : Input   - psrc

-                 Outputs - out0, out1, out2, out3

-*/

-#define LD4x4_SH(psrc, out0, out1, out2, out3) {         \

-  out0 = LD_SH(psrc);                                    \

-  out2 = LD_SH(psrc + 8);                                \

-  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \

-  out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \

-}

-/* Description : Load 2 vectors of signed word elements with stride

-   Arguments   : Inputs  - psrc, stride

-                 Outputs - out0, out1

-                 Return Type - signed word

-*/

-#define LD_SW2(psrc, stride, out0, out1) {  \

-  out0 = LD_SW((psrc));                     \

-  out1 = LD_SW((psrc) + stride);            \

-}

-/* Description : Store vectors of 16 byte elements with stride

-   Arguments   : Inputs - in0, in1, pdst, stride

-   Details     : Store 16 byte elements from 'in0' to (pdst)

-                 Store 16 byte elements from 'in1' to (pdst + stride)

-*/

-#define ST_B2(RTYPE, in0, in1, pdst, stride) {  \

-  ST_B(RTYPE, in0, (pdst));                     \

-  ST_B(RTYPE, in1, (pdst) + stride);            \

-}

-#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)

-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \

-  ST_B2(RTYPE, in0, in1, (pdst), stride);                 \

-  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \

-}

-#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)

-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \

-              pdst, stride) {                                     \

-  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \

-  ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \

-}

-#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)

-/* Description : Store vectors of 8 halfword elements with stride

-   Arguments   : Inputs - in0, in1, pdst, stride

-   Details     : Store 8 halfword elements from 'in0' to (pdst)

-                 Store 8 halfword elements from 'in1' to (pdst + stride)

-*/

-#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \

-  ST_H(RTYPE, in0, (pdst));                     \

-  ST_H(RTYPE, in1, (pdst) + stride);            \

-}

-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)

-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \

-  ST_H2(RTYPE, in0, in1, (pdst), stride);                 \

-  ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \

-}

-#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)

-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \

-  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \

-  ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \

-}

-#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)

-/* Description : Store vectors of word elements with stride

-   Arguments   : Inputs - in0, in1, pdst, stride

-   Details     : Store 4 word elements from 'in0' to (pdst)

-                 Store 4 word elements from 'in1' to (pdst + stride)

-*/

-#define ST_SW2(in0, in1, pdst, stride) {  \

-  ST_SW(in0, (pdst));                     \

-  ST_SW(in1, (pdst) + stride);            \

-}

-/* Description : Store 2x4 byte block to destination memory from input vector

-   Arguments   : Inputs - in, stidx, pdst, stride

-   Details     : Index 'stidx' halfword element from 'in' vector is copied to

-                 the GP register and stored to (pdst)

-                 Index 'stidx+1' halfword element from 'in' vector is copied to

-                 the GP register and stored to (pdst + stride)

-                 Index 'stidx+2' halfword element from 'in' vector is copied to

-                 the GP register and stored to (pdst + 2 * stride)

-                 Index 'stidx+3' halfword element from 'in' vector is copied to

-                 the GP register and stored to (pdst + 3 * stride)

-*/

-#define ST2x4_UB(in, stidx, pdst, stride) {         \

-  uint16_t out0_m, out1_m, out2_m, out3_m;          \

-  uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \

-                                                    \

-  out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \

-  out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \

-  out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \

-  out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \

-                                                    \

-  SH(out0_m, pblk_2x4_m);                           \

-  SH(out1_m, pblk_2x4_m + stride);                  \

-  SH(out2_m, pblk_2x4_m + 2 * stride);              \

-  SH(out3_m, pblk_2x4_m + 3 * stride);              \

-}

-/* Description : Store 4x2 byte block to destination memory from input vector

-   Arguments   : Inputs - in, pdst, stride

-   Details     : Index 0 word element from 'in' vector is copied to the GP

-                 register and stored to (pdst)

-                 Index 1 word element from 'in' vector is copied to the GP

-                 register and stored to (pdst + stride)

-*/

-#define ST4x2_UB(in, pdst, stride) {        \

-  uint32_t out0_m, out1_m;                  \

-  uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \

-                                            \

-  out0_m = __msa_copy_u_w((v4i32)in, 0);    \

-  out1_m = __msa_copy_u_w((v4i32)in, 1);    \

-                                            \

-  SW(out0_m, pblk_4x2_m);                   \

-  SW(out1_m, pblk_4x2_m + stride);          \

-}

-/* Description : Store 4x4 byte block to destination memory from input vector

-   Arguments   : Inputs - in0, in1, pdst, stride

-   Details     : 'Idx0' word element from input vector 'in0' is copied to the

-                 GP register and stored to (pdst)

-                 'Idx1' word element from input vector 'in0' is copied to the

-                 GP register and stored to (pdst + stride)

-                 'Idx2' word element from input vector 'in0' is copied to the

-                 GP register and stored to (pdst + 2 * stride)

-                 'Idx3' word element from input vector 'in0' is copied to the

-                 GP register and stored to (pdst + 3 * stride)

-*/

-#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \

-  uint32_t out0_m, out1_m, out2_m, out3_m;                          \

-  uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \

-                                                                    \

-  out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \

-  out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \

-  out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \

-  out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \

-                                                                    \

-  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \

-}

-#define ST4x8_UB(in0, in1, pdst, stride) {                        \

-  uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \

-                                                                  \

-  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \

-  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \

-}

-/* Description : Store 8x1 byte block to destination memory from input vector

-   Arguments   : Inputs - in, pdst

-   Details     : Index 0 double word element from 'in' vector is copied to the

-                 GP register and stored to (pdst)

-*/

-#define ST8x1_UB(in, pdst) {              \

-  uint64_t out0_m;                        \

-                                          \

-  out0_m = __msa_copy_u_d((v2i64)in, 0);  \

-  SD(out0_m, pdst);                       \

-}

-/* Description : Store 8x2 byte block to destination memory from input vector

-   Arguments   : Inputs - in, pdst, stride

-   Details     : Index 0 double word element from 'in' vector is copied to the

-                 GP register and stored to (pdst)

-                 Index 1 double word element from 'in' vector is copied to the

-                 GP register and stored to (pdst + stride)

-*/

-#define ST8x2_UB(in, pdst, stride) {        \

-  uint64_t out0_m, out1_m;                  \

-  uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \

-                                            \

-  out0_m = __msa_copy_u_d((v2i64)in, 0);    \

-  out1_m = __msa_copy_u_d((v2i64)in, 1);    \

-                                            \

-  SD(out0_m, pblk_8x2_m);                   \

-  SD(out1_m, pblk_8x2_m + stride);          \

-}

-/* Description : Store 8x4 byte block to destination memory from input

-                 vectors

-   Arguments   : Inputs - in0, in1, pdst, stride

-   Details     : Index 0 double word element from 'in0' vector is copied to the

-                 GP register and stored to (pdst)

-                 Index 1 double word element from 'in0' vector is copied to the

-                 GP register and stored to (pdst + stride)

-                 Index 0 double word element from 'in1' vector is copied to the

-                 GP register and stored to (pdst + 2 * stride)

-                 Index 1 double word element from 'in1' vector is copied to the

-                 GP register and stored to (pdst + 3 * stride)

-*/

-#define ST8x4_UB(in0, in1, pdst, stride) {                  \

-  uint64_t out0_m, out1_m, out2_m, out3_m;                  \

-  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \

-                                                            \

-  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \

-  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \

-  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \

-  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \

-                                                            \

-  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \

-}

-/* Description : average with rounding (in0 + in1 + 1) / 2.

-   Arguments   : Inputs  - in0, in1, in2, in3,

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Each unsigned byte element from 'in0' vector is added with

-                 each unsigned byte element from 'in1' vector. Then average

-                 with rounding is calculated and written to 'out0'

-*/

-#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \

-  out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);    \

-}

-#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)

-#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

-                 out0, out1, out2, out3) {                       \

-  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                \

-  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                \

-}

-#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)

-/* Description : Immediate number of elements to slide with zero

-   Arguments   : Inputs  - in0, in1, slide_val

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Byte elements from 'zero_m' vector are slide into 'in0' by

-                 value specified in the 'slide_val'

-*/

-#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \

-  v16i8 zero_m = { 0 };                                              \

-  out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \

-  out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \

-}

-#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)

-#define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \

-                  out0, out1, out2, out3, slide_val) {  \

-  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \

-  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \

-}

-#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)

-/* Description : Immediate number of elements to slide

-   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by

-                 value specified in the 'slide_val'

-*/

-#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \

-  out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \

-  out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \

-}

-#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)

-#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \

-                out0, out1, out2, slide_val) {                        \

-  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \

-  out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \

-}

-#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)

-#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)

-/* Description : Shuffle byte vector elements as per mask vector

-   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Byte elements from 'in0' & 'in1' are copied selectively to

-                 'out0' as per control vector 'mask0'

-*/

-#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \

-  out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \

-  out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \

-}

-#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)

-#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)

-#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)

-#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \

-                out0, out1, out2, out3) {                        \

-  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \

-  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \

-}

-#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)

-#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)

-/* Description : Dot product of byte vector elements

-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Unsigned byte elements from 'mult0' are multiplied with

-                 unsigned byte elements from 'cnst0' producing a result

-                 twice the size of input i.e. unsigned halfword.

-                 The multiplication result of adjacent odd-even elements

-                 are added together and written to the 'out0' vector

-*/

-#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

-  out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \

-  out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \

-}

-#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)

-#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \

-                 cnst0, cnst1, cnst2, cnst3,                \

-                 out0, out1, out2, out3) {                  \

-  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \

-  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \

-}

-#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)

-/* Description : Dot product of byte vector elements

-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Signed byte elements from 'mult0' are multiplied with

-                 signed byte elements from 'cnst0' producing a result

-                 twice the size of input i.e. signed halfword.

-                 The multiplication result of adjacent odd-even elements

-                 are added together and written to the 'out0' vector

-*/

-#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

-  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \

-  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \

-}

-#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)

-#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \

-                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \

-  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \

-  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \

-}

-#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)

-/* Description : Dot product of halfword vector elements

-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Signed halfword elements from 'mult0' are multiplied with

-                 signed halfword elements from 'cnst0' producing a result

-                 twice the size of input i.e. signed word.

-                 The multiplication result of adjacent odd-even elements

-                 are added together and written to the 'out0' vector

-*/

-#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

-  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \

-  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \

-}

-#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)

-#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \

-                 cnst0, cnst1, cnst2, cnst3,                \

-                 out0, out1, out2, out3) {                  \

-  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \

-  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \

-}

-#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)

-/* Description : Dot product of word vector elements

-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Signed word elements from 'mult0' are multiplied with

-                 signed word elements from 'cnst0' producing a result

-                 twice the size of input i.e. signed double word.

-                 The multiplication result of adjacent odd-even elements

-                 are added together and written to the 'out0' vector

-*/

-#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

-  out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \

-  out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \

-}

-#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)

-/* Description : Dot product & addition of byte vector elements

-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Signed byte elements from 'mult0' are multiplied with

-                 signed byte elements from 'cnst0' producing a result

-                 twice the size of input i.e. signed halfword.

-                 The multiplication result of adjacent odd-even elements

-                 are added to the 'out0' vector

-*/

-#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \

-  out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \

-  out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \

-}

-#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)

-#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \

-                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \

-  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \

-  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \

-}

-#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)

-/* Description : Dot product & addition of halfword vector elements

-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Signed halfword elements from 'mult0' are multiplied with

-                 signed halfword elements from 'cnst0' producing a result

-                 twice the size of input i.e. signed word.

-                 The multiplication result of adjacent odd-even elements

-                 are added to the 'out0' vector

-*/

-#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \

-  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \

-  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \

-}

-#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)

-/* Description : Dot product & addition of double word vector elements

-   Arguments   : Inputs  - mult0, mult1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Each signed word element from 'mult0' is multiplied with itself

-                 producing an intermediate result twice the size of input

-                 i.e. signed double word

-                 The multiplication result of adjacent odd-even elements

-                 are added to the 'out0' vector

-*/

-#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) {                       \

-  out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \

-  out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \

-}

-#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)

-/* Description : Minimum values between unsigned elements of

-                 either vector are copied to the output vector

-   Arguments   : Inputs  - in0, in1, min_vec

-                 Outputs - in place operation

-                 Return Type - as per RTYPE

-   Details     : Minimum of unsigned halfword element values from 'in0' and

-                 'min_vec' are written to output vector 'in0'

-*/

-#define MIN_UH2(RTYPE, in0, in1, min_vec) {         \

-  in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \

-  in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \

-}

-#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)

-#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \

-  MIN_UH2(RTYPE, in0, in1, min_vec);                   \

-  MIN_UH2(RTYPE, in2, in3, min_vec);                   \

-}

-#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)

-/* Description : Clips all signed halfword elements of input vector

-                 between 0 & 255

-   Arguments   : Input  - in

-                 Output - out_m

-                 Return Type - signed halfword

-*/

-#define CLIP_SH_0_255(in) ({                          \

-  v8i16 max_m = __msa_ldi_h(255);                     \

-  v8i16 out_m;                                        \

-                                                      \

-  out_m = __msa_maxi_s_h((v8i16)in, 0);               \

-  out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \

-  out_m;                                              \

-})

-#define CLIP_SH2_0_255(in0, in1) {  \

-  in0 = CLIP_SH_0_255(in0);         \

-  in1 = CLIP_SH_0_255(in1);         \

-}

-#define CLIP_SH4_0_255(in0, in1, in2, in3) {  \

-  CLIP_SH2_0_255(in0, in1);                   \

-  CLIP_SH2_0_255(in2, in3);                   \

-}

-/* Description : Horizontal addition of 4 signed word elements of input vector

-   Arguments   : Input  - in       (signed word vector)

-                 Output - sum_m    (i32 sum)

-                 Return Type - signed word (GP)

-   Details     : 4 signed word elements of 'in' vector are added together and

-                 the resulting integer sum is returned

-*/

-#define HADD_SW_S32(in) ({                        \

-  v2i64 res0_m, res1_m;                           \

-  int32_t sum_m;                                  \

-                                                  \

-  res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \

-  res1_m = __msa_splati_d(res0_m, 1);             \

-  res0_m = res0_m + res1_m;                       \

-  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \

-  sum_m;                                          \

-})

-/* Description : Horizontal addition of unsigned byte vector elements

-   Arguments   : Inputs  - in0, in1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Each unsigned odd byte element from 'in0' is added to

-                 even unsigned byte element from 'in0' (pairwise) and the

-                 halfword result is written to 'out0'

-*/

-#define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \

-  out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \

-  out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \

-}

-#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)

-#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \

-  HADD_UB2(RTYPE, in0, in1, out0, out1);                               \

-  HADD_UB2(RTYPE, in2, in3, out2, out3);                               \

-}

-#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)

-/* Description : Horizontal subtraction of unsigned byte vector elements

-   Arguments   : Inputs  - in0, in1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Each unsigned odd byte element from 'in0' is subtracted from

-                 even unsigned byte element from 'in0' (pairwise) and the

-                 halfword result is written to 'out0'

-*/

-#define HSUB_UB2(RTYPE, in0, in1, out0, out1) {          \

-  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \

-  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \

-}

-#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)

-/* Description : Horizontal subtraction of signed halfword vector elements

-   Arguments   : Inputs  - in0, in1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Each signed odd halfword element from 'in0' is subtracted from

-                 even signed halfword element from 'in0' (pairwise) and the

-                 word result is written to 'out0'

-*/

-#define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \

-  out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \

-  out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \

-}

-#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)

-/* Description : Set element n input vector to GPR value

-   Arguments   : Inputs - in0, in1, in2, in3

-                 Output - out

-                 Return Type - as per RTYPE

-   Details     : Set element 0 in vector 'out' to value specified in 'in0'

-*/

-#define INSERT_W2(RTYPE, in0, in1, out) {           \

-  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \

-  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \

-}

-#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)

-#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \

-  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \

-  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \

-  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \

-  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \

-}

-#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)

-#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)

-#define INSERT_D2(RTYPE, in0, in1, out) {           \

-  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \

-  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \

-}

-#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)

-#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)

-/* Description : Interleave even byte elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Even byte elements of 'in0' and 'in1' are interleaved

-                 and written to 'out0'

-*/

-#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \

-  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \

-}

-#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)

-#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)

-/* Description : Interleave even halfword elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Even halfword elements of 'in0' and 'in1' are interleaved

-                 and written to 'out0'

-*/

-#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \

-  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \

-}

-#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)

-#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)

-#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)

-/* Description : Interleave even word elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Even word elements of 'in0' and 'in1' are interleaved

-                 and written to 'out0'

-*/

-#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \

-  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \

-}

-#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)

-/* Description : Interleave even double word elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Even double word elements of 'in0' and 'in1' are interleaved

-                 and written to 'out0'

-*/

-#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \

-  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \

-}

-#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)

-/* Description : Interleave left half of byte elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved

-                 and written to 'out0'.

-*/

-#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \

-  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \

-}

-#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)

-#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)

-#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)

-#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)

-#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

-                out0, out1, out2, out3) {                       \

-  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \

-  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \

-}

-#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)

-#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)

-/* Description : Interleave left half of halfword elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Left half of halfword elements of 'in0' and 'in1' are

-                 interleaved and written to 'out0'.

-*/

-#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \

-  out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \

-}

-#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)

-/* Description : Interleave left half of word elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Left half of word elements of 'in0' and 'in1' are interleaved

-                 and written to 'out0'.

-*/

-#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \

-  out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \

-}

-#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)

-#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)

-/* Description : Interleave right half of byte elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved

-                 and written to out0.

-*/

-#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \

-  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \

-}

-#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)

-#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)

-#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)

-#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)

-#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

-                out0, out1, out2, out3) {                       \

-  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \

-  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \

-}

-#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)

-#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)

-#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)

-#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)

-#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \

-                in8, in9, in10, in11, in12, in13, in14, in15,      \

-                out0, out1, out2, out3, out4, out5, out6, out7) {  \

-  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \

-          out0, out1, out2, out3);                                 \

-  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \

-          out4, out5, out6, out7);                                 \

-}

-#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)

-/* Description : Interleave right half of halfword elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Right half of halfword elements of 'in0' and 'in1' are

-                 interleaved and written to 'out0'.

-*/

-#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \

-  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \

-}

-#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)

-#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

-                out0, out1, out2, out3) {                       \

-  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \

-  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \

-}

-#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)

-#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \

-  out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \

-}

-#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)

-#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)

-#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

-                out0, out1, out2, out3) {                       \

-  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \

-  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \

-}

-#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)

-/* Description : Interleave right half of double word elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Right half of double word elements of 'in0' and 'in1' are

-                 interleaved and written to 'out0'.

-*/

-#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \

-  out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \

-  out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \

-}

-#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)

-#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)

-#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)

-#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \

-  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \

-  out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \

-}

-#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)

-#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

-                out0, out1, out2, out3) {                       \

-  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \

-  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \

-}

-#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)

-#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)

-/* Description : Interleave both left and right half of input vectors

-   Arguments   : Inputs  - in0, in1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Right half of byte elements from 'in0' and 'in1' are

-                 interleaved and written to 'out0'

-*/

-#define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \

-  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \

-  out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \

-}

-#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)

-#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)

-#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)

-#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)

-#define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \

-  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \

-  out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \

-}

-#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)

-#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)

-#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \

-  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \

-  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \

-}

-#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)

-#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)

-/* Description : Saturate the halfword element values to the max

-                 unsigned value of (sat_val + 1) bits

-                 The element data width remains unchanged

-   Arguments   : Inputs  - in0, in1, sat_val

-                 Outputs - in place operation

-                 Return Type - as per RTYPE

-   Details     : Each unsigned halfword element from 'in0' is saturated to the

-                 value generated with (sat_val + 1) bit range.

-                 The results are written in place

-*/

-#define SAT_UH2(RTYPE, in0, in1, sat_val) {         \

-  in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \

-  in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \

-}

-#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)

-#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \

-  SAT_UH2(RTYPE, in0, in1, sat_val);                   \

-  SAT_UH2(RTYPE, in2, in3, sat_val)                    \

-}

-#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)

-/* Description : Saturate the halfword element values to the max

-                 unsigned value of (sat_val + 1) bits

-                 The element data width remains unchanged

-   Arguments   : Inputs  - in0, in1, sat_val

-                 Outputs - in place operation

-                 Return Type - as per RTYPE

-   Details     : Each unsigned halfword element from 'in0' is saturated to the

-                 value generated with (sat_val + 1) bit range

-                 The results are written in place

-*/

-#define SAT_SH2(RTYPE, in0, in1, sat_val) {         \

-  in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \

-  in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \

-}

-#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)

-#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \

-  SAT_SH2(RTYPE, in0, in1, sat_val);                   \

-  SAT_SH2(RTYPE, in2, in3, sat_val);                   \

-}

-#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)

-/* Description : Indexed halfword element values are replicated to all

-                 elements in output vector

-   Arguments   : Inputs  - in, idx0, idx1

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : 'idx0' element value from 'in' vector is replicated to all

-                  elements in 'out0' vector

-                  Valid index range for halfword operation is 0-7

-*/

-#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \

-  out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \

-  out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \

-}

-#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)

-#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \

-                  out0, out1, out2, out3) {           \

-  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \

-  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \

-}

-#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)

-#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)

-/* Description : Pack even byte elements of vector pairs

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Even byte elements of 'in0' are copied to the left half of

-                 'out0' & even byte elements of 'in1' are copied to the right

-                 half of 'out0'.

-*/

-#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \

-  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \

-}

-#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)

-#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)

-#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)

-#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

-                 out0, out1, out2, out3) {                       \

-  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \

-  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \

-}

-#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)

-#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)

-#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)

-/* Description : Pack even halfword elements of vector pairs

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Even halfword elements of 'in0' are copied to the left half of

-                 'out0' & even halfword elements of 'in1' are copied to the

-                 right half of 'out0'.

-*/

-#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \

-  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \

-}

-#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)

-#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)

-#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

-                 out0, out1, out2, out3) {                       \

-  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \

-  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \

-}

-#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)

-/* Description : Pack even double word elements of vector pairs

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Even double elements of 'in0' are copied to the left half of

-                 'out0' & even double elements of 'in1' are copied to the right

-                 half of 'out0'.

-*/

-#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \

-  out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \

-}

-#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)

-#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)

-#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

-                 out0, out1, out2, out3) {                       \

-  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \

-  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \

-}

-#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)

-/* Description : Each byte element is logically xor'ed with immediate 128

-   Arguments   : Inputs  - in0, in1

-                 Outputs - in place operation

-                 Return Type - as per RTYPE

-   Details     : Each unsigned byte element from input vector 'in0' is

-                 logically xor'ed with 128 and the result is stored in-place.

-*/

-#define XORI_B2_128(RTYPE, in0, in1) {         \

-  in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \

-  in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \

-}

-#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)

-#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)

-#define XORI_B3_128(RTYPE, in0, in1, in2) {    \

-  XORI_B2_128(RTYPE, in0, in1);                \

-  in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \

-}

-#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)

-#define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \

-  XORI_B2_128(RTYPE, in0, in1);                   \

-  XORI_B2_128(RTYPE, in2, in3);                   \

-}

-#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)

-#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)

-#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \

-  XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \

-  XORI_B3_128(RTYPE, in4, in5, in6);                             \

-}

-#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)

-/* Description : Average of signed halfword elements -> (a + b) / 2

-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

-                 Outputs - out0, out1, out2, out3

-                 Return Type - as per RTYPE

-   Details     : Each signed halfword element from 'in0' is added to each

-                 signed halfword element of 'in1' with full precision resulting

-                 in one extra bit in the result. The result is then divided by

-                 2 and written to 'out0'

-*/

-#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

-                out0, out1, out2, out3) {                       \

-  out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \

-  out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \

-  out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \

-  out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \

-}

-#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)

-/* Description : Addition of signed halfword elements and signed saturation

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-                 Return Type - as per RTYPE

-   Details     : Signed halfword elements from 'in0' are added to signed

-                 halfword elements of 'in1'. The result is then signed saturated

-                 between halfword data type range

-*/

-#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

-  out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \

-  out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \

-}

-#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)

-#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

-                 out0, out1, out2, out3) {                       \

-  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \

-  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \

-}

-#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)

-/* Description : Shift left all elements of vector (generic for all data types)

-   Arguments   : Inputs  - in0, in1, in2, in3, shift

-                 Outputs - in place operation

-                 Return Type - as per input vector RTYPE

-   Details     : Each element of vector 'in0' is left shifted by 'shift' and

-                 the result is written in-place.

-*/

-#define SLLI_4V(in0, in1, in2, in3, shift) {  \

-  in0 = in0 << shift;                         \

-  in1 = in1 << shift;                         \

-  in2 = in2 << shift;                         \

-  in3 = in3 << shift;                         \

-}

-/* Description : Arithmetic shift right all elements of vector

-                 (generic for all data types)

-   Arguments   : Inputs  - in0, in1, in2, in3, shift

-                 Outputs - in place operation

-                 Return Type - as per input vector RTYPE

-   Details     : Each element of vector 'in0' is right shifted by 'shift' and

-                 the result is written in-place. 'shift' is a GP variable.

-*/

-#define SRA_4V(in0, in1, in2, in3, shift) {  \

-  in0 = in0 >> shift;                        \

-  in1 = in1 >> shift;                        \

-  in2 = in2 >> shift;                        \

-  in3 = in3 >> shift;                        \

-}

-/* Description : Shift right arithmetic rounded words

-   Arguments   : Inputs  - in0, in1, shift

-                 Outputs - in place operation

-                 Return Type - as per RTYPE

-   Details     : Each element of vector 'in0' is shifted right arithmetically by

-                 the number of bits in the corresponding element in the vector

-                 'shift'. The last discarded bit is added to shifted value for

-                 rounding and the result is written in-place.

-                 'shift' is a vector.

-*/

-#define SRAR_W2(RTYPE, in0, in1, shift) {               \

-  in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \

-  in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \

-}

-#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \

-  SRAR_W2(RTYPE, in0, in1, shift)                    \

-  SRAR_W2(RTYPE, in2, in3, shift)                    \

-}

-#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)

-/* Description : Shift right arithmetic rounded (immediate)

-   Arguments   : Inputs  - in0, in1, shift

-                 Outputs - in place operation

-                 Return Type - as per RTYPE

-   Details     : Each element of vector 'in0' is shifted right arithmetically by

-                 the value in 'shift'. The last discarded bit is added to the

-                 shifted value for rounding and the result is written in-place.

-                 'shift' is an immediate value.

-*/

-#define SRARI_H2(RTYPE, in0, in1, shift) {        \

-  in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \

-  in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \

-}

-#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)

-#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)

-#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \

-  SRARI_H2(RTYPE, in0, in1, shift);                   \

-  SRARI_H2(RTYPE, in2, in3, shift);                   \

-}

-#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)

-#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)

-#define SRARI_W2(RTYPE, in0, in1, shift) {        \

-  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \

-  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \

-}

-#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)

-#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \

-  SRARI_W2(RTYPE, in0, in1, shift);                   \

-  SRARI_W2(RTYPE, in2, in3, shift);                   \

-}

-#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)

-/* Description : Logical shift right all elements of vector (immediate)

-   Arguments   : Inputs  - in0, in1, in2, in3, shift

-                 Outputs - out0, out1, out2, out3

-                 Return Type - as per RTYPE

-   Details     : Each element of vector 'in0' is right shifted by 'shift' and

-                 the result is written in-place. 'shift' is an immediate value.

-*/

-#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \

-  out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \

-  out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \

-  out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \

-  out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \

-}

-#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)

-/* Description : Multiplication of pairs of vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-   Details     : Each element from 'in0' is multiplied with elements from 'in1'

-                 and the result is written to 'out0'

-*/

-#define MUL2(in0, in1, in2, in3, out0, out1) {  \

-  out0 = in0 * in1;                             \

-  out1 = in2 * in3;                             \

-}

-#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \

-             out0, out1, out2, out3) {                \

-  MUL2(in0, in1, in2, in3, out0, out1);               \

-  MUL2(in4, in5, in6, in7, out2, out3);               \

-}

-/* Description : Addition of 2 pairs of vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-   Details     : Each element in 'in0' is added to 'in1' and result is written

-                 to 'out0'.

-*/

-#define ADD2(in0, in1, in2, in3, out0, out1) {  \

-  out0 = in0 + in1;                             \

-  out1 = in2 + in3;                             \

-}

-#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \

-             out0, out1, out2, out3) {                \

-  ADD2(in0, in1, in2, in3, out0, out1);               \

-  ADD2(in4, in5, in6, in7, out2, out3);               \

-}

-/* Description : Subtraction of 2 pairs of vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1

-   Details     : Each element in 'in1' is subtracted from 'in0' and result is

-                 written to 'out0'.

-*/

-#define SUB2(in0, in1, in2, in3, out0, out1) {  \

-  out0 = in0 - in1;                             \

-  out1 = in2 - in3;                             \

-}

-#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \

-             out0, out1, out2, out3) {                \

-  out0 = in0 - in1;                                   \

-  out1 = in2 - in3;                                   \

-  out2 = in4 - in5;                                   \

-  out3 = in6 - in7;                                   \

-}

-/* Description : Sign extend halfword elements from right half of the vector

-   Arguments   : Input  - in    (halfword vector)

-                 Output - out   (sign extended word vector)

-                 Return Type - signed word

-   Details     : Sign bit of halfword elements from input vector 'in' is

-                 extracted and interleaved with same vector 'in0' to generate

-                 4 word elements keeping sign intact

-*/

-#define UNPCK_R_SH_SW(in, out) {                 \

-  v8i16 sign_m;                                  \

-                                                 \

-  sign_m = __msa_clti_s_h((v8i16)in, 0);         \

-  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \

-}

-/* Description : Zero extend unsigned byte elements to halfword elements

-   Arguments   : Input   - in          (unsigned byte vector)

-                 Outputs - out0, out1  (unsigned  halfword vectors)

-                 Return Type - signed halfword

-   Details     : Zero extended right half of vector is returned in 'out0'

-                 Zero extended left half of vector is returned in 'out1'

-*/

-#define UNPCK_UB_SH(in, out0, out1) {   \

-  v16i8 zero_m = { 0 };                 \

-                                        \

-  ILVRL_B2_SH(zero_m, in, out0, out1);  \

-}

-/* Description : Sign extend halfword elements from input vector and return

-                 the result in pair of vectors

-   Arguments   : Input   - in            (halfword vector)

-                 Outputs - out0, out1   (sign extended word vectors)

-                 Return Type - signed word

-   Details     : Sign bit of halfword elements from input vector 'in' is

-                 extracted and interleaved right with same vector 'in0' to

-                 generate 4 signed word elements in 'out0'

-                 Then interleaved left with same vector 'in0' to

-                 generate 4 signed word elements in 'out1'

-*/

-#define UNPCK_SH_SW(in, out0, out1) {    \

-  v8i16 tmp_m;                           \

-                                         \

-  tmp_m = __msa_clti_s_h((v8i16)in, 0);  \

-  ILVRL_H2_SW(tmp_m, in, out0, out1);    \

-}

-/* Description : Butterfly of 4 input vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1, out2, out3

-   Details     : Butterfly operation

-*/

-#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \

-  out0 = in0 + in3;                                                \

-  out1 = in1 + in2;                                                \

-                                                                   \

-  out2 = in1 - in2;                                                \

-  out3 = in0 - in3;                                                \

-}

-/* Description : Butterfly of 8 input vectors

-   Arguments   : Inputs  - in0 ...  in7

-                 Outputs - out0 .. out7

-   Details     : Butterfly operation

-*/

-#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \

-                    out0, out1, out2, out3, out4, out5, out6, out7) {  \

-  out0 = in0 + in7;                                                    \

-  out1 = in1 + in6;                                                    \

-  out2 = in2 + in5;                                                    \

-  out3 = in3 + in4;                                                    \

-                                                                       \

-  out4 = in3 - in4;                                                    \

-  out5 = in2 - in5;                                                    \

-  out6 = in1 - in6;                                                    \

-  out7 = in0 - in7;                                                    \

-}

-/* Description : Butterfly of 16 input vectors

-   Arguments   : Inputs  - in0 ...  in15

-                 Outputs - out0 .. out15

-   Details     : Butterfly operation

-*/

-#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \

-                     in8, in9,  in10, in11, in12, in13, in14, in15,           \

-                     out0, out1, out2, out3, out4, out5, out6, out7,          \

-                     out8, out9, out10, out11, out12, out13, out14, out15) {  \

-  out0 = in0 + in15;                                                          \

-  out1 = in1 + in14;                                                          \

-  out2 = in2 + in13;                                                          \

-  out3 = in3 + in12;                                                          \

-  out4 = in4 + in11;                                                          \

-  out5 = in5 + in10;                                                          \

-  out6 = in6 + in9;                                                           \

-  out7 = in7 + in8;                                                           \

-                                                                              \

-  out8 = in7 - in8;                                                           \

-  out9 = in6 - in9;                                                           \

-  out10 = in5 - in10;                                                         \

-  out11 = in4 - in11;                                                         \

-  out12 = in3 - in12;                                                         \

-  out13 = in2 - in13;                                                         \

-  out14 = in1 - in14;                                                         \

-  out15 = in0 - in15;                                                         \

-}

-/* Description : Transpose input 8x8 byte block

-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7

-                 Return Type - as per RTYPE

-*/

-#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \

-                        out0, out1, out2, out3, out4, out5, out6, out7) {  \

-  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \

-  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \

-                                                                           \

-  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \

-             tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \

-  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \

-  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \

-  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \

-  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \

-  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \

-  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \

-}

-#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)

-/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors

-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,

-                           in8, in9, in10, in11, in12, in13, in14, in15

-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7

-                 Return Type - unsigned byte

-*/

-#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \

-                            in8, in9, in10, in11, in12, in13, in14, in15,      \

-                            out0, out1, out2, out3, out4, out5, out6, out7) {  \

-  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \

-  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \

-                                                                               \

-  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \

-  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \

-  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \

-  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \

-                                                                               \

-  tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \

-  tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \

-  tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \

-  tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \

-  out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \

-  tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \

-  out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \

-  tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \

-                                                                               \

-  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \

-  out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

-  out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

-                                                                               \

-  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \

-  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \

-  out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

-  out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

-                                                                               \

-  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \

-  out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

-  out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

-                                                                               \

-  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \

-  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \

-  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \

-  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \

-  out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

-  out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

-}

-/* Description : Transpose 4x4 block with half word elements in vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1, out2, out3

-                 Return Type - signed halfword

-*/

-#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \

-  v8i16 s0_m, s1_m;                                                       \

-                                                                          \

-  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \

-  ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \

-  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \

-  out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \

-}

-/* Description : Transpose 4x8 block with half word elements in vectors

-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7

-                 Return Type - signed halfword

-*/

-#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \

-                           out0, out1, out2, out3, out4, out5, out6, out7) {  \

-  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \

-  v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \

-  v8i16 zero_m = { 0 };                                                       \

-                                                                              \

-  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \

-             tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \

-  ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \

-  ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \

-                                                                              \

-  out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \

-  out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \

-  out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \

-  out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \

-                                                                              \

-  out4 = zero_m;                                                              \

-  out5 = zero_m;                                                              \

-  out6 = zero_m;                                                              \

-  out7 = zero_m;                                                              \

-}

-/* Description : Transpose 8x4 block with half word elements in vectors

-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7

-                 Return Type - signed halfword

-*/

-#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \

-  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \

-                                                                          \

-  ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \

-  ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \

-  ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \

-  ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \

-}

-/* Description : Transpose 8x8 block with half word elements in vectors

-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7

-                 Return Type - as per RTYPE

-*/

-#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \

-                       out0, out1, out2, out3, out4, out5, out6, out7) {  \

-  v8i16 s0_m, s1_m;                                                       \

-  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \

-  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \

-                                                                          \

-  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \

-  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \

-  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \

-  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \

-  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \

-  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \

-  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \

-  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \

-  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \

-           tmp3_m, tmp7_m, out0, out2, out4, out6);                       \

-  out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \

-  out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \

-  out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \

-  out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \

-}

-#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)

-/* Description : Transpose 4x4 block with word elements in vectors

-   Arguments   : Inputs  - in0, in1, in2, in3

-                 Outputs - out0, out1, out2, out3

-                 Return Type - signed word

-*/

-#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \

-  v4i32 s0_m, s1_m, s2_m, s3_m;                                           \

-                                                                          \

-  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \

-  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \

-                                                                          \

-  out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \

-  out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \

-  out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \

-  out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \

-}

-/* Description : Add block 4x4

-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride

-   Details     : Least significant 4 bytes from each input vector are added to

-                 the destination bytes, clipped between 0-255 and stored.

-*/

-#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \

-  uint32_t src0_m, src1_m, src2_m, src3_m;                      \

-  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \

-  v16i8 dst0_m = { 0 };                                         \

-  v16i8 dst1_m = { 0 };                                         \

-  v16i8 zero_m = { 0 };                                         \

-                                                                \

-  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \

-  LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \

-  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \

-  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \

-  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \

-  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \

-  CLIP_SH2_0_255(res0_m, res1_m);                               \

-  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \

-  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \

-}

-/* Description : Pack even elements of input vectors & xor with 128

-   Arguments   : Inputs - in0, in1

-                 Output - out_m

-                 Return Type - unsigned byte

-   Details     : Signed byte even elements from 'in0' and 'in1' are packed

-                 together in one vector and the resulting vector is xor'ed with

-                 128 to shift the range from signed to unsigned byte

-*/

-#define PCKEV_XORI128_UB(in0, in1) ({                    \

-  v16u8 out_m;                                           \

-                                                         \

-  out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \

-  out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \

-  out_m;                                                 \

-})

-/* Description : Converts inputs to unsigned bytes, interleave, average & store

-                 as 8x4 unsigned byte block

-   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,

-                          pdst, stride

-*/

-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \

-                                dst0, dst1, dst2, dst3, pdst, stride) {  \

-  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \

-  uint8_t *pdst_m = (uint8_t *)(pdst);                                   \

-                                                                         \

-  tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \

-  tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \

-  ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \

-  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \

-  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \

-}

-/* Description : Pack even byte elements and store byte vector in destination

-                 memory

-   Arguments   : Inputs - in0, in1, pdst

-*/

-#define PCKEV_ST_SB(in0, in1, pdst) {             \

-  v16i8 tmp_m;                                    \

-                                                  \

-  tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \

-  ST_SB(tmp_m, (pdst));                           \

-}

-/* Description : Horizontal 2 tap filter kernel code

-   Arguments   : Inputs - in0, in1, mask, coeff, shift

-*/

-#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \

-  v16i8 tmp0_m;                                                \

-  v8u16 tmp1_m;                                                \

-                                                               \

-  tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \

-  tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \

-  tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \

-  tmp1_m = __msa_sat_u_h(tmp1_m, shift);                       \

-                                                               \

-  tmp1_m;                                                      \

-})

-#endif  /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */

--- a/vp9/common/mips/msa/vp9_mfqe_msa.c

+++ b/vp9/common/mips/msa/vp9_mfqe_msa.c

@@ -10,7 +10,7 @@

 #include "./vp9_rtcd.h"

 #include "vp9/common/vp9_onyxc_int.h"

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#include "vpx_dsp/mips/macros_msa.h"

 static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,

                                     uint8_t *dst_ptr, int32_t dst_stride,

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -802,88 +802,6 @@

 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {

-# variance

-add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance16x16 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance8x8 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";

-# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form

-add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_sub_pixel_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";

-#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt

-add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";

 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";

 specialize qw/vp9_avg_8x8 sse2 neon msa/;

@@ -1084,241 +1002,6 @@

 specialize qw/vp9_temporal_filter_apply sse2 msa/;

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance4x8/;

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance4x8/;

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_sub_pixel_variance4x4/;

-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x8/;

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_10_sub_pixel_variance4x4/;

-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x8/;

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vp9_highbd_12_sub_pixel_variance4x4/;

-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";

-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;

   # ENCODEMB INVOKE

--- a/vp9/encoder/arm/neon/vp9_variance_neon.c

+++ /dev/null

@@ -1,153 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include "./vp9_rtcd.h"

-#include "./vpx_dsp_rtcd.h"

-#include "./vpx_config.h"

-#include "vpx_ports/mem.h"

-#include "vpx/vpx_integer.h"

-#include "vp9/common/vp9_filter.h"

-static const uint8_t bilinear_filters[8][2] = {

-  { 128,   0, },

-  { 112,  16, },

-  {  96,  32, },

-  {  80,  48, },

-  {  64,  64, },

-  {  48,  80, },

-  {  32,  96, },

-  {  16, 112, },

-};

-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,

-                                      uint8_t *output_ptr,

-                                      unsigned int src_pixels_per_line,

-                                      int pixel_step,

-                                      unsigned int output_height,

-                                      unsigned int output_width,

-                                      const uint8_t *vp9_filter) {

-  const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);

-  const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);

-  unsigned int i;

-  for (i = 0; i < output_height; ++i) {

-    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);

-    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);

-    const uint16x8_t a = vmull_u8(src_0, f0);

-    const uint16x8_t b = vmlal_u8(a, src_1, f1);

-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);

-    vst1_u8(&output_ptr[0], out);

-    // Next row...

-    src_ptr += src_pixels_per_line;

-    output_ptr += output_width;

-  }

-}

-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,

-                                       uint8_t *output_ptr,

-                                       unsigned int src_pixels_per_line,

-                                       int pixel_step,

-                                       unsigned int output_height,

-                                       unsigned int output_width,

-                                       const uint8_t *vp9_filter) {

-  const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);

-  const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);

-  unsigned int i, j;

-  for (i = 0; i < output_height; ++i) {

-    for (j = 0; j < output_width; j += 16) {

-      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);

-      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);

-      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);

-      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);

-      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);

-      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);

-      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);

-      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);

-      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));

-    }

-    // Next row...

-    src_ptr += src_pixels_per_line;

-    output_ptr += output_width;

-  }

-}

-unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,

-                                            int src_stride,

-                                            int xoffset,

-                                            int yoffset,

-                                            const uint8_t *dst,

-                                            int dst_stride,

-                                            unsigned int *sse) {

-  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);

-  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);

-  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,

-                            9, 8,

-                            bilinear_filters[xoffset]);

-  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,

-                            8, bilinear_filters[yoffset]);

-  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);

-}

-unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,

-                                              int src_stride,

-                                              int xoffset,

-                                              int yoffset,

-                                              const uint8_t *dst,

-                                              int dst_stride,

-                                              unsigned int *sse) {

-  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);

-  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);

-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

-                             17, 16,

-                             bilinear_filters[xoffset]);

-  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,

-                             16, bilinear_filters[yoffset]);

-  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);

-}

-unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,

-                                              int src_stride,

-                                              int xoffset,

-                                              int yoffset,

-                                              const uint8_t *dst,

-                                              int dst_stride,

-                                              unsigned int *sse) {

-  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);

-  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);

-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

-                             33, 32,

-                             bilinear_filters[xoffset]);

-  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,

-                             32, bilinear_filters[yoffset]);

-  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);

-}

-unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,

-                                              int src_stride,

-                                              int xoffset,

-                                              int yoffset,

-                                              const uint8_t *dst,

-                                              int dst_stride,

-                                              unsigned int *sse) {

-  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);

-  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);

-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

-                             65, 64,

-                             bilinear_filters[xoffset]);

-  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,

-                             64, bilinear_filters[yoffset]);

-  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);

-}

--- a/vp9/encoder/mips/msa/vp9_avg_msa.c

+++ b/vp9/encoder/mips/msa/vp9_avg_msa.c

@@ -9,7 +9,7 @@

*/

 #include "./vp9_rtcd.h"

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#include "vpx_dsp/mips/macros_msa.h"

 uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {

   uint32_t sum_out;

--- a/vp9/encoder/mips/msa/vp9_error_msa.c

+++ b/vp9/encoder/mips/msa/vp9_error_msa.c

@@ -9,7 +9,7 @@

*/

 #include "./vp9_rtcd.h"

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#include "vpx_dsp/mips/macros_msa.h"

 #define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                   \

 static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr,     \

--- a/vp9/encoder/mips/msa/vp9_fdct_msa.h

+++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h

@@ -13,7 +13,7 @@

 #include "vpx_ports/mem.h"

 #include "vp9/common/vp9_idct.h"

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#include "vpx_dsp/mips/macros_msa.h"

 #define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) {  \

   v8i16 k0_m = __msa_fill_h(cnst0);                                  \

--- a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c

+++ b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c

@@ -9,7 +9,7 @@

*/

 #include "./vp9_rtcd.h"

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#include "vpx_dsp/mips/macros_msa.h"

 static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,

                                             uint32_t stride,

--- a/vp9/encoder/mips/msa/vp9_variance_msa.c

+++ /dev/null

@@ -1,768 +1,0 @@

-/*

- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp9_rtcd.h"

-#include "./vpx_dsp_rtcd.h"

-#include "vpx_ports/mem.h"

-#include "vp9/common/vp9_filter.h"

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

-static const uint8_t bilinear_filters[8][2] = {

-  { 128,   0, },

-  { 112,  16, },

-  {  96,  32, },

-  {  80,  48, },

-  {  64,  64, },

-  {  48,  80, },

-  {  32,  96, },

-  {  16, 112, },

-};

-#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \

-  v16u8 src_l0_m, src_l1_m;                                        \

-  v8i16 res_l0_m, res_l1_m;                                        \

-                                                                   \

-  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \

-  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \

-  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \

-                                                                   \

-  sub += res_l0_m + res_l1_m;                                      \

-}

-#define VARIANCE_WxH(sse, diff, shift) \

-  sse - (((uint32_t)diff * diff) >> shift)

-#define VARIANCE_LARGE_WxH(sse, diff, shift) \

-  sse - (((int64_t)diff * diff) >> shift)

-static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,

-                                                int32_t src_stride,

-                                                const uint8_t *dst,

-                                                int32_t dst_stride,

-                                                const uint8_t *filter,

-                                                int32_t height,

-                                                int32_t *diff) {

-  int16_t filtval;

-  uint32_t loop_cnt;

-  uint32_t ref0, ref1, ref2, ref3;

-  v16u8 filt0, ref = { 0 };

-  v16i8 src0, src1, src2, src3;

-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };

-  v8u16 vec0, vec1, vec2, vec3;

-  v8u16 const255;

-  v8i16 avg = { 0 };

-  v4i32 vec, var = { 0 };

-  filtval = LH(filter);

-  filt0 = (v16u8)__msa_fill_h(filtval);

-  const255 = (v8u16)__msa_ldi_h(255);

-  for (loop_cnt = (height >> 2); loop_cnt--;) {

-    LD_SB4(src, src_stride, src0, src1, src2, src3);

-    src += (4 * src_stride);

-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);

-    dst += (4 * dst_stride);

-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);

-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);

-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);

-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,

-                vec0, vec1, vec2, vec3);

-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);

-    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);

-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,

-                src0, src1, src2, src3);

-    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);

-    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);

-    CALC_MSE_AVG_B(src0, ref, var, avg);

-  }

-  vec = __msa_hadd_s_w(avg, avg);

-  *diff = HADD_SW_S32(vec);

-  return HADD_SW_S32(var);

-}

-static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,

-                                                int32_t src_stride,

-                                                const uint8_t *dst,

-                                                int32_t dst_stride,

-                                                const uint8_t *filter,

-                                                int32_t height,

-                                                int32_t *diff) {

-  int16_t filtval;

-  uint32_t loop_cnt;

-  v16u8 filt0, out, ref0, ref1, ref2, ref3;

-  v16i8 src0, src1, src2, src3;

-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };

-  v8u16 vec0, vec1, vec2, vec3, const255;

-  v8i16 avg = { 0 };

-  v4i32 vec, var = { 0 };

-  filtval = LH(filter);

-  filt0 = (v16u8)__msa_fill_h(filtval);

-  const255 = (v8u16)__msa_ldi_h(255);

-  for (loop_cnt = (height >> 2); loop_cnt--;) {

-    LD_SB4(src, src_stride, src0, src1, src2, src3);

-    src += (4 * src_stride);

-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);

-    dst += (4 * dst_stride);

-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);

-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);

-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);

-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,

-                vec0, vec1, vec2, vec3);

-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);

-    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);

-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,

-                src0, src1, src2, src3);

-    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);

-    CALC_MSE_AVG_B(out, ref0, var, avg);

-    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);

-    CALC_MSE_AVG_B(out, ref1, var, avg);

-  }

-  vec = __msa_hadd_s_w(avg, avg);

-  *diff = HADD_SW_S32(vec);

-  return HADD_SW_S32(var);

-}

-static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,

-                                                 int32_t src_stride,

-                                                 const uint8_t *dst,

-                                                 int32_t dst_stride,

-                                                 const uint8_t *filter,

-                                                 int32_t height,

-                                                 int32_t *diff) {

-  int16_t filtval;

-  uint32_t loop_cnt;

-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;

-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };

-  v16u8 dst0, dst1, dst2, dst3, filt0;

-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;

-  v8u16 const255;

-  v8i16 avg = { 0 };

-  v4i32 vec, var = { 0 };

-  filtval = LH(filter);

-  filt0 = (v16u8)__msa_fill_h(filtval);

-  const255 = (v8u16)__msa_ldi_h(255);

-  for (loop_cnt = (height >> 2); loop_cnt--;) {

-    LD_SB4(src, src_stride, src0, src2, src4, src6);

-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);

-    src += (4 * src_stride);

-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);

-    dst += (4 * dst_stride);

-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);

-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);

-    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);

-    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);

-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,

-                out0, out1, out2, out3);

-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,

-                out4, out5, out6, out7);

-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);

-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);

-    MIN_UH4_UH(out0, out1, out2, out3, const255);

-    MIN_UH4_UH(out4, out5, out6, out7, const255);

-    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,

-                src0, src1, src2, src3);

-    CALC_MSE_AVG_B(src0, dst0, var, avg);

-    CALC_MSE_AVG_B(src1, dst1, var, avg);

-    CALC_MSE_AVG_B(src2, dst2, var, avg);

-    CALC_MSE_AVG_B(src3, dst3, var, avg);

-  }

-  vec = __msa_hadd_s_w(avg, avg);

-  *diff = HADD_SW_S32(vec);

-  return HADD_SW_S32(var);

-}

-static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,

-                                                 int32_t src_stride,

-                                                 const uint8_t *dst,

-                                                 int32_t dst_stride,

-                                                 const uint8_t *filter,

-                                                 int32_t height,

-                                                 int32_t *diff) {

-  uint32_t loop_cnt, sse = 0;

-  int32_t diff0[2];

-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {

-    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,

-                                            filter, height, &diff0[loop_cnt]);

-    src += 16;

-    dst += 16;

-  }

-  *diff = diff0[0] + diff0[1];

-  return sse;

-}

-static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,

-                                                 int32_t src_stride,

-                                                 const uint8_t *dst,

-                                                 int32_t dst_stride,

-                                                 const uint8_t *filter,

-                                                 int32_t height,

-                                                 int32_t *diff) {

-  uint32_t loop_cnt, sse = 0;

-  int32_t diff0[4];

-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {

-    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,

-                                            filter, height, &diff0[loop_cnt]);

-    src += 16;

-    dst += 16;

-  }

-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];

-  return sse;

-}

-static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,

-                                                int32_t src_stride,

-                                                const uint8_t *dst,

-                                                int32_t dst_stride,

-                                                const uint8_t *filter,

-                                                int32_t height,

-                                                int32_t *diff) {

-  int16_t filtval;

-  uint32_t loop_cnt;

-  uint32_t ref0, ref1, ref2, ref3;

-  v16u8 src0, src1, src2, src3, src4, out;

-  v16u8 src10_r, src32_r, src21_r, src43_r;

-  v16u8 ref = { 0 };

-  v16u8 src2110, src4332;

-  v16u8 filt0;

-  v8i16 avg = { 0 };

-  v4i32 vec, var = { 0 };

-  v8u16 tmp0, tmp1;

-  filtval = LH(filter);

-  filt0 = (v16u8)__msa_fill_h(filtval);

-  src0 = LD_UB(src);

-  src += src_stride;

-  for (loop_cnt = (height >> 2); loop_cnt--;) {

-    LD_UB4(src, src_stride, src1, src2, src3, src4);

-    src += (4 * src_stride);

-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);

-    dst += (4 * dst_stride);

-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);

-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,

-               src10_r, src21_r, src32_r, src43_r);

-    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);

-    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);

-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

-    SAT_UH2_UH(tmp0, tmp1, 7);

-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

-    CALC_MSE_AVG_B(out, ref, var, avg);

-    src0 = src4;

-  }

-  vec = __msa_hadd_s_w(avg, avg);

-  *diff = HADD_SW_S32(vec);

-  return HADD_SW_S32(var);

-}

-static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,

-                                                int32_t src_stride,

-                                                const uint8_t *dst,

-                                                int32_t dst_stride,

-                                                const uint8_t *filter,

-                                                int32_t height,

-                                                int32_t *diff) {

-  int16_t filtval;

-  uint32_t loop_cnt;

-  v16u8 src0, src1, src2, src3, src4;

-  v16u8 ref0, ref1, ref2, ref3;

-  v8u16 vec0, vec1, vec2, vec3;

-  v8u16 tmp0, tmp1, tmp2, tmp3;

-  v16u8 filt0;

-  v8i16 avg = { 0 };

-  v4i32 vec, var = { 0 };

-  filtval = LH(filter);

-  filt0 = (v16u8)__msa_fill_h(filtval);

-  src0 = LD_UB(src);

-  src += src_stride;

-  for (loop_cnt = (height >> 2); loop_cnt--;) {

-    LD_UB4(src, src_stride, src1, src2, src3, src4);

-    src += (4 * src_stride);

-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);

-    dst += (4 * dst_stride);

-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);

-    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,

-               vec0, vec1, vec2, vec3);

-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,

-                tmp0, tmp1, tmp2, tmp3);

-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);

-    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);

-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);

-    CALC_MSE_AVG_B(src0, ref0, var, avg);

-    CALC_MSE_AVG_B(src1, ref1, var, avg);

-    src0 = src4;

-  }

-  vec = __msa_hadd_s_w(avg, avg);

-  *diff = HADD_SW_S32(vec);

-  return HADD_SW_S32(var);

-}

-static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,

-                                                 int32_t src_stride,

-                                                 const uint8_t *dst,

-                                                 int32_t dst_stride,

-                                                 const uint8_t *filter,

-                                                 int32_t height,

-                                                 int32_t *diff) {

-  int16_t filtval;

-  uint32_t loop_cnt;

-  v16u8 ref0, ref1, ref2, ref3;

-  v16u8 src0, src1, src2, src3, src4;

-  v16u8 out0, out1, out2, out3;

-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

-  v8u16 tmp0, tmp1, tmp2, tmp3;

-  v16u8 filt0;

-  v8i16 avg = { 0 };

-  v4i32 vec, var = { 0 };

-  filtval = LH(filter);

-  filt0 = (v16u8)__msa_fill_h(filtval);

-  src0 = LD_UB(src);

-  src += src_stride;

-  for (loop_cnt = (height >> 2); loop_cnt--;) {

-    LD_UB4(src, src_stride, src1, src2, src3, src4);

-    src += (4 * src_stride);

-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);

-    dst += (4 * dst_stride);

-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);

-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);

-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);

-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

-    SAT_UH2_UH(tmp0, tmp1, 7);

-    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);

-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);

-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);

-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);

-    SAT_UH2_UH(tmp2, tmp3, 7);

-    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);

-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);

-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

-    SAT_UH2_UH(tmp0, tmp1, 7);

-    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);

-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);

-    SAT_UH2_UH(tmp2, tmp3, 7);

-    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);

-    src0 = src4;

-    CALC_MSE_AVG_B(out0, ref0, var, avg);

-    CALC_MSE_AVG_B(out1, ref1, var, avg);

-    CALC_MSE_AVG_B(out2, ref2, var, avg);

-    CALC_MSE_AVG_B(out3, ref3, var, avg);

-  }

-  vec = __msa_hadd_s_w(avg, avg);

-  *diff = HADD_SW_S32(vec);

-  return HADD_SW_S32(var);

-}

-static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,

-                                                 int32_t src_stride,

-                                                 const uint8_t *dst,

-                                                 int32_t dst_stride,

-                                                 const uint8_t *filter,

-                                                 int32_t height,

-                                                 int32_t *diff) {

-  uint32_t loop_cnt, sse = 0;

-  int32_t diff0[2];

-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {

-    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,

-                                            filter, height, &diff0[loop_cnt]);

-    src += 16;

-    dst += 16;

-  }

-  *diff = diff0[0] + diff0[1];

-  return sse;

-}

-static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,

-                                                 int32_t src_stride,

-                                                 const uint8_t *dst,

-                                                 int32_t dst_stride,

-                                                 const uint8_t *filter,

-                                                 int32_t height,

-                                                 int32_t *diff) {

-  uint32_t loop_cnt, sse = 0;

-  int32_t diff0[4];

-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {

-    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,

-                                            filter, height, &diff0[loop_cnt]);

-    src += 16;

-    dst += 16;

-  }

-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];

-  return sse;

-}

-static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,

-                                                 int32_t src_stride,

-                                                 const uint8_t *dst,

-                                                 int32_t dst_stride,

-                                                 const uint8_t *filter_horiz,

-                                                 const uint8_t *filter_vert,

-                                                 int32_t height,

-                                                 int32_t *diff) {

-  int16_t filtval;

-  uint32_t loop_cnt;

-  uint32_t ref0, ref1, ref2, ref3;

-  v16u8 src0, src1, src2, src3, src4;

-  v16u8 out, ref = { 0 };

-  v16u8 filt_vt, filt_hz, vec0, vec1;

-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };

-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;

-  v8u16 tmp0, tmp1;

-  v8i16 avg = { 0 };

-  v4i32 vec, var = { 0 };

-  filtval = LH(filter_horiz);

-  filt_hz = (v16u8)__msa_fill_h(filtval);

-  filtval = LH(filter_vert);

-  filt_vt = (v16u8)__msa_fill_h(filtval);

-  src0 = LD_UB(src);

-  src += src_stride;

-  for (loop_cnt = (height >> 2); loop_cnt--;) {

-    LD_UB4(src, src_stride, src1, src2, src3, src4);

-    src += (4 * src_stride);

-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);

-    dst += (4 * dst_stride);

-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);

-    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);

-    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);

-    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);

-    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);

-    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);

-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);

-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);

-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

-    SAT_UH2_UH(tmp0, tmp1, 7);

-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

-    CALC_MSE_AVG_B(out, ref, var, avg);

-    src0 = src4;

-  }

-  vec = __msa_hadd_s_w(avg, avg);

-  *diff = HADD_SW_S32(vec);

-  return HADD_SW_S32(var);

-}

-static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,

-                                                 int32_t src_stride,

-                                                 const uint8_t *dst,

-                                                 int32_t dst_stride,

-                                                 const uint8_t *filter_horiz,

-                                                 const uint8_t *filter_vert,

-                                                 int32_t height,

-                                                 int32_t *diff) {

-  int16_t filtval;

-  uint32_t loop_cnt;

-  v16u8 ref0, ref1, ref2, ref3;

-  v16u8 src0, src1, src2, src3, src4;

-  v16u8 out0, out1;

-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };

-  v8u16 hz_out0, hz_out1;

-  v8u16 tmp0, tmp1, tmp2, tmp3;

-  v16u8 filt_vt, filt_hz, vec0;

-  v8i16 avg = { 0 };

-  v4i32 vec, var = { 0 };

-  filtval = LH(filter_horiz);

-  filt_hz = (v16u8)__msa_fill_h(filtval);

-  filtval = LH(filter_vert);

-  filt_vt = (v16u8)__msa_fill_h(filtval);

-  src0 = LD_UB(src);

-  src += src_stride;

-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);

-  for (loop_cnt = (height >> 2); loop_cnt--;) {

-    LD_UB4(src, src_stride, src1, src2, src3, src4);

-    src += (4 * src_stride);

-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);

-    dst += (4 * dst_stride);

-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);

-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);

-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);

-    tmp0 = __msa_dotp_u_h(vec0, filt_vt);

-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);

-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);

-    tmp1 = __msa_dotp_u_h(vec0, filt_vt);

-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

-    SAT_UH2_UH(tmp0, tmp1, 7);

-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);

-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);

-    tmp2 = __msa_dotp_u_h(vec0, filt_vt);

-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);

-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);

-    tmp3 = __msa_dotp_u_h(vec0, filt_vt);

-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);

-    SAT_UH2_UH(tmp2, tmp3, 7);

-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);

-    CALC_MSE_AVG_B(out0, ref0, var, avg);

-    CALC_MSE_AVG_B(out1, ref1, var, avg);

-  }

-  vec = __msa_hadd_s_w(avg, avg);

-  *diff = HADD_SW_S32(vec);

-  return HADD_SW_S32(var);

-}

-static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,

-                                                  int32_t src_stride,

-                                                  const uint8_t *dst,

-                                                  int32_t dst_stride,

-                                                  const uint8_t *filter_horiz,

-                                                  const uint8_t *filter_vert,

-                                                  int32_t height,

-                                                  int32_t *diff) {

-  int16_t filtval;

-  uint32_t loop_cnt;

-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

-  v16u8 ref0, ref1, ref2, ref3;

-  v16u8 filt_hz, filt_vt, vec0, vec1;

-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };

-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;

-  v8u16 tmp0, tmp1;

-  v8i16 avg = { 0 };

-  v4i32 vec, var = { 0 };

-  filtval = LH(filter_horiz);

-  filt_hz = (v16u8)__msa_fill_h(filtval);

-  filtval = LH(filter_vert);

-  filt_vt = (v16u8)__msa_fill_h(filtval);

-  LD_UB2(src, 8, src0, src1);

-  src += src_stride;

-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);

-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);

-  for (loop_cnt = (height >> 2); loop_cnt--;) {

-    LD_UB4(src, src_stride, src0, src2, src4, src6);

-    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);

-    src += (4 * src_stride);

-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);

-    dst += (4 * dst_stride);

-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);

-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);

-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);

-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);

-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

-    SAT_UH2_UH(tmp0, tmp1, 7);

-    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);

-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);

-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);

-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);

-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

-    SAT_UH2_UH(tmp0, tmp1, 7);

-    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);

-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);

-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);

-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);

-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

-    SAT_UH2_UH(tmp0, tmp1, 7);

-    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);

-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);

-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);

-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);

-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

-    SAT_UH2_UH(tmp0, tmp1, 7);

-    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

-    CALC_MSE_AVG_B(src0, ref0, var, avg);

-    CALC_MSE_AVG_B(src1, ref1, var, avg);

-    CALC_MSE_AVG_B(src2, ref2, var, avg);

-    CALC_MSE_AVG_B(src3, ref3, var, avg);

-  }

-  vec = __msa_hadd_s_w(avg, avg);

-  *diff = HADD_SW_S32(vec);

-  return HADD_SW_S32(var);

-}

-static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,

-                                                  int32_t src_stride,

-                                                  const uint8_t *dst,

-                                                  int32_t dst_stride,

-                                                  const uint8_t *filter_horiz,

-                                                  const uint8_t *filter_vert,

-                                                  int32_t height,

-                                                  int32_t *diff) {

-  uint32_t loop_cnt, sse = 0;

-  int32_t diff0[2];

-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {

-    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,

-                                             filter_horiz, filter_vert, height,

-                                             &diff0[loop_cnt]);

-    src += 16;

-    dst += 16;

-  }

-  *diff = diff0[0] + diff0[1];

-  return sse;

-}

-static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,

-                                                  int32_t src_stride,

-                                                  const uint8_t *dst,

-                                                  int32_t dst_stride,

-                                                  const uint8_t *filter_horiz,

-                                                  const uint8_t *filter_vert,

-                                                  int32_t height,

-                                                  int32_t *diff) {

-  uint32_t loop_cnt, sse = 0;

-  int32_t diff0[4];

-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {

-    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,

-                                             filter_horiz, filter_vert, height,

-                                             &diff0[loop_cnt]);

-    src += 16;

-    dst += 16;

-  }

-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];

-  return sse;

-}

-#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);

-#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);

-#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);

-#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);

-#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);

-#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);

-#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);

-#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);

-#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);

-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);

-#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);

-#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);

-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);

-#define VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                         \

-uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \

-                                                 int32_t src_stride,     \

-                                                 int32_t xoffset,        \

-                                                 int32_t yoffset,        \

-                                                 const uint8_t *ref,     \

-                                                 int32_t ref_stride,     \

-                                                 uint32_t *sse) {        \

-  int32_t diff;                                                          \

-  uint32_t var;                                                          \

-  const uint8_t *h_filter = bilinear_filters[xoffset];                   \

-  const uint8_t *v_filter = bilinear_filters[yoffset];                   \

-                                                                         \

-  if (yoffset) {                                                         \

-    if (xoffset) {                                                       \

-      *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride,      \

-                                                   ref, ref_stride,      \

-                                                   h_filter, v_filter,   \

-                                                   ht, &diff);           \

-    } else {                                                             \

-      *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride,       \

-                                                  ref, ref_stride,       \

-                                                  v_filter, ht, &diff);  \

-    }                                                                    \

-                                                                         \

-    var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                          \

-  } else {                                                               \

-    if (xoffset) {                                                       \

-      *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride,       \

-                                                  ref, ref_stride,       \

-                                                  h_filter, ht, &diff);  \

-                                                                         \

-      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                        \

-    } else {                                                             \

-      var = vpx_variance##wd##x##ht##_msa(src, src_stride,               \

-                                          ref, ref_stride, sse);         \

-    }                                                                    \

-  }                                                                      \

-                                                                         \

-  return var;                                                            \

-}

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);

-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);

--- a/vp9/encoder/vp9_encoder.c

+++ b/vp9/encoder/vp9_encoder.c

@@ -1023,8 +1023,8 @@

                    vpx_highbd_sad32x16_bits8,

                    vpx_highbd_sad32x16_avg_bits8,

                    vpx_highbd_8_variance32x16,

-                   vp9_highbd_sub_pixel_variance32x16,

-                   vp9_highbd_sub_pixel_avg_variance32x16,

+                   vpx_highbd_8_sub_pixel_variance32x16,

+                   vpx_highbd_8_sub_pixel_avg_variance32x16,

                    NULL,

                    NULL,

                    vpx_highbd_sad32x16x4d_bits8)

@@ -1033,8 +1033,8 @@

                    vpx_highbd_sad16x32_bits8,

                    vpx_highbd_sad16x32_avg_bits8,

                    vpx_highbd_8_variance16x32,

-                   vp9_highbd_sub_pixel_variance16x32,

-                   vp9_highbd_sub_pixel_avg_variance16x32,

+                   vpx_highbd_8_sub_pixel_variance16x32,

+                   vpx_highbd_8_sub_pixel_avg_variance16x32,

                    NULL,

                    NULL,

                    vpx_highbd_sad16x32x4d_bits8)

@@ -1043,8 +1043,8 @@

                    vpx_highbd_sad64x32_bits8,

                    vpx_highbd_sad64x32_avg_bits8,

                    vpx_highbd_8_variance64x32,

-                   vp9_highbd_sub_pixel_variance64x32,

-                   vp9_highbd_sub_pixel_avg_variance64x32,

+                   vpx_highbd_8_sub_pixel_variance64x32,

+                   vpx_highbd_8_sub_pixel_avg_variance64x32,

                    NULL,

                    NULL,

                    vpx_highbd_sad64x32x4d_bits8)

@@ -1053,8 +1053,8 @@

                    vpx_highbd_sad32x64_bits8,

                    vpx_highbd_sad32x64_avg_bits8,

                    vpx_highbd_8_variance32x64,

-                   vp9_highbd_sub_pixel_variance32x64,

-                   vp9_highbd_sub_pixel_avg_variance32x64,

+                   vpx_highbd_8_sub_pixel_variance32x64,

+                   vpx_highbd_8_sub_pixel_avg_variance32x64,

                    NULL,

                    NULL,

                    vpx_highbd_sad32x64x4d_bits8)

@@ -1063,8 +1063,8 @@

                    vpx_highbd_sad32x32_bits8,

                    vpx_highbd_sad32x32_avg_bits8,

                    vpx_highbd_8_variance32x32,

-                   vp9_highbd_sub_pixel_variance32x32,

-                   vp9_highbd_sub_pixel_avg_variance32x32,

+                   vpx_highbd_8_sub_pixel_variance32x32,

+                   vpx_highbd_8_sub_pixel_avg_variance32x32,

                    vpx_highbd_sad32x32x3_bits8,

                    vpx_highbd_sad32x32x8_bits8,

                    vpx_highbd_sad32x32x4d_bits8)

@@ -1073,8 +1073,8 @@

                    vpx_highbd_sad64x64_bits8,

                    vpx_highbd_sad64x64_avg_bits8,

                    vpx_highbd_8_variance64x64,

-                   vp9_highbd_sub_pixel_variance64x64,

-                   vp9_highbd_sub_pixel_avg_variance64x64,

+                   vpx_highbd_8_sub_pixel_variance64x64,

+                   vpx_highbd_8_sub_pixel_avg_variance64x64,

                    vpx_highbd_sad64x64x3_bits8,

                    vpx_highbd_sad64x64x8_bits8,

                    vpx_highbd_sad64x64x4d_bits8)

@@ -1083,8 +1083,8 @@

                    vpx_highbd_sad16x16_bits8,

                    vpx_highbd_sad16x16_avg_bits8,

                    vpx_highbd_8_variance16x16,

-                   vp9_highbd_sub_pixel_variance16x16,

-                   vp9_highbd_sub_pixel_avg_variance16x16,

+                   vpx_highbd_8_sub_pixel_variance16x16,

+                   vpx_highbd_8_sub_pixel_avg_variance16x16,

                    vpx_highbd_sad16x16x3_bits8,

                    vpx_highbd_sad16x16x8_bits8,

                    vpx_highbd_sad16x16x4d_bits8)

@@ -1093,8 +1093,8 @@

                    vpx_highbd_sad16x8_bits8,

                    vpx_highbd_sad16x8_avg_bits8,

                    vpx_highbd_8_variance16x8,

-                   vp9_highbd_sub_pixel_variance16x8,

-                   vp9_highbd_sub_pixel_avg_variance16x8,

+                   vpx_highbd_8_sub_pixel_variance16x8,

+                   vpx_highbd_8_sub_pixel_avg_variance16x8,

                    vpx_highbd_sad16x8x3_bits8,

                    vpx_highbd_sad16x8x8_bits8,

                    vpx_highbd_sad16x8x4d_bits8)

@@ -1103,8 +1103,8 @@

                    vpx_highbd_sad8x16_bits8,

                    vpx_highbd_sad8x16_avg_bits8,

                    vpx_highbd_8_variance8x16,

-                   vp9_highbd_sub_pixel_variance8x16,

-                   vp9_highbd_sub_pixel_avg_variance8x16,

+                   vpx_highbd_8_sub_pixel_variance8x16,

+                   vpx_highbd_8_sub_pixel_avg_variance8x16,

                    vpx_highbd_sad8x16x3_bits8,

                    vpx_highbd_sad8x16x8_bits8,

                    vpx_highbd_sad8x16x4d_bits8)

@@ -1113,8 +1113,8 @@

                    vpx_highbd_sad8x8_bits8,

                    vpx_highbd_sad8x8_avg_bits8,

                    vpx_highbd_8_variance8x8,

-                   vp9_highbd_sub_pixel_variance8x8,

-                   vp9_highbd_sub_pixel_avg_variance8x8,

+                   vpx_highbd_8_sub_pixel_variance8x8,

+                   vpx_highbd_8_sub_pixel_avg_variance8x8,

                    vpx_highbd_sad8x8x3_bits8,

                    vpx_highbd_sad8x8x8_bits8,

                    vpx_highbd_sad8x8x4d_bits8)

@@ -1123,8 +1123,8 @@

                    vpx_highbd_sad8x4_bits8,

                    vpx_highbd_sad8x4_avg_bits8,

                    vpx_highbd_8_variance8x4,

-                   vp9_highbd_sub_pixel_variance8x4,

-                   vp9_highbd_sub_pixel_avg_variance8x4,

+                   vpx_highbd_8_sub_pixel_variance8x4,

+                   vpx_highbd_8_sub_pixel_avg_variance8x4,

                    NULL,

                    vpx_highbd_sad8x4x8_bits8,

                    vpx_highbd_sad8x4x4d_bits8)

@@ -1133,8 +1133,8 @@

                    vpx_highbd_sad4x8_bits8,

                    vpx_highbd_sad4x8_avg_bits8,

                    vpx_highbd_8_variance4x8,

-                   vp9_highbd_sub_pixel_variance4x8,

-                   vp9_highbd_sub_pixel_avg_variance4x8,

+                   vpx_highbd_8_sub_pixel_variance4x8,

+                   vpx_highbd_8_sub_pixel_avg_variance4x8,

                    NULL,

                    vpx_highbd_sad4x8x8_bits8,

                    vpx_highbd_sad4x8x4d_bits8)

@@ -1143,8 +1143,8 @@

                    vpx_highbd_sad4x4_bits8,

                    vpx_highbd_sad4x4_avg_bits8,

                    vpx_highbd_8_variance4x4,

-                   vp9_highbd_sub_pixel_variance4x4,

-                   vp9_highbd_sub_pixel_avg_variance4x4,

+                   vpx_highbd_8_sub_pixel_variance4x4,

+                   vpx_highbd_8_sub_pixel_avg_variance4x4,

                    vpx_highbd_sad4x4x3_bits8,

                    vpx_highbd_sad4x4x8_bits8,

                    vpx_highbd_sad4x4x4d_bits8)

@@ -1155,8 +1155,8 @@

                    vpx_highbd_sad32x16_bits10,

                    vpx_highbd_sad32x16_avg_bits10,

                    vpx_highbd_10_variance32x16,

-                   vp9_highbd_10_sub_pixel_variance32x16,

-                   vp9_highbd_10_sub_pixel_avg_variance32x16,

+                   vpx_highbd_10_sub_pixel_variance32x16,

+                   vpx_highbd_10_sub_pixel_avg_variance32x16,

                    NULL,

                    NULL,

                    vpx_highbd_sad32x16x4d_bits10)

@@ -1165,8 +1165,8 @@

                    vpx_highbd_sad16x32_bits10,

                    vpx_highbd_sad16x32_avg_bits10,

                    vpx_highbd_10_variance16x32,

-                   vp9_highbd_10_sub_pixel_variance16x32,

-                   vp9_highbd_10_sub_pixel_avg_variance16x32,

+                   vpx_highbd_10_sub_pixel_variance16x32,

+                   vpx_highbd_10_sub_pixel_avg_variance16x32,

                    NULL,

                    NULL,

                    vpx_highbd_sad16x32x4d_bits10)

@@ -1175,8 +1175,8 @@

                    vpx_highbd_sad64x32_bits10,

                    vpx_highbd_sad64x32_avg_bits10,

                    vpx_highbd_10_variance64x32,

-                   vp9_highbd_10_sub_pixel_variance64x32,

-                   vp9_highbd_10_sub_pixel_avg_variance64x32,

+                   vpx_highbd_10_sub_pixel_variance64x32,

+                   vpx_highbd_10_sub_pixel_avg_variance64x32,

                    NULL,

                    NULL,

                    vpx_highbd_sad64x32x4d_bits10)

@@ -1185,8 +1185,8 @@

                    vpx_highbd_sad32x64_bits10,

                    vpx_highbd_sad32x64_avg_bits10,

                    vpx_highbd_10_variance32x64,

-                   vp9_highbd_10_sub_pixel_variance32x64,

-                   vp9_highbd_10_sub_pixel_avg_variance32x64,

+                   vpx_highbd_10_sub_pixel_variance32x64,

+                   vpx_highbd_10_sub_pixel_avg_variance32x64,

                    NULL,

                    NULL,

                    vpx_highbd_sad32x64x4d_bits10)

@@ -1195,8 +1195,8 @@

                    vpx_highbd_sad32x32_bits10,

                    vpx_highbd_sad32x32_avg_bits10,

                    vpx_highbd_10_variance32x32,

-                   vp9_highbd_10_sub_pixel_variance32x32,

-                   vp9_highbd_10_sub_pixel_avg_variance32x32,

+                   vpx_highbd_10_sub_pixel_variance32x32,

+                   vpx_highbd_10_sub_pixel_avg_variance32x32,

                    vpx_highbd_sad32x32x3_bits10,

                    vpx_highbd_sad32x32x8_bits10,

                    vpx_highbd_sad32x32x4d_bits10)

@@ -1205,8 +1205,8 @@

                    vpx_highbd_sad64x64_bits10,

                    vpx_highbd_sad64x64_avg_bits10,

                    vpx_highbd_10_variance64x64,

-                   vp9_highbd_10_sub_pixel_variance64x64,

-                   vp9_highbd_10_sub_pixel_avg_variance64x64,

+                   vpx_highbd_10_sub_pixel_variance64x64,

+                   vpx_highbd_10_sub_pixel_avg_variance64x64,

                    vpx_highbd_sad64x64x3_bits10,

                    vpx_highbd_sad64x64x8_bits10,

                    vpx_highbd_sad64x64x4d_bits10)

@@ -1215,8 +1215,8 @@

                    vpx_highbd_sad16x16_bits10,

                    vpx_highbd_sad16x16_avg_bits10,

                    vpx_highbd_10_variance16x16,

-                   vp9_highbd_10_sub_pixel_variance16x16,

-                   vp9_highbd_10_sub_pixel_avg_variance16x16,

+                   vpx_highbd_10_sub_pixel_variance16x16,

+                   vpx_highbd_10_sub_pixel_avg_variance16x16,

                    vpx_highbd_sad16x16x3_bits10,

                    vpx_highbd_sad16x16x8_bits10,

                    vpx_highbd_sad16x16x4d_bits10)

@@ -1225,8 +1225,8 @@

                    vpx_highbd_sad16x8_bits10,

                    vpx_highbd_sad16x8_avg_bits10,

                    vpx_highbd_10_variance16x8,

-                   vp9_highbd_10_sub_pixel_variance16x8,

-                   vp9_highbd_10_sub_pixel_avg_variance16x8,

+                   vpx_highbd_10_sub_pixel_variance16x8,

+                   vpx_highbd_10_sub_pixel_avg_variance16x8,

                    vpx_highbd_sad16x8x3_bits10,

                    vpx_highbd_sad16x8x8_bits10,

                    vpx_highbd_sad16x8x4d_bits10)

@@ -1235,8 +1235,8 @@

                    vpx_highbd_sad8x16_bits10,

                    vpx_highbd_sad8x16_avg_bits10,

                    vpx_highbd_10_variance8x16,

-                   vp9_highbd_10_sub_pixel_variance8x16,

-                   vp9_highbd_10_sub_pixel_avg_variance8x16,

+                   vpx_highbd_10_sub_pixel_variance8x16,

+                   vpx_highbd_10_sub_pixel_avg_variance8x16,

                    vpx_highbd_sad8x16x3_bits10,

                    vpx_highbd_sad8x16x8_bits10,

                    vpx_highbd_sad8x16x4d_bits10)

@@ -1245,8 +1245,8 @@

                    vpx_highbd_sad8x8_bits10,

                    vpx_highbd_sad8x8_avg_bits10,

                    vpx_highbd_10_variance8x8,

-                   vp9_highbd_10_sub_pixel_variance8x8,

-                   vp9_highbd_10_sub_pixel_avg_variance8x8,

+                   vpx_highbd_10_sub_pixel_variance8x8,

+                   vpx_highbd_10_sub_pixel_avg_variance8x8,

                    vpx_highbd_sad8x8x3_bits10,

                    vpx_highbd_sad8x8x8_bits10,

                    vpx_highbd_sad8x8x4d_bits10)

@@ -1255,8 +1255,8 @@

                    vpx_highbd_sad8x4_bits10,

                    vpx_highbd_sad8x4_avg_bits10,

                    vpx_highbd_10_variance8x4,

-                   vp9_highbd_10_sub_pixel_variance8x4,

-                   vp9_highbd_10_sub_pixel_avg_variance8x4,

+                   vpx_highbd_10_sub_pixel_variance8x4,

+                   vpx_highbd_10_sub_pixel_avg_variance8x4,

                    NULL,

                    vpx_highbd_sad8x4x8_bits10,

                    vpx_highbd_sad8x4x4d_bits10)

@@ -1265,8 +1265,8 @@

                    vpx_highbd_sad4x8_bits10,

                    vpx_highbd_sad4x8_avg_bits10,

                    vpx_highbd_10_variance4x8,

-                   vp9_highbd_10_sub_pixel_variance4x8,

-                   vp9_highbd_10_sub_pixel_avg_variance4x8,

+                   vpx_highbd_10_sub_pixel_variance4x8,

+                   vpx_highbd_10_sub_pixel_avg_variance4x8,

                    NULL,

                    vpx_highbd_sad4x8x8_bits10,

                    vpx_highbd_sad4x8x4d_bits10)

@@ -1275,8 +1275,8 @@

                    vpx_highbd_sad4x4_bits10,

                    vpx_highbd_sad4x4_avg_bits10,

                    vpx_highbd_10_variance4x4,

-                   vp9_highbd_10_sub_pixel_variance4x4,

-                   vp9_highbd_10_sub_pixel_avg_variance4x4,

+                   vpx_highbd_10_sub_pixel_variance4x4,

+                   vpx_highbd_10_sub_pixel_avg_variance4x4,

                    vpx_highbd_sad4x4x3_bits10,

                    vpx_highbd_sad4x4x8_bits10,

                    vpx_highbd_sad4x4x4d_bits10)

@@ -1287,8 +1287,8 @@

                    vpx_highbd_sad32x16_bits12,

                    vpx_highbd_sad32x16_avg_bits12,

                    vpx_highbd_12_variance32x16,

-                   vp9_highbd_12_sub_pixel_variance32x16,

-                   vp9_highbd_12_sub_pixel_avg_variance32x16,

+                   vpx_highbd_12_sub_pixel_variance32x16,

+                   vpx_highbd_12_sub_pixel_avg_variance32x16,

                    NULL,

                    NULL,

                    vpx_highbd_sad32x16x4d_bits12)

@@ -1297,8 +1297,8 @@

                    vpx_highbd_sad16x32_bits12,

                    vpx_highbd_sad16x32_avg_bits12,

                    vpx_highbd_12_variance16x32,

-                   vp9_highbd_12_sub_pixel_variance16x32,

-                   vp9_highbd_12_sub_pixel_avg_variance16x32,

+                   vpx_highbd_12_sub_pixel_variance16x32,

+                   vpx_highbd_12_sub_pixel_avg_variance16x32,

                    NULL,

                    NULL,

                    vpx_highbd_sad16x32x4d_bits12)

@@ -1307,8 +1307,8 @@

                    vpx_highbd_sad64x32_bits12,

                    vpx_highbd_sad64x32_avg_bits12,

                    vpx_highbd_12_variance64x32,

-                   vp9_highbd_12_sub_pixel_variance64x32,

-                   vp9_highbd_12_sub_pixel_avg_variance64x32,

+                   vpx_highbd_12_sub_pixel_variance64x32,

+                   vpx_highbd_12_sub_pixel_avg_variance64x32,

                    NULL,

                    NULL,

                    vpx_highbd_sad64x32x4d_bits12)

@@ -1317,8 +1317,8 @@

                    vpx_highbd_sad32x64_bits12,

                    vpx_highbd_sad32x64_avg_bits12,

                    vpx_highbd_12_variance32x64,

-                   vp9_highbd_12_sub_pixel_variance32x64,

-                   vp9_highbd_12_sub_pixel_avg_variance32x64,

+                   vpx_highbd_12_sub_pixel_variance32x64,

+                   vpx_highbd_12_sub_pixel_avg_variance32x64,

                    NULL,

                    NULL,

                    vpx_highbd_sad32x64x4d_bits12)

@@ -1327,8 +1327,8 @@

                    vpx_highbd_sad32x32_bits12,

                    vpx_highbd_sad32x32_avg_bits12,

                    vpx_highbd_12_variance32x32,

-                   vp9_highbd_12_sub_pixel_variance32x32,

-                   vp9_highbd_12_sub_pixel_avg_variance32x32,

+                   vpx_highbd_12_sub_pixel_variance32x32,

+                   vpx_highbd_12_sub_pixel_avg_variance32x32,

                    vpx_highbd_sad32x32x3_bits12,

                    vpx_highbd_sad32x32x8_bits12,

                    vpx_highbd_sad32x32x4d_bits12)

@@ -1337,8 +1337,8 @@

                    vpx_highbd_sad64x64_bits12,

                    vpx_highbd_sad64x64_avg_bits12,

                    vpx_highbd_12_variance64x64,

-                   vp9_highbd_12_sub_pixel_variance64x64,

-                   vp9_highbd_12_sub_pixel_avg_variance64x64,

+                   vpx_highbd_12_sub_pixel_variance64x64,

+                   vpx_highbd_12_sub_pixel_avg_variance64x64,

                    vpx_highbd_sad64x64x3_bits12,

                    vpx_highbd_sad64x64x8_bits12,

                    vpx_highbd_sad64x64x4d_bits12)

@@ -1347,8 +1347,8 @@

                    vpx_highbd_sad16x16_bits12,

                    vpx_highbd_sad16x16_avg_bits12,

                    vpx_highbd_12_variance16x16,

-                   vp9_highbd_12_sub_pixel_variance16x16,

-                   vp9_highbd_12_sub_pixel_avg_variance16x16,

+                   vpx_highbd_12_sub_pixel_variance16x16,

+                   vpx_highbd_12_sub_pixel_avg_variance16x16,

                    vpx_highbd_sad16x16x3_bits12,

                    vpx_highbd_sad16x16x8_bits12,

                    vpx_highbd_sad16x16x4d_bits12)

@@ -1357,8 +1357,8 @@

                    vpx_highbd_sad16x8_bits12,

                    vpx_highbd_sad16x8_avg_bits12,

                    vpx_highbd_12_variance16x8,

-                   vp9_highbd_12_sub_pixel_variance16x8,

-                   vp9_highbd_12_sub_pixel_avg_variance16x8,

+                   vpx_highbd_12_sub_pixel_variance16x8,

+                   vpx_highbd_12_sub_pixel_avg_variance16x8,

                    vpx_highbd_sad16x8x3_bits12,

                    vpx_highbd_sad16x8x8_bits12,

                    vpx_highbd_sad16x8x4d_bits12)

@@ -1367,8 +1367,8 @@

                    vpx_highbd_sad8x16_bits12,

                    vpx_highbd_sad8x16_avg_bits12,

                    vpx_highbd_12_variance8x16,

-                   vp9_highbd_12_sub_pixel_variance8x16,

-                   vp9_highbd_12_sub_pixel_avg_variance8x16,

+                   vpx_highbd_12_sub_pixel_variance8x16,

+                   vpx_highbd_12_sub_pixel_avg_variance8x16,

                    vpx_highbd_sad8x16x3_bits12,

                    vpx_highbd_sad8x16x8_bits12,

                    vpx_highbd_sad8x16x4d_bits12)

@@ -1377,8 +1377,8 @@

                    vpx_highbd_sad8x8_bits12,

                    vpx_highbd_sad8x8_avg_bits12,

                    vpx_highbd_12_variance8x8,

-                   vp9_highbd_12_sub_pixel_variance8x8,

-                   vp9_highbd_12_sub_pixel_avg_variance8x8,

+                   vpx_highbd_12_sub_pixel_variance8x8,

+                   vpx_highbd_12_sub_pixel_avg_variance8x8,

                    vpx_highbd_sad8x8x3_bits12,

                    vpx_highbd_sad8x8x8_bits12,

                    vpx_highbd_sad8x8x4d_bits12)

@@ -1387,8 +1387,8 @@

                    vpx_highbd_sad8x4_bits12,

                    vpx_highbd_sad8x4_avg_bits12,

                    vpx_highbd_12_variance8x4,

-                   vp9_highbd_12_sub_pixel_variance8x4,

-                   vp9_highbd_12_sub_pixel_avg_variance8x4,

+                   vpx_highbd_12_sub_pixel_variance8x4,

+                   vpx_highbd_12_sub_pixel_avg_variance8x4,

                    NULL,

                    vpx_highbd_sad8x4x8_bits12,

                    vpx_highbd_sad8x4x4d_bits12)

@@ -1397,8 +1397,8 @@

                    vpx_highbd_sad4x8_bits12,

                    vpx_highbd_sad4x8_avg_bits12,

                    vpx_highbd_12_variance4x8,

-                   vp9_highbd_12_sub_pixel_variance4x8,

-                   vp9_highbd_12_sub_pixel_avg_variance4x8,

+                   vpx_highbd_12_sub_pixel_variance4x8,

+                   vpx_highbd_12_sub_pixel_avg_variance4x8,

                    NULL,

                    vpx_highbd_sad4x8x8_bits12,

                    vpx_highbd_sad4x8x4d_bits12)

@@ -1407,8 +1407,8 @@

                    vpx_highbd_sad4x4_bits12,

                    vpx_highbd_sad4x4_avg_bits12,

                    vpx_highbd_12_variance4x4,

-                   vp9_highbd_12_sub_pixel_variance4x4,

-                   vp9_highbd_12_sub_pixel_avg_variance4x4,

+                   vpx_highbd_12_sub_pixel_variance4x4,

+                   vpx_highbd_12_sub_pixel_avg_variance4x4,

                    vpx_highbd_sad4x4x3_bits12,

                    vpx_highbd_sad4x4x8_bits12,

                    vpx_highbd_sad4x4x4d_bits12)

@@ -1832,62 +1832,62 @@

     cpi->fn_ptr[BT].sdx4df         = SDX4DF;

   BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,

-      vpx_variance32x16, vp9_sub_pixel_variance32x16,

-      vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)

+      vpx_variance32x16, vpx_sub_pixel_variance32x16,

+      vpx_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)

   BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,

-      vpx_variance16x32, vp9_sub_pixel_variance16x32,

-      vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)

+      vpx_variance16x32, vpx_sub_pixel_variance16x32,

+      vpx_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)

   BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,

-      vpx_variance64x32, vp9_sub_pixel_variance64x32,

-      vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)

+      vpx_variance64x32, vpx_sub_pixel_variance64x32,

+      vpx_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)

   BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,

-      vpx_variance32x64, vp9_sub_pixel_variance32x64,

-      vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)

+      vpx_variance32x64, vpx_sub_pixel_variance32x64,

+      vpx_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)

   BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,

-      vpx_variance32x32, vp9_sub_pixel_variance32x32,

-      vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,

+      vpx_variance32x32, vpx_sub_pixel_variance32x32,

+      vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,

       vpx_sad32x32x4d)

   BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,

-      vpx_variance64x64, vp9_sub_pixel_variance64x64,

-      vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,

+      vpx_variance64x64, vpx_sub_pixel_variance64x64,

+      vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,

       vpx_sad64x64x4d)

   BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,

-      vpx_variance16x16, vp9_sub_pixel_variance16x16,

-      vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,

+      vpx_variance16x16, vpx_sub_pixel_variance16x16,

+      vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,

       vpx_sad16x16x4d)

   BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,

-      vpx_variance16x8, vp9_sub_pixel_variance16x8,

-      vp9_sub_pixel_avg_variance16x8,

+      vpx_variance16x8, vpx_sub_pixel_variance16x8,

+      vpx_sub_pixel_avg_variance16x8,

       vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)

   BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,

-      vpx_variance8x16, vp9_sub_pixel_variance8x16,

-      vp9_sub_pixel_avg_variance8x16,

+      vpx_variance8x16, vpx_sub_pixel_variance8x16,

+      vpx_sub_pixel_avg_variance8x16,

       vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)

   BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,

-      vpx_variance8x8, vp9_sub_pixel_variance8x8,

-      vp9_sub_pixel_avg_variance8x8,

+      vpx_variance8x8, vpx_sub_pixel_variance8x8,

+      vpx_sub_pixel_avg_variance8x8,

       vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)

   BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,

-      vpx_variance8x4, vp9_sub_pixel_variance8x4,

-      vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)

+      vpx_variance8x4, vpx_sub_pixel_variance8x4,

+      vpx_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)

   BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,

-      vpx_variance4x8, vp9_sub_pixel_variance4x8,

-      vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)

+      vpx_variance4x8, vpx_sub_pixel_variance4x8,

+      vpx_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)

   BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,

-      vpx_variance4x4, vp9_sub_pixel_variance4x4,

-      vp9_sub_pixel_avg_variance4x4,

+      vpx_variance4x4, vpx_sub_pixel_variance4x4,

+      vpx_sub_pixel_avg_variance4x4,

       vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)

 #if CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/encoder/vp9_encoder.h

+++ b/vp9/encoder/vp9_encoder.h

@@ -40,7 +40,7 @@

 #include "vp9/encoder/vp9_speed_features.h"

 #include "vp9/encoder/vp9_svc_layercontext.h"

 #include "vp9/encoder/vp9_tokenize.h"

-#include "vp9/encoder/vp9_variance.h"

+#include "vpx_dsp/variance.h"

 #if CONFIG_VP9_TEMPORAL_DENOISING

 #include "vp9/encoder/vp9_denoiser.h"

--- a/vp9/encoder/vp9_firstpass.c

+++ b/vp9/encoder/vp9_firstpass.c

@@ -35,7 +35,7 @@

 #include "vp9/encoder/vp9_mcomp.h"

 #include "vp9/encoder/vp9_quantize.h"

 #include "vp9/encoder/vp9_rd.h"

-#include "vp9/encoder/vp9_variance.h"

+#include "vpx_dsp/variance.h"

 #define OUTPUT_FPF          0

 #define ARF_STATS_OUTPUT    0

@@ -298,7 +298,7 @@

-static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {

+static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {

   switch (bsize) {

     case BLOCK_8X8:

       return vpx_mse8x8;

@@ -315,13 +315,13 @@

                                          const struct buf_2d *src,

                                          const struct buf_2d *ref) {

   unsigned int sse;

-  const vp9_variance_fn_t fn = get_block_variance_fn(bsize);

+  const vpx_variance_fn_t fn = get_block_variance_fn(bsize);

   fn(src->buf, src->stride, ref->buf, ref->stride, &sse);

   return sse;

 #if CONFIG_VP9_HIGHBITDEPTH

-static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,

+static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,

                                                       int bd) {

   switch (bd) {

     default:

@@ -368,7 +368,7 @@

                                                 const struct buf_2d *ref,

                                                 int bd) {

   unsigned int sse;

-  const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);

+  const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);

   fn(src->buf, src->stride, ref->buf, ref->stride, &sse);

   return sse;

--- a/vp9/encoder/vp9_mcomp.h

+++ b/vp9/encoder/vp9_mcomp.h

@@ -13,7 +13,7 @@

 #define VP9_ENCODER_VP9_MCOMP_H_

 #include "vp9/encoder/vp9_block.h"

-#include "vp9/encoder/vp9_variance.h"

+#include "vpx_dsp/variance.h"

 #ifdef __cplusplus

 extern "C" {

--- a/vp9/encoder/vp9_rd.c

+++ b/vp9/encoder/vp9_rd.c

@@ -37,7 +37,6 @@

 #include "vp9/encoder/vp9_ratectrl.h"

 #include "vp9/encoder/vp9_rd.h"

 #include "vp9/encoder/vp9_tokenize.h"

-#include "vp9/encoder/vp9_variance.h"

 #define RD_THRESH_POW      1.25

 #define RD_MULT_EPB_RATIO  64

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -39,7 +39,6 @@

 #include "vp9/encoder/vp9_ratectrl.h"

 #include "vp9/encoder/vp9_rd.h"

 #include "vp9/encoder/vp9_rdopt.h"

-#include "vp9/encoder/vp9_variance.h"

 #include "vp9/encoder/vp9_aq_variance.h"

 #define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \

--- a/vp9/encoder/vp9_variance.c

+++ /dev/null

@@ -1,380 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp9_rtcd.h"

-#include "./vpx_dsp_rtcd.h"

-#include "vpx_ports/mem.h"

-#include "vpx/vpx_integer.h"

-#include "vp9/common/vp9_common.h"

-#include "vp9/common/vp9_filter.h"

-#include "vp9/encoder/vp9_variance.h"

-static const uint8_t bilinear_filters[8][2] = {

-  { 128,   0, },

-  { 112,  16, },

-  {  96,  32, },

-  {  80,  48, },

-  {  64,  64, },

-  {  48,  80, },

-  {  32,  96, },

-  {  16, 112, },

-};

-// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal

-// or vertical direction to produce the filtered output block. Used to implement

-// first-pass of 2-D separable filter.

-//

-// Produces int32_t output to retain precision for next pass. Two filter taps

-// should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is

-// applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It

-// defines the offset required to move from one input to the next.

-static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,

-                                              uint16_t *output_ptr,

-                                              unsigned int src_pixels_per_line,

-                                              int pixel_step,

-                                              unsigned int output_height,

-                                              unsigned int output_width,

-                                              const uint8_t *vp9_filter) {

-  unsigned int i, j;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +

-                          (int)src_ptr[pixel_step] * vp9_filter[1],

-                          FILTER_BITS);

-      src_ptr++;

-    }

-    // Next row...

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_width;

-  }

-}

-// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal

-// or vertical direction to produce the filtered output block. Used to implement

-// second-pass of 2-D separable filter.

-//

-// Requires 32-bit input as produced by filter_block2d_bil_first_pass. Two

-// filter taps should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the

-// filter is applied horizontally (pixel_step=1) or vertically (pixel_step=

-// stride). It defines the offset required to move from one input to the next.

-static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,

-                                               uint8_t *output_ptr,

-                                               unsigned int src_pixels_per_line,

-                                               unsigned int pixel_step,

-                                               unsigned int output_height,

-                                               unsigned int output_width,

-                                               const uint8_t *vp9_filter) {

-  unsigned int  i, j;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +

-                          (int)src_ptr[pixel_step] * vp9_filter[1],

-                          FILTER_BITS);

-      src_ptr++;

-    }

-    src_ptr += src_pixels_per_line - output_width;

-    output_ptr += output_width;

-  }

-}

-#define SUBPIX_VAR(W, H) \

-unsigned int vp9_sub_pixel_variance##W##x##H##_c( \

-  const uint8_t *src, int  src_stride, \

-  int xoffset, int  yoffset, \

-  const uint8_t *dst, int dst_stride, \

-  unsigned int *sse) { \

-  uint16_t fdata3[(H + 1) * W]; \

-  uint8_t temp2[H * W]; \

-\

-  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \

-                                    bilinear_filters[xoffset]); \

-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

-                                     bilinear_filters[yoffset]); \

-\

-  return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \

-}

-#define SUBPIX_AVG_VAR(W, H) \

-unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \

-  const uint8_t *src, int  src_stride, \

-  int xoffset, int  yoffset, \

-  const uint8_t *dst, int dst_stride, \

-  unsigned int *sse, \

-  const uint8_t *second_pred) { \

-  uint16_t fdata3[(H + 1) * W]; \

-  uint8_t temp2[H * W]; \

-  DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \

-\

-  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \

-                                    bilinear_filters[xoffset]); \

-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

-                                     bilinear_filters[yoffset]); \

-\

-  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \

-\

-  return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \

-}

-SUBPIX_VAR(4, 4)

-SUBPIX_AVG_VAR(4, 4)

-SUBPIX_VAR(4, 8)

-SUBPIX_AVG_VAR(4, 8)

-SUBPIX_VAR(8, 4)

-SUBPIX_AVG_VAR(8, 4)

-SUBPIX_VAR(8, 8)

-SUBPIX_AVG_VAR(8, 8)

-SUBPIX_VAR(8, 16)

-SUBPIX_AVG_VAR(8, 16)

-SUBPIX_VAR(16, 8)

-SUBPIX_AVG_VAR(16, 8)

-SUBPIX_VAR(16, 16)

-SUBPIX_AVG_VAR(16, 16)

-SUBPIX_VAR(16, 32)

-SUBPIX_AVG_VAR(16, 32)

-SUBPIX_VAR(32, 16)

-SUBPIX_AVG_VAR(32, 16)

-SUBPIX_VAR(32, 32)

-SUBPIX_AVG_VAR(32, 32)

-SUBPIX_VAR(32, 64)

-SUBPIX_AVG_VAR(32, 64)

-SUBPIX_VAR(64, 32)

-SUBPIX_AVG_VAR(64, 32)

-SUBPIX_VAR(64, 64)

-SUBPIX_AVG_VAR(64, 64)

-#if CONFIG_VP9_HIGHBITDEPTH

-static void highbd_var_filter_block2d_bil_first_pass(

-    const uint8_t *src_ptr8,

-    uint16_t *output_ptr,

-    unsigned int src_pixels_per_line,

-    int pixel_step,

-    unsigned int output_height,

-    unsigned int output_width,

-    const uint8_t *vp9_filter) {

-  unsigned int i, j;

-  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      output_ptr[j] =

-          ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +

-                             (int)src_ptr[pixel_step] * vp9_filter[1],

-                             FILTER_BITS);

-      src_ptr++;

-    }

-    // Next row...

-    src_ptr += src_pixels_per_line - output_width;

-    output_ptr += output_width;

-  }

-}

-static void highbd_var_filter_block2d_bil_second_pass(

-    const uint16_t *src_ptr,

-    uint16_t *output_ptr,

-    unsigned int src_pixels_per_line,

-    unsigned int pixel_step,

-    unsigned int output_height,

-    unsigned int output_width,

-    const uint8_t *vp9_filter) {

-  unsigned int  i, j;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      output_ptr[j] =

-          ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +

-                             (int)src_ptr[pixel_step] * vp9_filter[1],

-                             FILTER_BITS);

-      src_ptr++;

-    }

-    src_ptr += src_pixels_per_line - output_width;

-    output_ptr += output_width;

-  }

-}

-#define HIGHBD_SUBPIX_VAR(W, H) \

-unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \

-  const uint8_t *src, int  src_stride, \

-  int xoffset, int  yoffset, \

-  const uint8_t *dst, int dst_stride, \

-  unsigned int *sse) { \

-  uint16_t fdata3[(H + 1) * W]; \

-  uint16_t temp2[H * W]; \

-\

-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

-                                           W, bilinear_filters[xoffset]); \

-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

-                                            bilinear_filters[yoffset]); \

-\

-  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \

-                                          dst_stride, sse); \

-} \

-\

-unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \

-  const uint8_t *src, int  src_stride, \

-  int xoffset, int  yoffset, \

-  const uint8_t *dst, int dst_stride, \

-  unsigned int *sse) { \

-  uint16_t fdata3[(H + 1) * W]; \

-  uint16_t temp2[H * W]; \

-\

-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

-                                           W, bilinear_filters[xoffset]); \

-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

-                                            bilinear_filters[yoffset]); \

-\

-  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \

-                                             W, dst, dst_stride, sse); \

-} \

-\

-unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \

-  const uint8_t *src, int  src_stride, \

-  int xoffset, int  yoffset, \

-  const uint8_t *dst, int dst_stride, \

-  unsigned int *sse) { \

-  uint16_t fdata3[(H + 1) * W]; \

-  uint16_t temp2[H * W]; \

-\

-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

-                                           W, bilinear_filters[xoffset]); \

-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

-                                            bilinear_filters[yoffset]); \

-\

-  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \

-                                             W, dst, dst_stride, sse); \

-}

-#define HIGHBD_SUBPIX_AVG_VAR(W, H) \

-unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \

-  const uint8_t *src, int  src_stride, \

-  int xoffset, int  yoffset, \

-  const uint8_t *dst, int dst_stride, \

-  unsigned int *sse, \

-  const uint8_t *second_pred) { \

-  uint16_t fdata3[(H + 1) * W]; \

-  uint16_t temp2[H * W]; \

-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \

-\

-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

-                                           W, bilinear_filters[xoffset]); \

-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

-                                            bilinear_filters[yoffset]); \

-\

-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \

-                           CONVERT_TO_BYTEPTR(temp2), W); \

-\

-  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \

-                                          dst_stride, sse); \

-} \

-\

-unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \

-  const uint8_t *src, int  src_stride, \

-  int xoffset, int  yoffset, \

-  const uint8_t *dst, int dst_stride, \

-  unsigned int *sse, \

-  const uint8_t *second_pred) { \

-  uint16_t fdata3[(H + 1) * W]; \

-  uint16_t temp2[H * W]; \

-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \

-\

-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

-                                           W, bilinear_filters[xoffset]); \

-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

-                                            bilinear_filters[yoffset]); \

-\

-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \

-                           CONVERT_TO_BYTEPTR(temp2), W); \

-\

-  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \

-                                             W, dst, dst_stride, sse); \

-} \

-\

-unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \

-  const uint8_t *src, int  src_stride, \

-  int xoffset, int  yoffset, \

-  const uint8_t *dst, int dst_stride, \

-  unsigned int *sse, \

-  const uint8_t *second_pred) { \

-  uint16_t fdata3[(H + 1) * W]; \

-  uint16_t temp2[H * W]; \

-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \

-\

-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

-                                           W, bilinear_filters[xoffset]); \

-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

-                                            bilinear_filters[yoffset]); \

-\

-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \

-                           CONVERT_TO_BYTEPTR(temp2), W); \

-\

-  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \

-                                             W, dst, dst_stride, sse); \

-}

-HIGHBD_SUBPIX_VAR(4, 4)

-HIGHBD_SUBPIX_AVG_VAR(4, 4)

-HIGHBD_SUBPIX_VAR(4, 8)

-HIGHBD_SUBPIX_AVG_VAR(4, 8)

-HIGHBD_SUBPIX_VAR(8, 4)

-HIGHBD_SUBPIX_AVG_VAR(8, 4)

-HIGHBD_SUBPIX_VAR(8, 8)

-HIGHBD_SUBPIX_AVG_VAR(8, 8)

-HIGHBD_SUBPIX_VAR(8, 16)

-HIGHBD_SUBPIX_AVG_VAR(8, 16)

-HIGHBD_SUBPIX_VAR(16, 8)

-HIGHBD_SUBPIX_AVG_VAR(16, 8)

-HIGHBD_SUBPIX_VAR(16, 16)

-HIGHBD_SUBPIX_AVG_VAR(16, 16)

-HIGHBD_SUBPIX_VAR(16, 32)

-HIGHBD_SUBPIX_AVG_VAR(16, 32)

-HIGHBD_SUBPIX_VAR(32, 16)

-HIGHBD_SUBPIX_AVG_VAR(32, 16)

-HIGHBD_SUBPIX_VAR(32, 32)

-HIGHBD_SUBPIX_AVG_VAR(32, 32)

-HIGHBD_SUBPIX_VAR(32, 64)

-HIGHBD_SUBPIX_AVG_VAR(32, 64)

-HIGHBD_SUBPIX_VAR(64, 32)

-HIGHBD_SUBPIX_AVG_VAR(64, 32)

-HIGHBD_SUBPIX_VAR(64, 64)

-HIGHBD_SUBPIX_AVG_VAR(64, 64)

-#endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/encoder/vp9_variance.h

+++ /dev/null

@@ -1,81 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_ENCODER_VP9_VARIANCE_H_

-#define VP9_ENCODER_VP9_VARIANCE_H_

-#include "vpx/vpx_integer.h"

-#include "vpx_ports/mem.h"

-#ifdef __cplusplus

-extern "C" {

-#endif

-typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,

-                                    int source_stride,

-                                    const uint8_t *ref_ptr,

-                                    int ref_stride);

-typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,

-                                        int source_stride,

-                                        const uint8_t *ref_ptr,

-                                        int ref_stride,

-                                        const uint8_t *second_pred);

-typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,

-                                   int source_stride,

-                                   const uint8_t *ref_ptr,

-                                   int  ref_stride,

-                                   unsigned int *sad_array);

-typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,

-                                     int source_stride,

-                                     const uint8_t* const ref_ptr[],

-                                     int  ref_stride, unsigned int *sad_array);

-typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr,

-                                          int source_stride,

-                                          const uint8_t *ref_ptr,

-                                          int ref_stride,

-                                          unsigned int *sse);

-typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,

-                                                int source_stride,

-                                                int xoffset,

-                                                int yoffset,

-                                                const uint8_t *ref_ptr,

-                                                int Refstride,

-                                                unsigned int *sse);

-typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,

-                                                   int source_stride,

-                                                   int xoffset,

-                                                   int yoffset,

-                                                   const uint8_t *ref_ptr,

-                                                   int Refstride,

-                                                   unsigned int *sse,

-                                                   const uint8_t *second_pred);

-typedef struct vp9_variance_vtable {

-  vp9_sad_fn_t               sdf;

-  vp9_sad_avg_fn_t           sdaf;

-  vp9_variance_fn_t          vf;

-  vp9_subpixvariance_fn_t    svf;

-  vp9_subp_avg_variance_fn_t svaf;

-  vp9_sad_multi_fn_t         sdx3f;

-  vp9_sad_multi_fn_t         sdx8f;

-  vp9_sad_multi_d_fn_t       sdx4df;

-} vp9_variance_fn_ptr_t;

-#ifdef __cplusplus

-}  // extern "C"

-#endif

-#endif  // VP9_ENCODER_VP9_VARIANCE_H_

--- a/vp9/encoder/x86/vp9_highbd_subpel_variance.asm

+++ /dev/null

@@ -1,1039 +1,0 @@

-;

-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-SECTION_RODATA

-pw_8: times  8 dw  8

-bilin_filter_m_sse2: times  8 dw 16

-                     times  8 dw  0

-                     times  8 dw 14

-                     times  8 dw  2

-                     times  8 dw 12

-                     times  8 dw  4

-                     times  8 dw 10

-                     times  8 dw  6

-                     times 16 dw  8

-                     times  8 dw  6

-                     times  8 dw 10

-                     times  8 dw  4

-                     times  8 dw 12

-                     times  8 dw  2

-                     times  8 dw 14

-SECTION .text

-; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,

-;                               int x_offset, int y_offset,

-;                               const uint8_t *dst, ptrdiff_t dst_stride,

-;                               int height, unsigned int *sse);

-;

-; This function returns the SE and stores SSE in the given pointer.

-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse

-  psubw                %3, %4

-  psubw                %1, %2

-  mova                 %4, %3       ; make copies to manipulate to calc sum

-  mova                 %2, %1       ; use originals for calc sse

-  pmaddwd              %3, %3

-  paddw                %4, %2

-  pmaddwd              %1, %1

-  movhlps              %2, %4

-  paddd                %6, %3

-  paddw                %4, %2

-  pxor                 %2, %2

-  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)

-  punpcklwd            %4, %2       ; sign-extend word to dword

-  paddd                %6, %1

-  paddd                %5, %4

-%endmacro

-%macro STORE_AND_RET 0

-%if mmsize == 16

-  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit

-  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.

-  ; We have to sign-extend it before adding the words within the register

-  ; and outputing to a dword.

-  movhlps              m3, m7

-  movhlps              m4, m6

-  paddd                m7, m3

-  paddd                m6, m4

-  pshufd               m3, m7, 0x1

-  pshufd               m4, m6, 0x1

-  paddd                m7, m3

-  paddd                m6, m4

-  mov                  r1, ssem         ; r1 = unsigned int *sse

-  movd               [r1], m7           ; store sse

-  movd                rax, m6           ; store sum as return value

-%endif

-  RET

-%endmacro

-%macro INC_SRC_BY_SRC_STRIDE  0

-%if ARCH_X86=1 && CONFIG_PIC=1

-  lea                srcq, [srcq + src_stridemp*2]

-%else

-  lea                srcq, [srcq + src_strideq*2]

-%endif

-%endmacro

-%macro INC_SRC_BY_SRC_2STRIDE  0

-%if ARCH_X86=1 && CONFIG_PIC=1

-  lea                srcq, [srcq + src_stridemp*4]

-%else

-  lea                srcq, [srcq + src_strideq*4]

-%endif

-%endmacro

-%macro SUBPEL_VARIANCE 1-2 0 ; W

-%define bilin_filter_m bilin_filter_m_sse2

-%define filter_idx_shift 5

-%ifdef PIC    ; 64bit PIC

-  %if %2 == 1 ; avg

-    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \

-                                      x_offset, y_offset, \

-                                      dst, dst_stride, \

-                                      sec, sec_stride, height, sse

-    %define sec_str sec_strideq

-  %else

-    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \

-                                  y_offset, dst, dst_stride, height, sse

-  %endif

-  %define h heightd

-  %define bilin_filter sseq

-%else

-  %if ARCH_X86=1 && CONFIG_PIC=1

-    %if %2 == 1 ; avg

-      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \

-                                  x_offset, y_offset, \

-                                  dst, dst_stride, \

-                                  sec, sec_stride, \

-                                  height, sse, g_bilin_filter, g_pw_8

-      %define h dword heightm

-      %define sec_str sec_stridemp

-      ; Store bilin_filter and pw_8 location in stack

-      GET_GOT eax

-      add esp, 4                ; restore esp

-      lea ecx, [GLOBAL(bilin_filter_m)]

-      mov g_bilin_filterm, ecx

-      lea ecx, [GLOBAL(pw_8)]

-      mov g_pw_8m, ecx

-      LOAD_IF_USED 0, 1         ; load eax, ecx back

-    %else

-      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \

-                                x_offset, y_offset, dst, dst_stride, height, \

-                                sse, g_bilin_filter, g_pw_8

-      %define h heightd

-      ; Store bilin_filter and pw_8 location in stack

-      GET_GOT eax

-      add esp, 4                ; restore esp

-      lea ecx, [GLOBAL(bilin_filter_m)]

-      mov g_bilin_filterm, ecx

-      lea ecx, [GLOBAL(pw_8)]

-      mov g_pw_8m, ecx

-      LOAD_IF_USED 0, 1         ; load eax, ecx back

-    %endif

-  %else

-    %if %2 == 1 ; avg

-      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \

-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \

-                                             x_offset, y_offset, \

-                                             dst, dst_stride, \

-                                             sec, sec_stride, \

-                                             height, sse

-      %if ARCH_X86_64

-      %define h heightd

-      %define sec_str sec_strideq

-      %else

-      %define h dword heightm

-      %define sec_str sec_stridemp

-      %endif

-    %else

-      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \

-                              x_offset, y_offset, dst, dst_stride, height, sse

-      %define h heightd

-    %endif

-    %define bilin_filter bilin_filter_m

-  %endif

-%endif

-  ASSERT               %1 <= 16         ; m6 overflows if w > 16

-  pxor                 m6, m6           ; sum

-  pxor                 m7, m7           ; sse

-%if %1 < 16

-  sar                   h, 1

-%endif

-%if %2 == 1 ; avg

-  shl             sec_str, 1

-%endif

-  ; FIXME(rbultje) replace by jumptable?

-  test          x_offsetd, x_offsetd

-  jnz .x_nonzero

-  ; x_offset == 0

-  test          y_offsetd, y_offsetd

-  jnz .x_zero_y_nonzero

-  ; x_offset == 0 && y_offset == 0

-.x_zero_y_zero_loop:

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m2, [srcq + 16]

-  mova                 m1, [dstq]

-  mova                 m3, [dstq + 16]

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  pavgw                m2, [secq+16]

-%endif

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  lea                srcq, [srcq + src_strideq*2]

-  lea                dstq, [dstq + dst_strideq*2]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%else ; %1 < 16

-  movu                 m0, [srcq]

-  movu                 m2, [srcq + src_strideq*2]

-  mova                 m1, [dstq]

-  mova                 m3, [dstq + dst_strideq*2]

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  add                secq, sec_str

-  pavgw                m2, [secq]

-%endif

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  lea                srcq, [srcq + src_strideq*4]

-  lea                dstq, [dstq + dst_strideq*4]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%endif

-  dec                   h

-  jg .x_zero_y_zero_loop

-  STORE_AND_RET

-.x_zero_y_nonzero:

-  cmp           y_offsetd, 8

-  jne .x_zero_y_nonhalf

-  ; x_offset == 0 && y_offset == 0.5

-.x_zero_y_half_loop:

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq+16]

-  movu                 m4, [srcq+src_strideq*2]

-  movu                 m5, [srcq+src_strideq*2+16]

-  mova                 m2, [dstq]

-  mova                 m3, [dstq+16]

-  pavgw                m0, m4

-  pavgw                m1, m5

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  pavgw                m1, [secq+16]

-%endif

-  SUM_SSE              m0, m2, m1, m3, m6, m7

-  lea                srcq, [srcq + src_strideq*2]

-  lea                dstq, [dstq + dst_strideq*2]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%else ; %1 < 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq+src_strideq*2]

-  movu                 m5, [srcq+src_strideq*4]

-  mova                 m2, [dstq]

-  mova                 m3, [dstq+dst_strideq*2]

-  pavgw                m0, m1

-  pavgw                m1, m5

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  add                secq, sec_str

-  pavgw                m1, [secq]

-%endif

-  SUM_SSE              m0, m2, m1, m3, m6, m7

-  lea                srcq, [srcq + src_strideq*4]

-  lea                dstq, [dstq + dst_strideq*4]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%endif

-  dec                   h

-  jg .x_zero_y_half_loop

-  STORE_AND_RET

-.x_zero_y_nonhalf:

-  ; x_offset == 0 && y_offset == bilin interpolation

-%ifdef PIC

-  lea        bilin_filter, [bilin_filter_m]

-%endif

-  shl           y_offsetd, filter_idx_shift

-%if ARCH_X86_64 && mmsize == 16

-  mova                 m8, [bilin_filter+y_offsetq]

-  mova                 m9, [bilin_filter+y_offsetq+16]

-  mova                m10, [pw_8]

-%define filter_y_a m8

-%define filter_y_b m9

-%define filter_rnd m10

-%else ; x86-32 or mmx

-%if ARCH_X86=1 && CONFIG_PIC=1

-; x_offset == 0, reuse x_offset reg

-%define tempq x_offsetq

-  add y_offsetq, g_bilin_filterm

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-  mov tempq, g_pw_8m

-%define filter_rnd [tempq]

-%else

-  add           y_offsetq, bilin_filter

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-%define filter_rnd [pw_8]

-%endif

-%endif

-.x_zero_y_other_loop:

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq + 16]

-  movu                 m4, [srcq+src_strideq*2]

-  movu                 m5, [srcq+src_strideq*2+16]

-  mova                 m2, [dstq]

-  mova                 m3, [dstq+16]

-  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can

-  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of

-  ; instructions is the same (5), but it is 1 mul instead of 2, so might be

-  ; slightly faster because of pmullw latency. It would also cut our rodata

-  ; tables in half for this function, and save 1-2 registers on x86-64.

-  pmullw               m1, filter_y_a

-  pmullw               m5, filter_y_b

-  paddw                m1, filter_rnd

-  pmullw               m0, filter_y_a

-  pmullw               m4, filter_y_b

-  paddw                m0, filter_rnd

-  paddw                m1, m5

-  paddw                m0, m4

-  psrlw                m1, 4

-  psrlw                m0, 4

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  pavgw                m1, [secq+16]

-%endif

-  SUM_SSE              m0, m2, m1, m3, m6, m7

-  lea                srcq, [srcq + src_strideq*2]

-  lea                dstq, [dstq + dst_strideq*2]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%else ; %1 < 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq+src_strideq*2]

-  movu                 m5, [srcq+src_strideq*4]

-  mova                 m4, m1

-  mova                 m2, [dstq]

-  mova                 m3, [dstq+dst_strideq*2]

-  pmullw               m1, filter_y_a

-  pmullw               m5, filter_y_b

-  paddw                m1, filter_rnd

-  pmullw               m0, filter_y_a

-  pmullw               m4, filter_y_b

-  paddw                m0, filter_rnd

-  paddw                m1, m5

-  paddw                m0, m4

-  psrlw                m1, 4

-  psrlw                m0, 4

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  add                secq, sec_str

-  pavgw                m1, [secq]

-%endif

-  SUM_SSE              m0, m2, m1, m3, m6, m7

-  lea                srcq, [srcq + src_strideq*4]

-  lea                dstq, [dstq + dst_strideq*4]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%endif

-  dec                   h

-  jg .x_zero_y_other_loop

-%undef filter_y_a

-%undef filter_y_b

-%undef filter_rnd

-  STORE_AND_RET

-.x_nonzero:

-  cmp           x_offsetd, 8

-  jne .x_nonhalf

-  ; x_offset == 0.5

-  test          y_offsetd, y_offsetd

-  jnz .x_half_y_nonzero

-  ; x_offset == 0.5 && y_offset == 0

-.x_half_y_zero_loop:

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq + 16]

-  movu                 m4, [srcq + 2]

-  movu                 m5, [srcq + 18]

-  mova                 m2, [dstq]

-  mova                 m3, [dstq + 16]

-  pavgw                m0, m4

-  pavgw                m1, m5

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  pavgw                m1, [secq+16]

-%endif

-  SUM_SSE              m0, m2, m1, m3, m6, m7

-  lea                srcq, [srcq + src_strideq*2]

-  lea                dstq, [dstq + dst_strideq*2]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%else ; %1 < 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq + src_strideq*2]

-  movu                 m4, [srcq + 2]

-  movu                 m5, [srcq + src_strideq*2 + 2]

-  mova                 m2, [dstq]

-  mova                 m3, [dstq + dst_strideq*2]

-  pavgw                m0, m4

-  pavgw                m1, m5

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  add                secq, sec_str

-  pavgw                m1, [secq]

-%endif

-  SUM_SSE              m0, m2, m1, m3, m6, m7

-  lea                srcq, [srcq + src_strideq*4]

-  lea                dstq, [dstq + dst_strideq*4]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%endif

-  dec                   h

-  jg .x_half_y_zero_loop

-  STORE_AND_RET

-.x_half_y_nonzero:

-  cmp           y_offsetd, 8

-  jne .x_half_y_nonhalf

-  ; x_offset == 0.5 && y_offset == 0.5

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq+16]

-  movu                 m2, [srcq+2]

-  movu                 m3, [srcq+18]

-  lea                srcq, [srcq + src_strideq*2]

-  pavgw                m0, m2

-  pavgw                m1, m3

-.x_half_y_half_loop:

-  movu                 m2, [srcq]

-  movu                 m3, [srcq + 16]

-  movu                 m4, [srcq + 2]

-  movu                 m5, [srcq + 18]

-  pavgw                m2, m4

-  pavgw                m3, m5

-  pavgw                m0, m2

-  pavgw                m1, m3

-  mova                 m4, [dstq]

-  mova                 m5, [dstq + 16]

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  pavgw                m1, [secq+16]

-%endif

-  SUM_SSE              m0, m4, m1, m5, m6, m7

-  mova                 m0, m2

-  mova                 m1, m3

-  lea                srcq, [srcq + src_strideq*2]

-  lea                dstq, [dstq + dst_strideq*2]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%else ; %1 < 16

-  movu                 m0, [srcq]

-  movu                 m2, [srcq+2]

-  lea                srcq, [srcq + src_strideq*2]

-  pavgw                m0, m2

-.x_half_y_half_loop:

-  movu                 m2, [srcq]

-  movu                 m3, [srcq + src_strideq*2]

-  movu                 m4, [srcq + 2]

-  movu                 m5, [srcq + src_strideq*2 + 2]

-  pavgw                m2, m4

-  pavgw                m3, m5

-  pavgw                m0, m2

-  pavgw                m2, m3

-  mova                 m4, [dstq]

-  mova                 m5, [dstq + dst_strideq*2]

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  add                secq, sec_str

-  pavgw                m2, [secq]

-%endif

-  SUM_SSE              m0, m4, m2, m5, m6, m7

-  mova                 m0, m3

-  lea                srcq, [srcq + src_strideq*4]

-  lea                dstq, [dstq + dst_strideq*4]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%endif

-  dec                   h

-  jg .x_half_y_half_loop

-  STORE_AND_RET

-.x_half_y_nonhalf:

-  ; x_offset == 0.5 && y_offset == bilin interpolation

-%ifdef PIC

-  lea        bilin_filter, [bilin_filter_m]

-%endif

-  shl           y_offsetd, filter_idx_shift

-%if ARCH_X86_64 && mmsize == 16

-  mova                 m8, [bilin_filter+y_offsetq]

-  mova                 m9, [bilin_filter+y_offsetq+16]

-  mova                m10, [pw_8]

-%define filter_y_a m8

-%define filter_y_b m9

-%define filter_rnd m10

-%else  ; x86_32

-%if ARCH_X86=1 && CONFIG_PIC=1

-; x_offset == 0.5. We can reuse x_offset reg

-%define tempq x_offsetq

-  add y_offsetq, g_bilin_filterm

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-  mov tempq, g_pw_8m

-%define filter_rnd [tempq]

-%else

-  add           y_offsetq, bilin_filter

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-%define filter_rnd [pw_8]

-%endif

-%endif

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq+16]

-  movu                 m2, [srcq+2]

-  movu                 m3, [srcq+18]

-  lea                srcq, [srcq + src_strideq*2]

-  pavgw                m0, m2

-  pavgw                m1, m3

-.x_half_y_other_loop:

-  movu                 m2, [srcq]

-  movu                 m3, [srcq+16]

-  movu                 m4, [srcq+2]

-  movu                 m5, [srcq+18]

-  pavgw                m2, m4

-  pavgw                m3, m5

-  mova                 m4, m2

-  mova                 m5, m3

-  pmullw               m1, filter_y_a

-  pmullw               m3, filter_y_b

-  paddw                m1, filter_rnd

-  paddw                m1, m3

-  pmullw               m0, filter_y_a

-  pmullw               m2, filter_y_b

-  paddw                m0, filter_rnd

-  psrlw                m1, 4

-  paddw                m0, m2

-  mova                 m2, [dstq]

-  psrlw                m0, 4

-  mova                 m3, [dstq+16]

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  pavgw                m1, [secq+16]

-%endif

-  SUM_SSE              m0, m2, m1, m3, m6, m7

-  mova                 m0, m4

-  mova                 m1, m5

-  lea                srcq, [srcq + src_strideq*2]

-  lea                dstq, [dstq + dst_strideq*2]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%else ; %1 < 16

-  movu                 m0, [srcq]

-  movu                 m2, [srcq+2]

-  lea                srcq, [srcq + src_strideq*2]

-  pavgw                m0, m2

-.x_half_y_other_loop:

-  movu                 m2, [srcq]

-  movu                 m3, [srcq+src_strideq*2]

-  movu                 m4, [srcq+2]

-  movu                 m5, [srcq+src_strideq*2+2]

-  pavgw                m2, m4

-  pavgw                m3, m5

-  mova                 m4, m2

-  mova                 m5, m3

-  pmullw               m4, filter_y_a

-  pmullw               m3, filter_y_b

-  paddw                m4, filter_rnd

-  paddw                m4, m3

-  pmullw               m0, filter_y_a

-  pmullw               m2, filter_y_b

-  paddw                m0, filter_rnd

-  psrlw                m4, 4

-  paddw                m0, m2

-  mova                 m2, [dstq]

-  psrlw                m0, 4

-  mova                 m3, [dstq+dst_strideq*2]

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  add                secq, sec_str

-  pavgw                m4, [secq]

-%endif

-  SUM_SSE              m0, m2, m4, m3, m6, m7

-  mova                 m0, m5

-  lea                srcq, [srcq + src_strideq*4]

-  lea                dstq, [dstq + dst_strideq*4]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%endif

-  dec                   h

-  jg .x_half_y_other_loop

-%undef filter_y_a

-%undef filter_y_b

-%undef filter_rnd

-  STORE_AND_RET

-.x_nonhalf:

-  test          y_offsetd, y_offsetd

-  jnz .x_nonhalf_y_nonzero

-  ; x_offset == bilin interpolation && y_offset == 0

-%ifdef PIC

-  lea        bilin_filter, [bilin_filter_m]

-%endif

-  shl           x_offsetd, filter_idx_shift

-%if ARCH_X86_64 && mmsize == 16

-  mova                 m8, [bilin_filter+x_offsetq]

-  mova                 m9, [bilin_filter+x_offsetq+16]

-  mova                m10, [pw_8]

-%define filter_x_a m8

-%define filter_x_b m9

-%define filter_rnd m10

-%else    ; x86-32

-%if ARCH_X86=1 && CONFIG_PIC=1

-; y_offset == 0. We can reuse y_offset reg.

-%define tempq y_offsetq

-  add x_offsetq, g_bilin_filterm

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-  mov tempq, g_pw_8m

-%define filter_rnd [tempq]

-%else

-  add           x_offsetq, bilin_filter

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-%define filter_rnd [pw_8]

-%endif

-%endif

-.x_other_y_zero_loop:

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq+16]

-  movu                 m2, [srcq+2]

-  movu                 m3, [srcq+18]

-  mova                 m4, [dstq]

-  mova                 m5, [dstq+16]

-  pmullw               m1, filter_x_a

-  pmullw               m3, filter_x_b

-  paddw                m1, filter_rnd

-  pmullw               m0, filter_x_a

-  pmullw               m2, filter_x_b

-  paddw                m0, filter_rnd

-  paddw                m1, m3

-  paddw                m0, m2

-  psrlw                m1, 4

-  psrlw                m0, 4

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  pavgw                m1, [secq+16]

-%endif

-  SUM_SSE              m0, m4, m1, m5, m6, m7

-  lea                srcq, [srcq+src_strideq*2]

-  lea                dstq, [dstq+dst_strideq*2]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%else ; %1 < 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq+src_strideq*2]

-  movu                 m2, [srcq+2]

-  movu                 m3, [srcq+src_strideq*2+2]

-  mova                 m4, [dstq]

-  mova                 m5, [dstq+dst_strideq*2]

-  pmullw               m1, filter_x_a

-  pmullw               m3, filter_x_b

-  paddw                m1, filter_rnd

-  pmullw               m0, filter_x_a

-  pmullw               m2, filter_x_b

-  paddw                m0, filter_rnd

-  paddw                m1, m3

-  paddw                m0, m2

-  psrlw                m1, 4

-  psrlw                m0, 4

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  add                secq, sec_str

-  pavgw                m1, [secq]

-%endif

-  SUM_SSE              m0, m4, m1, m5, m6, m7

-  lea                srcq, [srcq+src_strideq*4]

-  lea                dstq, [dstq+dst_strideq*4]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%endif

-  dec                   h

-  jg .x_other_y_zero_loop

-%undef filter_x_a

-%undef filter_x_b

-%undef filter_rnd

-  STORE_AND_RET

-.x_nonhalf_y_nonzero:

-  cmp           y_offsetd, 8

-  jne .x_nonhalf_y_nonhalf

-  ; x_offset == bilin interpolation && y_offset == 0.5

-%ifdef PIC

-  lea        bilin_filter, [bilin_filter_m]

-%endif

-  shl           x_offsetd, filter_idx_shift

-%if ARCH_X86_64 && mmsize == 16

-  mova                 m8, [bilin_filter+x_offsetq]

-  mova                 m9, [bilin_filter+x_offsetq+16]

-  mova                m10, [pw_8]

-%define filter_x_a m8

-%define filter_x_b m9

-%define filter_rnd m10

-%else    ; x86-32

-%if ARCH_X86=1 && CONFIG_PIC=1

-; y_offset == 0.5. We can reuse y_offset reg.

-%define tempq y_offsetq

-  add x_offsetq, g_bilin_filterm

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-  mov tempq, g_pw_8m

-%define filter_rnd [tempq]

-%else

-  add           x_offsetq, bilin_filter

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-%define filter_rnd [pw_8]

-%endif

-%endif

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq+16]

-  movu                 m2, [srcq+2]

-  movu                 m3, [srcq+18]

-  pmullw               m0, filter_x_a

-  pmullw               m2, filter_x_b

-  paddw                m0, filter_rnd

-  pmullw               m1, filter_x_a

-  pmullw               m3, filter_x_b

-  paddw                m1, filter_rnd

-  paddw                m0, m2

-  paddw                m1, m3

-  psrlw                m0, 4

-  psrlw                m1, 4

-  lea                srcq, [srcq+src_strideq*2]

-.x_other_y_half_loop:

-  movu                 m2, [srcq]

-  movu                 m3, [srcq+16]

-  movu                 m4, [srcq+2]

-  movu                 m5, [srcq+18]

-  pmullw               m2, filter_x_a

-  pmullw               m4, filter_x_b

-  paddw                m2, filter_rnd

-  pmullw               m3, filter_x_a

-  pmullw               m5, filter_x_b

-  paddw                m3, filter_rnd

-  paddw                m2, m4

-  paddw                m3, m5

-  mova                 m4, [dstq]

-  mova                 m5, [dstq+16]

-  psrlw                m2, 4

-  psrlw                m3, 4

-  pavgw                m0, m2

-  pavgw                m1, m3

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  pavgw                m1, [secq+16]

-%endif

-  SUM_SSE              m0, m4, m1, m5, m6, m7

-  mova                 m0, m2

-  mova                 m1, m3

-  lea                srcq, [srcq+src_strideq*2]

-  lea                dstq, [dstq+dst_strideq*2]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%else ; %1 < 16

-  movu                 m0, [srcq]

-  movu                 m2, [srcq+2]

-  pmullw               m0, filter_x_a

-  pmullw               m2, filter_x_b

-  paddw                m0, filter_rnd

-  paddw                m0, m2

-  psrlw                m0, 4

-  lea                srcq, [srcq+src_strideq*2]

-.x_other_y_half_loop:

-  movu                 m2, [srcq]

-  movu                 m3, [srcq+src_strideq*2]

-  movu                 m4, [srcq+2]

-  movu                 m5, [srcq+src_strideq*2+2]

-  pmullw               m2, filter_x_a

-  pmullw               m4, filter_x_b

-  paddw                m2, filter_rnd

-  pmullw               m3, filter_x_a

-  pmullw               m5, filter_x_b

-  paddw                m3, filter_rnd

-  paddw                m2, m4

-  paddw                m3, m5

-  mova                 m4, [dstq]

-  mova                 m5, [dstq+dst_strideq*2]

-  psrlw                m2, 4

-  psrlw                m3, 4

-  pavgw                m0, m2

-  pavgw                m2, m3

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  add                secq, sec_str

-  pavgw                m2, [secq]

-%endif

-  SUM_SSE              m0, m4, m2, m5, m6, m7

-  mova                 m0, m3

-  lea                srcq, [srcq+src_strideq*4]

-  lea                dstq, [dstq+dst_strideq*4]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%endif

-  dec                   h

-  jg .x_other_y_half_loop

-%undef filter_x_a

-%undef filter_x_b

-%undef filter_rnd

-  STORE_AND_RET

-.x_nonhalf_y_nonhalf:

-; loading filter - this is same as in 8-bit depth

-%ifdef PIC

-  lea        bilin_filter, [bilin_filter_m]

-%endif

-  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5

-  shl           y_offsetd, filter_idx_shift

-%if ARCH_X86_64 && mmsize == 16

-  mova                 m8, [bilin_filter+x_offsetq]

-  mova                 m9, [bilin_filter+x_offsetq+16]

-  mova                m10, [bilin_filter+y_offsetq]

-  mova                m11, [bilin_filter+y_offsetq+16]

-  mova                m12, [pw_8]

-%define filter_x_a m8

-%define filter_x_b m9

-%define filter_y_a m10

-%define filter_y_b m11

-%define filter_rnd m12

-%else   ; x86-32

-%if ARCH_X86=1 && CONFIG_PIC=1

-; In this case, there is NO unused register. Used src_stride register. Later,

-; src_stride has to be loaded from stack when it is needed.

-%define tempq src_strideq

-  mov tempq, g_bilin_filterm

-  add           x_offsetq, tempq

-  add           y_offsetq, tempq

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-  mov tempq, g_pw_8m

-%define filter_rnd [tempq]

-%else

-  add           x_offsetq, bilin_filter

-  add           y_offsetq, bilin_filter

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-%define filter_rnd [pw_8]

-%endif

-%endif

-; end of load filter

-  ; x_offset == bilin interpolation && y_offset == bilin interpolation

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m2, [srcq+2]

-  movu                 m1, [srcq+16]

-  movu                 m3, [srcq+18]

-  pmullw               m0, filter_x_a

-  pmullw               m2, filter_x_b

-  paddw                m0, filter_rnd

-  pmullw               m1, filter_x_a

-  pmullw               m3, filter_x_b

-  paddw                m1, filter_rnd

-  paddw                m0, m2

-  paddw                m1, m3

-  psrlw                m0, 4

-  psrlw                m1, 4

-  INC_SRC_BY_SRC_STRIDE

-.x_other_y_other_loop:

-  movu                 m2, [srcq]

-  movu                 m4, [srcq+2]

-  movu                 m3, [srcq+16]

-  movu                 m5, [srcq+18]

-  pmullw               m2, filter_x_a

-  pmullw               m4, filter_x_b

-  paddw                m2, filter_rnd

-  pmullw               m3, filter_x_a

-  pmullw               m5, filter_x_b

-  paddw                m3, filter_rnd

-  paddw                m2, m4

-  paddw                m3, m5

-  psrlw                m2, 4

-  psrlw                m3, 4

-  mova                 m4, m2

-  mova                 m5, m3

-  pmullw               m0, filter_y_a

-  pmullw               m2, filter_y_b

-  paddw                m0, filter_rnd

-  pmullw               m1, filter_y_a

-  pmullw               m3, filter_y_b

-  paddw                m0, m2

-  paddw                m1, filter_rnd

-  mova                 m2, [dstq]

-  paddw                m1, m3

-  psrlw                m0, 4

-  psrlw                m1, 4

-  mova                 m3, [dstq+16]

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  pavgw                m1, [secq+16]

-%endif

-  SUM_SSE              m0, m2, m1, m3, m6, m7

-  mova                 m0, m4

-  mova                 m1, m5

-  INC_SRC_BY_SRC_STRIDE

-  lea                dstq, [dstq + dst_strideq * 2]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%else ; %1 < 16

-  movu                 m0, [srcq]

-  movu                 m2, [srcq+2]

-  pmullw               m0, filter_x_a

-  pmullw               m2, filter_x_b

-  paddw                m0, filter_rnd

-  paddw                m0, m2

-  psrlw                m0, 4

-  INC_SRC_BY_SRC_STRIDE

-.x_other_y_other_loop:

-  movu                 m2, [srcq]

-  movu                 m4, [srcq+2]

-  movu                 m3, [srcq+src_strideq*2]

-  movu                 m5, [srcq+src_strideq*2+2]

-  pmullw               m2, filter_x_a

-  pmullw               m4, filter_x_b

-  paddw                m2, filter_rnd

-  pmullw               m3, filter_x_a

-  pmullw               m5, filter_x_b

-  paddw                m3, filter_rnd

-  paddw                m2, m4

-  paddw                m3, m5

-  psrlw                m2, 4

-  psrlw                m3, 4

-  mova                 m4, m2

-  mova                 m5, m3

-  pmullw               m0, filter_y_a

-  pmullw               m2, filter_y_b

-  paddw                m0, filter_rnd

-  pmullw               m4, filter_y_a

-  pmullw               m3, filter_y_b

-  paddw                m0, m2

-  paddw                m4, filter_rnd

-  mova                 m2, [dstq]

-  paddw                m4, m3

-  psrlw                m0, 4

-  psrlw                m4, 4

-  mova                 m3, [dstq+dst_strideq*2]

-%if %2 == 1 ; avg

-  pavgw                m0, [secq]

-  add                secq, sec_str

-  pavgw                m4, [secq]

-%endif

-  SUM_SSE              m0, m2, m4, m3, m6, m7

-  mova                 m0, m5

-  INC_SRC_BY_SRC_2STRIDE

-  lea                dstq, [dstq + dst_strideq * 4]

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-%endif

-  dec                   h

-  jg .x_other_y_other_loop

-%undef filter_x_a

-%undef filter_x_b

-%undef filter_y_a

-%undef filter_y_b

-%undef filter_rnd

-  STORE_AND_RET

-%endmacro

-INIT_XMM sse2

-SUBPEL_VARIANCE  8

-SUBPEL_VARIANCE 16

-INIT_XMM sse2

-SUBPEL_VARIANCE  8, 1

-SUBPEL_VARIANCE 16, 1

--- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c

+++ /dev/null

@@ -1,349 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vpx_config.h"

-#include "vp9/common/vp9_common.h"

-#include "vp9/encoder/vp9_variance.h"

-#include "vpx_ports/mem.h"

-#define DECL(w, opt) \

-int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \

-                                               ptrdiff_t src_stride, \

-                                               int x_offset, int y_offset, \

-                                               const uint16_t *dst, \

-                                               ptrdiff_t dst_stride, \

-                                               int height, unsigned int *sse);

-#define DECLS(opt1, opt2) \

-DECL(8, opt1); \

-DECL(16, opt1)

-DECLS(sse2, sse);

-// DECLS(ssse3, ssse3);

-#undef DECLS

-#undef DECL

-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \

-uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \

-                                                        int src_stride, \

-                                                        int x_offset, \

-                                                        int y_offset, \

-                                                        const uint8_t *dst8, \

-                                                        int dst_stride, \

-                                                        uint32_t *sse_ptr) { \

-  uint32_t sse; \

-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

-  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \

-                                                       x_offset, y_offset, \

-                                                       dst, dst_stride, h, \

-                                                       &sse); \

-  if (w > wf) { \

-    unsigned int sse2; \

-    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \

-                                                          src_stride, \

-                                                          x_offset, y_offset, \

-                                                          dst + 16, \

-                                                          dst_stride, \

-                                                          h, &sse2); \

-    se += se2; \

-    sse += sse2; \

-    if (w > wf * 2) { \

-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \

-                                                        x_offset, y_offset, \

-                                                        dst + 32, dst_stride, \

-                                                        h, &sse2); \

-      se += se2; \

-      sse += sse2; \

-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \

-          src + 48, src_stride, x_offset, y_offset, \

-          dst + 48, dst_stride, h, &sse2); \

-      se += se2; \

-      sse += sse2; \

-    } \

-  } \

-  *sse_ptr = sse; \

-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

-} \

-\

-uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \

-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \

-  uint32_t sse; \

-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

-  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \

-                                                       x_offset, y_offset, \

-                                                       dst, dst_stride, \

-                                                       h, &sse); \

-  if (w > wf) { \

-    uint32_t sse2; \

-    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \

-                                                          src_stride, \

-                                                          x_offset, y_offset, \

-                                                          dst + 16, \

-                                                          dst_stride, \

-                                                          h, &sse2); \

-    se += se2; \

-    sse += sse2; \

-    if (w > wf * 2) { \

-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \

-                                                        x_offset, y_offset, \

-                                                        dst + 32, dst_stride, \

-                                                        h, &sse2); \

-      se += se2; \

-      sse += sse2; \

-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \

-                                                        x_offset, y_offset, \

-                                                        dst + 48, dst_stride, \

-                                                        h, &sse2); \

-      se += se2; \

-      sse += sse2; \

-    } \

-  } \

-  se = ROUND_POWER_OF_TWO(se, 2); \

-  sse = ROUND_POWER_OF_TWO(sse, 4); \

-  *sse_ptr = sse; \

-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

-} \

-\

-uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \

-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \

-  int start_row; \

-  uint32_t sse; \

-  int se = 0; \

-  uint64_t long_sse = 0; \

-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

-  for (start_row = 0; start_row < h; start_row +=16) { \

-    uint32_t sse2; \

-    int height = h - start_row < 16 ? h - start_row : 16; \

-    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \

-        src + (start_row * src_stride), src_stride, \

-        x_offset, y_offset, dst + (start_row * dst_stride), \

-        dst_stride, height, &sse2); \

-    se += se2; \

-    long_sse += sse2; \

-    if (w > wf) { \

-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \

-          src + 16 + (start_row * src_stride), src_stride, \

-          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \

-          dst_stride, height, &sse2); \

-      se += se2; \

-      long_sse += sse2; \

-      if (w > wf * 2) { \

-        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \

-            src + 32 + (start_row * src_stride), src_stride, \

-            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \

-            dst_stride, height, &sse2); \

-        se += se2; \

-        long_sse += sse2; \

-        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \

-            src + 48 + (start_row * src_stride), src_stride, \

-            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \

-            dst_stride, height, &sse2); \

-        se += se2; \

-        long_sse += sse2; \

-      }\

-    } \

-  } \

-  se = ROUND_POWER_OF_TWO(se, 4); \

-  sse = ROUND_POWER_OF_TWO(long_sse, 8); \

-  *sse_ptr = sse; \

-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

-}

-#define FNS(opt1, opt2) \

-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \

-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \

-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \

-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \

-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \

-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \

-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \

-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \

-FN(8, 16, 8, 3, 4, opt1, (int64_t)); \

-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \

-FN(8, 4, 8, 3, 2, opt1, (int64_t));

-FNS(sse2, sse);

-#undef FNS

-#undef FN

-#define DECL(w, opt) \

-int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \

-                                                   ptrdiff_t src_stride, \

-                                                   int x_offset, int y_offset, \

-                                                   const uint16_t *dst, \

-                                                   ptrdiff_t dst_stride, \

-                                                   const uint16_t *sec, \

-                                                   ptrdiff_t sec_stride, \

-                                                   int height, \

-                                                   unsigned int *sse);

-#define DECLS(opt1) \

-DECL(16, opt1) \

-DECL(8, opt1)

-DECLS(sse2);

-#undef DECL

-#undef DECLS

-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \

-uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \

-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \

-    const uint8_t *sec8) { \

-  uint32_t sse; \

-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \

-  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-               src, src_stride, x_offset, \

-               y_offset, dst, dst_stride, sec, w, h, &sse); \

-  if (w > wf) { \

-    uint32_t sse2; \

-    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-                  src + 16, src_stride, x_offset, y_offset, \

-                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \

-    se += se2; \

-    sse += sse2; \

-    if (w > wf * 2) { \

-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-                src + 32, src_stride, x_offset, y_offset, \

-                dst + 32, dst_stride, sec + 32, w, h, &sse2); \

-      se += se2; \

-      sse += sse2; \

-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-                src + 48, src_stride, x_offset, y_offset, \

-                dst + 48, dst_stride, sec + 48, w, h, &sse2); \

-      se += se2; \

-      sse += sse2; \

-    } \

-  } \

-  *sse_ptr = sse; \

-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

-} \

-\

-uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \

-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \

-    const uint8_t *sec8) { \

-  uint32_t sse; \

-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \

-  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-                                            src, src_stride, x_offset, \

-                                            y_offset, dst, dst_stride, \

-                                            sec, w, h, &sse); \

-  if (w > wf) { \

-    uint32_t sse2; \

-    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-                                            src + 16, src_stride, \

-                                            x_offset, y_offset, \

-                                            dst + 16, dst_stride, \

-                                            sec + 16, w, h, &sse2); \

-    se += se2; \

-    sse += sse2; \

-    if (w > wf * 2) { \

-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-                                            src + 32, src_stride, \

-                                            x_offset, y_offset, \

-                                            dst + 32, dst_stride, \

-                                            sec + 32, w, h, &sse2); \

-      se += se2; \

-      sse += sse2; \

-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-                                            src + 48, src_stride, \

-                                            x_offset, y_offset, \

-                                            dst + 48, dst_stride, \

-                                            sec + 48, w, h, &sse2); \

-      se += se2; \

-      sse += sse2; \

-    } \

-  } \

-  se = ROUND_POWER_OF_TWO(se, 2); \

-  sse = ROUND_POWER_OF_TWO(sse, 4); \

-  *sse_ptr = sse; \

-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

-} \

-\

-uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \

-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \

-    const uint8_t *sec8) { \

-  int start_row; \

-  uint32_t sse; \

-  int se = 0; \

-  uint64_t long_sse = 0; \

-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \

-  for (start_row = 0; start_row < h; start_row +=16) { \

-    uint32_t sse2; \

-    int height = h - start_row < 16 ? h - start_row : 16; \

-    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-                src + (start_row * src_stride), src_stride, x_offset, \

-                y_offset, dst + (start_row * dst_stride), dst_stride, \

-                sec + (start_row * w), w, height, &sse2); \

-    se += se2; \

-    long_sse += sse2; \

-    if (w > wf) { \

-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-                src + 16 + (start_row * src_stride), src_stride, \

-                x_offset, y_offset, \

-                dst + 16 + (start_row * dst_stride), dst_stride, \

-                sec + 16 + (start_row * w), w, height, &sse2); \

-      se += se2; \

-      long_sse += sse2; \

-      if (w > wf * 2) { \

-        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-                src + 32 + (start_row * src_stride), src_stride, \

-                x_offset, y_offset, \

-                dst + 32 + (start_row * dst_stride), dst_stride, \

-                sec + 32 + (start_row * w), w, height, &sse2); \

-        se += se2; \

-        long_sse += sse2; \

-        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

-                src + 48 + (start_row * src_stride), src_stride, \

-                x_offset, y_offset, \

-                dst + 48 + (start_row * dst_stride), dst_stride, \

-                sec + 48 + (start_row * w), w, height, &sse2); \

-        se += se2; \

-        long_sse += sse2; \

-      } \

-    } \

-  } \

-  se = ROUND_POWER_OF_TWO(se, 4); \

-  sse = ROUND_POWER_OF_TWO(long_sse, 8); \

-  *sse_ptr = sse; \

-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

-}

-#define FNS(opt1) \

-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \

-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \

-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \

-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \

-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \

-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \

-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \

-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \

-FN(8, 16, 8, 4, 3, opt1, (int64_t)); \

-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \

-FN(8, 4, 8, 3, 2, opt1, (int64_t));

-FNS(sse2);

-#undef FNS

-#undef FN

--- a/vp9/encoder/x86/vp9_subpel_variance.asm

+++ /dev/null

@@ -1,1396 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-SECTION_RODATA

-pw_8: times  8 dw  8

-bilin_filter_m_sse2: times  8 dw 16

-                     times  8 dw  0

-                     times  8 dw 14

-                     times  8 dw  2

-                     times  8 dw 12

-                     times  8 dw  4

-                     times  8 dw 10

-                     times  8 dw  6

-                     times 16 dw  8

-                     times  8 dw  6

-                     times  8 dw 10

-                     times  8 dw  4

-                     times  8 dw 12

-                     times  8 dw  2

-                     times  8 dw 14

-bilin_filter_m_ssse3: times  8 db 16,  0

-                      times  8 db 14,  2

-                      times  8 db 12,  4

-                      times  8 db 10,  6

-                      times 16 db  8

-                      times  8 db  6, 10

-                      times  8 db  4, 12

-                      times  8 db  2, 14

-SECTION .text

-; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,

-;                               int x_offset, int y_offset,

-;                               const uint8_t *dst, ptrdiff_t dst_stride,

-;                               int height, unsigned int *sse);

-;

-; This function returns the SE and stores SSE in the given pointer.

-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse

-  psubw                %3, %4

-  psubw                %1, %2

-  paddw                %5, %3

-  pmaddwd              %3, %3

-  paddw                %5, %1

-  pmaddwd              %1, %1

-  paddd                %6, %3

-  paddd                %6, %1

-%endmacro

-%macro STORE_AND_RET 0

-%if mmsize == 16

-  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit

-  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.

-  ; We have to sign-extend it before adding the words within the register

-  ; and outputing to a dword.

-  pcmpgtw              m5, m6           ; mask for 0 > x

-  movhlps              m3, m7

-  punpcklwd            m4, m6, m5

-  punpckhwd            m6, m5           ; sign-extend m6 word->dword

-  paddd                m7, m3

-  paddd                m6, m4

-  pshufd               m3, m7, 0x1

-  movhlps              m4, m6

-  paddd                m7, m3

-  paddd                m6, m4

-  mov                  r1, ssem         ; r1 = unsigned int *sse

-  pshufd               m4, m6, 0x1

-  movd               [r1], m7           ; store sse

-  paddd                m6, m4

-  movd               raxd, m6           ; store sum as return value

-%else ; mmsize == 8

-  pshufw               m4, m6, 0xe

-  pshufw               m3, m7, 0xe

-  paddw                m6, m4

-  paddd                m7, m3

-  pcmpgtw              m5, m6           ; mask for 0 > x

-  mov                  r1, ssem         ; r1 = unsigned int *sse

-  punpcklwd            m6, m5           ; sign-extend m6 word->dword

-  movd               [r1], m7           ; store sse

-  pshufw               m4, m6, 0xe

-  paddd                m6, m4

-  movd               raxd, m6           ; store sum as return value

-%endif

-  RET

-%endmacro

-%macro INC_SRC_BY_SRC_STRIDE  0

-%if ARCH_X86=1 && CONFIG_PIC=1

-  add                srcq, src_stridemp

-%else

-  add                srcq, src_strideq

-%endif

-%endmacro

-%macro SUBPEL_VARIANCE 1-2 0 ; W

-%if cpuflag(ssse3)

-%define bilin_filter_m bilin_filter_m_ssse3

-%define filter_idx_shift 4

-%else

-%define bilin_filter_m bilin_filter_m_sse2

-%define filter_idx_shift 5

-%endif

-; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses

-; 11, not 13, if the registers are ordered correctly. May make a minor speed

-; difference on Win64

-%ifdef PIC    ; 64bit PIC

-  %if %2 == 1 ; avg

-    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \

-                                      x_offset, y_offset, \

-                                      dst, dst_stride, \

-                                      sec, sec_stride, height, sse

-    %define sec_str sec_strideq

-  %else

-    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \

-                                  y_offset, dst, dst_stride, height, sse

-  %endif

-  %define h heightd

-  %define bilin_filter sseq

-%else

-  %if ARCH_X86=1 && CONFIG_PIC=1

-    %if %2 == 1 ; avg

-      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \

-                                  x_offset, y_offset, \

-                                  dst, dst_stride, \

-                                  sec, sec_stride, \

-                                  height, sse, g_bilin_filter, g_pw_8

-      %define h dword heightm

-      %define sec_str sec_stridemp

-      ;Store bilin_filter and pw_8 location in stack

-      GET_GOT eax

-      add esp, 4                ; restore esp

-      lea ecx, [GLOBAL(bilin_filter_m)]

-      mov g_bilin_filterm, ecx

-      lea ecx, [GLOBAL(pw_8)]

-      mov g_pw_8m, ecx

-      LOAD_IF_USED 0, 1         ; load eax, ecx back

-    %else

-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \

-                                y_offset, dst, dst_stride, height, sse, \

-                                g_bilin_filter, g_pw_8

-      %define h heightd

-      ;Store bilin_filter and pw_8 location in stack

-      GET_GOT eax

-      add esp, 4                ; restore esp

-      lea ecx, [GLOBAL(bilin_filter_m)]

-      mov g_bilin_filterm, ecx

-      lea ecx, [GLOBAL(pw_8)]

-      mov g_pw_8m, ecx

-      LOAD_IF_USED 0, 1         ; load eax, ecx back

-    %endif

-  %else

-    %if %2 == 1 ; avg

-      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \

-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \

-                                             x_offset, y_offset, \

-                                             dst, dst_stride, \

-                                             sec, sec_stride, \

-                                             height, sse

-      %if ARCH_X86_64

-      %define h heightd

-      %define sec_str sec_strideq

-      %else

-      %define h dword heightm

-      %define sec_str sec_stridemp

-      %endif

-    %else

-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \

-                              y_offset, dst, dst_stride, height, sse

-      %define h heightd

-    %endif

-    %define bilin_filter bilin_filter_m

-  %endif

-%endif

-  ASSERT               %1 <= 16         ; m6 overflows if w > 16

-  pxor                 m6, m6           ; sum

-  pxor                 m7, m7           ; sse

-  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we

-  ; could perhaps use it for something more productive then

-  pxor                 m5, m5           ; dedicated zero register

-%if %1 < 16

-  sar                   h, 1

-%if %2 == 1 ; avg

-  shl             sec_str, 1

-%endif

-%endif

-  ; FIXME(rbultje) replace by jumptable?

-  test          x_offsetd, x_offsetd

-  jnz .x_nonzero

-  ; x_offset == 0

-  test          y_offsetd, y_offsetd

-  jnz .x_zero_y_nonzero

-  ; x_offset == 0 && y_offset == 0

-.x_zero_y_zero_loop:

-%if %1 == 16

-  movu                 m0, [srcq]

-  mova                 m1, [dstq]

-%if %2 == 1 ; avg

-  pavgb                m0, [secq]

-  punpckhbw            m3, m1, m5

-  punpcklbw            m1, m5

-%endif

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%if %2 == 0 ; !avg

-  punpckhbw            m3, m1, m5

-  punpcklbw            m1, m5

-%endif

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  add                srcq, src_strideq

-  add                dstq, dst_strideq

-%else ; %1 < 16

-  movh                 m0, [srcq]

-%if %2 == 1 ; avg

-%if mmsize == 16

-  movhps               m0, [srcq+src_strideq]

-%else ; mmsize == 8

-  punpckldq            m0, [srcq+src_strideq]

-%endif

-%else ; !avg

-  movh                 m2, [srcq+src_strideq]

-%endif

-  movh                 m1, [dstq]

-  movh                 m3, [dstq+dst_strideq]

-%if %2 == 1 ; avg

-  pavgb                m0, [secq]

-  punpcklbw            m3, m5

-  punpcklbw            m1, m5

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%else ; !avg

-  punpcklbw            m0, m5

-  punpcklbw            m2, m5

-  punpcklbw            m3, m5

-  punpcklbw            m1, m5

-%endif

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  lea                srcq, [srcq+src_strideq*2]

-  lea                dstq, [dstq+dst_strideq*2]

-%endif

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-  dec                   h

-  jg .x_zero_y_zero_loop

-  STORE_AND_RET

-.x_zero_y_nonzero:

-  cmp           y_offsetd, 8

-  jne .x_zero_y_nonhalf

-  ; x_offset == 0 && y_offset == 0.5

-.x_zero_y_half_loop:

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m4, [srcq+src_strideq]

-  mova                 m1, [dstq]

-  pavgb                m0, m4

-  punpckhbw            m3, m1, m5

-%if %2 == 1 ; avg

-  pavgb                m0, [secq]

-%endif

-  punpcklbw            m1, m5

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  add                srcq, src_strideq

-  add                dstq, dst_strideq

-%else ; %1 < 16

-  movh                 m0, [srcq]

-  movh                 m2, [srcq+src_strideq]

-%if %2 == 1 ; avg

-%if mmsize == 16

-  movhps               m2, [srcq+src_strideq*2]

-%else ; mmsize == 8

-%if %1 == 4

-  movh                 m1, [srcq+src_strideq*2]

-  punpckldq            m2, m1

-%else

-  punpckldq            m2, [srcq+src_strideq*2]

-%endif

-%endif

-  movh                 m1, [dstq]

-%if mmsize == 16

-  movlhps              m0, m2

-%else ; mmsize == 8

-  punpckldq            m0, m2

-%endif

-  movh                 m3, [dstq+dst_strideq]

-  pavgb                m0, m2

-  punpcklbw            m1, m5

-  pavgb                m0, [secq]

-  punpcklbw            m3, m5

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%else ; !avg

-  movh                 m4, [srcq+src_strideq*2]

-  movh                 m1, [dstq]

-  pavgb                m0, m2

-  movh                 m3, [dstq+dst_strideq]

-  pavgb                m2, m4

-  punpcklbw            m0, m5

-  punpcklbw            m2, m5

-  punpcklbw            m3, m5

-  punpcklbw            m1, m5

-%endif

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  lea                srcq, [srcq+src_strideq*2]

-  lea                dstq, [dstq+dst_strideq*2]

-%endif

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-  dec                   h

-  jg .x_zero_y_half_loop

-  STORE_AND_RET

-.x_zero_y_nonhalf:

-  ; x_offset == 0 && y_offset == bilin interpolation

-%ifdef PIC

-  lea        bilin_filter, [bilin_filter_m]

-%endif

-  shl           y_offsetd, filter_idx_shift

-%if ARCH_X86_64 && mmsize == 16

-  mova                 m8, [bilin_filter+y_offsetq]

-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

-  mova                 m9, [bilin_filter+y_offsetq+16]

-%endif

-  mova                m10, [pw_8]

-%define filter_y_a m8

-%define filter_y_b m9

-%define filter_rnd m10

-%else ; x86-32 or mmx

-%if ARCH_X86=1 && CONFIG_PIC=1

-; x_offset == 0, reuse x_offset reg

-%define tempq x_offsetq

-  add y_offsetq, g_bilin_filterm

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-  mov tempq, g_pw_8m

-%define filter_rnd [tempq]

-%else

-  add           y_offsetq, bilin_filter

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-%define filter_rnd [pw_8]

-%endif

-%endif

-.x_zero_y_other_loop:

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m4, [srcq+src_strideq]

-  mova                 m1, [dstq]

-%if cpuflag(ssse3)

-  punpckhbw            m2, m0, m4

-  punpcklbw            m0, m4

-  pmaddubsw            m2, filter_y_a

-  pmaddubsw            m0, filter_y_a

-  paddw                m2, filter_rnd

-  paddw                m0, filter_rnd

-%else

-  punpckhbw            m2, m0, m5

-  punpckhbw            m3, m4, m5

-  punpcklbw            m0, m5

-  punpcklbw            m4, m5

-  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can

-  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of

-  ; instructions is the same (5), but it is 1 mul instead of 2, so might be

-  ; slightly faster because of pmullw latency. It would also cut our rodata

-  ; tables in half for this function, and save 1-2 registers on x86-64.

-  pmullw               m2, filter_y_a

-  pmullw               m3, filter_y_b

-  paddw                m2, filter_rnd

-  pmullw               m0, filter_y_a

-  pmullw               m4, filter_y_b

-  paddw                m0, filter_rnd

-  paddw                m2, m3

-  paddw                m0, m4

-%endif

-  psraw                m2, 4

-  psraw                m0, 4

-%if %2 == 1 ; avg

-  ; FIXME(rbultje) pipeline

-  packuswb             m0, m2

-  pavgb                m0, [secq]

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%endif

-  punpckhbw            m3, m1, m5

-  punpcklbw            m1, m5

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  add                srcq, src_strideq

-  add                dstq, dst_strideq

-%else ; %1 < 16

-  movh                 m0, [srcq]

-  movh                 m2, [srcq+src_strideq]

-  movh                 m4, [srcq+src_strideq*2]

-  movh                 m3, [dstq+dst_strideq]

-%if cpuflag(ssse3)

-  movh                 m1, [dstq]

-  punpcklbw            m0, m2

-  punpcklbw            m2, m4

-  pmaddubsw            m0, filter_y_a

-  pmaddubsw            m2, filter_y_a

-  punpcklbw            m3, m5

-  paddw                m2, filter_rnd

-  paddw                m0, filter_rnd

-%else

-  punpcklbw            m0, m5

-  punpcklbw            m2, m5

-  punpcklbw            m4, m5

-  pmullw               m0, filter_y_a

-  pmullw               m1, m2, filter_y_b

-  punpcklbw            m3, m5

-  paddw                m0, filter_rnd

-  pmullw               m2, filter_y_a

-  pmullw               m4, filter_y_b

-  paddw                m0, m1

-  paddw                m2, filter_rnd

-  movh                 m1, [dstq]

-  paddw                m2, m4

-%endif

-  psraw                m0, 4

-  psraw                m2, 4

-%if %2 == 1 ; avg

-  ; FIXME(rbultje) pipeline

-  packuswb             m0, m2

-  pavgb                m0, [secq]

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%endif

-  punpcklbw            m1, m5

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  lea                srcq, [srcq+src_strideq*2]

-  lea                dstq, [dstq+dst_strideq*2]

-%endif

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-  dec                   h

-  jg .x_zero_y_other_loop

-%undef filter_y_a

-%undef filter_y_b

-%undef filter_rnd

-  STORE_AND_RET

-.x_nonzero:

-  cmp           x_offsetd, 8

-  jne .x_nonhalf

-  ; x_offset == 0.5

-  test          y_offsetd, y_offsetd

-  jnz .x_half_y_nonzero

-  ; x_offset == 0.5 && y_offset == 0

-.x_half_y_zero_loop:

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m4, [srcq+1]

-  mova                 m1, [dstq]

-  pavgb                m0, m4

-  punpckhbw            m3, m1, m5

-%if %2 == 1 ; avg

-  pavgb                m0, [secq]

-%endif

-  punpcklbw            m1, m5

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  add                srcq, src_strideq

-  add                dstq, dst_strideq

-%else ; %1 < 16

-  movh                 m0, [srcq]

-  movh                 m4, [srcq+1]

-%if %2 == 1 ; avg

-%if mmsize == 16

-  movhps               m0, [srcq+src_strideq]

-  movhps               m4, [srcq+src_strideq+1]

-%else ; mmsize == 8

-  punpckldq            m0, [srcq+src_strideq]

-  punpckldq            m4, [srcq+src_strideq+1]

-%endif

-  movh                 m1, [dstq]

-  movh                 m3, [dstq+dst_strideq]

-  pavgb                m0, m4

-  punpcklbw            m3, m5

-  pavgb                m0, [secq]

-  punpcklbw            m1, m5

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%else ; !avg

-  movh                 m2, [srcq+src_strideq]

-  movh                 m1, [dstq]

-  pavgb                m0, m4

-  movh                 m4, [srcq+src_strideq+1]

-  movh                 m3, [dstq+dst_strideq]

-  pavgb                m2, m4

-  punpcklbw            m0, m5

-  punpcklbw            m2, m5

-  punpcklbw            m3, m5

-  punpcklbw            m1, m5

-%endif

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  lea                srcq, [srcq+src_strideq*2]

-  lea                dstq, [dstq+dst_strideq*2]

-%endif

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-  dec                   h

-  jg .x_half_y_zero_loop

-  STORE_AND_RET

-.x_half_y_nonzero:

-  cmp           y_offsetd, 8

-  jne .x_half_y_nonhalf

-  ; x_offset == 0.5 && y_offset == 0.5

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m3, [srcq+1]

-  add                srcq, src_strideq

-  pavgb                m0, m3

-.x_half_y_half_loop:

-  movu                 m4, [srcq]

-  movu                 m3, [srcq+1]

-  mova                 m1, [dstq]

-  pavgb                m4, m3

-  punpckhbw            m3, m1, m5

-  pavgb                m0, m4

-%if %2 == 1 ; avg

-  punpcklbw            m1, m5

-  pavgb                m0, [secq]

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%else

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-  punpcklbw            m1, m5

-%endif

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  mova                 m0, m4

-  add                srcq, src_strideq

-  add                dstq, dst_strideq

-%else ; %1 < 16

-  movh                 m0, [srcq]

-  movh                 m3, [srcq+1]

-  add                srcq, src_strideq

-  pavgb                m0, m3

-.x_half_y_half_loop:

-  movh                 m2, [srcq]

-  movh                 m3, [srcq+1]

-%if %2 == 1 ; avg

-%if mmsize == 16

-  movhps               m2, [srcq+src_strideq]

-  movhps               m3, [srcq+src_strideq+1]

-%else

-%if %1 == 4

-  movh                 m1, [srcq+src_strideq]

-  punpckldq            m2, m1

-  movh                 m1, [srcq+src_strideq+1]

-  punpckldq            m3, m1

-%else

-  punpckldq            m2, [srcq+src_strideq]

-  punpckldq            m3, [srcq+src_strideq+1]

-%endif

-%endif

-  pavgb                m2, m3

-%if mmsize == 16

-  movlhps              m0, m2

-  movhlps              m4, m2

-%else ; mmsize == 8

-  punpckldq            m0, m2

-  pshufw               m4, m2, 0xe

-%endif

-  movh                 m1, [dstq]

-  pavgb                m0, m2

-  movh                 m3, [dstq+dst_strideq]

-  pavgb                m0, [secq]

-  punpcklbw            m3, m5

-  punpcklbw            m1, m5

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%else ; !avg

-  movh                 m4, [srcq+src_strideq]

-  movh                 m1, [srcq+src_strideq+1]

-  pavgb                m2, m3

-  pavgb                m4, m1

-  pavgb                m0, m2

-  pavgb                m2, m4

-  movh                 m1, [dstq]

-  movh                 m3, [dstq+dst_strideq]

-  punpcklbw            m0, m5

-  punpcklbw            m2, m5

-  punpcklbw            m3, m5

-  punpcklbw            m1, m5

-%endif

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  mova                 m0, m4

-  lea                srcq, [srcq+src_strideq*2]

-  lea                dstq, [dstq+dst_strideq*2]

-%endif

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-  dec                   h

-  jg .x_half_y_half_loop

-  STORE_AND_RET

-.x_half_y_nonhalf:

-  ; x_offset == 0.5 && y_offset == bilin interpolation

-%ifdef PIC

-  lea        bilin_filter, [bilin_filter_m]

-%endif

-  shl           y_offsetd, filter_idx_shift

-%if ARCH_X86_64 && mmsize == 16

-  mova                 m8, [bilin_filter+y_offsetq]

-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

-  mova                 m9, [bilin_filter+y_offsetq+16]

-%endif

-  mova                m10, [pw_8]

-%define filter_y_a m8

-%define filter_y_b m9

-%define filter_rnd m10

-%else  ;x86_32

-%if ARCH_X86=1 && CONFIG_PIC=1

-; x_offset == 0.5. We can reuse x_offset reg

-%define tempq x_offsetq

-  add y_offsetq, g_bilin_filterm

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-  mov tempq, g_pw_8m

-%define filter_rnd [tempq]

-%else

-  add           y_offsetq, bilin_filter

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-%define filter_rnd [pw_8]

-%endif

-%endif

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m3, [srcq+1]

-  add                srcq, src_strideq

-  pavgb                m0, m3

-.x_half_y_other_loop:

-  movu                 m4, [srcq]

-  movu                 m2, [srcq+1]

-  mova                 m1, [dstq]

-  pavgb                m4, m2

-%if cpuflag(ssse3)

-  punpckhbw            m2, m0, m4

-  punpcklbw            m0, m4

-  pmaddubsw            m2, filter_y_a

-  pmaddubsw            m0, filter_y_a

-  paddw                m2, filter_rnd

-  paddw                m0, filter_rnd

-  psraw                m2, 4

-%else

-  punpckhbw            m2, m0, m5

-  punpckhbw            m3, m4, m5

-  pmullw               m2, filter_y_a

-  pmullw               m3, filter_y_b

-  paddw                m2, filter_rnd

-  punpcklbw            m0, m5

-  paddw                m2, m3

-  punpcklbw            m3, m4, m5

-  pmullw               m0, filter_y_a

-  pmullw               m3, filter_y_b

-  paddw                m0, filter_rnd

-  psraw                m2, 4

-  paddw                m0, m3

-%endif

-  punpckhbw            m3, m1, m5

-  psraw                m0, 4

-%if %2 == 1 ; avg

-  ; FIXME(rbultje) pipeline

-  packuswb             m0, m2

-  pavgb                m0, [secq]

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%endif

-  punpcklbw            m1, m5

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  mova                 m0, m4

-  add                srcq, src_strideq

-  add                dstq, dst_strideq

-%else ; %1 < 16

-  movh                 m0, [srcq]

-  movh                 m3, [srcq+1]

-  add                srcq, src_strideq

-  pavgb                m0, m3

-%if notcpuflag(ssse3)

-  punpcklbw            m0, m5

-%endif

-.x_half_y_other_loop:

-  movh                 m2, [srcq]

-  movh                 m1, [srcq+1]

-  movh                 m4, [srcq+src_strideq]

-  movh                 m3, [srcq+src_strideq+1]

-  pavgb                m2, m1

-  pavgb                m4, m3

-  movh                 m3, [dstq+dst_strideq]

-%if cpuflag(ssse3)

-  movh                 m1, [dstq]

-  punpcklbw            m0, m2

-  punpcklbw            m2, m4

-  pmaddubsw            m0, filter_y_a

-  pmaddubsw            m2, filter_y_a

-  punpcklbw            m3, m5

-  paddw                m0, filter_rnd

-  paddw                m2, filter_rnd

-%else

-  punpcklbw            m2, m5

-  punpcklbw            m4, m5

-  pmullw               m0, filter_y_a

-  pmullw               m1, m2, filter_y_b

-  punpcklbw            m3, m5

-  paddw                m0, filter_rnd

-  pmullw               m2, filter_y_a

-  paddw                m0, m1

-  pmullw               m1, m4, filter_y_b

-  paddw                m2, filter_rnd

-  paddw                m2, m1

-  movh                 m1, [dstq]

-%endif

-  psraw                m0, 4

-  psraw                m2, 4

-%if %2 == 1 ; avg

-  ; FIXME(rbultje) pipeline

-  packuswb             m0, m2

-  pavgb                m0, [secq]

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%endif

-  punpcklbw            m1, m5

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  mova                 m0, m4

-  lea                srcq, [srcq+src_strideq*2]

-  lea                dstq, [dstq+dst_strideq*2]

-%endif

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-  dec                   h

-  jg .x_half_y_other_loop

-%undef filter_y_a

-%undef filter_y_b

-%undef filter_rnd

-  STORE_AND_RET

-.x_nonhalf:

-  test          y_offsetd, y_offsetd

-  jnz .x_nonhalf_y_nonzero

-  ; x_offset == bilin interpolation && y_offset == 0

-%ifdef PIC

-  lea        bilin_filter, [bilin_filter_m]

-%endif

-  shl           x_offsetd, filter_idx_shift

-%if ARCH_X86_64 && mmsize == 16

-  mova                 m8, [bilin_filter+x_offsetq]

-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

-  mova                 m9, [bilin_filter+x_offsetq+16]

-%endif

-  mova                m10, [pw_8]

-%define filter_x_a m8

-%define filter_x_b m9

-%define filter_rnd m10

-%else    ; x86-32

-%if ARCH_X86=1 && CONFIG_PIC=1

-;y_offset == 0. We can reuse y_offset reg.

-%define tempq y_offsetq

-  add x_offsetq, g_bilin_filterm

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-  mov tempq, g_pw_8m

-%define filter_rnd [tempq]

-%else

-  add           x_offsetq, bilin_filter

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-%define filter_rnd [pw_8]

-%endif

-%endif

-.x_other_y_zero_loop:

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m4, [srcq+1]

-  mova                 m1, [dstq]

-%if cpuflag(ssse3)

-  punpckhbw            m2, m0, m4

-  punpcklbw            m0, m4

-  pmaddubsw            m2, filter_x_a

-  pmaddubsw            m0, filter_x_a

-  paddw                m2, filter_rnd

-  paddw                m0, filter_rnd

-%else

-  punpckhbw            m2, m0, m5

-  punpckhbw            m3, m4, m5

-  punpcklbw            m0, m5

-  punpcklbw            m4, m5

-  pmullw               m2, filter_x_a

-  pmullw               m3, filter_x_b

-  paddw                m2, filter_rnd

-  pmullw               m0, filter_x_a

-  pmullw               m4, filter_x_b

-  paddw                m0, filter_rnd

-  paddw                m2, m3

-  paddw                m0, m4

-%endif

-  psraw                m2, 4

-  psraw                m0, 4

-%if %2 == 1 ; avg

-  ; FIXME(rbultje) pipeline

-  packuswb             m0, m2

-  pavgb                m0, [secq]

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%endif

-  punpckhbw            m3, m1, m5

-  punpcklbw            m1, m5

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  add                srcq, src_strideq

-  add                dstq, dst_strideq

-%else ; %1 < 16

-  movh                 m0, [srcq]

-  movh                 m1, [srcq+1]

-  movh                 m2, [srcq+src_strideq]

-  movh                 m4, [srcq+src_strideq+1]

-  movh                 m3, [dstq+dst_strideq]

-%if cpuflag(ssse3)

-  punpcklbw            m0, m1

-  movh                 m1, [dstq]

-  punpcklbw            m2, m4

-  pmaddubsw            m0, filter_x_a

-  pmaddubsw            m2, filter_x_a

-  punpcklbw            m3, m5

-  paddw                m0, filter_rnd

-  paddw                m2, filter_rnd

-%else

-  punpcklbw            m0, m5

-  punpcklbw            m1, m5

-  punpcklbw            m2, m5

-  punpcklbw            m4, m5

-  pmullw               m0, filter_x_a

-  pmullw               m1, filter_x_b

-  punpcklbw            m3, m5

-  paddw                m0, filter_rnd

-  pmullw               m2, filter_x_a

-  pmullw               m4, filter_x_b

-  paddw                m0, m1

-  paddw                m2, filter_rnd

-  movh                 m1, [dstq]

-  paddw                m2, m4

-%endif

-  psraw                m0, 4

-  psraw                m2, 4

-%if %2 == 1 ; avg

-  ; FIXME(rbultje) pipeline

-  packuswb             m0, m2

-  pavgb                m0, [secq]

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%endif

-  punpcklbw            m1, m5

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  lea                srcq, [srcq+src_strideq*2]

-  lea                dstq, [dstq+dst_strideq*2]

-%endif

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-  dec                   h

-  jg .x_other_y_zero_loop

-%undef filter_x_a

-%undef filter_x_b

-%undef filter_rnd

-  STORE_AND_RET

-.x_nonhalf_y_nonzero:

-  cmp           y_offsetd, 8

-  jne .x_nonhalf_y_nonhalf

-  ; x_offset == bilin interpolation && y_offset == 0.5

-%ifdef PIC

-  lea        bilin_filter, [bilin_filter_m]

-%endif

-  shl           x_offsetd, filter_idx_shift

-%if ARCH_X86_64 && mmsize == 16

-  mova                 m8, [bilin_filter+x_offsetq]

-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

-  mova                 m9, [bilin_filter+x_offsetq+16]

-%endif

-  mova                m10, [pw_8]

-%define filter_x_a m8

-%define filter_x_b m9

-%define filter_rnd m10

-%else    ; x86-32

-%if ARCH_X86=1 && CONFIG_PIC=1

-; y_offset == 0.5. We can reuse y_offset reg.

-%define tempq y_offsetq

-  add x_offsetq, g_bilin_filterm

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-  mov tempq, g_pw_8m

-%define filter_rnd [tempq]

-%else

-  add           x_offsetq, bilin_filter

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-%define filter_rnd [pw_8]

-%endif

-%endif

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq+1]

-%if cpuflag(ssse3)

-  punpckhbw            m2, m0, m1

-  punpcklbw            m0, m1

-  pmaddubsw            m2, filter_x_a

-  pmaddubsw            m0, filter_x_a

-  paddw                m2, filter_rnd

-  paddw                m0, filter_rnd

-%else

-  punpckhbw            m2, m0, m5

-  punpckhbw            m3, m1, m5

-  punpcklbw            m0, m5

-  punpcklbw            m1, m5

-  pmullw               m0, filter_x_a

-  pmullw               m1, filter_x_b

-  paddw                m0, filter_rnd

-  pmullw               m2, filter_x_a

-  pmullw               m3, filter_x_b

-  paddw                m2, filter_rnd

-  paddw                m0, m1

-  paddw                m2, m3

-%endif

-  psraw                m0, 4

-  psraw                m2, 4

-  add                srcq, src_strideq

-  packuswb             m0, m2

-.x_other_y_half_loop:

-  movu                 m4, [srcq]

-  movu                 m3, [srcq+1]

-%if cpuflag(ssse3)

-  mova                 m1, [dstq]

-  punpckhbw            m2, m4, m3

-  punpcklbw            m4, m3

-  pmaddubsw            m2, filter_x_a

-  pmaddubsw            m4, filter_x_a

-  paddw                m2, filter_rnd

-  paddw                m4, filter_rnd

-  psraw                m2, 4

-  psraw                m4, 4

-  packuswb             m4, m2

-  pavgb                m0, m4

-  punpckhbw            m3, m1, m5

-  punpcklbw            m1, m5

-%else

-  punpckhbw            m2, m4, m5

-  punpckhbw            m1, m3, m5

-  punpcklbw            m4, m5

-  punpcklbw            m3, m5

-  pmullw               m4, filter_x_a

-  pmullw               m3, filter_x_b

-  paddw                m4, filter_rnd

-  pmullw               m2, filter_x_a

-  pmullw               m1, filter_x_b

-  paddw                m2, filter_rnd

-  paddw                m4, m3

-  paddw                m2, m1

-  mova                 m1, [dstq]

-  psraw                m4, 4

-  psraw                m2, 4

-  punpckhbw            m3, m1, m5

-  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we

-  ; have a 1-register shortage to be able to store the backup of the bilin

-  ; filtered second line as words as cache for the next line. Packing into

-  ; a byte costs 1 pack and 2 unpacks, but saves a register.

-  packuswb             m4, m2

-  punpcklbw            m1, m5

-  pavgb                m0, m4

-%endif

-%if %2 == 1 ; avg

-  ; FIXME(rbultje) pipeline

-  pavgb                m0, [secq]

-%endif

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  mova                 m0, m4

-  add                srcq, src_strideq

-  add                dstq, dst_strideq

-%else ; %1 < 16

-  movh                 m0, [srcq]

-  movh                 m1, [srcq+1]

-%if cpuflag(ssse3)

-  punpcklbw            m0, m1

-  pmaddubsw            m0, filter_x_a

-  paddw                m0, filter_rnd

-%else

-  punpcklbw            m0, m5

-  punpcklbw            m1, m5

-  pmullw               m0, filter_x_a

-  pmullw               m1, filter_x_b

-  paddw                m0, filter_rnd

-  paddw                m0, m1

-%endif

-  add                srcq, src_strideq

-  psraw                m0, 4

-.x_other_y_half_loop:

-  movh                 m2, [srcq]

-  movh                 m1, [srcq+1]

-  movh                 m4, [srcq+src_strideq]

-  movh                 m3, [srcq+src_strideq+1]

-%if cpuflag(ssse3)

-  punpcklbw            m2, m1

-  punpcklbw            m4, m3

-  pmaddubsw            m2, filter_x_a

-  pmaddubsw            m4, filter_x_a

-  movh                 m1, [dstq]

-  movh                 m3, [dstq+dst_strideq]

-  paddw                m2, filter_rnd

-  paddw                m4, filter_rnd

-%else

-  punpcklbw            m2, m5

-  punpcklbw            m1, m5

-  punpcklbw            m4, m5

-  punpcklbw            m3, m5

-  pmullw               m2, filter_x_a

-  pmullw               m1, filter_x_b

-  paddw                m2, filter_rnd

-  pmullw               m4, filter_x_a

-  pmullw               m3, filter_x_b

-  paddw                m4, filter_rnd

-  paddw                m2, m1

-  movh                 m1, [dstq]

-  paddw                m4, m3

-  movh                 m3, [dstq+dst_strideq]

-%endif

-  psraw                m2, 4

-  psraw                m4, 4

-  pavgw                m0, m2

-  pavgw                m2, m4

-%if %2 == 1 ; avg

-  ; FIXME(rbultje) pipeline - also consider going to bytes here

-  packuswb             m0, m2

-  pavgb                m0, [secq]

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%endif

-  punpcklbw            m3, m5

-  punpcklbw            m1, m5

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  mova                 m0, m4

-  lea                srcq, [srcq+src_strideq*2]

-  lea                dstq, [dstq+dst_strideq*2]

-%endif

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-  dec                   h

-  jg .x_other_y_half_loop

-%undef filter_x_a

-%undef filter_x_b

-%undef filter_rnd

-  STORE_AND_RET

-.x_nonhalf_y_nonhalf:

-%ifdef PIC

-  lea        bilin_filter, [bilin_filter_m]

-%endif

-  shl           x_offsetd, filter_idx_shift

-  shl           y_offsetd, filter_idx_shift

-%if ARCH_X86_64 && mmsize == 16

-  mova                 m8, [bilin_filter+x_offsetq]

-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

-  mova                 m9, [bilin_filter+x_offsetq+16]

-%endif

-  mova                m10, [bilin_filter+y_offsetq]

-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

-  mova                m11, [bilin_filter+y_offsetq+16]

-%endif

-  mova                m12, [pw_8]

-%define filter_x_a m8

-%define filter_x_b m9

-%define filter_y_a m10

-%define filter_y_b m11

-%define filter_rnd m12

-%else   ; x86-32

-%if ARCH_X86=1 && CONFIG_PIC=1

-; In this case, there is NO unused register. Used src_stride register. Later,

-; src_stride has to be loaded from stack when it is needed.

-%define tempq src_strideq

-  mov tempq, g_bilin_filterm

-  add           x_offsetq, tempq

-  add           y_offsetq, tempq

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-  mov tempq, g_pw_8m

-%define filter_rnd [tempq]

-%else

-  add           x_offsetq, bilin_filter

-  add           y_offsetq, bilin_filter

-%define filter_x_a [x_offsetq]

-%define filter_x_b [x_offsetq+16]

-%define filter_y_a [y_offsetq]

-%define filter_y_b [y_offsetq+16]

-%define filter_rnd [pw_8]

-%endif

-%endif

-  ; x_offset == bilin interpolation && y_offset == bilin interpolation

-%if %1 == 16

-  movu                 m0, [srcq]

-  movu                 m1, [srcq+1]

-%if cpuflag(ssse3)

-  punpckhbw            m2, m0, m1

-  punpcklbw            m0, m1

-  pmaddubsw            m2, filter_x_a

-  pmaddubsw            m0, filter_x_a

-  paddw                m2, filter_rnd

-  paddw                m0, filter_rnd

-%else

-  punpckhbw            m2, m0, m5

-  punpckhbw            m3, m1, m5

-  punpcklbw            m0, m5

-  punpcklbw            m1, m5

-  pmullw               m0, filter_x_a

-  pmullw               m1, filter_x_b

-  paddw                m0, filter_rnd

-  pmullw               m2, filter_x_a

-  pmullw               m3, filter_x_b

-  paddw                m2, filter_rnd

-  paddw                m0, m1

-  paddw                m2, m3

-%endif

-  psraw                m0, 4

-  psraw                m2, 4

-  INC_SRC_BY_SRC_STRIDE

-  packuswb             m0, m2

-.x_other_y_other_loop:

-%if cpuflag(ssse3)

-  movu                 m4, [srcq]

-  movu                 m3, [srcq+1]

-  mova                 m1, [dstq]

-  punpckhbw            m2, m4, m3

-  punpcklbw            m4, m3

-  pmaddubsw            m2, filter_x_a

-  pmaddubsw            m4, filter_x_a

-  punpckhbw            m3, m1, m5

-  paddw                m2, filter_rnd

-  paddw                m4, filter_rnd

-  psraw                m2, 4

-  psraw                m4, 4

-  packuswb             m4, m2

-  punpckhbw            m2, m0, m4

-  punpcklbw            m0, m4

-  pmaddubsw            m2, filter_y_a

-  pmaddubsw            m0, filter_y_a

-  punpcklbw            m1, m5

-  paddw                m2, filter_rnd

-  paddw                m0, filter_rnd

-  psraw                m2, 4

-  psraw                m0, 4

-%else

-  movu                 m3, [srcq]

-  movu                 m4, [srcq+1]

-  punpckhbw            m1, m3, m5

-  punpckhbw            m2, m4, m5

-  punpcklbw            m3, m5

-  punpcklbw            m4, m5

-  pmullw               m3, filter_x_a

-  pmullw               m4, filter_x_b

-  paddw                m3, filter_rnd

-  pmullw               m1, filter_x_a

-  pmullw               m2, filter_x_b

-  paddw                m1, filter_rnd

-  paddw                m3, m4

-  paddw                m1, m2

-  psraw                m3, 4

-  psraw                m1, 4

-  packuswb             m4, m3, m1

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-  pmullw               m2, filter_y_a

-  pmullw               m1, filter_y_b

-  paddw                m2, filter_rnd

-  pmullw               m0, filter_y_a

-  pmullw               m3, filter_y_b

-  paddw                m2, m1

-  mova                 m1, [dstq]

-  paddw                m0, filter_rnd

-  psraw                m2, 4

-  paddw                m0, m3

-  punpckhbw            m3, m1, m5

-  psraw                m0, 4

-  punpcklbw            m1, m5

-%endif

-%if %2 == 1 ; avg

-  ; FIXME(rbultje) pipeline

-  packuswb             m0, m2

-  pavgb                m0, [secq]

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%endif

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  mova                 m0, m4

-  INC_SRC_BY_SRC_STRIDE

-  add                dstq, dst_strideq

-%else ; %1 < 16

-  movh                 m0, [srcq]

-  movh                 m1, [srcq+1]

-%if cpuflag(ssse3)

-  punpcklbw            m0, m1

-  pmaddubsw            m0, filter_x_a

-  paddw                m0, filter_rnd

-%else

-  punpcklbw            m0, m5

-  punpcklbw            m1, m5

-  pmullw               m0, filter_x_a

-  pmullw               m1, filter_x_b

-  paddw                m0, filter_rnd

-  paddw                m0, m1

-%endif

-  psraw                m0, 4

-%if cpuflag(ssse3)

-  packuswb             m0, m0

-%endif

-  INC_SRC_BY_SRC_STRIDE

-.x_other_y_other_loop:

-  movh                 m2, [srcq]

-  movh                 m1, [srcq+1]

-  INC_SRC_BY_SRC_STRIDE

-  movh                 m4, [srcq]

-  movh                 m3, [srcq+1]

-%if cpuflag(ssse3)

-  punpcklbw            m2, m1

-  punpcklbw            m4, m3

-  pmaddubsw            m2, filter_x_a

-  pmaddubsw            m4, filter_x_a

-  movh                 m3, [dstq+dst_strideq]

-  movh                 m1, [dstq]

-  paddw                m2, filter_rnd

-  paddw                m4, filter_rnd

-  psraw                m2, 4

-  psraw                m4, 4

-  packuswb             m2, m2

-  packuswb             m4, m4

-  punpcklbw            m0, m2

-  punpcklbw            m2, m4

-  pmaddubsw            m0, filter_y_a

-  pmaddubsw            m2, filter_y_a

-  punpcklbw            m3, m5

-  paddw                m0, filter_rnd

-  paddw                m2, filter_rnd

-  psraw                m0, 4

-  psraw                m2, 4

-  punpcklbw            m1, m5

-%else

-  punpcklbw            m2, m5

-  punpcklbw            m1, m5

-  punpcklbw            m4, m5

-  punpcklbw            m3, m5

-  pmullw               m2, filter_x_a

-  pmullw               m1, filter_x_b

-  paddw                m2, filter_rnd

-  pmullw               m4, filter_x_a

-  pmullw               m3, filter_x_b

-  paddw                m4, filter_rnd

-  paddw                m2, m1

-  paddw                m4, m3

-  psraw                m2, 4

-  psraw                m4, 4

-  pmullw               m0, filter_y_a

-  pmullw               m3, m2, filter_y_b

-  paddw                m0, filter_rnd

-  pmullw               m2, filter_y_a

-  pmullw               m1, m4, filter_y_b

-  paddw                m2, filter_rnd

-  paddw                m0, m3

-  movh                 m3, [dstq+dst_strideq]

-  paddw                m2, m1

-  movh                 m1, [dstq]

-  psraw                m0, 4

-  psraw                m2, 4

-  punpcklbw            m3, m5

-  punpcklbw            m1, m5

-%endif

-%if %2 == 1 ; avg

-  ; FIXME(rbultje) pipeline

-  packuswb             m0, m2

-  pavgb                m0, [secq]

-  punpckhbw            m2, m0, m5

-  punpcklbw            m0, m5

-%endif

-  SUM_SSE              m0, m1, m2, m3, m6, m7

-  mova                 m0, m4

-  INC_SRC_BY_SRC_STRIDE

-  lea                dstq, [dstq+dst_strideq*2]

-%endif

-%if %2 == 1 ; avg

-  add                secq, sec_str

-%endif

-  dec                   h

-  jg .x_other_y_other_loop

-%undef filter_x_a

-%undef filter_x_b

-%undef filter_y_a

-%undef filter_y_b

-%undef filter_rnd

-  STORE_AND_RET

-%endmacro

-; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical

-; between the ssse3 and non-ssse3 version. It may make sense to merge their

-; code in the sense that the ssse3 version would jump to the appropriate

-; location in the sse/2 version, rather than duplicating that code in the

-; binary.

-INIT_MMX sse

-SUBPEL_VARIANCE  4

-INIT_XMM sse2

-SUBPEL_VARIANCE  8

-SUBPEL_VARIANCE 16

-INIT_MMX ssse3

-SUBPEL_VARIANCE  4

-INIT_XMM ssse3

-SUBPEL_VARIANCE  8

-SUBPEL_VARIANCE 16

-INIT_MMX sse

-SUBPEL_VARIANCE  4, 1

-INIT_XMM sse2

-SUBPEL_VARIANCE  8, 1

-SUBPEL_VARIANCE 16, 1

-INIT_MMX ssse3

-SUBPEL_VARIANCE  4, 1

-INIT_XMM ssse3

-SUBPEL_VARIANCE  8, 1

-SUBPEL_VARIANCE 16, 1

--- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c

+++ /dev/null

@@ -1,525 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <immintrin.h>  // AVX2

-#include "./vp9_rtcd.h"

-#include "vpx_ports/mem.h"

-#include "vp9/encoder/vp9_variance.h"

-DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {

-  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,

-  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,

-  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,

-  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,

-  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,

-  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,

-  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,

-  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,

-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

-  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,

-  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,

-  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,

-  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,

-  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,

-  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,

-};

-#define FILTER_SRC(filter) \

-  /* filter the source */ \

-  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \

-  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \

-  \

-  /* add 8 to source */ \

-  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \

-  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \

-  \

-  /* divide source by 16 */ \

-  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \

-  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);

-#define MERGE_WITH_SRC(src_reg, reg) \

-  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \

-  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);

-#define LOAD_SRC_DST \

-  /* load source and destination */ \

-  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \

-  dst_reg = _mm256_loadu_si256((__m256i const *) (dst));

-#define AVG_NEXT_SRC(src_reg, size_stride) \

-  src_next_reg = _mm256_loadu_si256((__m256i const *) \

-                                   (src + size_stride)); \

-  /* average between current and next stride source */ \

-  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);

-#define MERGE_NEXT_SRC(src_reg, size_stride) \

-  src_next_reg = _mm256_loadu_si256((__m256i const *) \

-                                   (src + size_stride)); \

-  MERGE_WITH_SRC(src_reg, src_next_reg)

-#define CALC_SUM_SSE_INSIDE_LOOP \

-  /* expand each byte to 2 bytes */ \

-  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \

-  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \

-  /* source - dest */ \

-  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \

-  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \

-  /* caculate sum */ \

-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \

-  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \

-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \

-  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \

-  /* calculate sse */ \

-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \

-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);

-// final calculation to sum and sse

-#define CALC_SUM_AND_SSE \

-  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \

-  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \

-  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \

-  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \

-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \

-  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \

-  \

-  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \

-  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \

-  \

-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \

-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \

-  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \

-                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \

-  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \

-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \

-  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \

-        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));

-unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,

-                                             int src_stride,

-                                             int x_offset,

-                                             int y_offset,

-                                             const uint8_t *dst,

-                                             int dst_stride,

-                                             int height,

-                                             unsigned int *sse) {

-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;

-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;

-  __m256i zero_reg;

-  int i, sum;

-  sum_reg = _mm256_set1_epi16(0);

-  sse_reg = _mm256_set1_epi16(0);

-  zero_reg = _mm256_set1_epi16(0);

-  // x_offset = 0 and y_offset = 0

-  if (x_offset == 0) {

-    if (y_offset == 0) {

-      for (i = 0; i < height ; i++) {

-        LOAD_SRC_DST

-        // expend each byte to 2 bytes

-        MERGE_WITH_SRC(src_reg, zero_reg)

-        CALC_SUM_SSE_INSIDE_LOOP

-        src+= src_stride;

-        dst+= dst_stride;

-      }

-    // x_offset = 0 and y_offset = 8

-    } else if (y_offset == 8) {

-      __m256i src_next_reg;

-      for (i = 0; i < height ; i++) {

-        LOAD_SRC_DST

-        AVG_NEXT_SRC(src_reg, src_stride)

-        // expend each byte to 2 bytes

-        MERGE_WITH_SRC(src_reg, zero_reg)

-        CALC_SUM_SSE_INSIDE_LOOP

-        src+= src_stride;

-        dst+= dst_stride;

-      }

-    // x_offset = 0 and y_offset = bilin interpolation

-    } else {

-      __m256i filter, pw8, src_next_reg;

-      y_offset <<= 5;

-      filter = _mm256_load_si256((__m256i const *)

-               (bilinear_filters_avx2 + y_offset));

-      pw8 = _mm256_set1_epi16(8);

-      for (i = 0; i < height ; i++) {

-        LOAD_SRC_DST

-        MERGE_NEXT_SRC(src_reg, src_stride)

-        FILTER_SRC(filter)

-        CALC_SUM_SSE_INSIDE_LOOP

-        src+= src_stride;

-        dst+= dst_stride;

-      }

-    }

-  // x_offset = 8  and y_offset = 0

-  } else if (x_offset == 8) {

-    if (y_offset == 0) {

-      __m256i src_next_reg;

-      for (i = 0; i < height ; i++) {

-        LOAD_SRC_DST

-        AVG_NEXT_SRC(src_reg, 1)

-        // expand each byte to 2 bytes

-        MERGE_WITH_SRC(src_reg, zero_reg)

-        CALC_SUM_SSE_INSIDE_LOOP

-        src+= src_stride;

-        dst+= dst_stride;

-      }

-    // x_offset = 8  and y_offset = 8

-    } else if (y_offset == 8) {

-      __m256i src_next_reg, src_avg;

-      // load source and another source starting from the next

-      // following byte

-      src_reg = _mm256_loadu_si256((__m256i const *) (src));

-      AVG_NEXT_SRC(src_reg, 1)

-      for (i = 0; i < height ; i++) {

-        src_avg = src_reg;

-        src+= src_stride;

-        LOAD_SRC_DST

-        AVG_NEXT_SRC(src_reg, 1)

-        // average between previous average to current average

-        src_avg = _mm256_avg_epu8(src_avg, src_reg);

-        // expand each byte to 2 bytes

-        MERGE_WITH_SRC(src_avg, zero_reg)

-        // save current source average

-        CALC_SUM_SSE_INSIDE_LOOP

-        dst+= dst_stride;

-      }

-    // x_offset = 8  and y_offset = bilin interpolation

-    } else {

-      __m256i filter, pw8, src_next_reg, src_avg;

-      y_offset <<= 5;

-      filter = _mm256_load_si256((__m256i const *)

-               (bilinear_filters_avx2 + y_offset));

-      pw8 = _mm256_set1_epi16(8);

-      // load source and another source starting from the next

-      // following byte

-      src_reg = _mm256_loadu_si256((__m256i const *) (src));

-      AVG_NEXT_SRC(src_reg, 1)

-      for (i = 0; i < height ; i++) {

-        // save current source average

-        src_avg = src_reg;

-        src+= src_stride;

-        LOAD_SRC_DST

-        AVG_NEXT_SRC(src_reg, 1)

-        MERGE_WITH_SRC(src_avg, src_reg)

-        FILTER_SRC(filter)

-        CALC_SUM_SSE_INSIDE_LOOP

-        dst+= dst_stride;

-      }

-    }

-  // x_offset = bilin interpolation and y_offset = 0

-  } else {

-    if (y_offset == 0) {

-      __m256i filter, pw8, src_next_reg;

-      x_offset <<= 5;

-      filter = _mm256_load_si256((__m256i const *)

-               (bilinear_filters_avx2 + x_offset));

-      pw8 = _mm256_set1_epi16(8);

-      for (i = 0; i < height ; i++) {

-        LOAD_SRC_DST

-        MERGE_NEXT_SRC(src_reg, 1)

-        FILTER_SRC(filter)

-        CALC_SUM_SSE_INSIDE_LOOP

-        src+= src_stride;

-        dst+= dst_stride;

-      }

-    // x_offset = bilin interpolation and y_offset = 8

-    } else if (y_offset == 8) {

-      __m256i filter, pw8, src_next_reg, src_pack;

-      x_offset <<= 5;

-      filter = _mm256_load_si256((__m256i const *)

-               (bilinear_filters_avx2 + x_offset));

-      pw8 = _mm256_set1_epi16(8);

-      src_reg = _mm256_loadu_si256((__m256i const *) (src));

-      MERGE_NEXT_SRC(src_reg, 1)

-      FILTER_SRC(filter)

-      // convert each 16 bit to 8 bit to each low and high lane source

-      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-      for (i = 0; i < height ; i++) {

-        src+= src_stride;

-        LOAD_SRC_DST

-        MERGE_NEXT_SRC(src_reg, 1)

-        FILTER_SRC(filter)

-        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-        // average between previous pack to the current

-        src_pack = _mm256_avg_epu8(src_pack, src_reg);

-        MERGE_WITH_SRC(src_pack, zero_reg)

-        CALC_SUM_SSE_INSIDE_LOOP

-        src_pack = src_reg;

-        dst+= dst_stride;

-      }

-    // x_offset = bilin interpolation and y_offset = bilin interpolation

-    } else {

-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;

-      x_offset <<= 5;

-      xfilter = _mm256_load_si256((__m256i const *)

-                (bilinear_filters_avx2 + x_offset));

-      y_offset <<= 5;

-      yfilter = _mm256_load_si256((__m256i const *)

-                (bilinear_filters_avx2 + y_offset));

-      pw8 = _mm256_set1_epi16(8);

-      // load source and another source starting from the next

-      // following byte

-      src_reg = _mm256_loadu_si256((__m256i const *) (src));

-      MERGE_NEXT_SRC(src_reg, 1)

-      FILTER_SRC(xfilter)

-      // convert each 16 bit to 8 bit to each low and high lane source

-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-      for (i = 0; i < height ; i++) {

-        src+= src_stride;

-        LOAD_SRC_DST

-        MERGE_NEXT_SRC(src_reg, 1)

-        FILTER_SRC(xfilter)

-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-        // merge previous pack to current pack source

-        MERGE_WITH_SRC(src_pack, src_reg)

-        // filter the source

-        FILTER_SRC(yfilter)

-        src_pack = src_reg;

-        CALC_SUM_SSE_INSIDE_LOOP

-        dst+= dst_stride;

-      }

-    }

-  }

-  CALC_SUM_AND_SSE

-  return sum;

-}

-unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,

-                                             int src_stride,

-                                             int x_offset,

-                                             int y_offset,

-                                             const uint8_t *dst,

-                                             int dst_stride,

-                                             const uint8_t *sec,

-                                             int sec_stride,

-                                             int height,

-                                             unsigned int *sse) {

-  __m256i sec_reg;

-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;

-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;

-  __m256i zero_reg;

-  int i, sum;

-  sum_reg = _mm256_set1_epi16(0);

-  sse_reg = _mm256_set1_epi16(0);

-  zero_reg = _mm256_set1_epi16(0);

-  // x_offset = 0 and y_offset = 0

-  if (x_offset == 0) {

-    if (y_offset == 0) {

-      for (i = 0; i < height ; i++) {

-        LOAD_SRC_DST

-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);

-        sec+= sec_stride;

-        // expend each byte to 2 bytes

-        MERGE_WITH_SRC(src_reg, zero_reg)

-        CALC_SUM_SSE_INSIDE_LOOP

-        src+= src_stride;

-        dst+= dst_stride;

-      }

-    } else if (y_offset == 8) {

-      __m256i src_next_reg;

-      for (i = 0; i < height ; i++) {

-        LOAD_SRC_DST

-        AVG_NEXT_SRC(src_reg, src_stride)

-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);

-        sec+= sec_stride;

-        // expend each byte to 2 bytes

-        MERGE_WITH_SRC(src_reg, zero_reg)

-        CALC_SUM_SSE_INSIDE_LOOP

-        src+= src_stride;

-        dst+= dst_stride;

-      }

-    // x_offset = 0 and y_offset = bilin interpolation

-    } else {

-      __m256i filter, pw8, src_next_reg;

-      y_offset <<= 5;

-      filter = _mm256_load_si256((__m256i const *)

-                 (bilinear_filters_avx2 + y_offset));

-      pw8 = _mm256_set1_epi16(8);

-      for (i = 0; i < height ; i++) {

-        LOAD_SRC_DST

-        MERGE_NEXT_SRC(src_reg, src_stride)

-        FILTER_SRC(filter)

-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);

-        sec+= sec_stride;

-        MERGE_WITH_SRC(src_reg, zero_reg)

-        CALC_SUM_SSE_INSIDE_LOOP

-        src+= src_stride;

-        dst+= dst_stride;

-      }

-    }

-  // x_offset = 8  and y_offset = 0

-  } else if (x_offset == 8) {

-    if (y_offset == 0) {

-      __m256i src_next_reg;

-      for (i = 0; i < height ; i++) {

-        LOAD_SRC_DST

-        AVG_NEXT_SRC(src_reg, 1)

-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);

-        sec+= sec_stride;

-        // expand each byte to 2 bytes

-        MERGE_WITH_SRC(src_reg, zero_reg)

-        CALC_SUM_SSE_INSIDE_LOOP

-        src+= src_stride;

-        dst+= dst_stride;

-      }

-    // x_offset = 8  and y_offset = 8

-    } else if (y_offset == 8) {

-      __m256i src_next_reg, src_avg;

-      // load source and another source starting from the next

-      // following byte

-      src_reg = _mm256_loadu_si256((__m256i const *) (src));

-      AVG_NEXT_SRC(src_reg, 1)

-      for (i = 0; i < height ; i++) {

-        // save current source average

-        src_avg = src_reg;

-        src+= src_stride;

-        LOAD_SRC_DST

-        AVG_NEXT_SRC(src_reg, 1)

-        // average between previous average to current average

-        src_avg = _mm256_avg_epu8(src_avg, src_reg);

-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);

-        sec+= sec_stride;

-        // expand each byte to 2 bytes

-        MERGE_WITH_SRC(src_avg, zero_reg)

-        CALC_SUM_SSE_INSIDE_LOOP

-        dst+= dst_stride;

-      }

-    // x_offset = 8  and y_offset = bilin interpolation

-    } else {

-      __m256i filter, pw8, src_next_reg, src_avg;

-      y_offset <<= 5;

-      filter = _mm256_load_si256((__m256i const *)

-               (bilinear_filters_avx2 + y_offset));

-      pw8 = _mm256_set1_epi16(8);

-      // load source and another source starting from the next

-      // following byte

-      src_reg = _mm256_loadu_si256((__m256i const *) (src));

-      AVG_NEXT_SRC(src_reg, 1)

-      for (i = 0; i < height ; i++) {

-        // save current source average

-        src_avg = src_reg;

-        src+= src_stride;

-        LOAD_SRC_DST

-        AVG_NEXT_SRC(src_reg, 1)

-        MERGE_WITH_SRC(src_avg, src_reg)

-        FILTER_SRC(filter)

-        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);

-        // expand each byte to 2 bytes

-        MERGE_WITH_SRC(src_avg, zero_reg)

-        sec+= sec_stride;

-        CALC_SUM_SSE_INSIDE_LOOP

-        dst+= dst_stride;

-      }

-    }

-  // x_offset = bilin interpolation and y_offset = 0

-  } else {

-    if (y_offset == 0) {

-      __m256i filter, pw8, src_next_reg;

-      x_offset <<= 5;

-      filter = _mm256_load_si256((__m256i const *)

-               (bilinear_filters_avx2 + x_offset));

-      pw8 = _mm256_set1_epi16(8);

-      for (i = 0; i < height ; i++) {

-        LOAD_SRC_DST

-        MERGE_NEXT_SRC(src_reg, 1)

-        FILTER_SRC(filter)

-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);

-        MERGE_WITH_SRC(src_reg, zero_reg)

-        sec+= sec_stride;

-        CALC_SUM_SSE_INSIDE_LOOP

-        src+= src_stride;

-        dst+= dst_stride;

-      }

-    // x_offset = bilin interpolation and y_offset = 8

-    } else if (y_offset == 8) {

-      __m256i filter, pw8, src_next_reg, src_pack;

-      x_offset <<= 5;

-      filter = _mm256_load_si256((__m256i const *)

-               (bilinear_filters_avx2 + x_offset));

-      pw8 = _mm256_set1_epi16(8);

-      src_reg = _mm256_loadu_si256((__m256i const *) (src));

-      MERGE_NEXT_SRC(src_reg, 1)

-      FILTER_SRC(filter)

-      // convert each 16 bit to 8 bit to each low and high lane source

-      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-      for (i = 0; i < height ; i++) {

-        src+= src_stride;

-        LOAD_SRC_DST

-        MERGE_NEXT_SRC(src_reg, 1)

-        FILTER_SRC(filter)

-        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-        // average between previous pack to the current

-        src_pack = _mm256_avg_epu8(src_pack, src_reg);

-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);

-        sec+= sec_stride;

-        MERGE_WITH_SRC(src_pack, zero_reg)

-        src_pack = src_reg;

-        CALC_SUM_SSE_INSIDE_LOOP

-        dst+= dst_stride;

-      }

-    // x_offset = bilin interpolation and y_offset = bilin interpolation

-    } else {

-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;

-      x_offset <<= 5;

-      xfilter = _mm256_load_si256((__m256i const *)

-                (bilinear_filters_avx2 + x_offset));

-      y_offset <<= 5;

-      yfilter = _mm256_load_si256((__m256i const *)

-                (bilinear_filters_avx2 + y_offset));

-      pw8 = _mm256_set1_epi16(8);

-      // load source and another source starting from the next

-      // following byte

-      src_reg = _mm256_loadu_si256((__m256i const *) (src));

-      MERGE_NEXT_SRC(src_reg, 1)

-      FILTER_SRC(xfilter)

-      // convert each 16 bit to 8 bit to each low and high lane source

-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-      for (i = 0; i < height ; i++) {

-        src+= src_stride;

-        LOAD_SRC_DST

-        MERGE_NEXT_SRC(src_reg, 1)

-        FILTER_SRC(xfilter)

-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-        // merge previous pack to current pack source

-        MERGE_WITH_SRC(src_pack, src_reg)

-        // filter the source

-        FILTER_SRC(yfilter)

-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);

-        MERGE_WITH_SRC(src_pack, zero_reg)

-        src_pack = src_reg;

-        sec+= sec_stride;

-        CALC_SUM_SSE_INSIDE_LOOP

-        dst+= dst_stride;

-      }

-    }

-  }

-  CALC_SUM_AND_SSE

-  return sum;

-}

--- a/vp9/encoder/x86/vp9_variance_avx2.c

+++ /dev/null

@@ -1,104 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp9_rtcd.h"

-#include "./vpx_config.h"

-#include "vp9/encoder/vp9_variance.h"

-#include "vpx_ports/mem.h"

-unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,

-                                             int x_offset, int y_offset,

-                                             const uint8_t *dst, int dst_stride,

-                                             int height,

-                                             unsigned int *sse);

-unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,

-                                                 int src_stride,

-                                                 int x_offset,

-                                                 int y_offset,

-                                                 const uint8_t *dst,

-                                                 int dst_stride,

-                                                 const uint8_t *sec,

-                                                 int sec_stride,

-                                                 int height,

-                                                 unsigned int *sseptr);

-unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,

-                                              int src_stride,

-                                              int x_offset,

-                                              int y_offset,

-                                              const uint8_t *dst,

-                                              int dst_stride,

-                                              unsigned int *sse) {

-  unsigned int sse1;

-  const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,

-                                                  y_offset, dst, dst_stride,

-                                                  64, &sse1);

-  unsigned int sse2;

-  const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,

-                                                  x_offset, y_offset,

-                                                  dst + 32, dst_stride,

-                                                  64, &sse2);

-  const int se = se1 + se2;

-  *sse = sse1 + sse2;

-  return *sse - (((int64_t)se * se) >> 12);

-}

-unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,

-                                              int src_stride,

-                                              int x_offset,

-                                              int y_offset,

-                                              const uint8_t *dst,

-                                              int dst_stride,

-                                              unsigned int *sse) {

-  const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,

-                                                 y_offset, dst, dst_stride,

-                                                 32, sse);

-  return *sse - (((int64_t)se * se) >> 10);

-}

-unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,

-                                                  int src_stride,

-                                                  int x_offset,

-                                                  int y_offset,

-                                                  const uint8_t *dst,

-                                                  int dst_stride,

-                                                  unsigned int *sse,

-                                                  const uint8_t *sec) {

-  unsigned int sse1;

-  const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,

-                                                      y_offset, dst, dst_stride,

-                                                      sec, 64, 64, &sse1);

-  unsigned int sse2;

-  const int se2 =

-      vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,

-                                          y_offset, dst + 32, dst_stride,

-                                          sec + 32, 64, 64, &sse2);

-  const int se = se1 + se2;

-  *sse = sse1 + sse2;

-  return *sse - (((int64_t)se * se) >> 12);

-}

-unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,

-                                                  int src_stride,

-                                                  int x_offset,

-                                                  int y_offset,

-                                                  const uint8_t *dst,

-                                                  int dst_stride,

-                                                  unsigned int *sse,

-                                                  const uint8_t *sec) {

-  // processing 32 element in parallel

-  const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,

-                                                     y_offset, dst, dst_stride,

-                                                     sec, 32, 32, sse);

-  return *sse - (((int64_t)se * se) >> 10);

-}

--- a/vp9/encoder/x86/vp9_variance_sse2.c

+++ /dev/null

@@ -1,182 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <emmintrin.h>  // SSE2

-#include "./vp9_rtcd.h"

-#include "./vpx_config.h"

-#include "vp9/encoder/vp9_variance.h"

-#include "vpx_ports/mem.h"

-// The 2 unused parameters are place holders for PIC enabled build.

-#define DECL(w, opt) \

-int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \

-                                        ptrdiff_t src_stride, \

-                                        int x_offset, int y_offset, \

-                                        const uint8_t *dst, \

-                                        ptrdiff_t dst_stride, \

-                                        int height, unsigned int *sse, \

-                                        void *unused0, void *unused)

-#define DECLS(opt1, opt2) \

-DECL(4, opt2); \

-DECL(8, opt1); \

-DECL(16, opt1)

-DECLS(sse2, sse);

-DECLS(ssse3, ssse3);

-#undef DECLS

-#undef DECL

-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \

-unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \

-                                                     int src_stride, \

-                                                     int x_offset, \

-                                                     int y_offset, \

-                                                     const uint8_t *dst, \

-                                                     int dst_stride, \

-                                                     unsigned int *sse_ptr) { \

-  unsigned int sse; \

-  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \

-                                                y_offset, dst, dst_stride, \

-                                                h, &sse, NULL, NULL); \

-  if (w > wf) { \

-    unsigned int sse2; \

-    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \

-                                                   x_offset, y_offset, \

-                                                   dst + 16, dst_stride, \

-                                                   h, &sse2, NULL, NULL); \

-    se += se2; \

-    sse += sse2; \

-    if (w > wf * 2) { \

-      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \

-                                                 x_offset, y_offset, \

-                                                 dst + 32, dst_stride, \

-                                                 h, &sse2, NULL, NULL); \

-      se += se2; \

-      sse += sse2; \

-      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \

-                                                 x_offset, y_offset, \

-                                                 dst + 48, dst_stride, \

-                                                 h, &sse2, NULL, NULL); \

-      se += se2; \

-      sse += sse2; \

-    } \

-  } \

-  *sse_ptr = sse; \

-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

-}

-#define FNS(opt1, opt2) \

-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \

-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \

-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \

-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \

-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \

-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \

-FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \

-FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \

-FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \

-FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \

-FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \

-FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \

-FN(4,   4,  4, 2, 2, opt2, (unsigned int))

-FNS(sse2, sse);

-FNS(ssse3, ssse3);

-#undef FNS

-#undef FN

-// The 2 unused parameters are place holders for PIC enabled build.

-#define DECL(w, opt) \

-int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \

-                                            ptrdiff_t src_stride, \

-                                            int x_offset, int y_offset, \

-                                            const uint8_t *dst, \

-                                            ptrdiff_t dst_stride, \

-                                            const uint8_t *sec, \

-                                            ptrdiff_t sec_stride, \

-                                            int height, unsigned int *sse, \

-                                            void *unused0, void *unused)

-#define DECLS(opt1, opt2) \

-DECL(4, opt2); \

-DECL(8, opt1); \

-DECL(16, opt1)

-DECLS(sse2, sse);

-DECLS(ssse3, ssse3);

-#undef DECL

-#undef DECLS

-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \

-unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \

-                                                         int src_stride, \

-                                                         int x_offset, \

-                                                         int y_offset, \

-                                                         const uint8_t *dst, \

-                                                         int dst_stride, \

-                                                         unsigned int *sseptr, \

-                                                         const uint8_t *sec) { \

-  unsigned int sse; \

-  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \

-                                                    y_offset, dst, dst_stride, \

-                                                    sec, w, h, &sse, NULL, \

-                                                    NULL); \

-  if (w > wf) { \

-    unsigned int sse2; \

-    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \

-                                                       x_offset, y_offset, \

-                                                       dst + 16, dst_stride, \

-                                                       sec + 16, w, h, &sse2, \

-                                                       NULL, NULL); \

-    se += se2; \

-    sse += sse2; \

-    if (w > wf * 2) { \

-      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \

-                                                     x_offset, y_offset, \

-                                                     dst + 32, dst_stride, \

-                                                     sec + 32, w, h, &sse2, \

-                                                     NULL, NULL); \

-      se += se2; \

-      sse += sse2; \

-      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \

-                                                     x_offset, y_offset, \

-                                                     dst + 48, dst_stride, \

-                                                     sec + 48, w, h, &sse2, \

-                                                     NULL, NULL); \

-      se += se2; \

-      sse += sse2; \

-    } \

-  } \

-  *sseptr = sse; \

-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

-}

-#define FNS(opt1, opt2) \

-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \

-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \

-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \

-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \

-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \

-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \

-FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \

-FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \

-FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \

-FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \

-FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \

-FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \

-FN(4,   4,  4, 2, 2, opt2, (unsigned int))

-FNS(sse2, sse);

-FNS(ssse3, ssse3);

-#undef FNS

-#undef FN

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -131,7 +131,6 @@

 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c

 # common (msa)

-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h

 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c

 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c

 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -58,7 +58,6 @@

 VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.h

 VP9_CX_SRCS-yes += encoder/vp9_tokenize.h

 VP9_CX_SRCS-yes += encoder/vp9_treewriter.h

-VP9_CX_SRCS-yes += encoder/vp9_variance.h

 VP9_CX_SRCS-yes += encoder/vp9_mcomp.c

 VP9_CX_SRCS-yes += encoder/vp9_encoder.c

 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c

@@ -84,7 +83,6 @@

 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c

 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c

-VP9_CX_SRCS-yes += encoder/vp9_variance.c

 VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c

 VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h

 VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c

@@ -103,7 +101,6 @@

 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c

-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

@@ -114,13 +111,7 @@

 ifeq ($(CONFIG_USE_X86INC),yes)

 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm

-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c

-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm

-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c

-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm

 endif

-endif

 ifeq ($(ARCH_X86_64),yes)

 ifeq ($(CONFIG_USE_X86INC),yes)

@@ -143,7 +134,6 @@

 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2_impl.h

 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c

 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c

-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c

 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c

@@ -150,7 +140,6 @@

 endif

 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c

 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c

-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c

@@ -160,6 +149,5 @@

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c

-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_variance_msa.c

 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

--- /dev/null

+++ b/vpx_dsp/arm/bilinear_filter_media.asm

@@ -1,0 +1,237 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vpx_filter_block2d_bil_first_pass_media|

+    EXPORT  |vpx_filter_block2d_bil_second_pass_media|

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+;-------------------------------------

+; r0    unsigned char  *src_ptr,

+; r1    unsigned short *dst_ptr,

+; r2    unsigned int    src_pitch,

+; r3    unsigned int    height,

+; stack unsigned int    width,

+; stack const short    *vpx_filter

+;-------------------------------------

+; The output is transposed stroed in output array to make it easy for second pass filtering.

+|vpx_filter_block2d_bil_first_pass_media| PROC

+    stmdb   sp!, {r4 - r11, lr}

+    ldr     r11, [sp, #40]                  ; vpx_filter address

+    ldr     r4, [sp, #36]                   ; width

+    mov     r12, r3                         ; outer-loop counter

+    add     r7, r2, r4                      ; preload next row

+    pld     [r0, r7]

+    sub     r2, r2, r4                      ; src increment for height loop

+    ldr     r5, [r11]                       ; load up filter coefficients

+    mov     r3, r3, lsl #1                  ; height*2

+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)

+    mov     r11, r1                         ; save dst_ptr for each row

+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter

+    beq     bil_null_1st_filter

+|bil_height_loop_1st_v6|

+    ldrb    r6, [r0]                        ; load source data

+    ldrb    r7, [r0, #1]

+    ldrb    r8, [r0, #2]

+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter

+|bil_width_loop_1st_v6|

+    ldrb    r9, [r0, #3]

+    ldrb    r10, [r0, #4]

+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]

+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]

+    smuad   r6, r6, r5                      ; apply the filter

+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]

+    smuad   r7, r7, r5

+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]

+    smuad   r8, r8, r5

+    smuad   r9, r9, r5

+    add     r0, r0, #4

+    subs    lr, lr, #1

+    add     r6, r6, #0x40                   ; round_shift_and_clamp

+    add     r7, r7, #0x40

+    usat    r6, #16, r6, asr #7

+    usat    r7, #16, r7, asr #7

+    strh    r6, [r1], r3                    ; result is transposed and stored

+    add     r8, r8, #0x40                   ; round_shift_and_clamp

+    strh    r7, [r1], r3

+    add     r9, r9, #0x40

+    usat    r8, #16, r8, asr #7

+    usat    r9, #16, r9, asr #7

+    strh    r8, [r1], r3                    ; result is transposed and stored

+    ldrneb  r6, [r0]                        ; load source data

+    strh    r9, [r1], r3

+    ldrneb  r7, [r0, #1]

+    ldrneb  r8, [r0, #2]

+    bne     bil_width_loop_1st_v6

+    add     r0, r0, r2                      ; move to next input row

+    subs    r12, r12, #1

+    add     r9, r2, r4, lsl #1              ; adding back block width

+    pld     [r0, r9]                        ; preload next row

+    add     r11, r11, #2                    ; move over to next column

+    mov     r1, r11

+    bne     bil_height_loop_1st_v6

+    ldmia   sp!, {r4 - r11, pc}

+|bil_null_1st_filter|

+|bil_height_loop_null_1st|

+    mov     lr, r4, lsr #2                  ; loop counter

+|bil_width_loop_null_1st|

+    ldrb    r6, [r0]                        ; load data

+    ldrb    r7, [r0, #1]

+    ldrb    r8, [r0, #2]

+    ldrb    r9, [r0, #3]

+    strh    r6, [r1], r3                    ; store it to immediate buffer

+    add     r0, r0, #4

+    strh    r7, [r1], r3

+    subs    lr, lr, #1

+    strh    r8, [r1], r3

+    strh    r9, [r1], r3

+    bne     bil_width_loop_null_1st

+    subs    r12, r12, #1

+    add     r0, r0, r2                      ; move to next input line

+    add     r11, r11, #2                    ; move over to next column

+    mov     r1, r11

+    bne     bil_height_loop_null_1st

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP  ; |vpx_filter_block2d_bil_first_pass_media|

+;---------------------------------

+; r0    unsigned short *src_ptr,

+; r1    unsigned char  *dst_ptr,

+; r2    int             dst_pitch,

+; r3    unsigned int    height,

+; stack unsigned int    width,

+; stack const short    *vpx_filter

+;---------------------------------

+|vpx_filter_block2d_bil_second_pass_media| PROC

+    stmdb   sp!, {r4 - r11, lr}

+    ldr     r11, [sp, #40]                  ; vpx_filter address

+    ldr     r4, [sp, #36]                   ; width

+    ldr     r5, [r11]                       ; load up filter coefficients

+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix

+    mov     r11, r1

+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter

+    beq     bil_null_2nd_filter

+|bil_height_loop_2nd|

+    ldr     r6, [r0]                        ; load the data

+    ldr     r8, [r0, #4]

+    ldrh    r10, [r0, #8]

+    mov     lr, r3, lsr #2                  ; loop counter

+|bil_width_loop_2nd|

+    pkhtb   r7, r6, r8                      ; src[1] | src[2]

+    pkhtb   r9, r8, r10                     ; src[3] | src[4]

+    smuad   r6, r6, r5                      ; apply filter

+    smuad   r8, r8, r5                      ; apply filter

+    subs    lr, lr, #1

+    smuadx  r7, r7, r5                      ; apply filter

+    smuadx  r9, r9, r5                      ; apply filter

+    add     r0, r0, #8

+    add     r6, r6, #0x40                   ; round_shift_and_clamp

+    add     r7, r7, #0x40

+    usat    r6, #8, r6, asr #7

+    usat    r7, #8, r7, asr #7

+    strb    r6, [r1], r2                    ; the result is transposed back and stored

+    add     r8, r8, #0x40                   ; round_shift_and_clamp

+    strb    r7, [r1], r2

+    add     r9, r9, #0x40

+    usat    r8, #8, r8, asr #7

+    usat    r9, #8, r9, asr #7

+    strb    r8, [r1], r2                    ; the result is transposed back and stored

+    ldrne   r6, [r0]                        ; load data

+    strb    r9, [r1], r2

+    ldrne   r8, [r0, #4]

+    ldrneh  r10, [r0, #8]

+    bne     bil_width_loop_2nd

+    subs    r12, r12, #1

+    add     r0, r0, #4                      ; update src for next row

+    add     r11, r11, #1

+    mov     r1, r11

+    bne     bil_height_loop_2nd

+    ldmia   sp!, {r4 - r11, pc}

+|bil_null_2nd_filter|

+|bil_height_loop_null_2nd|

+    mov     lr, r3, lsr #2

+|bil_width_loop_null_2nd|

+    ldr     r6, [r0], #4                    ; load data

+    subs    lr, lr, #1

+    ldr     r8, [r0], #4

+    strb    r6, [r1], r2                    ; store data

+    mov     r7, r6, lsr #16

+    strb    r7, [r1], r2

+    mov     r9, r8, lsr #16

+    strb    r8, [r1], r2

+    strb    r9, [r1], r2

+    bne     bil_width_loop_null_2nd

+    subs    r12, r12, #1

+    add     r0, r0, #4

+    add     r11, r11, #1

+    mov     r1, r11

+    bne     bil_height_loop_null_2nd

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP  ; |vpx_filter_block2d_second_pass_media|

+    END

--- /dev/null

+++ b/vpx_dsp/arm/subpel_variance_media.c

@@ -1,0 +1,105 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_config.h"

+#include "./vpx_dsp_rtcd.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_ports/mem.h"

+#if HAVE_MEDIA

+static const int16_t bilinear_filters_media[8][2] = {

+  { 128,   0 },

+  { 112,  16 },

+  {  96,  32 },

+  {  80,  48 },

+  {  64,  64 },

+  {  48,  80 },

+  {  32,  96 },

+  {  16, 112 }

+};

+extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr,

+                                                    uint16_t *dst_ptr,

+                                                    uint32_t src_pitch,

+                                                    uint32_t height,

+                                                    uint32_t width,

+                                                    const int16_t *filter);

+extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr,

+                                                     uint8_t *dst_ptr,

+                                                     int32_t src_pitch,

+                                                     uint32_t height,

+                                                     uint32_t width,

+                                                     const int16_t *filter);

+unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,

+                                             int src_pixels_per_line,

+                                             int xoffset, int yoffset,

+                                             const uint8_t *dst_ptr,

+                                             int dst_pixels_per_line,

+                                             unsigned int *sse) {

+  uint16_t first_pass[10*8];

+  uint8_t  second_pass[8*8];

+  const int16_t *HFilter, *VFilter;

+  HFilter = bilinear_filters_media[xoffset];

+  VFilter = bilinear_filters_media[yoffset];

+  vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,

+                                          src_pixels_per_line,

+                                          9, 8, HFilter);

+  vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,

+                                           8, 8, 8, VFilter);

+  return vpx_variance8x8_media(second_pass, 8, dst_ptr,

+                               dst_pixels_per_line, sse);

+}

+unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr,

+                                               int src_pixels_per_line,

+                                               int xoffset,

+                                               int yoffset,

+                                               const uint8_t *dst_ptr,

+                                               int dst_pixels_per_line,

+                                               unsigned int *sse) {

+  uint16_t first_pass[36*16];

+  uint8_t  second_pass[20*16];

+  const int16_t *HFilter, *VFilter;

+  unsigned int var;

+  if (xoffset == 4 && yoffset == 0) {

+    var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line,

+                                               dst_ptr, dst_pixels_per_line,

+                                               sse);

+  } else if (xoffset == 0 && yoffset == 4) {

+    var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line,

+                                               dst_ptr, dst_pixels_per_line,

+                                               sse);

+  } else if (xoffset == 4 && yoffset == 4) {

+    var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line,

+                                                dst_ptr, dst_pixels_per_line,

+                                                sse);

+  } else {

+    HFilter = bilinear_filters_media[xoffset];

+    VFilter = bilinear_filters_media[yoffset];

+    vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,

+                                            src_pixels_per_line,

+                                            17, 16, HFilter);

+    vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,

+                                             16, 16, 16, VFilter);

+    var = vpx_variance16x16_media(second_pass, 16, dst_ptr,

+                                  dst_pixels_per_line, sse);

+  }

+  return var;

+}

+#endif  // HAVE_MEDIA

--- /dev/null

+++ b/vpx_dsp/arm/subpel_variance_neon.c

@@ -1,0 +1,152 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "./vpx_dsp_rtcd.h"

+#include "./vpx_config.h"

+#include "vpx_ports/mem.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_dsp/variance.h"

+static const uint8_t bilinear_filters[8][2] = {

+  { 128,   0, },

+  { 112,  16, },

+  {  96,  32, },

+  {  80,  48, },

+  {  64,  64, },

+  {  48,  80, },

+  {  32,  96, },

+  {  16, 112, },

+};

+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,

+                                      uint8_t *output_ptr,

+                                      unsigned int src_pixels_per_line,

+                                      int pixel_step,

+                                      unsigned int output_height,

+                                      unsigned int output_width,

+                                      const uint8_t *filter) {

+  const uint8x8_t f0 = vmov_n_u8(filter[0]);

+  const uint8x8_t f1 = vmov_n_u8(filter[1]);

+  unsigned int i;

+  for (i = 0; i < output_height; ++i) {

+    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);

+    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);

+    const uint16x8_t a = vmull_u8(src_0, f0);

+    const uint16x8_t b = vmlal_u8(a, src_1, f1);

+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);

+    vst1_u8(&output_ptr[0], out);

+    // Next row...

+    src_ptr += src_pixels_per_line;

+    output_ptr += output_width;

+  }

+}

+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,

+                                       uint8_t *output_ptr,

+                                       unsigned int src_pixels_per_line,

+                                       int pixel_step,

+                                       unsigned int output_height,

+                                       unsigned int output_width,

+                                       const uint8_t *filter) {

+  const uint8x8_t f0 = vmov_n_u8(filter[0]);

+  const uint8x8_t f1 = vmov_n_u8(filter[1]);

+  unsigned int i, j;

+  for (i = 0; i < output_height; ++i) {

+    for (j = 0; j < output_width; j += 16) {

+      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);

+      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);

+      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);

+      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);

+      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);

+      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);

+      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);

+      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);

+      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));

+    }

+    // Next row...

+    src_ptr += src_pixels_per_line;

+    output_ptr += output_width;

+  }

+}

+unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,

+                                            int src_stride,

+                                            int xoffset,

+                                            int yoffset,

+                                            const uint8_t *dst,

+                                            int dst_stride,

+                                            unsigned int *sse) {

+  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);

+  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);

+  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,

+                            9, 8,

+                            bilinear_filters[xoffset]);

+  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,

+                            8, bilinear_filters[yoffset]);

+  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);

+}

+unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,

+                                              int src_stride,

+                                              int xoffset,

+                                              int yoffset,

+                                              const uint8_t *dst,

+                                              int dst_stride,

+                                              unsigned int *sse) {

+  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);

+  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);

+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

+                             17, 16,

+                             bilinear_filters[xoffset]);

+  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,

+                             16, bilinear_filters[yoffset]);

+  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);

+}

+unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,

+                                              int src_stride,

+                                              int xoffset,

+                                              int yoffset,

+                                              const uint8_t *dst,

+                                              int dst_stride,

+                                              unsigned int *sse) {

+  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);

+  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);

+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

+                             33, 32,

+                             bilinear_filters[xoffset]);

+  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,

+                             32, bilinear_filters[yoffset]);

+  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);

+}

+unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,

+                                              int src_stride,

+                                              int xoffset,

+                                              int yoffset,

+                                              const uint8_t *dst,

+                                              int dst_stride,

+                                              unsigned int *sse) {

+  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);

+  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);

+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,

+                             65, 64,

+                             bilinear_filters[xoffset]);

+  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,

+                             64, bilinear_filters[yoffset]);

+  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);

+}

--- /dev/null

+++ b/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm

@@ -1,0 +1,182 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vpx_variance_halfpixvar16x16_h_media|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vpx_variance_halfpixvar16x16_h_media| PROC

+    stmfd   sp!, {r4-r12, lr}

+    pld     [r0, r1, lsl #0]

+    pld     [r2, r3, lsl #0]

+    mov     r8, #0              ; initialize sum = 0

+    ldr     r10, c80808080

+    mov     r11, #0             ; initialize sse = 0

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+    mov     lr, #0              ; constant zero

+loop

+    ; 1st 4 pixels

+    ldr     r4, [r0, #0]        ; load 4 src pixels

+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #0]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    pld     [r0, r1, lsl #1]

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    pld     [r2, r3, lsl #1]

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    adds    r8, r8, r4          ; add positive differences to sum

+    subs    r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r4, [r0, #4]        ; load 4 src pixels

+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #4]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r4, [r0, #8]        ; load 4 src pixels

+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #8]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r4, [r0, #12]       ; load 4 src pixels

+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #12]       ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    subs    r12, r12, #1

+    bne     loop

+    ; return stuff

+    ldr     r6, [sp, #40]       ; get address of sse

+    mul     r0, r8, r8          ; sum * sum

+    str     r11, [r6]           ; store sse

+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+c80808080

+    DCD     0x80808080

+    END

--- /dev/null

+++ b/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm

@@ -1,0 +1,222 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vpx_variance_halfpixvar16x16_hv_media|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vpx_variance_halfpixvar16x16_hv_media| PROC

+    stmfd   sp!, {r4-r12, lr}

+    pld     [r0, r1, lsl #0]

+    pld     [r2, r3, lsl #0]

+    mov     r8, #0              ; initialize sum = 0

+    ldr     r10, c80808080

+    mov     r11, #0             ; initialize sse = 0

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+    mov     lr, #0              ; constant zero

+loop

+    add     r9, r0, r1          ; pointer to pixels on the next row

+    ; 1st 4 pixels

+    ldr     r4, [r0, #0]        ; load source pixels a, row N

+    ldr     r6, [r0, #1]        ; load source pixels b, row N

+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1

+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #0]        ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    pld     [r0, r1, lsl #1]

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    pld     [r2, r3, lsl #1]

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    adds    r8, r8, r4          ; add positive differences to sum

+    subs    r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r4, [r0, #4]        ; load source pixels a, row N

+    ldr     r6, [r0, #5]        ; load source pixels b, row N

+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #4]        ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r4, [r0, #8]        ; load source pixels a, row N

+    ldr     r6, [r0, #9]        ; load source pixels b, row N

+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #8]        ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r4, [r0, #12]       ; load source pixels a, row N

+    ldr     r6, [r0, #13]       ; load source pixels b, row N

+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #12]       ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    subs    r12, r12, #1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    bne     loop

+    ; return stuff

+    ldr     r6, [sp, #40]       ; get address of sse

+    mul     r0, r8, r8          ; sum * sum

+    str     r11, [r6]           ; store sse

+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+c80808080

+    DCD     0x80808080

+    END

--- /dev/null

+++ b/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm

@@ -1,0 +1,184 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vpx_variance_halfpixvar16x16_v_media|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vpx_variance_halfpixvar16x16_v_media| PROC

+    stmfd   sp!, {r4-r12, lr}

+    pld     [r0, r1, lsl #0]

+    pld     [r2, r3, lsl #0]

+    mov     r8, #0              ; initialize sum = 0

+    ldr     r10, c80808080

+    mov     r11, #0             ; initialize sse = 0

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+    mov     lr, #0              ; constant zero

+loop

+    add     r9, r0, r1          ; set src pointer to next row

+    ; 1st 4 pixels

+    ldr     r4, [r0, #0]        ; load 4 src pixels

+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row

+    ldr     r5, [r2, #0]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    pld     [r0, r1, lsl #1]

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    pld     [r2, r3, lsl #1]

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    adds    r8, r8, r4          ; add positive differences to sum

+    subs    r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r4, [r0, #4]        ; load 4 src pixels

+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row

+    ldr     r5, [r2, #4]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r4, [r0, #8]        ; load 4 src pixels

+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row

+    ldr     r5, [r2, #8]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r4, [r0, #12]       ; load 4 src pixels

+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row

+    ldr     r5, [r2, #12]       ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; subtract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    subs    r12, r12, #1

+    bne     loop

+    ; return stuff

+    ldr     r6, [sp, #40]       ; get address of sse

+    mul     r0, r8, r8          ; sum * sum

+    str     r11, [r6]           ; store sse

+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+c80808080

+    DCD     0x80808080

+    END

--- a/vpx_dsp/mips/macros_msa.h

+++ b/vpx_dsp/mips/macros_msa.h

@@ -24,10 +24,34 @@

 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)

 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)

+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))

+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)

+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)

+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)

+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)

 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)

 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)

+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)

+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)

 #if (__mips_isa_rev >= 6)

+#define LH(psrc) ({                                 \

+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

+  uint16_t val_m;                                   \

+                                                    \

+  __asm__ __volatile__ (                            \

+      "lh  %[val_m],  %[psrc_m]  \n\t"              \

+                                                    \

+      : [val_m] "=r" (val_m)                        \

+      : [psrc_m] "m" (*psrc_m)                      \

+  );                                                \

+                                                    \

+  val_m;                                            \

+})

 #define LW(psrc) ({                                 \

   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

   uint32_t val_m;                                   \

@@ -73,6 +97,18 @@

})

 #endif  // (__mips == 64)

+#define SH(val, pdst) {                 \

+  uint8_t *pdst_m = (uint8_t *)(pdst);  \

+  const uint16_t val_m = (val);         \

+                                        \

+  __asm__ __volatile__ (                \

+      "sh  %[val_m],  %[pdst_m]  \n\t"  \

+                                        \

+      : [pdst_m] "=m" (*pdst_m)         \

+      : [val_m] "r" (val_m)             \

+  );                                    \

+}

 #define SW(val, pdst) {                 \

   uint8_t *pdst_m = (uint8_t *)(pdst);  \

   const uint32_t val_m = (val);         \

@@ -97,6 +133,20 @@

   );                                    \

 #else  // !(__mips_isa_rev >= 6)

+#define LH(psrc) ({                                 \

+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

+  uint16_t val_m;                                   \

+                                                    \

+  __asm__ __volatile__ (                            \

+      "ulh  %[val_m],  %[psrc_m]  \n\t"             \

+                                                    \

+      : [val_m] "=r" (val_m)                        \

+      : [psrc_m] "m" (*psrc_m)                      \

+  );                                                \

+                                                    \

+  val_m;                                            \

+})

 #define LW(psrc) ({                                 \

   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

   uint32_t val_m;                                   \

@@ -111,18 +161,6 @@

   val_m;                                            \

})

-#define SW(val, pdst) {                  \

-  uint8_t *pdst_m = (uint8_t *)(pdst);   \

-  const uint32_t val_m = (val);          \

-                                         \

-  __asm__ __volatile__ (                 \

-      "usw  %[val_m],  %[pdst_m]  \n\t"  \

-                                         \

-      : [pdst_m] "=m" (*pdst_m)          \

-      : [val_m] "r" (val_m)              \

-  );                                     \

-}

 #if (__mips == 64)

 #define LD(psrc) ({                                 \

   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

@@ -154,6 +192,30 @@

})

 #endif  // (__mips == 64)

+#define SH(val, pdst) {                  \

+  uint8_t *pdst_m = (uint8_t *)(pdst);   \

+  const uint16_t val_m = (val);          \

+                                         \

+  __asm__ __volatile__ (                 \

+      "ush  %[val_m],  %[pdst_m]  \n\t"  \

+                                         \

+      : [pdst_m] "=m" (*pdst_m)          \

+      : [val_m] "r" (val_m)              \

+  );                                     \

+}

+#define SW(val, pdst) {                  \

+  uint8_t *pdst_m = (uint8_t *)(pdst);   \

+  const uint32_t val_m = (val);          \

+                                         \

+  __asm__ __volatile__ (                 \

+      "usw  %[val_m],  %[pdst_m]  \n\t"  \

+                                         \

+      : [pdst_m] "=m" (*pdst_m)          \

+      : [val_m] "r" (val_m)              \

+  );                                     \

+}

 #define SD(val, pdst) {                                     \

   uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \

   uint32_t val0_m, val1_m;                                  \

@@ -196,6 +258,34 @@

   LD2((psrc) + 2 * stride, stride, out2, out3);      \

+/* Description : Store 4 words with stride

+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride

+   Details     : Store word from 'in0' to (pdst)

+                 Store word from 'in1' to (pdst + stride)

+                 Store word from 'in2' to (pdst + 2 * stride)

+                 Store word from 'in3' to (pdst + 3 * stride)

+*/

+#define SW4(in0, in1, in2, in3, pdst, stride) {  \

+  SW(in0, (pdst))                                \

+  SW(in1, (pdst) + stride);                      \

+  SW(in2, (pdst) + 2 * stride);                  \

+  SW(in3, (pdst) + 3 * stride);                  \

+}

+/* Description : Store 4 double words with stride

+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride

+   Details     : Store double word from 'in0' to (pdst)

+                 Store double word from 'in1' to (pdst + stride)

+                 Store double word from 'in2' to (pdst + 2 * stride)

+                 Store double word from 'in3' to (pdst + 3 * stride)

+*/

+#define SD4(in0, in1, in2, in3, pdst, stride) {  \

+  SD(in0, (pdst))                                \

+  SD(in1, (pdst) + stride);                      \

+  SD(in2, (pdst) + 2 * stride);                  \

+  SD(in3, (pdst) + 3 * stride);                  \

+}

 /* Description : Load vectors with 16 byte elements with stride

    Arguments   : Inputs  - psrc, stride

                  Outputs - out0, out1

@@ -228,7 +318,15 @@

   out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \

 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)

+#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)

+#define LD_B7(RTYPE, psrc, stride,                             \

+              out0, out1, out2, out3, out4, out5, out6) {      \

+  LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \

+  LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \

+}

+#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)

 #define LD_B8(RTYPE, psrc, stride,                                    \

               out0, out1, out2, out3, out4, out5, out6, out7) {       \

   LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \

@@ -247,6 +345,7 @@

   out0 = LD_H(RTYPE, (psrc));                     \

   out1 = LD_H(RTYPE, (psrc) + (stride));          \

+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)

 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \

   LD_H2(RTYPE, (psrc), stride, out0, out1);                   \

@@ -254,6 +353,229 @@

 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)

+#define LD_H8(RTYPE, psrc, stride,                                    \

+              out0, out1, out2, out3, out4, out5, out6, out7) {       \

+  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \

+  LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \

+}

+#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)

+#define LD_H16(RTYPE, psrc, stride,                                     \

+               out0, out1, out2, out3, out4, out5, out6, out7,          \

+               out8, out9, out10, out11, out12, out13, out14, out15) {  \

+  LD_H8(RTYPE, (psrc), stride,                                          \

+        out0, out1, out2, out3, out4, out5, out6, out7);                \

+  LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \

+        out8, out9, out10, out11, out12, out13, out14, out15);          \

+}

+#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)

+/* Description : Load 4x4 block of signed halfword elements from 1D source

+                 data into 4 vectors (Each vector with 4 signed halfwords)

+   Arguments   : Input   - psrc

+                 Outputs - out0, out1, out2, out3

+*/

+#define LD4x4_SH(psrc, out0, out1, out2, out3) {         \

+  out0 = LD_SH(psrc);                                    \

+  out2 = LD_SH(psrc + 8);                                \

+  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \

+  out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \

+}

+/* Description : Load 2 vectors of signed word elements with stride

+   Arguments   : Inputs  - psrc, stride

+                 Outputs - out0, out1

+                 Return Type - signed word

+*/

+#define LD_SW2(psrc, stride, out0, out1) {  \

+  out0 = LD_SW((psrc));                     \

+  out1 = LD_SW((psrc) + stride);            \

+}

+/* Description : Store vectors of 16 byte elements with stride

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : Store 16 byte elements from 'in0' to (pdst)

+                 Store 16 byte elements from 'in1' to (pdst + stride)

+*/

+#define ST_B2(RTYPE, in0, in1, pdst, stride) {  \

+  ST_B(RTYPE, in0, (pdst));                     \

+  ST_B(RTYPE, in1, (pdst) + stride);            \

+}

+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)

+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \

+  ST_B2(RTYPE, in0, in1, (pdst), stride);                 \

+  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \

+}

+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)

+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \

+              pdst, stride) {                                     \

+  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \

+  ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \

+}

+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)

+/* Description : Store vectors of 8 halfword elements with stride

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : Store 8 halfword elements from 'in0' to (pdst)

+                 Store 8 halfword elements from 'in1' to (pdst + stride)

+*/

+#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \

+  ST_H(RTYPE, in0, (pdst));                     \

+  ST_H(RTYPE, in1, (pdst) + stride);            \

+}

+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)

+#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \

+  ST_H2(RTYPE, in0, in1, (pdst), stride);                 \

+  ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \

+}

+#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)

+#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \

+  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \

+  ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \

+}

+#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)

+/* Description : Store vectors of word elements with stride

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : Store 4 word elements from 'in0' to (pdst)

+                 Store 4 word elements from 'in1' to (pdst + stride)

+*/

+#define ST_SW2(in0, in1, pdst, stride) {  \

+  ST_SW(in0, (pdst));                     \

+  ST_SW(in1, (pdst) + stride);            \

+}

+/* Description : Store 2x4 byte block to destination memory from input vector

+   Arguments   : Inputs - in, stidx, pdst, stride

+   Details     : Index 'stidx' halfword element from 'in' vector is copied to

+                 the GP register and stored to (pdst)

+                 Index 'stidx+1' halfword element from 'in' vector is copied to

+                 the GP register and stored to (pdst + stride)

+                 Index 'stidx+2' halfword element from 'in' vector is copied to

+                 the GP register and stored to (pdst + 2 * stride)

+                 Index 'stidx+3' halfword element from 'in' vector is copied to

+                 the GP register and stored to (pdst + 3 * stride)

+*/

+#define ST2x4_UB(in, stidx, pdst, stride) {         \

+  uint16_t out0_m, out1_m, out2_m, out3_m;          \

+  uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \

+                                                    \

+  out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \

+  out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \

+  out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \

+  out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \

+                                                    \

+  SH(out0_m, pblk_2x4_m);                           \

+  SH(out1_m, pblk_2x4_m + stride);                  \

+  SH(out2_m, pblk_2x4_m + 2 * stride);              \

+  SH(out3_m, pblk_2x4_m + 3 * stride);              \

+}

+/* Description : Store 4x2 byte block to destination memory from input vector

+   Arguments   : Inputs - in, pdst, stride

+   Details     : Index 0 word element from 'in' vector is copied to the GP

+                 register and stored to (pdst)

+                 Index 1 word element from 'in' vector is copied to the GP

+                 register and stored to (pdst + stride)

+*/

+#define ST4x2_UB(in, pdst, stride) {        \

+  uint32_t out0_m, out1_m;                  \

+  uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \

+                                            \

+  out0_m = __msa_copy_u_w((v4i32)in, 0);    \

+  out1_m = __msa_copy_u_w((v4i32)in, 1);    \

+                                            \

+  SW(out0_m, pblk_4x2_m);                   \

+  SW(out1_m, pblk_4x2_m + stride);          \

+}

+/* Description : Store 4x4 byte block to destination memory from input vector

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : 'Idx0' word element from input vector 'in0' is copied to the

+                 GP register and stored to (pdst)

+                 'Idx1' word element from input vector 'in0' is copied to the

+                 GP register and stored to (pdst + stride)

+                 'Idx2' word element from input vector 'in0' is copied to the

+                 GP register and stored to (pdst + 2 * stride)

+                 'Idx3' word element from input vector 'in0' is copied to the

+                 GP register and stored to (pdst + 3 * stride)

+*/

+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \

+  uint32_t out0_m, out1_m, out2_m, out3_m;                          \

+  uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \

+                                                                    \

+  out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \

+  out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \

+  out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \

+  out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \

+                                                                    \

+  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \

+}

+#define ST4x8_UB(in0, in1, pdst, stride) {                        \

+  uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \

+                                                                  \

+  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \

+  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \

+}

+/* Description : Store 8x1 byte block to destination memory from input vector

+   Arguments   : Inputs - in, pdst

+   Details     : Index 0 double word element from 'in' vector is copied to the

+                 GP register and stored to (pdst)

+*/

+#define ST8x1_UB(in, pdst) {              \

+  uint64_t out0_m;                        \

+                                          \

+  out0_m = __msa_copy_u_d((v2i64)in, 0);  \

+  SD(out0_m, pdst);                       \

+}

+/* Description : Store 8x2 byte block to destination memory from input vector

+   Arguments   : Inputs - in, pdst, stride

+   Details     : Index 0 double word element from 'in' vector is copied to the

+                 GP register and stored to (pdst)

+                 Index 1 double word element from 'in' vector is copied to the

+                 GP register and stored to (pdst + stride)

+*/

+#define ST8x2_UB(in, pdst, stride) {        \

+  uint64_t out0_m, out1_m;                  \

+  uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \

+                                            \

+  out0_m = __msa_copy_u_d((v2i64)in, 0);    \

+  out1_m = __msa_copy_u_d((v2i64)in, 1);    \

+                                            \

+  SD(out0_m, pblk_8x2_m);                   \

+  SD(out1_m, pblk_8x2_m + stride);          \

+}

+/* Description : Store 8x4 byte block to destination memory from input

+                 vectors

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : Index 0 double word element from 'in0' vector is copied to the

+                 GP register and stored to (pdst)

+                 Index 1 double word element from 'in0' vector is copied to the

+                 GP register and stored to (pdst + stride)

+                 Index 0 double word element from 'in1' vector is copied to the

+                 GP register and stored to (pdst + 2 * stride)

+                 Index 1 double word element from 'in1' vector is copied to the

+                 GP register and stored to (pdst + 3 * stride)

+*/

+#define ST8x4_UB(in0, in1, pdst, stride) {                  \

+  uint64_t out0_m, out1_m, out2_m, out3_m;                  \

+  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \

+                                                            \

+  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \

+  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \

+  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \

+  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \

+                                                            \

+  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \

+}

 /* Description : average with rounding (in0 + in1 + 1) / 2.

    Arguments   : Inputs  - in0, in1, in2, in3,

                  Outputs - out0, out1

@@ -275,6 +597,27 @@

 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)

+/* Description : Immediate number of elements to slide with zero

+   Arguments   : Inputs  - in0, in1, slide_val

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by

+                 value specified in the 'slide_val'

+*/

+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \

+  v16i8 zero_m = { 0 };                                              \

+  out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \

+  out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \

+}

+#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)

+#define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \

+                  out0, out1, out2, out3, slide_val) {  \

+  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \

+  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \

+}

+#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)

 /* Description : Immediate number of elements to slide

    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val

                  Outputs - out0, out1

@@ -287,7 +630,149 @@

   out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \

 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)

+#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)

+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \

+                out0, out1, out2, slide_val) {                        \

+  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \

+  out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \

+}

+#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)

+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)

+/* Description : Shuffle byte vector elements as per mask vector

+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to

+                 'out0' as per control vector 'mask0'

+*/

+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \

+  out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \

+  out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \

+}

+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)

+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)

+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)

+#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \

+                out0, out1, out2, out3) {                        \

+  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \

+  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \

+}

+#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)

+#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)

+/* Description : Dot product of byte vector elements

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Unsigned byte elements from 'mult0' are multiplied with

+                 unsigned byte elements from 'cnst0' producing a result

+                 twice the size of input i.e. unsigned halfword.

+                 The multiplication result of adjacent odd-even elements

+                 are added together and written to the 'out0' vector

+*/

+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

+  out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \

+  out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \

+}

+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)

+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \

+                 cnst0, cnst1, cnst2, cnst3,                \

+                 out0, out1, out2, out3) {                  \

+  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \

+  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \

+}

+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)

+/* Description : Dot product of byte vector elements

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Signed byte elements from 'mult0' are multiplied with

+                 signed byte elements from 'cnst0' producing a result

+                 twice the size of input i.e. signed halfword.

+                 The multiplication result of adjacent odd-even elements

+                 are added together and written to the 'out0' vector

+*/

+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

+  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \

+  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \

+}

+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)

+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \

+                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \

+  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \

+  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \

+}

+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)

+/* Description : Dot product of halfword vector elements

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Signed halfword elements from 'mult0' are multiplied with

+                 signed halfword elements from 'cnst0' producing a result

+                 twice the size of input i.e. signed word.

+                 The multiplication result of adjacent odd-even elements

+                 are added together and written to the 'out0' vector

+*/

+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

+  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \

+  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \

+}

+#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)

+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \

+                 cnst0, cnst1, cnst2, cnst3,                \

+                 out0, out1, out2, out3) {                  \

+  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \

+  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \

+}

+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)

+/* Description : Dot product of word vector elements

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Signed word elements from 'mult0' are multiplied with

+                 signed word elements from 'cnst0' producing a result

+                 twice the size of input i.e. signed double word.

+                 The multiplication result of adjacent odd-even elements

+                 are added together and written to the 'out0' vector

+*/

+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

+  out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \

+  out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \

+}

+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)

+/* Description : Dot product & addition of byte vector elements

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Signed byte elements from 'mult0' are multiplied with

+                 signed byte elements from 'cnst0' producing a result

+                 twice the size of input i.e. signed halfword.

+                 The multiplication result of adjacent odd-even elements

+                 are added to the 'out0' vector

+*/

+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \

+  out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \

+  out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \

+}

+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)

+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \

+                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \

+  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \

+  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \

+}

+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)

 /* Description : Dot product & addition of halfword vector elements

    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

                  Outputs - out0, out1

@@ -309,7 +794,7 @@

                  Outputs - out0, out1

                  Return Type - as per RTYPE

    Details     : Each signed word element from 'mult0' is multiplied with itself

-                 producing an intermediate result twice the size of it

+                 producing an intermediate result twice the size of input

                  i.e. signed double word

                  The multiplication result of adjacent odd-even elements

                  are added to the 'out0' vector

@@ -320,6 +805,49 @@

 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)

+/* Description : Minimum values between unsigned elements of

+                 either vector are copied to the output vector

+   Arguments   : Inputs  - in0, in1, min_vec

+                 Outputs - in place operation

+                 Return Type - as per RTYPE

+   Details     : Minimum of unsigned halfword element values from 'in0' and

+                 'min_vec' are written to output vector 'in0'

+*/

+#define MIN_UH2(RTYPE, in0, in1, min_vec) {         \

+  in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \

+  in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \

+}

+#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)

+#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \

+  MIN_UH2(RTYPE, in0, in1, min_vec);                   \

+  MIN_UH2(RTYPE, in2, in3, min_vec);                   \

+}

+#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)

+/* Description : Clips all signed halfword elements of input vector

+                 between 0 & 255

+   Arguments   : Input  - in

+                 Output - out_m

+                 Return Type - signed halfword

+*/

+#define CLIP_SH_0_255(in) ({                          \

+  v8i16 max_m = __msa_ldi_h(255);                     \

+  v8i16 out_m;                                        \

+                                                      \

+  out_m = __msa_maxi_s_h((v8i16)in, 0);               \

+  out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \

+  out_m;                                              \

+})

+#define CLIP_SH2_0_255(in0, in1) {  \

+  in0 = CLIP_SH_0_255(in0);         \

+  in1 = CLIP_SH_0_255(in1);         \

+}

+#define CLIP_SH4_0_255(in0, in1, in2, in3) {  \

+  CLIP_SH2_0_255(in0, in1);                   \

+  CLIP_SH2_0_255(in2, in3);                   \

+}

 /* Description : Horizontal addition of 4 signed word elements of input vector

    Arguments   : Input  - in       (signed word vector)

                  Output - sum_m    (i32 sum)

@@ -358,6 +886,26 @@

   sum_m;                                             \

})

+/* Description : Horizontal addition of unsigned byte vector elements

+   Arguments   : Inputs  - in0, in1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Each unsigned odd byte element from 'in0' is added to

+                 even unsigned byte element from 'in0' (pairwise) and the

+                 halfword result is written to 'out0'

+*/

+#define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \

+  out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \

+  out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \

+}

+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)

+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \

+  HADD_UB2(RTYPE, in0, in1, out0, out1);                               \

+  HADD_UB2(RTYPE, in2, in3, out2, out3);                               \

+}

+#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)

 /* Description : Horizontal subtraction of unsigned byte vector elements

    Arguments   : Inputs  - in0, in1

                  Outputs - out0, out1

@@ -393,6 +941,20 @@

   sad_m;                                                    \

})

+/* Description : Horizontal subtraction of signed halfword vector elements

+   Arguments   : Inputs  - in0, in1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Each signed odd halfword element from 'in0' is subtracted from

+                 even signed halfword element from 'in0' (pairwise) and the

+                 word result is written to 'out0'

+*/

+#define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \

+  out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \

+  out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \

+}

+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)

 /* Description : Set element n input vector to GPR value

    Arguments   : Inputs - in0, in1, in2, in3

                  Output - out

@@ -399,6 +961,12 @@

                  Return Type - as per RTYPE

    Details     : Set element 0 in vector 'out' to value specified in 'in0'

*/

+#define INSERT_W2(RTYPE, in0, in1, out) {           \

+  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \

+  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \

+}

+#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)

 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \

   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \

   out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \

@@ -415,6 +983,211 @@

 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)

 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)

+/* Description : Interleave even byte elements from vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Even byte elements of 'in0' and 'in1' are interleaved

+                 and written to 'out0'

+*/

+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \

+  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \

+}

+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)

+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)

+/* Description : Interleave even halfword elements from vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved

+                 and written to 'out0'

+*/

+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \

+  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \

+}

+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)

+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)

+#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)

+/* Description : Interleave even word elements from vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Even word elements of 'in0' and 'in1' are interleaved

+                 and written to 'out0'

+*/

+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \

+  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \

+}

+#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)

+/* Description : Interleave even double word elements from vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Even double word elements of 'in0' and 'in1' are interleaved

+                 and written to 'out0'

+*/

+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \

+  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \

+}

+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)

+/* Description : Interleave left half of byte elements from vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved

+                 and written to 'out0'.

+*/

+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \

+  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \

+}

+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)

+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)

+#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)

+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)

+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

+                out0, out1, out2, out3) {                       \

+  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \

+  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \

+}

+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)

+#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)

+/* Description : Interleave left half of halfword elements from vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Left half of halfword elements of 'in0' and 'in1' are

+                 interleaved and written to 'out0'.

+*/

+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \

+  out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \

+}

+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)

+/* Description : Interleave left half of word elements from vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved

+                 and written to 'out0'.

+*/

+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \

+  out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \

+}

+#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)

+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)

+/* Description : Interleave right half of byte elements from vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved

+                 and written to out0.

+*/

+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \

+  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \

+}

+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)

+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)

+#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)

+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)

+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

+                out0, out1, out2, out3) {                       \

+  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \

+  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \

+}

+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)

+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)

+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)

+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)

+#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \

+                in8, in9, in10, in11, in12, in13, in14, in15,      \

+                out0, out1, out2, out3, out4, out5, out6, out7) {  \

+  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \

+          out0, out1, out2, out3);                                 \

+  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \

+          out4, out5, out6, out7);                                 \

+}

+#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)

+/* Description : Interleave right half of halfword elements from vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Right half of halfword elements of 'in0' and 'in1' are

+                 interleaved and written to 'out0'.

+*/

+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \

+  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \

+}

+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)

+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

+                out0, out1, out2, out3) {                       \

+  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \

+  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \

+}

+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)

+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \

+  out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \

+}

+#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)

+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)

+#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

+                out0, out1, out2, out3) {                       \

+  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \

+  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \

+}

+#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)

+/* Description : Interleave right half of double word elements from vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Right half of double word elements of 'in0' and 'in1' are

+                 interleaved and written to 'out0'.

+*/

+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \

+  out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \

+  out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \

+}

+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)

+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)

+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)

+#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \

+  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \

+  out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \

+}

+#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)

+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

+                out0, out1, out2, out3) {                       \

+  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \

+  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \

+}

+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)

+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)

 /* Description : Interleave both left and right half of input vectors

    Arguments   : Inputs  - in0, in1

                  Outputs - out0, out1

@@ -427,13 +1200,138 @@

   out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \

 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)

+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)

+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)

+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)

 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \

   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \

   out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \

+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)

 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)

+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \

+  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \

+  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \

+}

+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)

+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)

+/* Description : Saturate the halfword element values to the max

+                 unsigned value of (sat_val + 1) bits

+                 The element data width remains unchanged

+   Arguments   : Inputs  - in0, in1, sat_val

+                 Outputs - in place operation

+                 Return Type - as per RTYPE

+   Details     : Each unsigned halfword element from 'in0' is saturated to the

+                 value generated with (sat_val + 1) bit range.

+                 The results are written in place

+*/

+#define SAT_UH2(RTYPE, in0, in1, sat_val) {         \

+  in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \

+  in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \

+}

+#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)

+#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \

+  SAT_UH2(RTYPE, in0, in1, sat_val);                   \

+  SAT_UH2(RTYPE, in2, in3, sat_val)                    \

+}

+#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)

+/* Description : Saturate the halfword element values to the max

+                 unsigned value of (sat_val + 1) bits

+                 The element data width remains unchanged

+   Arguments   : Inputs  - in0, in1, sat_val

+                 Outputs - in place operation

+                 Return Type - as per RTYPE

+   Details     : Each unsigned halfword element from 'in0' is saturated to the

+                 value generated with (sat_val + 1) bit range

+                 The results are written in place

+*/

+#define SAT_SH2(RTYPE, in0, in1, sat_val) {         \

+  in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \

+  in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \

+}

+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)

+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \

+  SAT_SH2(RTYPE, in0, in1, sat_val);                   \

+  SAT_SH2(RTYPE, in2, in3, sat_val);                   \

+}

+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)

+/* Description : Indexed halfword element values are replicated to all

+                 elements in output vector

+   Arguments   : Inputs  - in, idx0, idx1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : 'idx0' element value from 'in' vector is replicated to all

+                  elements in 'out0' vector

+                  Valid index range for halfword operation is 0-7

+*/

+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \

+  out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \

+  out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \

+}

+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)

+#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \

+                  out0, out1, out2, out3) {           \

+  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \

+  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \

+}

+#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)

+#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)

+/* Description : Pack even byte elements of vector pairs

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Even byte elements of 'in0' are copied to the left half of

+                 'out0' & even byte elements of 'in1' are copied to the right

+                 half of 'out0'.

+*/

+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \

+  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \

+}

+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)

+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)

+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)

+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

+                 out0, out1, out2, out3) {                       \

+  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \

+  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \

+}

+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)

+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)

+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)

+/* Description : Pack even halfword elements of vector pairs

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Even halfword elements of 'in0' are copied to the left half of

+                 'out0' & even halfword elements of 'in1' are copied to the

+                 right half of 'out0'.

+*/

+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \

+  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \

+}

+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)

+#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)

+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

+                 out0, out1, out2, out3) {                       \

+  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \

+  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \

+}

+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)

 /* Description : Pack even double word elements of vector pairs

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

@@ -447,6 +1345,7 @@

   out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \

 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)

+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)

 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

                  out0, out1, out2, out3) {                       \

@@ -455,6 +1354,256 @@

 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)

+/* Description : Each byte element is logically xor'ed with immediate 128

+   Arguments   : Inputs  - in0, in1

+                 Outputs - in place operation

+                 Return Type - as per RTYPE

+   Details     : Each unsigned byte element from input vector 'in0' is

+                 logically xor'ed with 128 and the result is stored in-place.

+*/

+#define XORI_B2_128(RTYPE, in0, in1) {         \

+  in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \

+  in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \

+}

+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)

+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)

+#define XORI_B3_128(RTYPE, in0, in1, in2) {    \

+  XORI_B2_128(RTYPE, in0, in1);                \

+  in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \

+}

+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)

+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \

+  XORI_B2_128(RTYPE, in0, in1);                   \

+  XORI_B2_128(RTYPE, in2, in3);                   \

+}

+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)

+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)

+#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \

+  XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \

+  XORI_B3_128(RTYPE, in4, in5, in6);                             \

+}

+#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)

+/* Description : Average of signed halfword elements -> (a + b) / 2

+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

+                 Outputs - out0, out1, out2, out3

+                 Return Type - as per RTYPE

+   Details     : Each signed halfword element from 'in0' is added to each

+                 signed halfword element of 'in1' with full precision resulting

+                 in one extra bit in the result. The result is then divided by

+                 2 and written to 'out0'

+*/

+#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

+                out0, out1, out2, out3) {                       \

+  out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \

+  out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \

+  out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \

+  out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \

+}

+#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)

+/* Description : Addition of signed halfword elements and signed saturation

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Signed halfword elements from 'in0' are added to signed

+                 halfword elements of 'in1'. The result is then signed saturated

+                 between halfword data type range

+*/

+#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

+  out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \

+  out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \

+}

+#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)

+#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \

+                 out0, out1, out2, out3) {                       \

+  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \

+  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \

+}

+#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)

+/* Description : Shift left all elements of vector (generic for all data types)

+   Arguments   : Inputs  - in0, in1, in2, in3, shift

+                 Outputs - in place operation

+                 Return Type - as per input vector RTYPE

+   Details     : Each element of vector 'in0' is left shifted by 'shift' and

+                 the result is written in-place.

+*/

+#define SLLI_4V(in0, in1, in2, in3, shift) {  \

+  in0 = in0 << shift;                         \

+  in1 = in1 << shift;                         \

+  in2 = in2 << shift;                         \

+  in3 = in3 << shift;                         \

+}

+/* Description : Arithmetic shift right all elements of vector

+                 (generic for all data types)

+   Arguments   : Inputs  - in0, in1, in2, in3, shift

+                 Outputs - in place operation

+                 Return Type - as per input vector RTYPE

+   Details     : Each element of vector 'in0' is right shifted by 'shift' and

+                 the result is written in-place. 'shift' is a GP variable.

+*/

+#define SRA_4V(in0, in1, in2, in3, shift) {  \

+  in0 = in0 >> shift;                        \

+  in1 = in1 >> shift;                        \

+  in2 = in2 >> shift;                        \

+  in3 = in3 >> shift;                        \

+}

+/* Description : Shift right arithmetic rounded words

+   Arguments   : Inputs  - in0, in1, shift

+                 Outputs - in place operation

+                 Return Type - as per RTYPE

+   Details     : Each element of vector 'in0' is shifted right arithmetically by

+                 the number of bits in the corresponding element in the vector

+                 'shift'. The last discarded bit is added to shifted value for

+                 rounding and the result is written in-place.

+                 'shift' is a vector.

+*/

+#define SRAR_W2(RTYPE, in0, in1, shift) {               \

+  in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \

+  in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \

+}

+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \

+  SRAR_W2(RTYPE, in0, in1, shift)                    \

+  SRAR_W2(RTYPE, in2, in3, shift)                    \

+}

+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)

+/* Description : Shift right arithmetic rounded (immediate)

+   Arguments   : Inputs  - in0, in1, shift

+                 Outputs - in place operation

+                 Return Type - as per RTYPE

+   Details     : Each element of vector 'in0' is shifted right arithmetically by

+                 the value in 'shift'. The last discarded bit is added to the

+                 shifted value for rounding and the result is written in-place.

+                 'shift' is an immediate value.

+*/

+#define SRARI_H2(RTYPE, in0, in1, shift) {        \

+  in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \

+  in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \

+}

+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)

+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)

+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \

+  SRARI_H2(RTYPE, in0, in1, shift);                   \

+  SRARI_H2(RTYPE, in2, in3, shift);                   \

+}

+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)

+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)

+#define SRARI_W2(RTYPE, in0, in1, shift) {        \

+  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \

+  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \

+}

+#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)

+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \

+  SRARI_W2(RTYPE, in0, in1, shift);                   \

+  SRARI_W2(RTYPE, in2, in3, shift);                   \

+}

+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)

+/* Description : Logical shift right all elements of vector (immediate)

+   Arguments   : Inputs  - in0, in1, in2, in3, shift

+                 Outputs - out0, out1, out2, out3

+                 Return Type - as per RTYPE

+   Details     : Each element of vector 'in0' is right shifted by 'shift' and

+                 the result is written in-place. 'shift' is an immediate value.

+*/

+#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \

+  out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \

+  out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \

+  out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \

+  out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \

+}

+#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)

+/* Description : Multiplication of pairs of vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+   Details     : Each element from 'in0' is multiplied with elements from 'in1'

+                 and the result is written to 'out0'

+*/

+#define MUL2(in0, in1, in2, in3, out0, out1) {  \

+  out0 = in0 * in1;                             \

+  out1 = in2 * in3;                             \

+}

+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \

+             out0, out1, out2, out3) {                \

+  MUL2(in0, in1, in2, in3, out0, out1);               \

+  MUL2(in4, in5, in6, in7, out2, out3);               \

+}

+/* Description : Addition of 2 pairs of vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+   Details     : Each element in 'in0' is added to 'in1' and result is written

+                 to 'out0'.

+*/

+#define ADD2(in0, in1, in2, in3, out0, out1) {  \

+  out0 = in0 + in1;                             \

+  out1 = in2 + in3;                             \

+}

+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \

+             out0, out1, out2, out3) {                \

+  ADD2(in0, in1, in2, in3, out0, out1);               \

+  ADD2(in4, in5, in6, in7, out2, out3);               \

+}

+/* Description : Subtraction of 2 pairs of vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+   Details     : Each element in 'in1' is subtracted from 'in0' and result is

+                 written to 'out0'.

+*/

+#define SUB2(in0, in1, in2, in3, out0, out1) {  \

+  out0 = in0 - in1;                             \

+  out1 = in2 - in3;                             \

+}

+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \

+             out0, out1, out2, out3) {                \

+  out0 = in0 - in1;                                   \

+  out1 = in2 - in3;                                   \

+  out2 = in4 - in5;                                   \

+  out3 = in6 - in7;                                   \

+}

+/* Description : Sign extend halfword elements from right half of the vector

+   Arguments   : Input  - in    (halfword vector)

+                 Output - out   (sign extended word vector)

+                 Return Type - signed word

+   Details     : Sign bit of halfword elements from input vector 'in' is

+                 extracted and interleaved with same vector 'in0' to generate

+                 4 word elements keeping sign intact

+*/

+#define UNPCK_R_SH_SW(in, out) {                 \

+  v8i16 sign_m;                                  \

+                                                 \

+  sign_m = __msa_clti_s_h((v8i16)in, 0);         \

+  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \

+}

+/* Description : Zero extend unsigned byte elements to halfword elements

+   Arguments   : Input   - in          (unsigned byte vector)

+                 Outputs - out0, out1  (unsigned  halfword vectors)

+                 Return Type - signed halfword

+   Details     : Zero extended right half of vector is returned in 'out0'

+                 Zero extended left half of vector is returned in 'out1'

+*/

+#define UNPCK_UB_SH(in, out0, out1) {   \

+  v16i8 zero_m = { 0 };                 \

+                                        \

+  ILVRL_B2_SH(zero_m, in, out0, out1);  \

+}

 /* Description : Sign extend halfword elements from input vector and return

                  the result in pair of vectors

    Arguments   : Input   - in            (halfword vector)

@@ -473,52 +1622,312 @@

   ILVRL_H2_SW(tmp_m, in, out0, out1);    \

-/* Description : Store 4 double words with stride

+/* Description : Butterfly of 4 input vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1, out2, out3

+   Details     : Butterfly operation

+*/

+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \

+  out0 = in0 + in3;                                                \

+  out1 = in1 + in2;                                                \

+                                                                   \

+  out2 = in1 - in2;                                                \

+  out3 = in0 - in3;                                                \

+}

+/* Description : Butterfly of 8 input vectors

+   Arguments   : Inputs  - in0 ...  in7

+                 Outputs - out0 .. out7

+   Details     : Butterfly operation

+*/

+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \

+                    out0, out1, out2, out3, out4, out5, out6, out7) {  \

+  out0 = in0 + in7;                                                    \

+  out1 = in1 + in6;                                                    \

+  out2 = in2 + in5;                                                    \

+  out3 = in3 + in4;                                                    \

+                                                                       \

+  out4 = in3 - in4;                                                    \

+  out5 = in2 - in5;                                                    \

+  out6 = in1 - in6;                                                    \

+  out7 = in0 - in7;                                                    \

+}

+/* Description : Butterfly of 16 input vectors

+   Arguments   : Inputs  - in0 ...  in15

+                 Outputs - out0 .. out15

+   Details     : Butterfly operation

+*/

+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \

+                     in8, in9,  in10, in11, in12, in13, in14, in15,           \

+                     out0, out1, out2, out3, out4, out5, out6, out7,          \

+                     out8, out9, out10, out11, out12, out13, out14, out15) {  \

+  out0 = in0 + in15;                                                          \

+  out1 = in1 + in14;                                                          \

+  out2 = in2 + in13;                                                          \

+  out3 = in3 + in12;                                                          \

+  out4 = in4 + in11;                                                          \

+  out5 = in5 + in10;                                                          \

+  out6 = in6 + in9;                                                           \

+  out7 = in7 + in8;                                                           \

+                                                                              \

+  out8 = in7 - in8;                                                           \

+  out9 = in6 - in9;                                                           \

+  out10 = in5 - in10;                                                         \

+  out11 = in4 - in11;                                                         \

+  out12 = in3 - in12;                                                         \

+  out13 = in2 - in13;                                                         \

+  out14 = in1 - in14;                                                         \

+  out15 = in0 - in15;                                                         \

+}

+/* Description : Transpose input 8x8 byte block

+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7

+                 Return Type - as per RTYPE

+*/

+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \

+                        out0, out1, out2, out3, out4, out5, out6, out7) {  \

+  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \

+  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \

+                                                                           \

+  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \

+             tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \

+  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \

+  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \

+  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \

+  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \

+  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \

+  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \

+}

+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)

+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors

+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,

+                           in8, in9, in10, in11, in12, in13, in14, in15

+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7

+                 Return Type - unsigned byte

+*/

+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \

+                            in8, in9, in10, in11, in12, in13, in14, in15,      \

+                            out0, out1, out2, out3, out4, out5, out6, out7) {  \

+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \

+  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \

+                                                                               \

+  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \

+  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \

+  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \

+  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \

+                                                                               \

+  tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \

+  tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \

+  tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \

+  tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \

+  out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \

+  tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \

+  out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \

+  tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \

+                                                                               \

+  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \

+  out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

+  out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

+                                                                               \

+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \

+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \

+  out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

+  out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

+                                                                               \

+  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \

+  out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

+  out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

+                                                                               \

+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \

+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \

+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \

+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \

+  out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

+  out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

+}

+/* Description : Transpose 4x4 block with half word elements in vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1, out2, out3

+                 Return Type - signed halfword

+*/

+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \

+  v8i16 s0_m, s1_m;                                                       \

+                                                                          \

+  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \

+  ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \

+  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \

+  out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \

+}

+/* Description : Transpose 4x8 block with half word elements in vectors

+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7

+                 Return Type - signed halfword

+*/

+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \

+                           out0, out1, out2, out3, out4, out5, out6, out7) {  \

+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \

+  v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \

+  v8i16 zero_m = { 0 };                                                       \

+                                                                              \

+  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \

+             tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \

+  ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \

+  ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \

+                                                                              \

+  out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \

+  out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \

+  out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \

+  out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \

+                                                                              \

+  out4 = zero_m;                                                              \

+  out5 = zero_m;                                                              \

+  out6 = zero_m;                                                              \

+  out7 = zero_m;                                                              \

+}

+/* Description : Transpose 8x4 block with half word elements in vectors

+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7

+                 Return Type - signed halfword

+*/

+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \

+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \

+                                                                          \

+  ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \

+  ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \

+  ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \

+  ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \

+}

+/* Description : Transpose 8x8 block with half word elements in vectors

+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7

+                 Return Type - as per RTYPE

+*/

+#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \

+                       out0, out1, out2, out3, out4, out5, out6, out7) {  \

+  v8i16 s0_m, s1_m;                                                       \

+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \

+  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \

+                                                                          \

+  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \

+  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \

+  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \

+  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \

+  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \

+  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \

+  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \

+  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \

+  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \

+           tmp3_m, tmp7_m, out0, out2, out4, out6);                       \

+  out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \

+  out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \

+  out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \

+  out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \

+}

+#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)

+/* Description : Transpose 4x4 block with word elements in vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1, out2, out3

+                 Return Type - signed word

+*/

+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \

+  v4i32 s0_m, s1_m, s2_m, s3_m;                                           \

+                                                                          \

+  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \

+  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \

+                                                                          \

+  out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \

+  out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \

+  out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \

+  out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \

+}

+/* Description : Add block 4x4

    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride

-   Details     : Store double word from 'in0' to (pdst)

-                 Store double word from 'in1' to (pdst + stride)

-                 Store double word from 'in2' to (pdst + 2 * stride)

-                 Store double word from 'in3' to (pdst + 3 * stride)

+   Details     : Least significant 4 bytes from each input vector are added to

+                 the destination bytes, clipped between 0-255 and stored.

*/

-#define SD4(in0, in1, in2, in3, pdst, stride) {  \

-  SD(in0, (pdst))                                \

-  SD(in1, (pdst) + stride);                      \

-  SD(in2, (pdst) + 2 * stride);                  \

-  SD(in3, (pdst) + 3 * stride);                  \

+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \

+  uint32_t src0_m, src1_m, src2_m, src3_m;                      \

+  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \

+  v16i8 dst0_m = { 0 };                                         \

+  v16i8 dst1_m = { 0 };                                         \

+  v16i8 zero_m = { 0 };                                         \

+                                                                \

+  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \

+  LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \

+  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \

+  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \

+  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \

+  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \

+  CLIP_SH2_0_255(res0_m, res1_m);                               \

+  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \

+  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \

-/* Description : Store vectors of 8 halfword elements with stride

-   Arguments   : Inputs - in0, in1, pdst, stride

-   Details     : Store 8 halfword elements from 'in0' to (pdst)

-                 Store 8 halfword elements from 'in1' to (pdst + stride)

+/* Description : Pack even elements of input vectors & xor with 128

+   Arguments   : Inputs - in0, in1

+                 Output - out_m

+                 Return Type - unsigned byte

+   Details     : Signed byte even elements from 'in0' and 'in1' are packed

+                 together in one vector and the resulting vector is xor'ed with

+                 128 to shift the range from signed to unsigned byte

*/

-#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \

-  ST_H(RTYPE, in0, (pdst));                     \

-  ST_H(RTYPE, in1, (pdst) + stride);            \

+#define PCKEV_XORI128_UB(in0, in1) ({                    \

+  v16u8 out_m;                                           \

+                                                         \

+  out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \

+  out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \

+  out_m;                                                 \

+})

+/* Description : Converts inputs to unsigned bytes, interleave, average & store

+                 as 8x4 unsigned byte block

+   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,

+                          pdst, stride

+*/

+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \

+                                dst0, dst1, dst2, dst3, pdst, stride) {  \

+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \

+  uint8_t *pdst_m = (uint8_t *)(pdst);                                   \

+                                                                         \

+  tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \

+  tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \

+  ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \

+  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \

+  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \

-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)

-/* Description : Store 8x4 byte block to destination memory from input

-                 vectors

-   Arguments   : Inputs - in0, in1, pdst, stride

-   Details     : Index 0 double word element from 'in0' vector is copied to the

-                 GP register and stored to (pdst)

-                 Index 1 double word element from 'in0' vector is copied to the

-                 GP register and stored to (pdst + stride)

-                 Index 0 double word element from 'in1' vector is copied to the

-                 GP register and stored to (pdst + 2 * stride)

-                 Index 1 double word element from 'in1' vector is copied to the

-                 GP register and stored to (pdst + 3 * stride)

+/* Description : Pack even byte elements and store byte vector in destination

+                 memory

+   Arguments   : Inputs - in0, in1, pdst

*/

-#define ST8x4_UB(in0, in1, pdst, stride) {                  \

-  uint64_t out0_m, out1_m, out2_m, out3_m;                  \

-  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \

-                                                            \

-  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \

-  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \

-  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \

-  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \

-                                                            \

-  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \

+#define PCKEV_ST_SB(in0, in1, pdst) {             \

+  v16i8 tmp_m;                                    \

+                                                  \

+  tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \

+  ST_SB(tmp_m, (pdst));                           \

+/* Description : Horizontal 2 tap filter kernel code

+   Arguments   : Inputs - in0, in1, mask, coeff, shift

+*/

+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \

+  v16i8 tmp0_m;                                                \

+  v8u16 tmp1_m;                                                \

+                                                               \

+  tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \

+  tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \

+  tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \

+  tmp1_m = __msa_sat_u_h(tmp1_m, shift);                       \

+                                                               \

+  tmp1_m;                                                      \

+})

 #endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */

--- /dev/null

+++ b/vpx_dsp/mips/sub_pixel_variance_msa.c

@@ -1,0 +1,767 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_ports/mem.h"

+#include "vpx_dsp/mips/macros_msa.h"

+#include "vpx_dsp/variance.h"

+static const uint8_t bilinear_filters_msa[8][2] = {

+  { 128,   0, },

+  { 112,  16, },

+  {  96,  32, },

+  {  80,  48, },

+  {  64,  64, },

+  {  48,  80, },

+  {  32,  96, },

+  {  16, 112, },

+};

+#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \

+  v16u8 src_l0_m, src_l1_m;                                        \

+  v8i16 res_l0_m, res_l1_m;                                        \

+                                                                   \

+  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \

+  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \

+  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \

+                                                                   \

+  sub += res_l0_m + res_l1_m;                                      \

+}

+#define VARIANCE_WxH(sse, diff, shift) \

+  sse - (((uint32_t)diff * diff) >> shift)

+#define VARIANCE_LARGE_WxH(sse, diff, shift) \

+  sse - (((int64_t)diff * diff) >> shift)

+static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,

+                                                int32_t src_stride,

+                                                const uint8_t *dst,

+                                                int32_t dst_stride,

+                                                const uint8_t *filter,

+                                                int32_t height,

+                                                int32_t *diff) {

+  int16_t filtval;

+  uint32_t loop_cnt;

+  uint32_t ref0, ref1, ref2, ref3;

+  v16u8 filt0, ref = { 0 };

+  v16i8 src0, src1, src2, src3;

+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };

+  v8u16 vec0, vec1, vec2, vec3;

+  v8u16 const255;

+  v8i16 avg = { 0 };

+  v4i32 vec, var = { 0 };

+  filtval = LH(filter);

+  filt0 = (v16u8)__msa_fill_h(filtval);

+  const255 = (v8u16)__msa_ldi_h(255);

+  for (loop_cnt = (height >> 2); loop_cnt--;) {

+    LD_SB4(src, src_stride, src0, src1, src2, src3);

+    src += (4 * src_stride);

+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);

+    dst += (4 * dst_stride);

+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);

+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);

+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);

+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,

+                vec0, vec1, vec2, vec3);

+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);

+    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);

+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,

+                src0, src1, src2, src3);

+    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);

+    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);

+    CALC_MSE_AVG_B(src0, ref, var, avg);

+  }

+  vec = __msa_hadd_s_w(avg, avg);

+  *diff = HADD_SW_S32(vec);

+  return HADD_SW_S32(var);

+}

+static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,

+                                                int32_t src_stride,

+                                                const uint8_t *dst,

+                                                int32_t dst_stride,

+                                                const uint8_t *filter,

+                                                int32_t height,

+                                                int32_t *diff) {

+  int16_t filtval;

+  uint32_t loop_cnt;

+  v16u8 filt0, out, ref0, ref1, ref2, ref3;

+  v16i8 src0, src1, src2, src3;

+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };

+  v8u16 vec0, vec1, vec2, vec3, const255;

+  v8i16 avg = { 0 };

+  v4i32 vec, var = { 0 };

+  filtval = LH(filter);

+  filt0 = (v16u8)__msa_fill_h(filtval);

+  const255 = (v8u16)__msa_ldi_h(255);

+  for (loop_cnt = (height >> 2); loop_cnt--;) {

+    LD_SB4(src, src_stride, src0, src1, src2, src3);

+    src += (4 * src_stride);

+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);

+    dst += (4 * dst_stride);

+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);

+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);

+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);

+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,

+                vec0, vec1, vec2, vec3);

+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);

+    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);

+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,

+                src0, src1, src2, src3);

+    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);

+    CALC_MSE_AVG_B(out, ref0, var, avg);

+    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);

+    CALC_MSE_AVG_B(out, ref1, var, avg);

+  }

+  vec = __msa_hadd_s_w(avg, avg);

+  *diff = HADD_SW_S32(vec);

+  return HADD_SW_S32(var);

+}

+static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,

+                                                 int32_t src_stride,

+                                                 const uint8_t *dst,

+                                                 int32_t dst_stride,

+                                                 const uint8_t *filter,

+                                                 int32_t height,

+                                                 int32_t *diff) {

+  int16_t filtval;

+  uint32_t loop_cnt;

+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };

+  v16u8 dst0, dst1, dst2, dst3, filt0;

+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;

+  v8u16 const255;

+  v8i16 avg = { 0 };

+  v4i32 vec, var = { 0 };

+  filtval = LH(filter);

+  filt0 = (v16u8)__msa_fill_h(filtval);

+  const255 = (v8u16)__msa_ldi_h(255);

+  for (loop_cnt = (height >> 2); loop_cnt--;) {

+    LD_SB4(src, src_stride, src0, src2, src4, src6);

+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);

+    src += (4 * src_stride);

+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);

+    dst += (4 * dst_stride);

+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);

+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);

+    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);

+    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);

+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,

+                out0, out1, out2, out3);

+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,

+                out4, out5, out6, out7);

+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);

+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);

+    MIN_UH4_UH(out0, out1, out2, out3, const255);

+    MIN_UH4_UH(out4, out5, out6, out7, const255);

+    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,

+                src0, src1, src2, src3);

+    CALC_MSE_AVG_B(src0, dst0, var, avg);

+    CALC_MSE_AVG_B(src1, dst1, var, avg);

+    CALC_MSE_AVG_B(src2, dst2, var, avg);

+    CALC_MSE_AVG_B(src3, dst3, var, avg);

+  }

+  vec = __msa_hadd_s_w(avg, avg);

+  *diff = HADD_SW_S32(vec);

+  return HADD_SW_S32(var);

+}

+static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,

+                                                 int32_t src_stride,

+                                                 const uint8_t *dst,

+                                                 int32_t dst_stride,

+                                                 const uint8_t *filter,

+                                                 int32_t height,

+                                                 int32_t *diff) {

+  uint32_t loop_cnt, sse = 0;

+  int32_t diff0[2];

+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {

+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,

+                                            filter, height, &diff0[loop_cnt]);

+    src += 16;

+    dst += 16;

+  }

+  *diff = diff0[0] + diff0[1];

+  return sse;

+}

+static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,

+                                                 int32_t src_stride,

+                                                 const uint8_t *dst,

+                                                 int32_t dst_stride,

+                                                 const uint8_t *filter,

+                                                 int32_t height,

+                                                 int32_t *diff) {

+  uint32_t loop_cnt, sse = 0;

+  int32_t diff0[4];

+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {

+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,

+                                            filter, height, &diff0[loop_cnt]);

+    src += 16;

+    dst += 16;

+  }

+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];

+  return sse;

+}

+static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,

+                                                int32_t src_stride,

+                                                const uint8_t *dst,

+                                                int32_t dst_stride,

+                                                const uint8_t *filter,

+                                                int32_t height,

+                                                int32_t *diff) {

+  int16_t filtval;

+  uint32_t loop_cnt;

+  uint32_t ref0, ref1, ref2, ref3;

+  v16u8 src0, src1, src2, src3, src4, out;

+  v16u8 src10_r, src32_r, src21_r, src43_r;

+  v16u8 ref = { 0 };

+  v16u8 src2110, src4332;

+  v16u8 filt0;

+  v8i16 avg = { 0 };

+  v4i32 vec, var = { 0 };

+  v8u16 tmp0, tmp1;

+  filtval = LH(filter);

+  filt0 = (v16u8)__msa_fill_h(filtval);

+  src0 = LD_UB(src);

+  src += src_stride;

+  for (loop_cnt = (height >> 2); loop_cnt--;) {

+    LD_UB4(src, src_stride, src1, src2, src3, src4);

+    src += (4 * src_stride);

+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);

+    dst += (4 * dst_stride);

+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);

+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,

+               src10_r, src21_r, src32_r, src43_r);

+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);

+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);

+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

+    SAT_UH2_UH(tmp0, tmp1, 7);

+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

+    CALC_MSE_AVG_B(out, ref, var, avg);

+    src0 = src4;

+  }

+  vec = __msa_hadd_s_w(avg, avg);

+  *diff = HADD_SW_S32(vec);

+  return HADD_SW_S32(var);

+}

+static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,

+                                                int32_t src_stride,

+                                                const uint8_t *dst,

+                                                int32_t dst_stride,

+                                                const uint8_t *filter,

+                                                int32_t height,

+                                                int32_t *diff) {

+  int16_t filtval;

+  uint32_t loop_cnt;

+  v16u8 src0, src1, src2, src3, src4;

+  v16u8 ref0, ref1, ref2, ref3;

+  v8u16 vec0, vec1, vec2, vec3;

+  v8u16 tmp0, tmp1, tmp2, tmp3;

+  v16u8 filt0;

+  v8i16 avg = { 0 };

+  v4i32 vec, var = { 0 };

+  filtval = LH(filter);

+  filt0 = (v16u8)__msa_fill_h(filtval);

+  src0 = LD_UB(src);

+  src += src_stride;

+  for (loop_cnt = (height >> 2); loop_cnt--;) {

+    LD_UB4(src, src_stride, src1, src2, src3, src4);

+    src += (4 * src_stride);

+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);

+    dst += (4 * dst_stride);

+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);

+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,

+               vec0, vec1, vec2, vec3);

+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,

+                tmp0, tmp1, tmp2, tmp3);

+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);

+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);

+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);

+    CALC_MSE_AVG_B(src0, ref0, var, avg);

+    CALC_MSE_AVG_B(src1, ref1, var, avg);

+    src0 = src4;

+  }

+  vec = __msa_hadd_s_w(avg, avg);

+  *diff = HADD_SW_S32(vec);

+  return HADD_SW_S32(var);

+}

+static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,

+                                                 int32_t src_stride,

+                                                 const uint8_t *dst,

+                                                 int32_t dst_stride,

+                                                 const uint8_t *filter,

+                                                 int32_t height,

+                                                 int32_t *diff) {

+  int16_t filtval;

+  uint32_t loop_cnt;

+  v16u8 ref0, ref1, ref2, ref3;

+  v16u8 src0, src1, src2, src3, src4;

+  v16u8 out0, out1, out2, out3;

+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v8u16 tmp0, tmp1, tmp2, tmp3;

+  v16u8 filt0;

+  v8i16 avg = { 0 };

+  v4i32 vec, var = { 0 };

+  filtval = LH(filter);

+  filt0 = (v16u8)__msa_fill_h(filtval);

+  src0 = LD_UB(src);

+  src += src_stride;

+  for (loop_cnt = (height >> 2); loop_cnt--;) {

+    LD_UB4(src, src_stride, src1, src2, src3, src4);

+    src += (4 * src_stride);

+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);

+    dst += (4 * dst_stride);

+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);

+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);

+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);

+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

+    SAT_UH2_UH(tmp0, tmp1, 7);

+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);

+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);

+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);

+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);

+    SAT_UH2_UH(tmp2, tmp3, 7);

+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);

+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);

+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

+    SAT_UH2_UH(tmp0, tmp1, 7);

+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);

+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);

+    SAT_UH2_UH(tmp2, tmp3, 7);

+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);

+    src0 = src4;

+    CALC_MSE_AVG_B(out0, ref0, var, avg);

+    CALC_MSE_AVG_B(out1, ref1, var, avg);

+    CALC_MSE_AVG_B(out2, ref2, var, avg);

+    CALC_MSE_AVG_B(out3, ref3, var, avg);

+  }

+  vec = __msa_hadd_s_w(avg, avg);

+  *diff = HADD_SW_S32(vec);

+  return HADD_SW_S32(var);

+}

+static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,

+                                                 int32_t src_stride,

+                                                 const uint8_t *dst,

+                                                 int32_t dst_stride,

+                                                 const uint8_t *filter,

+                                                 int32_t height,

+                                                 int32_t *diff) {

+  uint32_t loop_cnt, sse = 0;

+  int32_t diff0[2];

+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {

+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,

+                                            filter, height, &diff0[loop_cnt]);

+    src += 16;

+    dst += 16;

+  }

+  *diff = diff0[0] + diff0[1];

+  return sse;

+}

+static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,

+                                                 int32_t src_stride,

+                                                 const uint8_t *dst,

+                                                 int32_t dst_stride,

+                                                 const uint8_t *filter,

+                                                 int32_t height,

+                                                 int32_t *diff) {

+  uint32_t loop_cnt, sse = 0;

+  int32_t diff0[4];

+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {

+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,

+                                            filter, height, &diff0[loop_cnt]);

+    src += 16;

+    dst += 16;

+  }

+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];

+  return sse;

+}

+static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,

+                                                 int32_t src_stride,

+                                                 const uint8_t *dst,

+                                                 int32_t dst_stride,

+                                                 const uint8_t *filter_horiz,

+                                                 const uint8_t *filter_vert,

+                                                 int32_t height,

+                                                 int32_t *diff) {

+  int16_t filtval;

+  uint32_t loop_cnt;

+  uint32_t ref0, ref1, ref2, ref3;

+  v16u8 src0, src1, src2, src3, src4;

+  v16u8 out, ref = { 0 };

+  v16u8 filt_vt, filt_hz, vec0, vec1;

+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };

+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;

+  v8u16 tmp0, tmp1;

+  v8i16 avg = { 0 };

+  v4i32 vec, var = { 0 };

+  filtval = LH(filter_horiz);

+  filt_hz = (v16u8)__msa_fill_h(filtval);

+  filtval = LH(filter_vert);

+  filt_vt = (v16u8)__msa_fill_h(filtval);

+  src0 = LD_UB(src);

+  src += src_stride;

+  for (loop_cnt = (height >> 2); loop_cnt--;) {

+    LD_UB4(src, src_stride, src1, src2, src3, src4);

+    src += (4 * src_stride);

+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);

+    dst += (4 * dst_stride);

+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);

+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);

+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);

+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);

+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);

+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);

+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);

+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);

+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

+    SAT_UH2_UH(tmp0, tmp1, 7);

+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

+    CALC_MSE_AVG_B(out, ref, var, avg);

+    src0 = src4;

+  }

+  vec = __msa_hadd_s_w(avg, avg);

+  *diff = HADD_SW_S32(vec);

+  return HADD_SW_S32(var);

+}

+static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,

+                                                 int32_t src_stride,

+                                                 const uint8_t *dst,

+                                                 int32_t dst_stride,

+                                                 const uint8_t *filter_horiz,

+                                                 const uint8_t *filter_vert,

+                                                 int32_t height,

+                                                 int32_t *diff) {

+  int16_t filtval;

+  uint32_t loop_cnt;

+  v16u8 ref0, ref1, ref2, ref3;

+  v16u8 src0, src1, src2, src3, src4;

+  v16u8 out0, out1;

+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };

+  v8u16 hz_out0, hz_out1;

+  v8u16 tmp0, tmp1, tmp2, tmp3;

+  v16u8 filt_vt, filt_hz, vec0;

+  v8i16 avg = { 0 };

+  v4i32 vec, var = { 0 };

+  filtval = LH(filter_horiz);

+  filt_hz = (v16u8)__msa_fill_h(filtval);

+  filtval = LH(filter_vert);

+  filt_vt = (v16u8)__msa_fill_h(filtval);

+  src0 = LD_UB(src);

+  src += src_stride;

+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);

+  for (loop_cnt = (height >> 2); loop_cnt--;) {

+    LD_UB4(src, src_stride, src1, src2, src3, src4);

+    src += (4 * src_stride);

+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);

+    dst += (4 * dst_stride);

+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);

+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);

+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);

+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);

+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);

+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);

+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);

+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

+    SAT_UH2_UH(tmp0, tmp1, 7);

+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);

+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);

+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);

+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);

+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);

+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);

+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);

+    SAT_UH2_UH(tmp2, tmp3, 7);

+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);

+    CALC_MSE_AVG_B(out0, ref0, var, avg);

+    CALC_MSE_AVG_B(out1, ref1, var, avg);

+  }

+  vec = __msa_hadd_s_w(avg, avg);

+  *diff = HADD_SW_S32(vec);

+  return HADD_SW_S32(var);

+}

+static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,

+                                                  int32_t src_stride,

+                                                  const uint8_t *dst,

+                                                  int32_t dst_stride,

+                                                  const uint8_t *filter_horiz,

+                                                  const uint8_t *filter_vert,

+                                                  int32_t height,

+                                                  int32_t *diff) {

+  int16_t filtval;

+  uint32_t loop_cnt;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v16u8 ref0, ref1, ref2, ref3;

+  v16u8 filt_hz, filt_vt, vec0, vec1;

+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };

+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;

+  v8u16 tmp0, tmp1;

+  v8i16 avg = { 0 };

+  v4i32 vec, var = { 0 };

+  filtval = LH(filter_horiz);

+  filt_hz = (v16u8)__msa_fill_h(filtval);

+  filtval = LH(filter_vert);

+  filt_vt = (v16u8)__msa_fill_h(filtval);

+  LD_UB2(src, 8, src0, src1);

+  src += src_stride;

+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);

+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);

+  for (loop_cnt = (height >> 2); loop_cnt--;) {

+    LD_UB4(src, src_stride, src0, src2, src4, src6);

+    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);

+    src += (4 * src_stride);

+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);

+    dst += (4 * dst_stride);

+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);

+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);

+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);

+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);

+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

+    SAT_UH2_UH(tmp0, tmp1, 7);

+    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);

+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);

+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);

+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);

+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

+    SAT_UH2_UH(tmp0, tmp1, 7);

+    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);

+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);

+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);

+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);

+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

+    SAT_UH2_UH(tmp0, tmp1, 7);

+    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);

+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);

+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);

+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);

+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);

+    SAT_UH2_UH(tmp0, tmp1, 7);

+    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

+    CALC_MSE_AVG_B(src0, ref0, var, avg);

+    CALC_MSE_AVG_B(src1, ref1, var, avg);

+    CALC_MSE_AVG_B(src2, ref2, var, avg);

+    CALC_MSE_AVG_B(src3, ref3, var, avg);

+  }

+  vec = __msa_hadd_s_w(avg, avg);

+  *diff = HADD_SW_S32(vec);

+  return HADD_SW_S32(var);

+}

+static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,

+                                                  int32_t src_stride,

+                                                  const uint8_t *dst,

+                                                  int32_t dst_stride,

+                                                  const uint8_t *filter_horiz,

+                                                  const uint8_t *filter_vert,

+                                                  int32_t height,

+                                                  int32_t *diff) {

+  uint32_t loop_cnt, sse = 0;

+  int32_t diff0[2];

+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {

+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,

+                                             filter_horiz, filter_vert, height,

+                                             &diff0[loop_cnt]);

+    src += 16;

+    dst += 16;

+  }

+  *diff = diff0[0] + diff0[1];

+  return sse;

+}

+static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,

+                                                  int32_t src_stride,

+                                                  const uint8_t *dst,

+                                                  int32_t dst_stride,

+                                                  const uint8_t *filter_horiz,

+                                                  const uint8_t *filter_vert,

+                                                  int32_t height,

+                                                  int32_t *diff) {

+  uint32_t loop_cnt, sse = 0;

+  int32_t diff0[4];

+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {

+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,

+                                             filter_horiz, filter_vert, height,

+                                             &diff0[loop_cnt]);

+    src += 16;

+    dst += 16;

+  }

+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];

+  return sse;

+}

+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);

+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);

+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);

+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);

+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);

+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);

+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);

+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);

+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);

+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);

+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);

+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);

+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);

+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                         \

+uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \

+                                                 int32_t src_stride,     \

+                                                 int32_t xoffset,        \

+                                                 int32_t yoffset,        \

+                                                 const uint8_t *ref,     \

+                                                 int32_t ref_stride,     \

+                                                 uint32_t *sse) {        \

+  int32_t diff;                                                          \

+  uint32_t var;                                                          \

+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];               \

+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];               \

+                                                                         \

+  if (yoffset) {                                                         \

+    if (xoffset) {                                                       \

+      *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride,      \

+                                                   ref, ref_stride,      \

+                                                   h_filter, v_filter,   \

+                                                   ht, &diff);           \

+    } else {                                                             \

+      *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride,       \

+                                                  ref, ref_stride,       \

+                                                  v_filter, ht, &diff);  \

+    }                                                                    \

+                                                                         \

+    var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                          \

+  } else {                                                               \

+    if (xoffset) {                                                       \

+      *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride,       \

+                                                  ref, ref_stride,       \

+                                                  h_filter, ht, &diff);  \

+                                                                         \

+      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                        \

+    } else {                                                             \

+      var = vpx_variance##wd##x##ht##_msa(src, src_stride,               \

+                                          ref, ref_stride, sse);         \

+    }                                                                    \

+  }                                                                      \

+                                                                         \

+  return var;                                                            \

+}

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);

+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);

--- a/vpx_dsp/variance.c

+++ b/vpx_dsp/variance.c

@@ -14,13 +14,26 @@

 #include "vpx_ports/mem.h"

 #include "vpx/vpx_integer.h"

-unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int  a_stride,

-                                const unsigned char *b, int  b_stride) {

+#include "vpx_dsp/variance.h"

+static const uint8_t bilinear_filters[8][2] = {

+  { 128,   0  },

+  { 112,  16  },

+  {  96,  32  },

+  {  80,  48  },

+  {  64,  64  },

+  {  48,  80  },

+  {  32,  96  },

+  {  16, 112  },

+};

+uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int  a_stride,

+                            const uint8_t *b, int  b_stride) {

   int distortion = 0;

   int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

+  for (r = 0; r < 4; ++r) {

+    for (c = 0; c < 4; ++c) {

       int diff = a[c] - b[c];

       distortion += diff * diff;

@@ -32,7 +45,7 @@

   return distortion;

-unsigned int vpx_get_mb_ss_c(const int16_t *a) {

+uint32_t vpx_get_mb_ss_c(const int16_t *a) {

   unsigned int i, sum = 0;

   for (i = 0; i < 256; ++i) {

@@ -42,16 +55,38 @@

   return sum;

+uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,

+                                          const uint8_t *b, int b_stride,

+                                          uint32_t *sse) {

+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,

+                                       b, b_stride, sse);

+}

+uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,

+                                          const uint8_t *b, int b_stride,

+                                          uint32_t *sse) {

+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,

+                                       b, b_stride, sse);

+}

+uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,

+                                           const uint8_t *b, int b_stride,

+                                           uint32_t *sse) {

+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,

+                                       b, b_stride, sse);

+}

 static void variance(const uint8_t *a, int  a_stride,

                      const uint8_t *b, int  b_stride,

-                     int  w, int  h, unsigned int *sse, int *sum) {

+                     int  w, int  h, uint32_t *sse, int *sum) {

   int i, j;

   *sum = 0;

   *sse = 0;

-  for (i = 0; i < h; i++) {

-    for (j = 0; j < w; j++) {

+  for (i = 0; i < h; ++i) {

+    for (j = 0; j < w; ++j) {

       const int diff = a[j] - b[j];

       *sum += diff;

       *sse += diff * diff;

@@ -62,15 +97,113 @@

+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal

+// or vertical direction to produce the filtered output block. Used to implement

+// the first-pass of 2-D separable filter.

+//

+// Produces int16_t output to retain precision for the next pass. Two filter

+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is

+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).

+// It defines the offset required to move from one input to the next.

+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,

+                                              unsigned int src_pixels_per_line,

+                                              int pixel_step,

+                                              unsigned int output_height,

+                                              unsigned int output_width,

+                                              const uint8_t *filter) {

+  unsigned int i, j;

+  for (i = 0; i < output_height; ++i) {

+    for (j = 0; j < output_width; ++j) {

+      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +

+                          (int)a[pixel_step] * filter[1],

+                          FILTER_BITS);

+      ++a;

+    }

+    a += src_pixels_per_line - output_width;

+    b += output_width;

+  }

+}

+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal

+// or vertical direction to produce the filtered output block. Used to implement

+// the second-pass of 2-D separable filter.

+//

+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two

+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the

+// filter is applied horizontally (pixel_step = 1) or vertically

+// (pixel_step = stride). It defines the offset required to move from one input

+// to the next. Output is 8-bit.

+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,

+                                               unsigned int src_pixels_per_line,

+                                               unsigned int pixel_step,

+                                               unsigned int output_height,

+                                               unsigned int output_width,

+                                               const uint8_t *filter) {

+  unsigned int  i, j;

+  for (i = 0; i < output_height; ++i) {

+    for (j = 0; j < output_width; ++j) {

+      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +

+                          (int)a[pixel_step] * filter[1],

+                          FILTER_BITS);

+      ++a;

+    }

+    a += src_pixels_per_line - output_width;

+    b += output_width;

+  }

+}

 #define VAR(W, H) \

-unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \

-                                       const uint8_t *b, int b_stride, \

-                                       unsigned int *sse) { \

+uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \

+                                   const uint8_t *b, int b_stride, \

+                                   uint32_t *sse) { \

   int sum; \

   variance(a, a_stride, b, b_stride, W, H, sse, &sum); \

   return *sse - (((int64_t)sum * sum) / (W * H)); \

+#define SUBPIX_VAR(W, H) \

+uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \

+                                             int xoffset, int  yoffset, \

+                                             const uint8_t *b, int b_stride, \

+                                             uint32_t *sse) { \

+  uint16_t fdata3[(H + 1) * W]; \

+  uint8_t temp2[H * W]; \

+\

+  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \

+                                    bilinear_filters[xoffset]); \

+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

+                                     bilinear_filters[yoffset]); \

+\

+  return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \

+}

+#define SUBPIX_AVG_VAR(W, H) \

+uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \

+                                                 int  a_stride, \

+                                                 int xoffset, int  yoffset, \

+                                                 const uint8_t *b, \

+                                                 int b_stride, \

+                                                 uint32_t *sse, \

+                                                 const uint8_t *second_pred) { \

+  uint16_t fdata3[(H + 1) * W]; \

+  uint8_t temp2[H * W]; \

+  DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \

+\

+  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \

+                                    bilinear_filters[xoffset]); \

+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

+                                     bilinear_filters[yoffset]); \

+\

+  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \

+\

+  return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \

+}

 /* Identical to the variance call except it takes an additional parameter, sum,

  * and returns that value using pass-by-reference instead of returning

  * sse - sum^2 / w*h

@@ -78,7 +211,7 @@

 #define GET_VAR(W, H) \

 void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \

                              const uint8_t *b, int b_stride, \

-                             unsigned int *sse, int *sum) { \

+                             uint32_t *sse, int *sum) { \

   variance(a, a_stride, b, b_stride, W, H, sse, sum); \

@@ -87,28 +220,34 @@

  * variable.

*/

 #define MSE(W, H) \

-unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \

-                                  const uint8_t *b, int b_stride, \

-                                  unsigned int *sse) { \

+uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \

+                              const uint8_t *b, int b_stride, \

+                              uint32_t *sse) { \

   int sum; \

   variance(a, a_stride, b, b_stride, W, H, sse, &sum); \

   return *sse; \

-VAR(64, 64)

-VAR(64, 32)

-VAR(32, 64)

-VAR(32, 32)

-VAR(32, 16)

-VAR(16, 32)

-VAR(16, 16)

-VAR(16, 8)

-VAR(8, 16)

-VAR(8, 8)

-VAR(8, 4)

-VAR(4, 8)

-VAR(4, 4)

+/* All three forms of the variance are available in the same sizes. */

+#define VARIANCES(W, H) \

+    VAR(W, H) \

+    SUBPIX_VAR(W, H) \

+    SUBPIX_AVG_VAR(W, H)

+VARIANCES(64, 64)

+VARIANCES(64, 32)

+VARIANCES(32, 64)

+VARIANCES(32, 32)

+VARIANCES(32, 16)

+VARIANCES(16, 32)

+VARIANCES(16, 16)

+VARIANCES(16, 8)

+VARIANCES(8, 16)

+VARIANCES(8, 8)

+VARIANCES(8, 4)

+VARIANCES(4, 8)

+VARIANCES(4, 4)

 GET_VAR(16, 16)

 GET_VAR(8, 8)

@@ -117,12 +256,13 @@

 MSE(8, 16)

 MSE(8, 8)

-void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,

-                         int height, const uint8_t *ref, int ref_stride) {

+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,

+                         int width, int height,

+                         const uint8_t *ref, int ref_stride) {

   int i, j;

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

+  for (i = 0; i < height; ++i) {

+    for (j = 0; j < width; ++j) {

       const int tmp = pred[j] + ref[j];

       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);

@@ -143,8 +283,8 @@

   *sum = 0;

   *sse = 0;

-  for (i = 0; i < h; i++) {

-    for (j = 0; j < w; j++) {

+  for (i = 0; i < h; ++i) {

+    for (j = 0; j < w; ++j) {

       const int diff = a[j] - b[j];

       *sum += diff;

       *sse += diff * diff;

@@ -156,60 +296,60 @@

 static void highbd_8_variance(const uint8_t *a8, int  a_stride,

                               const uint8_t *b8, int  b_stride,

-                              int w, int h, unsigned int *sse, int *sum) {

+                              int w, int h, uint32_t *sse, int *sum) {

   uint64_t sse_long = 0;

   uint64_t sum_long = 0;

   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);

-  *sse = (unsigned int)sse_long;

+  *sse = (uint32_t)sse_long;

   *sum = (int)sum_long;

 static void highbd_10_variance(const uint8_t *a8, int  a_stride,

                                const uint8_t *b8, int  b_stride,

-                               int w, int h, unsigned int *sse, int *sum) {

+                               int w, int h, uint32_t *sse, int *sum) {

   uint64_t sse_long = 0;

   uint64_t sum_long = 0;

   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);

-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);

+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);

   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);

 static void highbd_12_variance(const uint8_t *a8, int  a_stride,

                                const uint8_t *b8, int  b_stride,

-                               int w, int h, unsigned int *sse, int *sum) {

+                               int w, int h, uint32_t *sse, int *sum) {

   uint64_t sse_long = 0;

   uint64_t sum_long = 0;

   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);

-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);

+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);

   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);

 #define HIGHBD_VAR(W, H) \

-unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \

-                                                int a_stride, \

-                                                const uint8_t *b, \

-                                                int b_stride, \

-                                                unsigned int *sse) { \

+uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \

+                                            int a_stride, \

+                                            const uint8_t *b, \

+                                            int b_stride, \

+                                            uint32_t *sse) { \

   int sum; \

   highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \

   return *sse - (((int64_t)sum * sum) / (W * H)); \

} \

-unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \

-                                                 int a_stride, \

-                                                 const uint8_t *b, \

-                                                 int b_stride, \

-                                                 unsigned int *sse) { \

+uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \

+                                             int a_stride, \

+                                             const uint8_t *b, \

+                                             int b_stride, \

+                                             uint32_t *sse) { \

   int sum; \

   highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \

   return *sse - (((int64_t)sum * sum) / (W * H)); \

} \

-unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \

-                                                 int a_stride, \

-                                                 const uint8_t *b, \

-                                                 int b_stride, \

-                                                 unsigned int *sse) { \

+uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \

+                                             int a_stride, \

+                                             const uint8_t *b, \

+                                             int b_stride, \

+                                             uint32_t *sse) { \

   int sum; \

   highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \

   return *sse - (((int64_t)sum * sum) / (W * H)); \

@@ -217,54 +357,243 @@

 #define HIGHBD_GET_VAR(S) \

 void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \

-                                    const uint8_t *ref, int ref_stride, \

-                                    unsigned int *sse, int *sum) { \

+                                      const uint8_t *ref, int ref_stride, \

+                                      uint32_t *sse, int *sum) { \

   highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \

} \

 void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \

                                        const uint8_t *ref, int ref_stride, \

-                                       unsigned int *sse, int *sum) { \

+                                       uint32_t *sse, int *sum) { \

   highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \

} \

 void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \

                                        const uint8_t *ref, int ref_stride, \

-                                       unsigned int *sse, int *sum) { \

+                                       uint32_t *sse, int *sum) { \

   highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \

 #define HIGHBD_MSE(W, H) \

-unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \

-                                         int src_stride, \

-                                         const uint8_t *ref, \

-                                         int ref_stride, \

-                                         unsigned int *sse) { \

+uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \

+                                       int src_stride, \

+                                       const uint8_t *ref, \

+                                       int ref_stride, \

+                                       uint32_t *sse) { \

   int sum; \

   highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \

   return *sse; \

} \

-unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \

-                                            int src_stride, \

-                                            const uint8_t *ref, \

-                                            int ref_stride, \

-                                            unsigned int *sse) { \

+uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \

+                                        int src_stride, \

+                                        const uint8_t *ref, \

+                                        int ref_stride, \

+                                        uint32_t *sse) { \

   int sum; \

   highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \

   return *sse; \

} \

-unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \

-                                            int src_stride, \

-                                            const uint8_t *ref, \

-                                            int ref_stride, \

-                                            unsigned int *sse) { \

+uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \

+                                        int src_stride, \

+                                        const uint8_t *ref, \

+                                        int ref_stride, \

+                                        uint32_t *sse) { \

   int sum; \

   highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \

   return *sse; \

+static void highbd_var_filter_block2d_bil_first_pass(

+    const uint8_t *src_ptr8,

+    uint16_t *output_ptr,

+    unsigned int src_pixels_per_line,

+    int pixel_step,

+    unsigned int output_height,

+    unsigned int output_width,

+    const uint8_t *filter) {

+  unsigned int i, j;

+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);

+  for (i = 0; i < output_height; ++i) {

+    for (j = 0; j < output_width; ++j) {

+      output_ptr[j] =

+          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +

+                             (int)src_ptr[pixel_step] * filter[1],

+                             FILTER_BITS);

+      ++src_ptr;

+    }

+    // Next row...

+    src_ptr += src_pixels_per_line - output_width;

+    output_ptr += output_width;

+  }

+}

+static void highbd_var_filter_block2d_bil_second_pass(

+    const uint16_t *src_ptr,

+    uint16_t *output_ptr,

+    unsigned int src_pixels_per_line,

+    unsigned int pixel_step,

+    unsigned int output_height,

+    unsigned int output_width,

+    const uint8_t *filter) {

+  unsigned int  i, j;

+  for (i = 0; i < output_height; ++i) {

+    for (j = 0; j < output_width; ++j) {

+      output_ptr[j] =

+          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +

+                             (int)src_ptr[pixel_step] * filter[1],

+                             FILTER_BITS);

+      ++src_ptr;

+    }

+    src_ptr += src_pixels_per_line - output_width;

+    output_ptr += output_width;

+  }

+}

+#define HIGHBD_SUBPIX_VAR(W, H) \

+uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \

+  const uint8_t *src, int  src_stride, \

+  int xoffset, int  yoffset, \

+  const uint8_t *dst, int dst_stride, \

+  uint32_t *sse) { \

+  uint16_t fdata3[(H + 1) * W]; \

+  uint16_t temp2[H * W]; \

+\

+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

+                                           W, bilinear_filters[xoffset]); \

+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

+                                            bilinear_filters[yoffset]); \

+\

+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \

+                                          dst_stride, sse); \

+} \

+\

+uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \

+  const uint8_t *src, int  src_stride, \

+  int xoffset, int  yoffset, \

+  const uint8_t *dst, int dst_stride, \

+  uint32_t *sse) { \

+  uint16_t fdata3[(H + 1) * W]; \

+  uint16_t temp2[H * W]; \

+\

+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

+                                           W, bilinear_filters[xoffset]); \

+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

+                                            bilinear_filters[yoffset]); \

+\

+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \

+                                             W, dst, dst_stride, sse); \

+} \

+\

+uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \

+  const uint8_t *src, int  src_stride, \

+  int xoffset, int  yoffset, \

+  const uint8_t *dst, int dst_stride, \

+  uint32_t *sse) { \

+  uint16_t fdata3[(H + 1) * W]; \

+  uint16_t temp2[H * W]; \

+\

+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

+                                           W, bilinear_filters[xoffset]); \

+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

+                                            bilinear_filters[yoffset]); \

+\

+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \

+                                             W, dst, dst_stride, sse); \

+}

+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \

+uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \

+  const uint8_t *src, int  src_stride, \

+  int xoffset, int  yoffset, \

+  const uint8_t *dst, int dst_stride, \

+  uint32_t *sse, \

+  const uint8_t *second_pred) { \

+  uint16_t fdata3[(H + 1) * W]; \

+  uint16_t temp2[H * W]; \

+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \

+\

+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

+                                           W, bilinear_filters[xoffset]); \

+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

+                                            bilinear_filters[yoffset]); \

+\

+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \

+                           CONVERT_TO_BYTEPTR(temp2), W); \

+\

+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \

+                                          dst_stride, sse); \

+} \

+\

+uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \

+  const uint8_t *src, int  src_stride, \

+  int xoffset, int  yoffset, \

+  const uint8_t *dst, int dst_stride, \

+  uint32_t *sse, \

+  const uint8_t *second_pred) { \

+  uint16_t fdata3[(H + 1) * W]; \

+  uint16_t temp2[H * W]; \

+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \

+\

+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

+                                           W, bilinear_filters[xoffset]); \

+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

+                                            bilinear_filters[yoffset]); \

+\

+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \

+                           CONVERT_TO_BYTEPTR(temp2), W); \

+\

+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \

+                                             W, dst, dst_stride, sse); \

+} \

+\

+uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \

+  const uint8_t *src, int  src_stride, \

+  int xoffset, int  yoffset, \

+  const uint8_t *dst, int dst_stride, \

+  uint32_t *sse, \

+  const uint8_t *second_pred) { \

+  uint16_t fdata3[(H + 1) * W]; \

+  uint16_t temp2[H * W]; \

+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \

+\

+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \

+                                           W, bilinear_filters[xoffset]); \

+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \

+                                            bilinear_filters[yoffset]); \

+\

+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \

+                           CONVERT_TO_BYTEPTR(temp2), W); \

+\

+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \

+                                             W, dst, dst_stride, sse); \

+}

+/* All three forms of the variance are available in the same sizes. */

+#define HIGHBD_VARIANCES(W, H) \

+    HIGHBD_VAR(W, H) \

+    HIGHBD_SUBPIX_VAR(W, H) \

+    HIGHBD_SUBPIX_AVG_VAR(W, H)

+HIGHBD_VARIANCES(64, 64)

+HIGHBD_VARIANCES(64, 32)

+HIGHBD_VARIANCES(32, 64)

+HIGHBD_VARIANCES(32, 32)

+HIGHBD_VARIANCES(32, 16)

+HIGHBD_VARIANCES(16, 32)

+HIGHBD_VARIANCES(16, 16)

+HIGHBD_VARIANCES(16, 8)

+HIGHBD_VARIANCES(8, 16)

+HIGHBD_VARIANCES(8, 8)

+HIGHBD_VARIANCES(8, 4)

+HIGHBD_VARIANCES(4, 8)

+HIGHBD_VARIANCES(4, 4)

 HIGHBD_GET_VAR(8)

 HIGHBD_GET_VAR(16)

@@ -273,20 +602,6 @@

 HIGHBD_MSE(8, 16)

 HIGHBD_MSE(8, 8)

-HIGHBD_VAR(64, 64)

-HIGHBD_VAR(64, 32)

-HIGHBD_VAR(32, 64)

-HIGHBD_VAR(32, 32)

-HIGHBD_VAR(32, 16)

-HIGHBD_VAR(16, 32)

-HIGHBD_VAR(16, 16)

-HIGHBD_VAR(16, 8)

-HIGHBD_VAR(8, 16)

-HIGHBD_VAR(8, 8)

-HIGHBD_VAR(8, 4)

-HIGHBD_VAR(4, 8)

-HIGHBD_VAR(4, 4)

 void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,

                               int width, int height, const uint8_t *ref8,

                               int ref_stride) {

@@ -293,8 +608,8 @@

   int i, j;

   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);

   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

+  for (i = 0; i < height; ++i) {

+    for (j = 0; j < width; ++j) {

       const int tmp = pred[j] + ref[j];

       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);

--- /dev/null

+++ b/vpx_dsp/variance.h

@@ -1,0 +1,94 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_VARIANCE_H_

+#define VPX_DSP_VARIANCE_H_

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#ifdef __cplusplus

+extern "C" {

+#endif

+#define FILTER_BITS 7

+#define FILTER_WEIGHT 128

+typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,

+                                    const uint8_t *b_ptr, int b_stride);

+typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,

+                                        const uint8_t *b_ptr, int b_stride,

+                                        const uint8_t *second_pred);

+typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,

+                                  uint8_t *b, int b_stride, int n);

+typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,

+                                   const uint8_t *b, int b_stride,

+                                   unsigned int *sad_array);

+typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,

+                                     const uint8_t *const b_array[],

+                                     int b_stride,

+                                     unsigned int *sad_array);

+typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,

+                                          const uint8_t *b, int b_stride,

+                                          unsigned int *sse);

+typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,

+                                                int xoffset, int yoffset,

+                                                const uint8_t *b, int b_stride,

+                                                unsigned int *sse);

+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,

+                                                   int a_stride,

+                                                   int xoffset, int yoffset,

+                                                   const uint8_t *b_ptr,

+                                                   int b_stride,

+                                                   unsigned int *sse,

+                                                   const uint8_t *second_pred);

+#if CONFIG_VP8

+typedef struct variance_vtable {

+  vpx_sad_fn_t            sdf;

+  vpx_variance_fn_t       vf;

+  vpx_subpixvariance_fn_t svf;

+  vpx_variance_fn_t       svf_halfpix_h;

+  vpx_variance_fn_t       svf_halfpix_v;

+  vpx_variance_fn_t       svf_halfpix_hv;

+  vpx_sad_multi_fn_t      sdx3f;

+  vpx_sad_multi_fn_t      sdx8f;

+  vpx_sad_multi_d_fn_t    sdx4df;

+#if ARCH_X86 || ARCH_X86_64

+  vp8_copy32xn_fn_t       copymem;

+#endif

+} vp8_variance_fn_ptr_t;

+#endif  // CONFIG_VP8

+#if CONFIG_VP9

+typedef struct vp9_variance_vtable {

+  vpx_sad_fn_t               sdf;

+  vpx_sad_avg_fn_t           sdaf;

+  vpx_variance_fn_t          vf;

+  vpx_subpixvariance_fn_t    svf;

+  vpx_subp_avg_variance_fn_t svaf;

+  vpx_sad_multi_fn_t         sdx3f;

+  vpx_sad_multi_fn_t         sdx8f;

+  vpx_sad_multi_d_fn_t       sdx4df;

+} vp9_variance_fn_ptr_t;

+#endif  // CONFIG_VP9

+#ifdef __cplusplus

+}  // extern "C"

+#endif

+#endif  // VPX_DSP_VARIANCE_H_

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -10,6 +10,8 @@

 DSP_SRCS-yes += vpx_dsp.mk

+DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h

 ifeq ($(CONFIG_ENCODERS),yes)

 DSP_SRCS-yes            += sad.c

 DSP_SRCS-yes            += subtract.c

@@ -19,7 +21,6 @@

 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c

 DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c

-DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h

 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c

 DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c

@@ -45,21 +46,36 @@

 ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)

 DSP_SRCS-yes            += variance.c

+DSP_SRCS-yes            += variance.h

+DSP_SRCS-$(HAVE_MEDIA)  += arm/bilinear_filter_media$(ASM)

+DSP_SRCS-$(HAVE_MEDIA)  += arm/subpel_variance_media.c

+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_h_media$(ASM)

+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_hv_media$(ASM)

+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_v_media$(ASM)

 DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_media$(ASM)

+DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c

 DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c

 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c

+DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c

 DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c

 DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm

-DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c

+DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3

 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c

 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c

+ifeq ($(CONFIG_USE_X86INC),yes)

+DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3

+endif  # CONFIG_USE_X86INC

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c

 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm

+ifeq ($(CONFIG_USE_X86INC),yes)

+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm

+endif  # CONFIG_USE_X86INC

 endif  # CONFIG_VP9_HIGHBITDEPTH

 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -412,6 +412,9 @@

 if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {

+#

+# Variance

+#

 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

   specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;

@@ -451,7 +454,9 @@

 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

   specialize qw/vpx_variance4x4 mmx sse2 msa/;

+#

+# Specialty Variance

+#

 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";

   specialize qw/vpx_get16x16var sse2 avx2 neon msa/;

@@ -478,6 +483,99 @@

 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";

+#

+# Subpixel Variance

+#

+add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";

+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";

+#

+# Specialty Subpixel

+#

+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";

+  specialize qw/vpx_variance_halfpixvar16x16_h mmx media/;

+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";

+  specialize qw/vpx_variance_halfpixvar16x16_v mmx media/;

+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";

+  specialize qw/vpx_variance_halfpixvar16x16_hv mmx media/;

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

   specialize qw/vpx_highbd_12_variance64x64 sse2/;

@@ -615,6 +713,226 @@

   specialize qw/vpx_highbd_12_mse8x8 sse2/;

   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";

+  #

+  # Subpixel Variance

+  #

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

 }  # CONFIG_VP9_HIGHBITDEPTH

 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

--- /dev/null

+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm

@@ -1,0 +1,1041 @@

+;

+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%define program_name vpx

+%include "third_party/x86inc/x86inc.asm"

+SECTION_RODATA

+pw_8: times  8 dw  8

+bilin_filter_m_sse2: times  8 dw 16

+                     times  8 dw  0

+                     times  8 dw 14

+                     times  8 dw  2

+                     times  8 dw 12

+                     times  8 dw  4

+                     times  8 dw 10

+                     times  8 dw  6

+                     times 16 dw  8

+                     times  8 dw  6

+                     times  8 dw 10

+                     times  8 dw  4

+                     times  8 dw 12

+                     times  8 dw  2

+                     times  8 dw 14

+SECTION .text

+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,

+;                               int x_offset, int y_offset,

+;                               const uint8_t *dst, ptrdiff_t dst_stride,

+;                               int height, unsigned int *sse);

+;

+; This function returns the SE and stores SSE in the given pointer.

+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse

+  psubw                %3, %4

+  psubw                %1, %2

+  mova                 %4, %3       ; make copies to manipulate to calc sum

+  mova                 %2, %1       ; use originals for calc sse

+  pmaddwd              %3, %3

+  paddw                %4, %2

+  pmaddwd              %1, %1

+  movhlps              %2, %4

+  paddd                %6, %3

+  paddw                %4, %2

+  pxor                 %2, %2

+  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)

+  punpcklwd            %4, %2       ; sign-extend word to dword

+  paddd                %6, %1

+  paddd                %5, %4

+%endmacro

+%macro STORE_AND_RET 0

+%if mmsize == 16

+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit

+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.

+  ; We have to sign-extend it before adding the words within the register

+  ; and outputing to a dword.

+  movhlps              m3, m7

+  movhlps              m4, m6

+  paddd                m7, m3

+  paddd                m6, m4

+  pshufd               m3, m7, 0x1

+  pshufd               m4, m6, 0x1

+  paddd                m7, m3

+  paddd                m6, m4

+  mov                  r1, ssem         ; r1 = unsigned int *sse

+  movd               [r1], m7           ; store sse

+  movd                rax, m6           ; store sum as return value

+%endif

+  RET

+%endmacro

+%macro INC_SRC_BY_SRC_STRIDE  0

+%if ARCH_X86=1 && CONFIG_PIC=1

+  lea                srcq, [srcq + src_stridemp*2]

+%else

+  lea                srcq, [srcq + src_strideq*2]

+%endif

+%endmacro

+%macro INC_SRC_BY_SRC_2STRIDE  0

+%if ARCH_X86=1 && CONFIG_PIC=1

+  lea                srcq, [srcq + src_stridemp*4]

+%else

+  lea                srcq, [srcq + src_strideq*4]

+%endif

+%endmacro

+%macro SUBPEL_VARIANCE 1-2 0 ; W

+%define bilin_filter_m bilin_filter_m_sse2

+%define filter_idx_shift 5

+%ifdef PIC    ; 64bit PIC

+  %if %2 == 1 ; avg

+    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \

+                                      x_offset, y_offset, \

+                                      dst, dst_stride, \

+                                      sec, sec_stride, height, sse

+    %define sec_str sec_strideq

+  %else

+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \

+                                  y_offset, dst, dst_stride, height, sse

+  %endif

+  %define h heightd

+  %define bilin_filter sseq

+%else

+  %if ARCH_X86=1 && CONFIG_PIC=1

+    %if %2 == 1 ; avg

+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \

+                                  x_offset, y_offset, \

+                                  dst, dst_stride, \

+                                  sec, sec_stride, \

+                                  height, sse, g_bilin_filter, g_pw_8

+      %define h dword heightm

+      %define sec_str sec_stridemp

+      ; Store bilin_filter and pw_8 location in stack

+      GET_GOT eax

+      add esp, 4                ; restore esp

+      lea ecx, [GLOBAL(bilin_filter_m)]

+      mov g_bilin_filterm, ecx

+      lea ecx, [GLOBAL(pw_8)]

+      mov g_pw_8m, ecx

+      LOAD_IF_USED 0, 1         ; load eax, ecx back

+    %else

+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \

+                                x_offset, y_offset, dst, dst_stride, height, \

+                                sse, g_bilin_filter, g_pw_8

+      %define h heightd

+      ; Store bilin_filter and pw_8 location in stack

+      GET_GOT eax

+      add esp, 4                ; restore esp

+      lea ecx, [GLOBAL(bilin_filter_m)]

+      mov g_bilin_filterm, ecx

+      lea ecx, [GLOBAL(pw_8)]

+      mov g_pw_8m, ecx

+      LOAD_IF_USED 0, 1         ; load eax, ecx back

+    %endif

+  %else

+    %if %2 == 1 ; avg

+      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \

+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \

+                                             x_offset, y_offset, \

+                                             dst, dst_stride, \

+                                             sec, sec_stride, \

+                                             height, sse

+      %if ARCH_X86_64

+      %define h heightd

+      %define sec_str sec_strideq

+      %else

+      %define h dword heightm

+      %define sec_str sec_stridemp

+      %endif

+    %else

+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \

+                              x_offset, y_offset, dst, dst_stride, height, sse

+      %define h heightd

+    %endif

+    %define bilin_filter bilin_filter_m

+  %endif

+%endif

+  ASSERT               %1 <= 16         ; m6 overflows if w > 16

+  pxor                 m6, m6           ; sum

+  pxor                 m7, m7           ; sse

+%if %1 < 16

+  sar                   h, 1

+%endif

+%if %2 == 1 ; avg

+  shl             sec_str, 1

+%endif

+  ; FIXME(rbultje) replace by jumptable?

+  test          x_offsetd, x_offsetd

+  jnz .x_nonzero

+  ; x_offset == 0

+  test          y_offsetd, y_offsetd

+  jnz .x_zero_y_nonzero

+  ; x_offset == 0 && y_offset == 0

+.x_zero_y_zero_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq + 16]

+  mova                 m1, [dstq]

+  mova                 m3, [dstq + 16]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m2, [secq+16]

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq + src_strideq*2]

+  mova                 m1, [dstq]

+  mova                 m3, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  add                secq, sec_str

+  pavgw                m2, [secq]

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%endif

+  dec                   h

+  jg .x_zero_y_zero_loop

+  STORE_AND_RET

+.x_zero_y_nonzero:

+  cmp           y_offsetd, 8

+  jne .x_zero_y_nonhalf

+  ; x_offset == 0 && y_offset == 0.5

+.x_zero_y_half_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+16]

+  movu                 m4, [srcq+src_strideq*2]

+  movu                 m5, [srcq+src_strideq*2+16]

+  mova                 m2, [dstq]

+  mova                 m3, [dstq+16]

+  pavgw                m0, m4

+  pavgw                m1, m5

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+src_strideq*2]

+  movu                 m5, [srcq+src_strideq*4]

+  mova                 m2, [dstq]

+  mova                 m3, [dstq+dst_strideq*2]

+  pavgw                m0, m1

+  pavgw                m1, m5

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  add                secq, sec_str

+  pavgw                m1, [secq]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%endif

+  dec                   h

+  jg .x_zero_y_half_loop

+  STORE_AND_RET

+.x_zero_y_nonhalf:

+  ; x_offset == 0 && y_offset == bilin interpolation

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           y_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+y_offsetq]

+  mova                 m9, [bilin_filter+y_offsetq+16]

+  mova                m10, [pw_8]

+%define filter_y_a m8

+%define filter_y_b m9

+%define filter_rnd m10

+%else ; x86-32 or mmx

+%if ARCH_X86=1 && CONFIG_PIC=1

+; x_offset == 0, reuse x_offset reg

+%define tempq x_offsetq

+  add y_offsetq, g_bilin_filterm

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           y_offsetq, bilin_filter

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+.x_zero_y_other_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq + 16]

+  movu                 m4, [srcq+src_strideq*2]

+  movu                 m5, [srcq+src_strideq*2+16]

+  mova                 m2, [dstq]

+  mova                 m3, [dstq+16]

+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can

+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of

+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be

+  ; slightly faster because of pmullw latency. It would also cut our rodata

+  ; tables in half for this function, and save 1-2 registers on x86-64.

+  pmullw               m1, filter_y_a

+  pmullw               m5, filter_y_b

+  paddw                m1, filter_rnd

+  pmullw               m0, filter_y_a

+  pmullw               m4, filter_y_b

+  paddw                m0, filter_rnd

+  paddw                m1, m5

+  paddw                m0, m4

+  psrlw                m1, 4

+  psrlw                m0, 4

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+src_strideq*2]

+  movu                 m5, [srcq+src_strideq*4]

+  mova                 m4, m1

+  mova                 m2, [dstq]

+  mova                 m3, [dstq+dst_strideq*2]

+  pmullw               m1, filter_y_a

+  pmullw               m5, filter_y_b

+  paddw                m1, filter_rnd

+  pmullw               m0, filter_y_a

+  pmullw               m4, filter_y_b

+  paddw                m0, filter_rnd

+  paddw                m1, m5

+  paddw                m0, m4

+  psrlw                m1, 4

+  psrlw                m0, 4

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  add                secq, sec_str

+  pavgw                m1, [secq]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%endif

+  dec                   h

+  jg .x_zero_y_other_loop

+%undef filter_y_a

+%undef filter_y_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonzero:

+  cmp           x_offsetd, 8

+  jne .x_nonhalf

+  ; x_offset == 0.5

+  test          y_offsetd, y_offsetd

+  jnz .x_half_y_nonzero

+  ; x_offset == 0.5 && y_offset == 0

+.x_half_y_zero_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq + 16]

+  movu                 m4, [srcq + 2]

+  movu                 m5, [srcq + 18]

+  mova                 m2, [dstq]

+  mova                 m3, [dstq + 16]

+  pavgw                m0, m4

+  pavgw                m1, m5

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq + src_strideq*2]

+  movu                 m4, [srcq + 2]

+  movu                 m5, [srcq + src_strideq*2 + 2]

+  mova                 m2, [dstq]

+  mova                 m3, [dstq + dst_strideq*2]

+  pavgw                m0, m4

+  pavgw                m1, m5

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  add                secq, sec_str

+  pavgw                m1, [secq]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%endif

+  dec                   h

+  jg .x_half_y_zero_loop

+  STORE_AND_RET

+.x_half_y_nonzero:

+  cmp           y_offsetd, 8

+  jne .x_half_y_nonhalf

+  ; x_offset == 0.5 && y_offset == 0.5

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+16]

+  movu                 m2, [srcq+2]

+  movu                 m3, [srcq+18]

+  lea                srcq, [srcq + src_strideq*2]

+  pavgw                m0, m2

+  pavgw                m1, m3

+.x_half_y_half_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq + 16]

+  movu                 m4, [srcq + 2]

+  movu                 m5, [srcq + 18]

+  pavgw                m2, m4

+  pavgw                m3, m5

+  pavgw                m0, m2

+  pavgw                m1, m3

+  mova                 m4, [dstq]

+  mova                 m5, [dstq + 16]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m4, m1, m5, m6, m7

+  mova                 m0, m2

+  mova                 m1, m3

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq+2]

+  lea                srcq, [srcq + src_strideq*2]

+  pavgw                m0, m2

+.x_half_y_half_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq + src_strideq*2]

+  movu                 m4, [srcq + 2]

+  movu                 m5, [srcq + src_strideq*2 + 2]

+  pavgw                m2, m4

+  pavgw                m3, m5

+  pavgw                m0, m2

+  pavgw                m2, m3

+  mova                 m4, [dstq]

+  mova                 m5, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  add                secq, sec_str

+  pavgw                m2, [secq]

+%endif

+  SUM_SSE              m0, m4, m2, m5, m6, m7

+  mova                 m0, m3

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%endif

+  dec                   h

+  jg .x_half_y_half_loop

+  STORE_AND_RET

+.x_half_y_nonhalf:

+  ; x_offset == 0.5 && y_offset == bilin interpolation

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           y_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+y_offsetq]

+  mova                 m9, [bilin_filter+y_offsetq+16]

+  mova                m10, [pw_8]

+%define filter_y_a m8

+%define filter_y_b m9

+%define filter_rnd m10

+%else  ; x86_32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; x_offset == 0.5. We can reuse x_offset reg

+%define tempq x_offsetq

+  add y_offsetq, g_bilin_filterm

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           y_offsetq, bilin_filter

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+16]

+  movu                 m2, [srcq+2]

+  movu                 m3, [srcq+18]

+  lea                srcq, [srcq + src_strideq*2]

+  pavgw                m0, m2

+  pavgw                m1, m3

+.x_half_y_other_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq+16]

+  movu                 m4, [srcq+2]

+  movu                 m5, [srcq+18]

+  pavgw                m2, m4

+  pavgw                m3, m5

+  mova                 m4, m2

+  mova                 m5, m3

+  pmullw               m1, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m1, filter_rnd

+  paddw                m1, m3

+  pmullw               m0, filter_y_a

+  pmullw               m2, filter_y_b

+  paddw                m0, filter_rnd

+  psrlw                m1, 4

+  paddw                m0, m2

+  mova                 m2, [dstq]

+  psrlw                m0, 4

+  mova                 m3, [dstq+16]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  mova                 m0, m4

+  mova                 m1, m5

+  lea                srcq, [srcq + src_strideq*2]

+  lea                dstq, [dstq + dst_strideq*2]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq+2]

+  lea                srcq, [srcq + src_strideq*2]

+  pavgw                m0, m2

+.x_half_y_other_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq+src_strideq*2]

+  movu                 m4, [srcq+2]

+  movu                 m5, [srcq+src_strideq*2+2]

+  pavgw                m2, m4

+  pavgw                m3, m5

+  mova                 m4, m2

+  mova                 m5, m3

+  pmullw               m4, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m4, filter_rnd

+  paddw                m4, m3

+  pmullw               m0, filter_y_a

+  pmullw               m2, filter_y_b

+  paddw                m0, filter_rnd

+  psrlw                m4, 4

+  paddw                m0, m2

+  mova                 m2, [dstq]

+  psrlw                m0, 4

+  mova                 m3, [dstq+dst_strideq*2]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  add                secq, sec_str

+  pavgw                m4, [secq]

+%endif

+  SUM_SSE              m0, m2, m4, m3, m6, m7

+  mova                 m0, m5

+  lea                srcq, [srcq + src_strideq*4]

+  lea                dstq, [dstq + dst_strideq*4]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%endif

+  dec                   h

+  jg .x_half_y_other_loop

+%undef filter_y_a

+%undef filter_y_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonhalf:

+  test          y_offsetd, y_offsetd

+  jnz .x_nonhalf_y_nonzero

+  ; x_offset == bilin interpolation && y_offset == 0

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           x_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+x_offsetq]

+  mova                 m9, [bilin_filter+x_offsetq+16]

+  mova                m10, [pw_8]

+%define filter_x_a m8

+%define filter_x_b m9

+%define filter_rnd m10

+%else    ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; y_offset == 0. We can reuse y_offset reg.

+%define tempq y_offsetq

+  add x_offsetq, g_bilin_filterm

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           x_offsetq, bilin_filter

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+.x_other_y_zero_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+16]

+  movu                 m2, [srcq+2]

+  movu                 m3, [srcq+18]

+  mova                 m4, [dstq]

+  mova                 m5, [dstq+16]

+  pmullw               m1, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m1, filter_rnd

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  paddw                m1, m3

+  paddw                m0, m2

+  psrlw                m1, 4

+  psrlw                m0, 4

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m4, m1, m5, m6, m7

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+src_strideq*2]

+  movu                 m2, [srcq+2]

+  movu                 m3, [srcq+src_strideq*2+2]

+  mova                 m4, [dstq]

+  mova                 m5, [dstq+dst_strideq*2]

+  pmullw               m1, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m1, filter_rnd

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  paddw                m1, m3

+  paddw                m0, m2

+  psrlw                m1, 4

+  psrlw                m0, 4

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  add                secq, sec_str

+  pavgw                m1, [secq]

+%endif

+  SUM_SSE              m0, m4, m1, m5, m6, m7

+  lea                srcq, [srcq+src_strideq*4]

+  lea                dstq, [dstq+dst_strideq*4]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%endif

+  dec                   h

+  jg .x_other_y_zero_loop

+%undef filter_x_a

+%undef filter_x_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonhalf_y_nonzero:

+  cmp           y_offsetd, 8

+  jne .x_nonhalf_y_nonhalf

+  ; x_offset == bilin interpolation && y_offset == 0.5

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           x_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+x_offsetq]

+  mova                 m9, [bilin_filter+x_offsetq+16]

+  mova                m10, [pw_8]

+%define filter_x_a m8

+%define filter_x_b m9

+%define filter_rnd m10

+%else    ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; y_offset == 0.5. We can reuse y_offset reg.

+%define tempq y_offsetq

+  add x_offsetq, g_bilin_filterm

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           x_offsetq, bilin_filter

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+16]

+  movu                 m2, [srcq+2]

+  movu                 m3, [srcq+18]

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  pmullw               m1, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m1, filter_rnd

+  paddw                m0, m2

+  paddw                m1, m3

+  psrlw                m0, 4

+  psrlw                m1, 4

+  lea                srcq, [srcq+src_strideq*2]

+.x_other_y_half_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq+16]

+  movu                 m4, [srcq+2]

+  movu                 m5, [srcq+18]

+  pmullw               m2, filter_x_a

+  pmullw               m4, filter_x_b

+  paddw                m2, filter_rnd

+  pmullw               m3, filter_x_a

+  pmullw               m5, filter_x_b

+  paddw                m3, filter_rnd

+  paddw                m2, m4

+  paddw                m3, m5

+  mova                 m4, [dstq]

+  mova                 m5, [dstq+16]

+  psrlw                m2, 4

+  psrlw                m3, 4

+  pavgw                m0, m2

+  pavgw                m1, m3

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m4, m1, m5, m6, m7

+  mova                 m0, m2

+  mova                 m1, m3

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq+2]

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  paddw                m0, m2

+  psrlw                m0, 4

+  lea                srcq, [srcq+src_strideq*2]

+.x_other_y_half_loop:

+  movu                 m2, [srcq]

+  movu                 m3, [srcq+src_strideq*2]

+  movu                 m4, [srcq+2]

+  movu                 m5, [srcq+src_strideq*2+2]

+  pmullw               m2, filter_x_a

+  pmullw               m4, filter_x_b

+  paddw                m2, filter_rnd

+  pmullw               m3, filter_x_a

+  pmullw               m5, filter_x_b

+  paddw                m3, filter_rnd

+  paddw                m2, m4

+  paddw                m3, m5

+  mova                 m4, [dstq]

+  mova                 m5, [dstq+dst_strideq*2]

+  psrlw                m2, 4

+  psrlw                m3, 4

+  pavgw                m0, m2

+  pavgw                m2, m3

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  add                secq, sec_str

+  pavgw                m2, [secq]

+%endif

+  SUM_SSE              m0, m4, m2, m5, m6, m7

+  mova                 m0, m3

+  lea                srcq, [srcq+src_strideq*4]

+  lea                dstq, [dstq+dst_strideq*4]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%endif

+  dec                   h

+  jg .x_other_y_half_loop

+%undef filter_x_a

+%undef filter_x_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonhalf_y_nonhalf:

+; loading filter - this is same as in 8-bit depth

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5

+  shl           y_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+x_offsetq]

+  mova                 m9, [bilin_filter+x_offsetq+16]

+  mova                m10, [bilin_filter+y_offsetq]

+  mova                m11, [bilin_filter+y_offsetq+16]

+  mova                m12, [pw_8]

+%define filter_x_a m8

+%define filter_x_b m9

+%define filter_y_a m10

+%define filter_y_b m11

+%define filter_rnd m12

+%else   ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; In this case, there is NO unused register. Used src_stride register. Later,

+; src_stride has to be loaded from stack when it is needed.

+%define tempq src_strideq

+  mov tempq, g_bilin_filterm

+  add           x_offsetq, tempq

+  add           y_offsetq, tempq

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           x_offsetq, bilin_filter

+  add           y_offsetq, bilin_filter

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+; end of load filter

+  ; x_offset == bilin interpolation && y_offset == bilin interpolation

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq+2]

+  movu                 m1, [srcq+16]

+  movu                 m3, [srcq+18]

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  pmullw               m1, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m1, filter_rnd

+  paddw                m0, m2

+  paddw                m1, m3

+  psrlw                m0, 4

+  psrlw                m1, 4

+  INC_SRC_BY_SRC_STRIDE

+.x_other_y_other_loop:

+  movu                 m2, [srcq]

+  movu                 m4, [srcq+2]

+  movu                 m3, [srcq+16]

+  movu                 m5, [srcq+18]

+  pmullw               m2, filter_x_a

+  pmullw               m4, filter_x_b

+  paddw                m2, filter_rnd

+  pmullw               m3, filter_x_a

+  pmullw               m5, filter_x_b

+  paddw                m3, filter_rnd

+  paddw                m2, m4

+  paddw                m3, m5

+  psrlw                m2, 4

+  psrlw                m3, 4

+  mova                 m4, m2

+  mova                 m5, m3

+  pmullw               m0, filter_y_a

+  pmullw               m2, filter_y_b

+  paddw                m0, filter_rnd

+  pmullw               m1, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m0, m2

+  paddw                m1, filter_rnd

+  mova                 m2, [dstq]

+  paddw                m1, m3

+  psrlw                m0, 4

+  psrlw                m1, 4

+  mova                 m3, [dstq+16]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  pavgw                m1, [secq+16]

+%endif

+  SUM_SSE              m0, m2, m1, m3, m6, m7

+  mova                 m0, m4

+  mova                 m1, m5

+  INC_SRC_BY_SRC_STRIDE

+  lea                dstq, [dstq + dst_strideq * 2]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%else ; %1 < 16

+  movu                 m0, [srcq]

+  movu                 m2, [srcq+2]

+  pmullw               m0, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m0, filter_rnd

+  paddw                m0, m2

+  psrlw                m0, 4

+  INC_SRC_BY_SRC_STRIDE

+.x_other_y_other_loop:

+  movu                 m2, [srcq]

+  movu                 m4, [srcq+2]

+  movu                 m3, [srcq+src_strideq*2]

+  movu                 m5, [srcq+src_strideq*2+2]

+  pmullw               m2, filter_x_a

+  pmullw               m4, filter_x_b

+  paddw                m2, filter_rnd

+  pmullw               m3, filter_x_a

+  pmullw               m5, filter_x_b

+  paddw                m3, filter_rnd

+  paddw                m2, m4

+  paddw                m3, m5

+  psrlw                m2, 4

+  psrlw                m3, 4

+  mova                 m4, m2

+  mova                 m5, m3

+  pmullw               m0, filter_y_a

+  pmullw               m2, filter_y_b

+  paddw                m0, filter_rnd

+  pmullw               m4, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m0, m2

+  paddw                m4, filter_rnd

+  mova                 m2, [dstq]

+  paddw                m4, m3

+  psrlw                m0, 4

+  psrlw                m4, 4

+  mova                 m3, [dstq+dst_strideq*2]

+%if %2 == 1 ; avg

+  pavgw                m0, [secq]

+  add                secq, sec_str

+  pavgw                m4, [secq]

+%endif

+  SUM_SSE              m0, m2, m4, m3, m6, m7

+  mova                 m0, m5

+  INC_SRC_BY_SRC_2STRIDE

+  lea                dstq, [dstq + dst_strideq * 4]

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+%endif

+  dec                   h

+  jg .x_other_y_other_loop

+%undef filter_x_a

+%undef filter_x_b

+%undef filter_y_a

+%undef filter_y_b

+%undef filter_rnd

+  STORE_AND_RET

+%endmacro

+INIT_XMM sse2

+SUBPEL_VARIANCE  8

+SUBPEL_VARIANCE 16

+INIT_XMM sse2

+SUBPEL_VARIANCE  8, 1

+SUBPEL_VARIANCE 16, 1

--- a/vpx_dsp/x86/highbd_variance_sse2.c

+++ b/vpx_dsp/x86/highbd_variance_sse2.c

@@ -8,9 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

 #include "./vpx_config.h"

-#include "vp9/common/vp9_common.h"

-#include "vp9/encoder/vp9_variance.h"

 #include "vpx_ports/mem.h"

 typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,

@@ -243,3 +241,341 @@

                           sse, &sum, vpx_highbd_calc8x8var_sse2, 8);

   return *sse;

+#if CONFIG_USE_X86INC

+#define DECL(w, opt) \

+  int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \

+                                                 ptrdiff_t src_stride, \

+                                                 int x_offset, int y_offset, \

+                                                 const uint16_t *dst, \

+                                                 ptrdiff_t dst_stride, \

+                                                 int height, unsigned int *sse);

+#define DECLS(opt1, opt2) \

+  DECL(8, opt1); \

+  DECL(16, opt1)

+DECLS(sse2, sse);

+// TODO(johannkoenig): enable the ssse3 or delete

+// DECLS(ssse3, ssse3);

+#undef DECLS

+#undef DECL

+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \

+uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \

+                                                          int src_stride, \

+                                                          int x_offset, \

+                                                          int y_offset, \

+                                                          const uint8_t *dst8, \

+                                                          int dst_stride, \

+                                                          uint32_t *sse_ptr) { \

+  uint32_t sse; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \

+                                                       x_offset, y_offset, \

+                                                       dst, dst_stride, h, \

+                                                       &sse); \

+  if (w > wf) { \

+    unsigned int sse2; \

+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \

+                                                          src_stride, \

+                                                          x_offset, y_offset, \

+                                                          dst + 16, \

+                                                          dst_stride, \

+                                                          h, &sse2); \

+    se += se2; \

+    sse += sse2; \

+    if (w > wf * 2) { \

+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \

+                                                        x_offset, y_offset, \

+                                                        dst + 32, dst_stride, \

+                                                        h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \

+          src + 48, src_stride, x_offset, y_offset, \

+          dst + 48, dst_stride, h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+    } \

+  } \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+} \

+\

+uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \

+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \

+  uint32_t sse; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \

+                                                       x_offset, y_offset, \

+                                                       dst, dst_stride, \

+                                                       h, &sse); \

+  if (w > wf) { \

+    uint32_t sse2; \

+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \

+                                                          src_stride, \

+                                                          x_offset, y_offset, \

+                                                          dst + 16, \

+                                                          dst_stride, \

+                                                          h, &sse2); \

+    se += se2; \

+    sse += sse2; \

+    if (w > wf * 2) { \

+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \

+                                                        x_offset, y_offset, \

+                                                        dst + 32, dst_stride, \

+                                                        h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \

+                                                        x_offset, y_offset, \

+                                                        dst + 48, dst_stride, \

+                                                        h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+    } \

+  } \

+  se = ROUND_POWER_OF_TWO(se, 2); \

+  sse = ROUND_POWER_OF_TWO(sse, 4); \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+} \

+\

+uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \

+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \

+  int start_row; \

+  uint32_t sse; \

+  int se = 0; \

+  uint64_t long_sse = 0; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  for (start_row = 0; start_row < h; start_row +=16) { \

+    uint32_t sse2; \

+    int height = h - start_row < 16 ? h - start_row : 16; \

+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \

+        src + (start_row * src_stride), src_stride, \

+        x_offset, y_offset, dst + (start_row * dst_stride), \

+        dst_stride, height, &sse2); \

+    se += se2; \

+    long_sse += sse2; \

+    if (w > wf) { \

+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \

+          src + 16 + (start_row * src_stride), src_stride, \

+          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \

+          dst_stride, height, &sse2); \

+      se += se2; \

+      long_sse += sse2; \

+      if (w > wf * 2) { \

+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \

+            src + 32 + (start_row * src_stride), src_stride, \

+            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \

+            dst_stride, height, &sse2); \

+        se += se2; \

+        long_sse += sse2; \

+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \

+            src + 48 + (start_row * src_stride), src_stride, \

+            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \

+            dst_stride, height, &sse2); \

+        se += se2; \

+        long_sse += sse2; \

+      }\

+    } \

+  } \

+  se = ROUND_POWER_OF_TWO(se, 4); \

+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+}

+#define FNS(opt1, opt2) \

+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \

+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \

+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \

+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \

+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \

+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \

+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \

+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \

+FN(8, 16, 8, 3, 4, opt1, (int64_t)); \

+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \

+FN(8, 4, 8, 3, 2, opt1, (int64_t));

+FNS(sse2, sse);

+#undef FNS

+#undef FN

+#define DECL(w, opt) \

+int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \

+                                                   ptrdiff_t src_stride, \

+                                                   int x_offset, int y_offset, \

+                                                   const uint16_t *dst, \

+                                                   ptrdiff_t dst_stride, \

+                                                   const uint16_t *sec, \

+                                                   ptrdiff_t sec_stride, \

+                                                   int height, \

+                                                   unsigned int *sse);

+#define DECLS(opt1) \

+DECL(16, opt1) \

+DECL(8, opt1)

+DECLS(sse2);

+#undef DECL

+#undef DECLS

+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \

+uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \

+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \

+    const uint8_t *sec8) { \

+  uint32_t sse; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \

+  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+               src, src_stride, x_offset, \

+               y_offset, dst, dst_stride, sec, w, h, &sse); \

+  if (w > wf) { \

+    uint32_t sse2; \

+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                  src + 16, src_stride, x_offset, y_offset, \

+                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \

+    se += se2; \

+    sse += sse2; \

+    if (w > wf * 2) { \

+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + 32, src_stride, x_offset, y_offset, \

+                dst + 32, dst_stride, sec + 32, w, h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + 48, src_stride, x_offset, y_offset, \

+                dst + 48, dst_stride, sec + 48, w, h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+    } \

+  } \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+} \

+\

+uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \

+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \

+    const uint8_t *sec8) { \

+  uint32_t sse; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \

+  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                                            src, src_stride, x_offset, \

+                                            y_offset, dst, dst_stride, \

+                                            sec, w, h, &sse); \

+  if (w > wf) { \

+    uint32_t sse2; \

+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                                            src + 16, src_stride, \

+                                            x_offset, y_offset, \

+                                            dst + 16, dst_stride, \

+                                            sec + 16, w, h, &sse2); \

+    se += se2; \

+    sse += sse2; \

+    if (w > wf * 2) { \

+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                                            src + 32, src_stride, \

+                                            x_offset, y_offset, \

+                                            dst + 32, dst_stride, \

+                                            sec + 32, w, h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                                            src + 48, src_stride, \

+                                            x_offset, y_offset, \

+                                            dst + 48, dst_stride, \

+                                            sec + 48, w, h, &sse2); \

+      se += se2; \

+      sse += sse2; \

+    } \

+  } \

+  se = ROUND_POWER_OF_TWO(se, 2); \

+  sse = ROUND_POWER_OF_TWO(sse, 4); \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+} \

+\

+uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \

+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \

+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \

+    const uint8_t *sec8) { \

+  int start_row; \

+  uint32_t sse; \

+  int se = 0; \

+  uint64_t long_sse = 0; \

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \

+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \

+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \

+  for (start_row = 0; start_row < h; start_row +=16) { \

+    uint32_t sse2; \

+    int height = h - start_row < 16 ? h - start_row : 16; \

+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + (start_row * src_stride), src_stride, x_offset, \

+                y_offset, dst + (start_row * dst_stride), dst_stride, \

+                sec + (start_row * w), w, height, &sse2); \

+    se += se2; \

+    long_sse += sse2; \

+    if (w > wf) { \

+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + 16 + (start_row * src_stride), src_stride, \

+                x_offset, y_offset, \

+                dst + 16 + (start_row * dst_stride), dst_stride, \

+                sec + 16 + (start_row * w), w, height, &sse2); \

+      se += se2; \

+      long_sse += sse2; \

+      if (w > wf * 2) { \

+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + 32 + (start_row * src_stride), src_stride, \

+                x_offset, y_offset, \

+                dst + 32 + (start_row * dst_stride), dst_stride, \

+                sec + 32 + (start_row * w), w, height, &sse2); \

+        se += se2; \

+        long_sse += sse2; \

+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \

+                src + 48 + (start_row * src_stride), src_stride, \

+                x_offset, y_offset, \

+                dst + 48 + (start_row * dst_stride), dst_stride, \

+                sec + 48 + (start_row * w), w, height, &sse2); \

+        se += se2; \

+        long_sse += sse2; \

+      } \

+    } \

+  } \

+  se = ROUND_POWER_OF_TWO(se, 4); \

+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+}

+#define FNS(opt1) \

+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \

+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \

+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \

+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \

+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \

+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \

+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \

+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \

+FN(8, 16, 8, 4, 3, opt1, (int64_t)); \

+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \

+FN(8, 4, 8, 3, 2, opt1, (int64_t));

+FNS(sse2);

+#undef FNS

+#undef FN

+#endif  // CONFIG_USE_X86INC

--- /dev/null

+++ b/vpx_dsp/x86/subpel_variance_sse2.asm

@@ -1,0 +1,1398 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%define program_name vpx

+%include "third_party/x86inc/x86inc.asm"

+SECTION_RODATA

+pw_8: times  8 dw  8

+bilin_filter_m_sse2: times  8 dw 16

+                     times  8 dw  0

+                     times  8 dw 14

+                     times  8 dw  2

+                     times  8 dw 12

+                     times  8 dw  4

+                     times  8 dw 10

+                     times  8 dw  6

+                     times 16 dw  8

+                     times  8 dw  6

+                     times  8 dw 10

+                     times  8 dw  4

+                     times  8 dw 12

+                     times  8 dw  2

+                     times  8 dw 14

+bilin_filter_m_ssse3: times  8 db 16,  0

+                      times  8 db 14,  2

+                      times  8 db 12,  4

+                      times  8 db 10,  6

+                      times 16 db  8

+                      times  8 db  6, 10

+                      times  8 db  4, 12

+                      times  8 db  2, 14

+SECTION .text

+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,

+;                               int x_offset, int y_offset,

+;                               const uint8_t *dst, ptrdiff_t dst_stride,

+;                               int height, unsigned int *sse);

+;

+; This function returns the SE and stores SSE in the given pointer.

+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse

+  psubw                %3, %4

+  psubw                %1, %2

+  paddw                %5, %3

+  pmaddwd              %3, %3

+  paddw                %5, %1

+  pmaddwd              %1, %1

+  paddd                %6, %3

+  paddd                %6, %1

+%endmacro

+%macro STORE_AND_RET 0

+%if mmsize == 16

+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit

+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.

+  ; We have to sign-extend it before adding the words within the register

+  ; and outputing to a dword.

+  pcmpgtw              m5, m6           ; mask for 0 > x

+  movhlps              m3, m7

+  punpcklwd            m4, m6, m5

+  punpckhwd            m6, m5           ; sign-extend m6 word->dword

+  paddd                m7, m3

+  paddd                m6, m4

+  pshufd               m3, m7, 0x1

+  movhlps              m4, m6

+  paddd                m7, m3

+  paddd                m6, m4

+  mov                  r1, ssem         ; r1 = unsigned int *sse

+  pshufd               m4, m6, 0x1

+  movd               [r1], m7           ; store sse

+  paddd                m6, m4

+  movd               raxd, m6           ; store sum as return value

+%else ; mmsize == 8

+  pshufw               m4, m6, 0xe

+  pshufw               m3, m7, 0xe

+  paddw                m6, m4

+  paddd                m7, m3

+  pcmpgtw              m5, m6           ; mask for 0 > x

+  mov                  r1, ssem         ; r1 = unsigned int *sse

+  punpcklwd            m6, m5           ; sign-extend m6 word->dword

+  movd               [r1], m7           ; store sse

+  pshufw               m4, m6, 0xe

+  paddd                m6, m4

+  movd               raxd, m6           ; store sum as return value

+%endif

+  RET

+%endmacro

+%macro INC_SRC_BY_SRC_STRIDE  0

+%if ARCH_X86=1 && CONFIG_PIC=1

+  add                srcq, src_stridemp

+%else

+  add                srcq, src_strideq

+%endif

+%endmacro

+%macro SUBPEL_VARIANCE 1-2 0 ; W

+%if cpuflag(ssse3)

+%define bilin_filter_m bilin_filter_m_ssse3

+%define filter_idx_shift 4

+%else

+%define bilin_filter_m bilin_filter_m_sse2

+%define filter_idx_shift 5

+%endif

+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses

+; 11, not 13, if the registers are ordered correctly. May make a minor speed

+; difference on Win64

+%ifdef PIC    ; 64bit PIC

+  %if %2 == 1 ; avg

+    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \

+                                      x_offset, y_offset, \

+                                      dst, dst_stride, \

+                                      sec, sec_stride, height, sse

+    %define sec_str sec_strideq

+  %else

+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \

+                                  y_offset, dst, dst_stride, height, sse

+  %endif

+  %define h heightd

+  %define bilin_filter sseq

+%else

+  %if ARCH_X86=1 && CONFIG_PIC=1

+    %if %2 == 1 ; avg

+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \

+                                  x_offset, y_offset, \

+                                  dst, dst_stride, \

+                                  sec, sec_stride, \

+                                  height, sse, g_bilin_filter, g_pw_8

+      %define h dword heightm

+      %define sec_str sec_stridemp

+      ;Store bilin_filter and pw_8 location in stack

+      GET_GOT eax

+      add esp, 4                ; restore esp

+      lea ecx, [GLOBAL(bilin_filter_m)]

+      mov g_bilin_filterm, ecx

+      lea ecx, [GLOBAL(pw_8)]

+      mov g_pw_8m, ecx

+      LOAD_IF_USED 0, 1         ; load eax, ecx back

+    %else

+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \

+                                y_offset, dst, dst_stride, height, sse, \

+                                g_bilin_filter, g_pw_8

+      %define h heightd

+      ;Store bilin_filter and pw_8 location in stack

+      GET_GOT eax

+      add esp, 4                ; restore esp

+      lea ecx, [GLOBAL(bilin_filter_m)]

+      mov g_bilin_filterm, ecx

+      lea ecx, [GLOBAL(pw_8)]

+      mov g_pw_8m, ecx

+      LOAD_IF_USED 0, 1         ; load eax, ecx back

+    %endif

+  %else

+    %if %2 == 1 ; avg

+      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \

+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \

+                                             x_offset, y_offset, \

+                                             dst, dst_stride, \

+                                             sec, sec_stride, \

+                                             height, sse

+      %if ARCH_X86_64

+      %define h heightd

+      %define sec_str sec_strideq

+      %else

+      %define h dword heightm

+      %define sec_str sec_stridemp

+      %endif

+    %else

+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \

+                              y_offset, dst, dst_stride, height, sse

+      %define h heightd

+    %endif

+    %define bilin_filter bilin_filter_m

+  %endif

+%endif

+  ASSERT               %1 <= 16         ; m6 overflows if w > 16

+  pxor                 m6, m6           ; sum

+  pxor                 m7, m7           ; sse

+  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we

+  ; could perhaps use it for something more productive then

+  pxor                 m5, m5           ; dedicated zero register

+%if %1 < 16

+  sar                   h, 1

+%if %2 == 1 ; avg

+  shl             sec_str, 1

+%endif

+%endif

+  ; FIXME(rbultje) replace by jumptable?

+  test          x_offsetd, x_offsetd

+  jnz .x_nonzero

+  ; x_offset == 0

+  test          y_offsetd, y_offsetd

+  jnz .x_zero_y_nonzero

+  ; x_offset == 0 && y_offset == 0

+.x_zero_y_zero_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  mova                 m1, [dstq]

+%if %2 == 1 ; avg

+  pavgb                m0, [secq]

+  punpckhbw            m3, m1, m5

+  punpcklbw            m1, m5

+%endif

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%if %2 == 0 ; !avg

+  punpckhbw            m3, m1, m5

+  punpcklbw            m1, m5

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  add                srcq, src_strideq

+  add                dstq, dst_strideq

+%else ; %1 < 16

+  movh                 m0, [srcq]

+%if %2 == 1 ; avg

+%if mmsize == 16

+  movhps               m0, [srcq+src_strideq]

+%else ; mmsize == 8

+  punpckldq            m0, [srcq+src_strideq]

+%endif

+%else ; !avg

+  movh                 m2, [srcq+src_strideq]

+%endif

+  movh                 m1, [dstq]

+  movh                 m3, [dstq+dst_strideq]

+%if %2 == 1 ; avg

+  pavgb                m0, [secq]

+  punpcklbw            m3, m5

+  punpcklbw            m1, m5

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%else ; !avg

+  punpcklbw            m0, m5

+  punpcklbw            m2, m5

+  punpcklbw            m3, m5

+  punpcklbw            m1, m5

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%endif

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+  dec                   h

+  jg .x_zero_y_zero_loop

+  STORE_AND_RET

+.x_zero_y_nonzero:

+  cmp           y_offsetd, 8

+  jne .x_zero_y_nonhalf

+  ; x_offset == 0 && y_offset == 0.5

+.x_zero_y_half_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m4, [srcq+src_strideq]

+  mova                 m1, [dstq]

+  pavgb                m0, m4

+  punpckhbw            m3, m1, m5

+%if %2 == 1 ; avg

+  pavgb                m0, [secq]

+%endif

+  punpcklbw            m1, m5

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  add                srcq, src_strideq

+  add                dstq, dst_strideq

+%else ; %1 < 16

+  movh                 m0, [srcq]

+  movh                 m2, [srcq+src_strideq]

+%if %2 == 1 ; avg

+%if mmsize == 16

+  movhps               m2, [srcq+src_strideq*2]

+%else ; mmsize == 8

+%if %1 == 4

+  movh                 m1, [srcq+src_strideq*2]

+  punpckldq            m2, m1

+%else

+  punpckldq            m2, [srcq+src_strideq*2]

+%endif

+%endif

+  movh                 m1, [dstq]

+%if mmsize == 16

+  movlhps              m0, m2

+%else ; mmsize == 8

+  punpckldq            m0, m2

+%endif

+  movh                 m3, [dstq+dst_strideq]

+  pavgb                m0, m2

+  punpcklbw            m1, m5

+  pavgb                m0, [secq]

+  punpcklbw            m3, m5

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%else ; !avg

+  movh                 m4, [srcq+src_strideq*2]

+  movh                 m1, [dstq]

+  pavgb                m0, m2

+  movh                 m3, [dstq+dst_strideq]

+  pavgb                m2, m4

+  punpcklbw            m0, m5

+  punpcklbw            m2, m5

+  punpcklbw            m3, m5

+  punpcklbw            m1, m5

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%endif

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+  dec                   h

+  jg .x_zero_y_half_loop

+  STORE_AND_RET

+.x_zero_y_nonhalf:

+  ; x_offset == 0 && y_offset == bilin interpolation

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           y_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+y_offsetq]

+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

+  mova                 m9, [bilin_filter+y_offsetq+16]

+%endif

+  mova                m10, [pw_8]

+%define filter_y_a m8

+%define filter_y_b m9

+%define filter_rnd m10

+%else ; x86-32 or mmx

+%if ARCH_X86=1 && CONFIG_PIC=1

+; x_offset == 0, reuse x_offset reg

+%define tempq x_offsetq

+  add y_offsetq, g_bilin_filterm

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           y_offsetq, bilin_filter

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+.x_zero_y_other_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m4, [srcq+src_strideq]

+  mova                 m1, [dstq]

+%if cpuflag(ssse3)

+  punpckhbw            m2, m0, m4

+  punpcklbw            m0, m4

+  pmaddubsw            m2, filter_y_a

+  pmaddubsw            m0, filter_y_a

+  paddw                m2, filter_rnd

+  paddw                m0, filter_rnd

+%else

+  punpckhbw            m2, m0, m5

+  punpckhbw            m3, m4, m5

+  punpcklbw            m0, m5

+  punpcklbw            m4, m5

+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can

+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of

+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be

+  ; slightly faster because of pmullw latency. It would also cut our rodata

+  ; tables in half for this function, and save 1-2 registers on x86-64.

+  pmullw               m2, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m2, filter_rnd

+  pmullw               m0, filter_y_a

+  pmullw               m4, filter_y_b

+  paddw                m0, filter_rnd

+  paddw                m2, m3

+  paddw                m0, m4

+%endif

+  psraw                m2, 4

+  psraw                m0, 4

+%if %2 == 1 ; avg

+  ; FIXME(rbultje) pipeline

+  packuswb             m0, m2

+  pavgb                m0, [secq]

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%endif

+  punpckhbw            m3, m1, m5

+  punpcklbw            m1, m5

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  add                srcq, src_strideq

+  add                dstq, dst_strideq

+%else ; %1 < 16

+  movh                 m0, [srcq]

+  movh                 m2, [srcq+src_strideq]

+  movh                 m4, [srcq+src_strideq*2]

+  movh                 m3, [dstq+dst_strideq]

+%if cpuflag(ssse3)

+  movh                 m1, [dstq]

+  punpcklbw            m0, m2

+  punpcklbw            m2, m4

+  pmaddubsw            m0, filter_y_a

+  pmaddubsw            m2, filter_y_a

+  punpcklbw            m3, m5

+  paddw                m2, filter_rnd

+  paddw                m0, filter_rnd

+%else

+  punpcklbw            m0, m5

+  punpcklbw            m2, m5

+  punpcklbw            m4, m5

+  pmullw               m0, filter_y_a

+  pmullw               m1, m2, filter_y_b

+  punpcklbw            m3, m5

+  paddw                m0, filter_rnd

+  pmullw               m2, filter_y_a

+  pmullw               m4, filter_y_b

+  paddw                m0, m1

+  paddw                m2, filter_rnd

+  movh                 m1, [dstq]

+  paddw                m2, m4

+%endif

+  psraw                m0, 4

+  psraw                m2, 4

+%if %2 == 1 ; avg

+  ; FIXME(rbultje) pipeline

+  packuswb             m0, m2

+  pavgb                m0, [secq]

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%endif

+  punpcklbw            m1, m5

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%endif

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+  dec                   h

+  jg .x_zero_y_other_loop

+%undef filter_y_a

+%undef filter_y_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonzero:

+  cmp           x_offsetd, 8

+  jne .x_nonhalf

+  ; x_offset == 0.5

+  test          y_offsetd, y_offsetd

+  jnz .x_half_y_nonzero

+  ; x_offset == 0.5 && y_offset == 0

+.x_half_y_zero_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m4, [srcq+1]

+  mova                 m1, [dstq]

+  pavgb                m0, m4

+  punpckhbw            m3, m1, m5

+%if %2 == 1 ; avg

+  pavgb                m0, [secq]

+%endif

+  punpcklbw            m1, m5

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  add                srcq, src_strideq

+  add                dstq, dst_strideq

+%else ; %1 < 16

+  movh                 m0, [srcq]

+  movh                 m4, [srcq+1]

+%if %2 == 1 ; avg

+%if mmsize == 16

+  movhps               m0, [srcq+src_strideq]

+  movhps               m4, [srcq+src_strideq+1]

+%else ; mmsize == 8

+  punpckldq            m0, [srcq+src_strideq]

+  punpckldq            m4, [srcq+src_strideq+1]

+%endif

+  movh                 m1, [dstq]

+  movh                 m3, [dstq+dst_strideq]

+  pavgb                m0, m4

+  punpcklbw            m3, m5

+  pavgb                m0, [secq]

+  punpcklbw            m1, m5

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%else ; !avg

+  movh                 m2, [srcq+src_strideq]

+  movh                 m1, [dstq]

+  pavgb                m0, m4

+  movh                 m4, [srcq+src_strideq+1]

+  movh                 m3, [dstq+dst_strideq]

+  pavgb                m2, m4

+  punpcklbw            m0, m5

+  punpcklbw            m2, m5

+  punpcklbw            m3, m5

+  punpcklbw            m1, m5

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%endif

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+  dec                   h

+  jg .x_half_y_zero_loop

+  STORE_AND_RET

+.x_half_y_nonzero:

+  cmp           y_offsetd, 8

+  jne .x_half_y_nonhalf

+  ; x_offset == 0.5 && y_offset == 0.5

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m3, [srcq+1]

+  add                srcq, src_strideq

+  pavgb                m0, m3

+.x_half_y_half_loop:

+  movu                 m4, [srcq]

+  movu                 m3, [srcq+1]

+  mova                 m1, [dstq]

+  pavgb                m4, m3

+  punpckhbw            m3, m1, m5

+  pavgb                m0, m4

+%if %2 == 1 ; avg

+  punpcklbw            m1, m5

+  pavgb                m0, [secq]

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%else

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+  punpcklbw            m1, m5

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  mova                 m0, m4

+  add                srcq, src_strideq

+  add                dstq, dst_strideq

+%else ; %1 < 16

+  movh                 m0, [srcq]

+  movh                 m3, [srcq+1]

+  add                srcq, src_strideq

+  pavgb                m0, m3

+.x_half_y_half_loop:

+  movh                 m2, [srcq]

+  movh                 m3, [srcq+1]

+%if %2 == 1 ; avg

+%if mmsize == 16

+  movhps               m2, [srcq+src_strideq]

+  movhps               m3, [srcq+src_strideq+1]

+%else

+%if %1 == 4

+  movh                 m1, [srcq+src_strideq]

+  punpckldq            m2, m1

+  movh                 m1, [srcq+src_strideq+1]

+  punpckldq            m3, m1

+%else

+  punpckldq            m2, [srcq+src_strideq]

+  punpckldq            m3, [srcq+src_strideq+1]

+%endif

+%endif

+  pavgb                m2, m3

+%if mmsize == 16

+  movlhps              m0, m2

+  movhlps              m4, m2

+%else ; mmsize == 8

+  punpckldq            m0, m2

+  pshufw               m4, m2, 0xe

+%endif

+  movh                 m1, [dstq]

+  pavgb                m0, m2

+  movh                 m3, [dstq+dst_strideq]

+  pavgb                m0, [secq]

+  punpcklbw            m3, m5

+  punpcklbw            m1, m5

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%else ; !avg

+  movh                 m4, [srcq+src_strideq]

+  movh                 m1, [srcq+src_strideq+1]

+  pavgb                m2, m3

+  pavgb                m4, m1

+  pavgb                m0, m2

+  pavgb                m2, m4

+  movh                 m1, [dstq]

+  movh                 m3, [dstq+dst_strideq]

+  punpcklbw            m0, m5

+  punpcklbw            m2, m5

+  punpcklbw            m3, m5

+  punpcklbw            m1, m5

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  mova                 m0, m4

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%endif

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+  dec                   h

+  jg .x_half_y_half_loop

+  STORE_AND_RET

+.x_half_y_nonhalf:

+  ; x_offset == 0.5 && y_offset == bilin interpolation

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           y_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+y_offsetq]

+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

+  mova                 m9, [bilin_filter+y_offsetq+16]

+%endif

+  mova                m10, [pw_8]

+%define filter_y_a m8

+%define filter_y_b m9

+%define filter_rnd m10

+%else  ;x86_32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; x_offset == 0.5. We can reuse x_offset reg

+%define tempq x_offsetq

+  add y_offsetq, g_bilin_filterm

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           y_offsetq, bilin_filter

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m3, [srcq+1]

+  add                srcq, src_strideq

+  pavgb                m0, m3

+.x_half_y_other_loop:

+  movu                 m4, [srcq]

+  movu                 m2, [srcq+1]

+  mova                 m1, [dstq]

+  pavgb                m4, m2

+%if cpuflag(ssse3)

+  punpckhbw            m2, m0, m4

+  punpcklbw            m0, m4

+  pmaddubsw            m2, filter_y_a

+  pmaddubsw            m0, filter_y_a

+  paddw                m2, filter_rnd

+  paddw                m0, filter_rnd

+  psraw                m2, 4

+%else

+  punpckhbw            m2, m0, m5

+  punpckhbw            m3, m4, m5

+  pmullw               m2, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m2, filter_rnd

+  punpcklbw            m0, m5

+  paddw                m2, m3

+  punpcklbw            m3, m4, m5

+  pmullw               m0, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m0, filter_rnd

+  psraw                m2, 4

+  paddw                m0, m3

+%endif

+  punpckhbw            m3, m1, m5

+  psraw                m0, 4

+%if %2 == 1 ; avg

+  ; FIXME(rbultje) pipeline

+  packuswb             m0, m2

+  pavgb                m0, [secq]

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%endif

+  punpcklbw            m1, m5

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  mova                 m0, m4

+  add                srcq, src_strideq

+  add                dstq, dst_strideq

+%else ; %1 < 16

+  movh                 m0, [srcq]

+  movh                 m3, [srcq+1]

+  add                srcq, src_strideq

+  pavgb                m0, m3

+%if notcpuflag(ssse3)

+  punpcklbw            m0, m5

+%endif

+.x_half_y_other_loop:

+  movh                 m2, [srcq]

+  movh                 m1, [srcq+1]

+  movh                 m4, [srcq+src_strideq]

+  movh                 m3, [srcq+src_strideq+1]

+  pavgb                m2, m1

+  pavgb                m4, m3

+  movh                 m3, [dstq+dst_strideq]

+%if cpuflag(ssse3)

+  movh                 m1, [dstq]

+  punpcklbw            m0, m2

+  punpcklbw            m2, m4

+  pmaddubsw            m0, filter_y_a

+  pmaddubsw            m2, filter_y_a

+  punpcklbw            m3, m5

+  paddw                m0, filter_rnd

+  paddw                m2, filter_rnd

+%else

+  punpcklbw            m2, m5

+  punpcklbw            m4, m5

+  pmullw               m0, filter_y_a

+  pmullw               m1, m2, filter_y_b

+  punpcklbw            m3, m5

+  paddw                m0, filter_rnd

+  pmullw               m2, filter_y_a

+  paddw                m0, m1

+  pmullw               m1, m4, filter_y_b

+  paddw                m2, filter_rnd

+  paddw                m2, m1

+  movh                 m1, [dstq]

+%endif

+  psraw                m0, 4

+  psraw                m2, 4

+%if %2 == 1 ; avg

+  ; FIXME(rbultje) pipeline

+  packuswb             m0, m2

+  pavgb                m0, [secq]

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%endif

+  punpcklbw            m1, m5

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  mova                 m0, m4

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%endif

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+  dec                   h

+  jg .x_half_y_other_loop

+%undef filter_y_a

+%undef filter_y_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonhalf:

+  test          y_offsetd, y_offsetd

+  jnz .x_nonhalf_y_nonzero

+  ; x_offset == bilin interpolation && y_offset == 0

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           x_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+x_offsetq]

+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

+  mova                 m9, [bilin_filter+x_offsetq+16]

+%endif

+  mova                m10, [pw_8]

+%define filter_x_a m8

+%define filter_x_b m9

+%define filter_rnd m10

+%else    ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+;y_offset == 0. We can reuse y_offset reg.

+%define tempq y_offsetq

+  add x_offsetq, g_bilin_filterm

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           x_offsetq, bilin_filter

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+.x_other_y_zero_loop:

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m4, [srcq+1]

+  mova                 m1, [dstq]

+%if cpuflag(ssse3)

+  punpckhbw            m2, m0, m4

+  punpcklbw            m0, m4

+  pmaddubsw            m2, filter_x_a

+  pmaddubsw            m0, filter_x_a

+  paddw                m2, filter_rnd

+  paddw                m0, filter_rnd

+%else

+  punpckhbw            m2, m0, m5

+  punpckhbw            m3, m4, m5

+  punpcklbw            m0, m5

+  punpcklbw            m4, m5

+  pmullw               m2, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m2, filter_rnd

+  pmullw               m0, filter_x_a

+  pmullw               m4, filter_x_b

+  paddw                m0, filter_rnd

+  paddw                m2, m3

+  paddw                m0, m4

+%endif

+  psraw                m2, 4

+  psraw                m0, 4

+%if %2 == 1 ; avg

+  ; FIXME(rbultje) pipeline

+  packuswb             m0, m2

+  pavgb                m0, [secq]

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%endif

+  punpckhbw            m3, m1, m5

+  punpcklbw            m1, m5

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  add                srcq, src_strideq

+  add                dstq, dst_strideq

+%else ; %1 < 16

+  movh                 m0, [srcq]

+  movh                 m1, [srcq+1]

+  movh                 m2, [srcq+src_strideq]

+  movh                 m4, [srcq+src_strideq+1]

+  movh                 m3, [dstq+dst_strideq]

+%if cpuflag(ssse3)

+  punpcklbw            m0, m1

+  movh                 m1, [dstq]

+  punpcklbw            m2, m4

+  pmaddubsw            m0, filter_x_a

+  pmaddubsw            m2, filter_x_a

+  punpcklbw            m3, m5

+  paddw                m0, filter_rnd

+  paddw                m2, filter_rnd

+%else

+  punpcklbw            m0, m5

+  punpcklbw            m1, m5

+  punpcklbw            m2, m5

+  punpcklbw            m4, m5

+  pmullw               m0, filter_x_a

+  pmullw               m1, filter_x_b

+  punpcklbw            m3, m5

+  paddw                m0, filter_rnd

+  pmullw               m2, filter_x_a

+  pmullw               m4, filter_x_b

+  paddw                m0, m1

+  paddw                m2, filter_rnd

+  movh                 m1, [dstq]

+  paddw                m2, m4

+%endif

+  psraw                m0, 4

+  psraw                m2, 4

+%if %2 == 1 ; avg

+  ; FIXME(rbultje) pipeline

+  packuswb             m0, m2

+  pavgb                m0, [secq]

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%endif

+  punpcklbw            m1, m5

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%endif

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+  dec                   h

+  jg .x_other_y_zero_loop

+%undef filter_x_a

+%undef filter_x_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonhalf_y_nonzero:

+  cmp           y_offsetd, 8

+  jne .x_nonhalf_y_nonhalf

+  ; x_offset == bilin interpolation && y_offset == 0.5

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           x_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+x_offsetq]

+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

+  mova                 m9, [bilin_filter+x_offsetq+16]

+%endif

+  mova                m10, [pw_8]

+%define filter_x_a m8

+%define filter_x_b m9

+%define filter_rnd m10

+%else    ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; y_offset == 0.5. We can reuse y_offset reg.

+%define tempq y_offsetq

+  add x_offsetq, g_bilin_filterm

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           x_offsetq, bilin_filter

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+1]

+%if cpuflag(ssse3)

+  punpckhbw            m2, m0, m1

+  punpcklbw            m0, m1

+  pmaddubsw            m2, filter_x_a

+  pmaddubsw            m0, filter_x_a

+  paddw                m2, filter_rnd

+  paddw                m0, filter_rnd

+%else

+  punpckhbw            m2, m0, m5

+  punpckhbw            m3, m1, m5

+  punpcklbw            m0, m5

+  punpcklbw            m1, m5

+  pmullw               m0, filter_x_a

+  pmullw               m1, filter_x_b

+  paddw                m0, filter_rnd

+  pmullw               m2, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m2, filter_rnd

+  paddw                m0, m1

+  paddw                m2, m3

+%endif

+  psraw                m0, 4

+  psraw                m2, 4

+  add                srcq, src_strideq

+  packuswb             m0, m2

+.x_other_y_half_loop:

+  movu                 m4, [srcq]

+  movu                 m3, [srcq+1]

+%if cpuflag(ssse3)

+  mova                 m1, [dstq]

+  punpckhbw            m2, m4, m3

+  punpcklbw            m4, m3

+  pmaddubsw            m2, filter_x_a

+  pmaddubsw            m4, filter_x_a

+  paddw                m2, filter_rnd

+  paddw                m4, filter_rnd

+  psraw                m2, 4

+  psraw                m4, 4

+  packuswb             m4, m2

+  pavgb                m0, m4

+  punpckhbw            m3, m1, m5

+  punpcklbw            m1, m5

+%else

+  punpckhbw            m2, m4, m5

+  punpckhbw            m1, m3, m5

+  punpcklbw            m4, m5

+  punpcklbw            m3, m5

+  pmullw               m4, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m4, filter_rnd

+  pmullw               m2, filter_x_a

+  pmullw               m1, filter_x_b

+  paddw                m2, filter_rnd

+  paddw                m4, m3

+  paddw                m2, m1

+  mova                 m1, [dstq]

+  psraw                m4, 4

+  psraw                m2, 4

+  punpckhbw            m3, m1, m5

+  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we

+  ; have a 1-register shortage to be able to store the backup of the bilin

+  ; filtered second line as words as cache for the next line. Packing into

+  ; a byte costs 1 pack and 2 unpacks, but saves a register.

+  packuswb             m4, m2

+  punpcklbw            m1, m5

+  pavgb                m0, m4

+%endif

+%if %2 == 1 ; avg

+  ; FIXME(rbultje) pipeline

+  pavgb                m0, [secq]

+%endif

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  mova                 m0, m4

+  add                srcq, src_strideq

+  add                dstq, dst_strideq

+%else ; %1 < 16

+  movh                 m0, [srcq]

+  movh                 m1, [srcq+1]

+%if cpuflag(ssse3)

+  punpcklbw            m0, m1

+  pmaddubsw            m0, filter_x_a

+  paddw                m0, filter_rnd

+%else

+  punpcklbw            m0, m5

+  punpcklbw            m1, m5

+  pmullw               m0, filter_x_a

+  pmullw               m1, filter_x_b

+  paddw                m0, filter_rnd

+  paddw                m0, m1

+%endif

+  add                srcq, src_strideq

+  psraw                m0, 4

+.x_other_y_half_loop:

+  movh                 m2, [srcq]

+  movh                 m1, [srcq+1]

+  movh                 m4, [srcq+src_strideq]

+  movh                 m3, [srcq+src_strideq+1]

+%if cpuflag(ssse3)

+  punpcklbw            m2, m1

+  punpcklbw            m4, m3

+  pmaddubsw            m2, filter_x_a

+  pmaddubsw            m4, filter_x_a

+  movh                 m1, [dstq]

+  movh                 m3, [dstq+dst_strideq]

+  paddw                m2, filter_rnd

+  paddw                m4, filter_rnd

+%else

+  punpcklbw            m2, m5

+  punpcklbw            m1, m5

+  punpcklbw            m4, m5

+  punpcklbw            m3, m5

+  pmullw               m2, filter_x_a

+  pmullw               m1, filter_x_b

+  paddw                m2, filter_rnd

+  pmullw               m4, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m4, filter_rnd

+  paddw                m2, m1

+  movh                 m1, [dstq]

+  paddw                m4, m3

+  movh                 m3, [dstq+dst_strideq]

+%endif

+  psraw                m2, 4

+  psraw                m4, 4

+  pavgw                m0, m2

+  pavgw                m2, m4

+%if %2 == 1 ; avg

+  ; FIXME(rbultje) pipeline - also consider going to bytes here

+  packuswb             m0, m2

+  pavgb                m0, [secq]

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%endif

+  punpcklbw            m3, m5

+  punpcklbw            m1, m5

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  mova                 m0, m4

+  lea                srcq, [srcq+src_strideq*2]

+  lea                dstq, [dstq+dst_strideq*2]

+%endif

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+  dec                   h

+  jg .x_other_y_half_loop

+%undef filter_x_a

+%undef filter_x_b

+%undef filter_rnd

+  STORE_AND_RET

+.x_nonhalf_y_nonhalf:

+%ifdef PIC

+  lea        bilin_filter, [bilin_filter_m]

+%endif

+  shl           x_offsetd, filter_idx_shift

+  shl           y_offsetd, filter_idx_shift

+%if ARCH_X86_64 && mmsize == 16

+  mova                 m8, [bilin_filter+x_offsetq]

+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

+  mova                 m9, [bilin_filter+x_offsetq+16]

+%endif

+  mova                m10, [bilin_filter+y_offsetq]

+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

+  mova                m11, [bilin_filter+y_offsetq+16]

+%endif

+  mova                m12, [pw_8]

+%define filter_x_a m8

+%define filter_x_b m9

+%define filter_y_a m10

+%define filter_y_b m11

+%define filter_rnd m12

+%else   ; x86-32

+%if ARCH_X86=1 && CONFIG_PIC=1

+; In this case, there is NO unused register. Used src_stride register. Later,

+; src_stride has to be loaded from stack when it is needed.

+%define tempq src_strideq

+  mov tempq, g_bilin_filterm

+  add           x_offsetq, tempq

+  add           y_offsetq, tempq

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+  mov tempq, g_pw_8m

+%define filter_rnd [tempq]

+%else

+  add           x_offsetq, bilin_filter

+  add           y_offsetq, bilin_filter

+%define filter_x_a [x_offsetq]

+%define filter_x_b [x_offsetq+16]

+%define filter_y_a [y_offsetq]

+%define filter_y_b [y_offsetq+16]

+%define filter_rnd [pw_8]

+%endif

+%endif

+  ; x_offset == bilin interpolation && y_offset == bilin interpolation

+%if %1 == 16

+  movu                 m0, [srcq]

+  movu                 m1, [srcq+1]

+%if cpuflag(ssse3)

+  punpckhbw            m2, m0, m1

+  punpcklbw            m0, m1

+  pmaddubsw            m2, filter_x_a

+  pmaddubsw            m0, filter_x_a

+  paddw                m2, filter_rnd

+  paddw                m0, filter_rnd

+%else

+  punpckhbw            m2, m0, m5

+  punpckhbw            m3, m1, m5

+  punpcklbw            m0, m5

+  punpcklbw            m1, m5

+  pmullw               m0, filter_x_a

+  pmullw               m1, filter_x_b

+  paddw                m0, filter_rnd

+  pmullw               m2, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m2, filter_rnd

+  paddw                m0, m1

+  paddw                m2, m3

+%endif

+  psraw                m0, 4

+  psraw                m2, 4

+  INC_SRC_BY_SRC_STRIDE

+  packuswb             m0, m2

+.x_other_y_other_loop:

+%if cpuflag(ssse3)

+  movu                 m4, [srcq]

+  movu                 m3, [srcq+1]

+  mova                 m1, [dstq]

+  punpckhbw            m2, m4, m3

+  punpcklbw            m4, m3

+  pmaddubsw            m2, filter_x_a

+  pmaddubsw            m4, filter_x_a

+  punpckhbw            m3, m1, m5

+  paddw                m2, filter_rnd

+  paddw                m4, filter_rnd

+  psraw                m2, 4

+  psraw                m4, 4

+  packuswb             m4, m2

+  punpckhbw            m2, m0, m4

+  punpcklbw            m0, m4

+  pmaddubsw            m2, filter_y_a

+  pmaddubsw            m0, filter_y_a

+  punpcklbw            m1, m5

+  paddw                m2, filter_rnd

+  paddw                m0, filter_rnd

+  psraw                m2, 4

+  psraw                m0, 4

+%else

+  movu                 m3, [srcq]

+  movu                 m4, [srcq+1]

+  punpckhbw            m1, m3, m5

+  punpckhbw            m2, m4, m5

+  punpcklbw            m3, m5

+  punpcklbw            m4, m5

+  pmullw               m3, filter_x_a

+  pmullw               m4, filter_x_b

+  paddw                m3, filter_rnd

+  pmullw               m1, filter_x_a

+  pmullw               m2, filter_x_b

+  paddw                m1, filter_rnd

+  paddw                m3, m4

+  paddw                m1, m2

+  psraw                m3, 4

+  psraw                m1, 4

+  packuswb             m4, m3, m1

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+  pmullw               m2, filter_y_a

+  pmullw               m1, filter_y_b

+  paddw                m2, filter_rnd

+  pmullw               m0, filter_y_a

+  pmullw               m3, filter_y_b

+  paddw                m2, m1

+  mova                 m1, [dstq]

+  paddw                m0, filter_rnd

+  psraw                m2, 4

+  paddw                m0, m3

+  punpckhbw            m3, m1, m5

+  psraw                m0, 4

+  punpcklbw            m1, m5

+%endif

+%if %2 == 1 ; avg

+  ; FIXME(rbultje) pipeline

+  packuswb             m0, m2

+  pavgb                m0, [secq]

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  mova                 m0, m4

+  INC_SRC_BY_SRC_STRIDE

+  add                dstq, dst_strideq

+%else ; %1 < 16

+  movh                 m0, [srcq]

+  movh                 m1, [srcq+1]

+%if cpuflag(ssse3)

+  punpcklbw            m0, m1

+  pmaddubsw            m0, filter_x_a

+  paddw                m0, filter_rnd

+%else

+  punpcklbw            m0, m5

+  punpcklbw            m1, m5

+  pmullw               m0, filter_x_a

+  pmullw               m1, filter_x_b

+  paddw                m0, filter_rnd

+  paddw                m0, m1

+%endif

+  psraw                m0, 4

+%if cpuflag(ssse3)

+  packuswb             m0, m0

+%endif

+  INC_SRC_BY_SRC_STRIDE

+.x_other_y_other_loop:

+  movh                 m2, [srcq]

+  movh                 m1, [srcq+1]

+  INC_SRC_BY_SRC_STRIDE

+  movh                 m4, [srcq]

+  movh                 m3, [srcq+1]

+%if cpuflag(ssse3)

+  punpcklbw            m2, m1

+  punpcklbw            m4, m3

+  pmaddubsw            m2, filter_x_a

+  pmaddubsw            m4, filter_x_a

+  movh                 m3, [dstq+dst_strideq]

+  movh                 m1, [dstq]

+  paddw                m2, filter_rnd

+  paddw                m4, filter_rnd

+  psraw                m2, 4

+  psraw                m4, 4

+  packuswb             m2, m2

+  packuswb             m4, m4

+  punpcklbw            m0, m2

+  punpcklbw            m2, m4

+  pmaddubsw            m0, filter_y_a

+  pmaddubsw            m2, filter_y_a

+  punpcklbw            m3, m5

+  paddw                m0, filter_rnd

+  paddw                m2, filter_rnd

+  psraw                m0, 4

+  psraw                m2, 4

+  punpcklbw            m1, m5

+%else

+  punpcklbw            m2, m5

+  punpcklbw            m1, m5

+  punpcklbw            m4, m5

+  punpcklbw            m3, m5

+  pmullw               m2, filter_x_a

+  pmullw               m1, filter_x_b

+  paddw                m2, filter_rnd

+  pmullw               m4, filter_x_a

+  pmullw               m3, filter_x_b

+  paddw                m4, filter_rnd

+  paddw                m2, m1

+  paddw                m4, m3

+  psraw                m2, 4

+  psraw                m4, 4

+  pmullw               m0, filter_y_a

+  pmullw               m3, m2, filter_y_b

+  paddw                m0, filter_rnd

+  pmullw               m2, filter_y_a

+  pmullw               m1, m4, filter_y_b

+  paddw                m2, filter_rnd

+  paddw                m0, m3

+  movh                 m3, [dstq+dst_strideq]

+  paddw                m2, m1

+  movh                 m1, [dstq]

+  psraw                m0, 4

+  psraw                m2, 4

+  punpcklbw            m3, m5

+  punpcklbw            m1, m5

+%endif

+%if %2 == 1 ; avg

+  ; FIXME(rbultje) pipeline

+  packuswb             m0, m2

+  pavgb                m0, [secq]

+  punpckhbw            m2, m0, m5

+  punpcklbw            m0, m5

+%endif

+  SUM_SSE              m0, m1, m2, m3, m6, m7

+  mova                 m0, m4

+  INC_SRC_BY_SRC_STRIDE

+  lea                dstq, [dstq+dst_strideq*2]

+%endif

+%if %2 == 1 ; avg

+  add                secq, sec_str

+%endif

+  dec                   h

+  jg .x_other_y_other_loop

+%undef filter_x_a

+%undef filter_x_b

+%undef filter_y_a

+%undef filter_y_b

+%undef filter_rnd

+  STORE_AND_RET

+%endmacro

+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical

+; between the ssse3 and non-ssse3 version. It may make sense to merge their

+; code in the sense that the ssse3 version would jump to the appropriate

+; location in the sse/2 version, rather than duplicating that code in the

+; binary.

+INIT_MMX sse

+SUBPEL_VARIANCE  4

+INIT_XMM sse2

+SUBPEL_VARIANCE  8

+SUBPEL_VARIANCE 16

+INIT_MMX ssse3

+SUBPEL_VARIANCE  4

+INIT_XMM ssse3

+SUBPEL_VARIANCE  8

+SUBPEL_VARIANCE 16

+INIT_MMX sse

+SUBPEL_VARIANCE  4, 1

+INIT_XMM sse2

+SUBPEL_VARIANCE  8, 1

+SUBPEL_VARIANCE 16, 1

+INIT_MMX ssse3

+SUBPEL_VARIANCE  4, 1

+INIT_XMM ssse3

+SUBPEL_VARIANCE  8, 1

+SUBPEL_VARIANCE 16, 1

--- a/vpx_dsp/x86/variance_avx2.c

+++ b/vpx_dsp/x86/variance_avx2.c

@@ -91,3 +91,93 @@

                 sse, &sum, vpx_get32x32var_avx2, 32);

   return *sse - (((int64_t)sum * sum) >> 11);

+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,

+                                             int x_offset, int y_offset,

+                                             const uint8_t *dst, int dst_stride,

+                                             int height,

+                                             unsigned int *sse);

+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,

+                                                 int src_stride,

+                                                 int x_offset,

+                                                 int y_offset,

+                                                 const uint8_t *dst,

+                                                 int dst_stride,

+                                                 const uint8_t *sec,

+                                                 int sec_stride,

+                                                 int height,

+                                                 unsigned int *sseptr);

+unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,

+                                              int src_stride,

+                                              int x_offset,

+                                              int y_offset,

+                                              const uint8_t *dst,

+                                              int dst_stride,

+                                              unsigned int *sse) {

+  unsigned int sse1;

+  const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,

+                                                  y_offset, dst, dst_stride,

+                                                  64, &sse1);

+  unsigned int sse2;

+  const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,

+                                                  x_offset, y_offset,

+                                                  dst + 32, dst_stride,

+                                                  64, &sse2);

+  const int se = se1 + se2;

+  *sse = sse1 + sse2;

+  return *sse - (((int64_t)se * se) >> 12);

+}

+unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,

+                                              int src_stride,

+                                              int x_offset,

+                                              int y_offset,

+                                              const uint8_t *dst,

+                                              int dst_stride,

+                                              unsigned int *sse) {

+  const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,

+                                                 y_offset, dst, dst_stride,

+                                                 32, sse);

+  return *sse - (((int64_t)se * se) >> 10);

+}

+unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,

+                                                  int src_stride,

+                                                  int x_offset,

+                                                  int y_offset,

+                                                  const uint8_t *dst,

+                                                  int dst_stride,

+                                                  unsigned int *sse,

+                                                  const uint8_t *sec) {

+  unsigned int sse1;

+  const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,

+                                                      y_offset, dst, dst_stride,

+                                                      sec, 64, 64, &sse1);

+  unsigned int sse2;

+  const int se2 =

+    vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,

+                                        y_offset, dst + 32, dst_stride,

+                                        sec + 32, 64, 64, &sse2);

+  const int se = se1 + se2;

+  *sse = sse1 + sse2;

+  return *sse - (((int64_t)se * se) >> 12);

+}

+unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,

+                                                  int src_stride,

+                                                  int x_offset,

+                                                  int y_offset,

+                                                  const uint8_t *dst,

+                                                  int dst_stride,

+                                                  unsigned int *sse,

+                                                  const uint8_t *sec) {

+  // Process 32 elements in parallel.

+  const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,

+                                                     y_offset, dst, dst_stride,

+                                                     sec, 32, 32, sse);

+  return *sse - (((int64_t)se * se) >> 10);

+}

--- a/vpx_dsp/x86/variance_impl_avx2.c

+++ b/vpx_dsp/x86/variance_impl_avx2.c

@@ -11,7 +11,28 @@

 #include <immintrin.h>  // AVX2

 #include "./vpx_dsp_rtcd.h"

+#include "vpx_ports/mem.h"

+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {

+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,

+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,

+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,

+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,

+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,

+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,

+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,

+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,

+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,

+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,

+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,

+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,

+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,

+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,

+};

 void vpx_get16x16var_avx2(const unsigned char *src_ptr,

                           int source_stride,

                           const unsigned char *ref_ptr,

@@ -212,4 +233,495 @@

       *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +

       _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));

+}

+#define FILTER_SRC(filter) \

+  /* filter the source */ \

+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \

+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \

+  \

+  /* add 8 to source */ \

+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \

+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \

+  \

+  /* divide source by 16 */ \

+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \

+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);

+#define MERGE_WITH_SRC(src_reg, reg) \

+  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \

+  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);

+#define LOAD_SRC_DST \

+  /* load source and destination */ \

+  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \

+  dst_reg = _mm256_loadu_si256((__m256i const *) (dst));

+#define AVG_NEXT_SRC(src_reg, size_stride) \

+  src_next_reg = _mm256_loadu_si256((__m256i const *) \

+                                   (src + size_stride)); \

+  /* average between current and next stride source */ \

+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);

+#define MERGE_NEXT_SRC(src_reg, size_stride) \

+  src_next_reg = _mm256_loadu_si256((__m256i const *) \

+                                   (src + size_stride)); \

+  MERGE_WITH_SRC(src_reg, src_next_reg)

+#define CALC_SUM_SSE_INSIDE_LOOP \

+  /* expand each byte to 2 bytes */ \

+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \

+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \

+  /* source - dest */ \

+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \

+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \

+  /* caculate sum */ \

+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \

+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \

+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \

+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \

+  /* calculate sse */ \

+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \

+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);

+// final calculation to sum and sse

+#define CALC_SUM_AND_SSE \

+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \

+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \

+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \

+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \

+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \

+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \

+  \

+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \

+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \

+  \

+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \

+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \

+  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \

+                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \

+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \

+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \

+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \

+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));

+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,

+                                             int src_stride,

+                                             int x_offset,

+                                             int y_offset,

+                                             const uint8_t *dst,

+                                             int dst_stride,

+                                             int height,

+                                             unsigned int *sse) {

+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;

+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;

+  __m256i zero_reg;

+  int i, sum;

+  sum_reg = _mm256_set1_epi16(0);

+  sse_reg = _mm256_set1_epi16(0);

+  zero_reg = _mm256_set1_epi16(0);

+  // x_offset = 0 and y_offset = 0

+  if (x_offset == 0) {

+    if (y_offset == 0) {

+      for (i = 0; i < height ; i++) {

+        LOAD_SRC_DST

+        // expend each byte to 2 bytes

+        MERGE_WITH_SRC(src_reg, zero_reg)

+        CALC_SUM_SSE_INSIDE_LOOP

+        src+= src_stride;

+        dst+= dst_stride;

+      }

+    // x_offset = 0 and y_offset = 8

+    } else if (y_offset == 8) {

+      __m256i src_next_reg;

+      for (i = 0; i < height ; i++) {

+        LOAD_SRC_DST

+        AVG_NEXT_SRC(src_reg, src_stride)

+        // expend each byte to 2 bytes

+        MERGE_WITH_SRC(src_reg, zero_reg)

+        CALC_SUM_SSE_INSIDE_LOOP

+        src+= src_stride;

+        dst+= dst_stride;

+      }

+    // x_offset = 0 and y_offset = bilin interpolation

+    } else {

+      __m256i filter, pw8, src_next_reg;

+      y_offset <<= 5;

+      filter = _mm256_load_si256((__m256i const *)

+               (bilinear_filters_avx2 + y_offset));

+      pw8 = _mm256_set1_epi16(8);

+      for (i = 0; i < height ; i++) {

+        LOAD_SRC_DST

+        MERGE_NEXT_SRC(src_reg, src_stride)

+        FILTER_SRC(filter)

+        CALC_SUM_SSE_INSIDE_LOOP

+        src+= src_stride;

+        dst+= dst_stride;

+      }

+    }

+  // x_offset = 8  and y_offset = 0

+  } else if (x_offset == 8) {

+    if (y_offset == 0) {

+      __m256i src_next_reg;

+      for (i = 0; i < height ; i++) {

+        LOAD_SRC_DST

+        AVG_NEXT_SRC(src_reg, 1)

+        // expand each byte to 2 bytes

+        MERGE_WITH_SRC(src_reg, zero_reg)

+        CALC_SUM_SSE_INSIDE_LOOP

+        src+= src_stride;

+        dst+= dst_stride;

+      }

+    // x_offset = 8  and y_offset = 8

+    } else if (y_offset == 8) {

+      __m256i src_next_reg, src_avg;

+      // load source and another source starting from the next

+      // following byte

+      src_reg = _mm256_loadu_si256((__m256i const *) (src));

+      AVG_NEXT_SRC(src_reg, 1)

+      for (i = 0; i < height ; i++) {

+        src_avg = src_reg;

+        src+= src_stride;

+        LOAD_SRC_DST

+        AVG_NEXT_SRC(src_reg, 1)

+        // average between previous average to current average

+        src_avg = _mm256_avg_epu8(src_avg, src_reg);

+        // expand each byte to 2 bytes

+        MERGE_WITH_SRC(src_avg, zero_reg)

+        // save current source average

+        CALC_SUM_SSE_INSIDE_LOOP

+        dst+= dst_stride;

+      }

+    // x_offset = 8  and y_offset = bilin interpolation

+    } else {

+      __m256i filter, pw8, src_next_reg, src_avg;

+      y_offset <<= 5;

+      filter = _mm256_load_si256((__m256i const *)

+               (bilinear_filters_avx2 + y_offset));

+      pw8 = _mm256_set1_epi16(8);

+      // load source and another source starting from the next

+      // following byte

+      src_reg = _mm256_loadu_si256((__m256i const *) (src));

+      AVG_NEXT_SRC(src_reg, 1)

+      for (i = 0; i < height ; i++) {

+        // save current source average

+        src_avg = src_reg;

+        src+= src_stride;

+        LOAD_SRC_DST

+        AVG_NEXT_SRC(src_reg, 1)

+        MERGE_WITH_SRC(src_avg, src_reg)

+        FILTER_SRC(filter)

+        CALC_SUM_SSE_INSIDE_LOOP

+        dst+= dst_stride;

+      }

+    }

+  // x_offset = bilin interpolation and y_offset = 0

+  } else {

+    if (y_offset == 0) {

+      __m256i filter, pw8, src_next_reg;

+      x_offset <<= 5;

+      filter = _mm256_load_si256((__m256i const *)

+               (bilinear_filters_avx2 + x_offset));

+      pw8 = _mm256_set1_epi16(8);

+      for (i = 0; i < height ; i++) {

+        LOAD_SRC_DST

+        MERGE_NEXT_SRC(src_reg, 1)

+        FILTER_SRC(filter)

+        CALC_SUM_SSE_INSIDE_LOOP

+        src+= src_stride;

+        dst+= dst_stride;

+      }

+    // x_offset = bilin interpolation and y_offset = 8

+    } else if (y_offset == 8) {

+      __m256i filter, pw8, src_next_reg, src_pack;

+      x_offset <<= 5;

+      filter = _mm256_load_si256((__m256i const *)

+               (bilinear_filters_avx2 + x_offset));

+      pw8 = _mm256_set1_epi16(8);

+      src_reg = _mm256_loadu_si256((__m256i const *) (src));

+      MERGE_NEXT_SRC(src_reg, 1)

+      FILTER_SRC(filter)

+      // convert each 16 bit to 8 bit to each low and high lane source

+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+      for (i = 0; i < height ; i++) {

+        src+= src_stride;

+        LOAD_SRC_DST

+        MERGE_NEXT_SRC(src_reg, 1)

+        FILTER_SRC(filter)

+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+        // average between previous pack to the current

+        src_pack = _mm256_avg_epu8(src_pack, src_reg);

+        MERGE_WITH_SRC(src_pack, zero_reg)

+        CALC_SUM_SSE_INSIDE_LOOP

+        src_pack = src_reg;

+        dst+= dst_stride;

+      }

+    // x_offset = bilin interpolation and y_offset = bilin interpolation

+    } else {

+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;

+      x_offset <<= 5;

+      xfilter = _mm256_load_si256((__m256i const *)

+                (bilinear_filters_avx2 + x_offset));

+      y_offset <<= 5;

+      yfilter = _mm256_load_si256((__m256i const *)

+                (bilinear_filters_avx2 + y_offset));

+      pw8 = _mm256_set1_epi16(8);

+      // load source and another source starting from the next

+      // following byte

+      src_reg = _mm256_loadu_si256((__m256i const *) (src));

+      MERGE_NEXT_SRC(src_reg, 1)

+      FILTER_SRC(xfilter)

+      // convert each 16 bit to 8 bit to each low and high lane source

+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+      for (i = 0; i < height ; i++) {

+        src+= src_stride;

+        LOAD_SRC_DST

+        MERGE_NEXT_SRC(src_reg, 1)

+        FILTER_SRC(xfilter)

+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+        // merge previous pack to current pack source

+        MERGE_WITH_SRC(src_pack, src_reg)

+        // filter the source

+        FILTER_SRC(yfilter)

+        src_pack = src_reg;

+        CALC_SUM_SSE_INSIDE_LOOP

+        dst+= dst_stride;

+      }

+    }

+  }

+  CALC_SUM_AND_SSE

+  return sum;

+}

+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,

+                                             int src_stride,

+                                             int x_offset,

+                                             int y_offset,

+                                             const uint8_t *dst,

+                                             int dst_stride,

+                                             const uint8_t *sec,

+                                             int sec_stride,

+                                             int height,

+                                             unsigned int *sse) {

+  __m256i sec_reg;

+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;

+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;

+  __m256i zero_reg;

+  int i, sum;

+  sum_reg = _mm256_set1_epi16(0);

+  sse_reg = _mm256_set1_epi16(0);

+  zero_reg = _mm256_set1_epi16(0);

+  // x_offset = 0 and y_offset = 0

+  if (x_offset == 0) {

+    if (y_offset == 0) {

+      for (i = 0; i < height ; i++) {

+        LOAD_SRC_DST

+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);

+        sec+= sec_stride;

+        // expend each byte to 2 bytes

+        MERGE_WITH_SRC(src_reg, zero_reg)

+        CALC_SUM_SSE_INSIDE_LOOP

+        src+= src_stride;

+        dst+= dst_stride;

+      }

+    } else if (y_offset == 8) {

+      __m256i src_next_reg;

+      for (i = 0; i < height ; i++) {

+        LOAD_SRC_DST

+        AVG_NEXT_SRC(src_reg, src_stride)

+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);

+        sec+= sec_stride;

+        // expend each byte to 2 bytes

+        MERGE_WITH_SRC(src_reg, zero_reg)

+        CALC_SUM_SSE_INSIDE_LOOP

+        src+= src_stride;

+        dst+= dst_stride;

+      }

+    // x_offset = 0 and y_offset = bilin interpolation

+    } else {

+      __m256i filter, pw8, src_next_reg;

+      y_offset <<= 5;

+      filter = _mm256_load_si256((__m256i const *)

+                 (bilinear_filters_avx2 + y_offset));

+      pw8 = _mm256_set1_epi16(8);

+      for (i = 0; i < height ; i++) {

+        LOAD_SRC_DST

+        MERGE_NEXT_SRC(src_reg, src_stride)

+        FILTER_SRC(filter)

+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);

+        sec+= sec_stride;

+        MERGE_WITH_SRC(src_reg, zero_reg)

+        CALC_SUM_SSE_INSIDE_LOOP

+        src+= src_stride;

+        dst+= dst_stride;

+      }

+    }

+  // x_offset = 8  and y_offset = 0

+  } else if (x_offset == 8) {

+    if (y_offset == 0) {

+      __m256i src_next_reg;

+      for (i = 0; i < height ; i++) {

+        LOAD_SRC_DST

+        AVG_NEXT_SRC(src_reg, 1)

+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);

+        sec+= sec_stride;

+        // expand each byte to 2 bytes

+        MERGE_WITH_SRC(src_reg, zero_reg)

+        CALC_SUM_SSE_INSIDE_LOOP

+        src+= src_stride;

+        dst+= dst_stride;

+      }

+    // x_offset = 8  and y_offset = 8

+    } else if (y_offset == 8) {

+      __m256i src_next_reg, src_avg;

+      // load source and another source starting from the next

+      // following byte

+      src_reg = _mm256_loadu_si256((__m256i const *) (src));

+      AVG_NEXT_SRC(src_reg, 1)

+      for (i = 0; i < height ; i++) {

+        // save current source average

+        src_avg = src_reg;

+        src+= src_stride;

+        LOAD_SRC_DST

+        AVG_NEXT_SRC(src_reg, 1)

+        // average between previous average to current average

+        src_avg = _mm256_avg_epu8(src_avg, src_reg);

+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);

+        sec+= sec_stride;

+        // expand each byte to 2 bytes

+        MERGE_WITH_SRC(src_avg, zero_reg)

+        CALC_SUM_SSE_INSIDE_LOOP

+        dst+= dst_stride;

+      }

+    // x_offset = 8  and y_offset = bilin interpolation

+    } else {

+      __m256i filter, pw8, src_next_reg, src_avg;

+      y_offset <<= 5;

+      filter = _mm256_load_si256((__m256i const *)

+               (bilinear_filters_avx2 + y_offset));

+      pw8 = _mm256_set1_epi16(8);

+      // load source and another source starting from the next

+      // following byte

+      src_reg = _mm256_loadu_si256((__m256i const *) (src));

+      AVG_NEXT_SRC(src_reg, 1)

+      for (i = 0; i < height ; i++) {

+        // save current source average

+        src_avg = src_reg;

+        src+= src_stride;

+        LOAD_SRC_DST

+        AVG_NEXT_SRC(src_reg, 1)

+        MERGE_WITH_SRC(src_avg, src_reg)

+        FILTER_SRC(filter)

+        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);

+        // expand each byte to 2 bytes

+        MERGE_WITH_SRC(src_avg, zero_reg)

+        sec+= sec_stride;

+        CALC_SUM_SSE_INSIDE_LOOP

+        dst+= dst_stride;

+      }

+    }

+  // x_offset = bilin interpolation and y_offset = 0

+  } else {

+    if (y_offset == 0) {

+      __m256i filter, pw8, src_next_reg;

+      x_offset <<= 5;

+      filter = _mm256_load_si256((__m256i const *)

+               (bilinear_filters_avx2 + x_offset));

+      pw8 = _mm256_set1_epi16(8);

+      for (i = 0; i < height ; i++) {

+        LOAD_SRC_DST

+        MERGE_NEXT_SRC(src_reg, 1)

+        FILTER_SRC(filter)

+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);

+        MERGE_WITH_SRC(src_reg, zero_reg)

+        sec+= sec_stride;

+        CALC_SUM_SSE_INSIDE_LOOP

+        src+= src_stride;

+        dst+= dst_stride;

+      }

+    // x_offset = bilin interpolation and y_offset = 8

+    } else if (y_offset == 8) {

+      __m256i filter, pw8, src_next_reg, src_pack;

+      x_offset <<= 5;

+      filter = _mm256_load_si256((__m256i const *)

+               (bilinear_filters_avx2 + x_offset));

+      pw8 = _mm256_set1_epi16(8);

+      src_reg = _mm256_loadu_si256((__m256i const *) (src));

+      MERGE_NEXT_SRC(src_reg, 1)

+      FILTER_SRC(filter)

+      // convert each 16 bit to 8 bit to each low and high lane source

+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+      for (i = 0; i < height ; i++) {

+        src+= src_stride;

+        LOAD_SRC_DST

+        MERGE_NEXT_SRC(src_reg, 1)

+        FILTER_SRC(filter)

+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+        // average between previous pack to the current

+        src_pack = _mm256_avg_epu8(src_pack, src_reg);

+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);

+        sec+= sec_stride;

+        MERGE_WITH_SRC(src_pack, zero_reg)

+        src_pack = src_reg;

+        CALC_SUM_SSE_INSIDE_LOOP

+        dst+= dst_stride;

+      }

+    // x_offset = bilin interpolation and y_offset = bilin interpolation

+    } else {

+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;

+      x_offset <<= 5;

+      xfilter = _mm256_load_si256((__m256i const *)

+                (bilinear_filters_avx2 + x_offset));

+      y_offset <<= 5;

+      yfilter = _mm256_load_si256((__m256i const *)

+                (bilinear_filters_avx2 + y_offset));

+      pw8 = _mm256_set1_epi16(8);

+      // load source and another source starting from the next

+      // following byte

+      src_reg = _mm256_loadu_si256((__m256i const *) (src));

+      MERGE_NEXT_SRC(src_reg, 1)

+      FILTER_SRC(xfilter)

+      // convert each 16 bit to 8 bit to each low and high lane source

+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+      for (i = 0; i < height ; i++) {

+        src+= src_stride;

+        LOAD_SRC_DST

+        MERGE_NEXT_SRC(src_reg, 1)

+        FILTER_SRC(xfilter)

+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+        // merge previous pack to current pack source

+        MERGE_WITH_SRC(src_pack, src_reg)

+        // filter the source

+        FILTER_SRC(yfilter)

+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));

+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);

+        MERGE_WITH_SRC(src_pack, zero_reg)

+        src_pack = src_reg;

+        sec+= sec_stride;

+        CALC_SUM_SSE_INSIDE_LOOP

+        dst+= dst_stride;

+      }

+    }

+  }

+  CALC_SUM_AND_SSE

+  return sum;

--- a/vpx_dsp/x86/variance_impl_mmx.asm

+++ b/vpx_dsp/x86/variance_impl_mmx.asm

@@ -11,6 +11,8 @@

 %include "vpx_ports/x86_abi_support.asm"

+%define mmx_filter_shift            7

 ;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )

 global sym(vpx_get_mb_ss_mmx) PRIVATE

 sym(vpx_get_mb_ss_mmx):

@@ -52,7 +54,6 @@

         movsxd      rcx, dword ptr [rsp+4]

         add         rax, rcx

     ; begin epilog

     add rsp, 8

     pop rdi

@@ -62,7 +63,6 @@

     pop         rbp

ret

 ;void vpx_get8x8var_mmx

;(

 ;    unsigned char *src_ptr,

@@ -83,7 +83,6 @@

     sub         rsp, 16

     ; end prolog

         pxor        mm5, mm5                    ; Blank mmx6

         pxor        mm6, mm6                    ; Blank mmx7

         pxor        mm7, mm7                    ; Blank mmx7

@@ -117,7 +116,6 @@

         paddd       mm7, mm0                    ; accumulate in mm7

         paddd       mm7, mm2                    ; accumulate in mm7

         ; Row 2

         movq        mm0, [rax]                  ; Copy eight bytes to mm0

         movq        mm2, mm0                    ; Take copies

@@ -298,7 +296,6 @@

         mov         dword ptr [rdi], edx

         xor         rax, rax    ; return 0

     ; begin epilog

     add rsp, 16

     pop rbx

@@ -308,8 +305,6 @@

     pop         rbp

ret

 ;void

 ;vpx_get4x4var_mmx

;(

@@ -331,7 +326,6 @@

     sub         rsp, 16

     ; end prolog

         pxor        mm5, mm5                    ; Blank mmx6

         pxor        mm6, mm6                    ; Blank mmx7

         pxor        mm7, mm7                    ; Blank mmx7

@@ -354,7 +348,6 @@

         movd        mm1, [rbx]                  ; Copy four bytes to mm1

         paddd       mm7, mm0                    ; accumulate in mm7

         ; Row 2

         movd        mm0, [rax]                  ; Copy four bytes to mm0

         punpcklbw   mm0, mm6                    ; unpack to higher prrcision

@@ -393,7 +386,6 @@

         pmaddwd     mm0, mm0                    ; square and accumulate

         paddd       mm7, mm0                    ; accumulate in mm7

         ; Now accumulate the final results.

         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory

         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory

@@ -413,7 +405,6 @@

         mov         dword ptr [rdi], edx

         xor         rax, rax    ; return 0

     ; begin epilog

     add rsp, 16

     pop rbx

@@ -422,3 +413,332 @@

     UNSHADOW_ARGS

     pop         rbp

ret

+;void vpx_filter_block2d_bil4x4_var_mmx

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned short *HFilter,

+;    unsigned short *VFilter,

+;    int *sum,

+;    unsigned int *sumsquared

+;)

+global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE

+sym(vpx_filter_block2d_bil4x4_var_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 8

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    sub         rsp, 16

+    ; end prolog

+        pxor            mm6,            mm6                 ;

+        pxor            mm7,            mm7                 ;

+        mov             rax,            arg(4) ;HFilter             ;

+        mov             rdx,            arg(5) ;VFilter             ;

+        mov             rsi,            arg(0) ;ref_ptr              ;

+        mov             rdi,            arg(2) ;src_ptr              ;

+        mov             rcx,            4                   ;

+        pxor            mm0,            mm0                 ;

+        movd            mm1,            [rsi]               ;

+        movd            mm3,            [rsi+1]             ;

+        punpcklbw       mm1,            mm0                 ;

+        pmullw          mm1,            [rax]               ;

+        punpcklbw       mm3,            mm0                 ;

+        pmullw          mm3,            [rax+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        movq            mm5,            mm1

+%if ABI_IS_32BIT

+        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;

+%else

+        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;

+        add             rsi, r8

+%endif

+.filter_block2d_bil4x4_var_mmx_loop:

+        movd            mm1,            [rsi]               ;

+        movd            mm3,            [rsi+1]             ;

+        punpcklbw       mm1,            mm0                 ;

+        pmullw          mm1,            [rax]               ;

+        punpcklbw       mm3,            mm0                 ;

+        pmullw          mm3,            [rax+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        movq            mm3,            mm5                 ;

+        movq            mm5,            mm1                 ;

+        pmullw          mm3,            [rdx]               ;

+        pmullw          mm1,            [rdx+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        movd            mm3,            [rdi]               ;

+        punpcklbw       mm3,            mm0                 ;

+        psubw           mm1,            mm3                 ;

+        paddw           mm6,            mm1                 ;

+        pmaddwd         mm1,            mm1                 ;

+        paddd           mm7,            mm1                 ;

+%if ABI_IS_32BIT

+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;

+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;

+%else

+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

+        add             rsi,            r8

+        add             rdi,            r9

+%endif

+        sub             rcx,            1                   ;

+        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;

+        pxor            mm3,            mm3                 ;

+        pxor            mm2,            mm2                 ;

+        punpcklwd       mm2,            mm6                 ;

+        punpckhwd       mm3,            mm6                 ;

+        paddd           mm2,            mm3                 ;

+        movq            mm6,            mm2                 ;

+        psrlq           mm6,            32                  ;

+        paddd           mm2,            mm6                 ;

+        psrad           mm2,            16                  ;

+        movq            mm4,            mm7                 ;

+        psrlq           mm4,            32                  ;

+        paddd           mm4,            mm7                 ;

+        mov             rdi,            arg(6) ;sum

+        mov             rsi,            arg(7) ;sumsquared

+        movd            dword ptr [rdi],          mm2                 ;

+        movd            dword ptr [rsi],          mm4                 ;

+    ; begin epilog

+    add rsp, 16

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vpx_filter_block2d_bil_var_mmx

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned int Height,

+;    unsigned short *HFilter,

+;    unsigned short *VFilter,

+;    int *sum,

+;    unsigned int *sumsquared

+;)

+global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE

+sym(vpx_filter_block2d_bil_var_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    sub         rsp, 16

+    ; end prolog

+        pxor            mm6,            mm6                 ;

+        pxor            mm7,            mm7                 ;

+        mov             rax,            arg(5) ;HFilter             ;

+        mov             rdx,            arg(6) ;VFilter             ;

+        mov             rsi,            arg(0) ;ref_ptr              ;

+        mov             rdi,            arg(2) ;src_ptr              ;

+        movsxd          rcx,            dword ptr arg(4) ;Height              ;

+        pxor            mm0,            mm0                 ;

+        movq            mm1,            [rsi]               ;

+        movq            mm3,            [rsi+1]             ;

+        movq            mm2,            mm1                 ;

+        movq            mm4,            mm3                 ;

+        punpcklbw       mm1,            mm0                 ;

+        punpckhbw       mm2,            mm0                 ;

+        pmullw          mm1,            [rax]               ;

+        pmullw          mm2,            [rax]               ;

+        punpcklbw       mm3,            mm0                 ;

+        punpckhbw       mm4,            mm0                 ;

+        pmullw          mm3,            [rax+8]             ;

+        pmullw          mm4,            [rax+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm2,            mm4                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm2,            mmx_filter_shift    ;

+        movq            mm5,            mm1

+        packuswb        mm5,            mm2                 ;

+%if ABI_IS_32BIT

+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line

+%else

+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

+        add             rsi,            r8

+%endif

+.filter_block2d_bil_var_mmx_loop:

+        movq            mm1,            [rsi]               ;

+        movq            mm3,            [rsi+1]             ;

+        movq            mm2,            mm1                 ;

+        movq            mm4,            mm3                 ;

+        punpcklbw       mm1,            mm0                 ;

+        punpckhbw       mm2,            mm0                 ;

+        pmullw          mm1,            [rax]               ;

+        pmullw          mm2,            [rax]               ;

+        punpcklbw       mm3,            mm0                 ;

+        punpckhbw       mm4,            mm0                 ;

+        pmullw          mm3,            [rax+8]             ;

+        pmullw          mm4,            [rax+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm2,            mm4                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm2,            mmx_filter_shift    ;

+        movq            mm3,            mm5                 ;

+        movq            mm4,            mm5                 ;

+        punpcklbw       mm3,            mm0                 ;

+        punpckhbw       mm4,            mm0                 ;

+        movq            mm5,            mm1                 ;

+        packuswb        mm5,            mm2                 ;

+        pmullw          mm3,            [rdx]               ;

+        pmullw          mm4,            [rdx]               ;

+        pmullw          mm1,            [rdx+8]             ;

+        pmullw          mm2,            [rdx+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm2,            mm4                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        psraw           mm2,            mmx_filter_shift    ;

+        movq            mm3,            [rdi]               ;

+        movq            mm4,            mm3                 ;

+        punpcklbw       mm3,            mm0                 ;

+        punpckhbw       mm4,            mm0                 ;

+        psubw           mm1,            mm3                 ;

+        psubw           mm2,            mm4                 ;

+        paddw           mm6,            mm1                 ;

+        pmaddwd         mm1,            mm1                 ;

+        paddw           mm6,            mm2                 ;

+        pmaddwd         mm2,            mm2                 ;

+        paddd           mm7,            mm1                 ;

+        paddd           mm7,            mm2                 ;

+%if ABI_IS_32BIT

+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;

+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;

+%else

+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;

+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;

+        add             rsi,            r8

+        add             rdi,            r9

+%endif

+        sub             rcx,            1                   ;

+        jnz             .filter_block2d_bil_var_mmx_loop       ;

+        pxor            mm3,            mm3                 ;

+        pxor            mm2,            mm2                 ;

+        punpcklwd       mm2,            mm6                 ;

+        punpckhwd       mm3,            mm6                 ;

+        paddd           mm2,            mm3                 ;

+        movq            mm6,            mm2                 ;

+        psrlq           mm6,            32                  ;

+        paddd           mm2,            mm6                 ;

+        psrad           mm2,            16                  ;

+        movq            mm4,            mm7                 ;

+        psrlq           mm4,            32                  ;

+        paddd           mm4,            mm7                 ;

+        mov             rdi,            arg(7) ;sum

+        mov             rsi,            arg(8) ;sumsquared

+        movd            dword ptr [rdi],          mm2                 ;

+        movd            dword ptr [rsi],          mm4                 ;

+    ; begin epilog

+    add rsp, 16

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+;short mmx_bi_rd[4] = { 64, 64, 64, 64};

+align 16

+mmx_bi_rd:

+    times 4 dw 64

--- a/vpx_dsp/x86/variance_mmx.c

+++ b/vpx_dsp/x86/variance_mmx.c

@@ -10,12 +10,45 @@

 #include "./vpx_dsp_rtcd.h"

+#include "vpx_ports/mem.h"

+DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {

+  { 128, 128, 128, 128,   0,   0,   0,   0 },

+  { 112, 112, 112, 112,  16,  16,  16,  16 },

+  {  96,  96,  96,  96,  32,  32,  32,  32 },

+  {  80,  80,  80,  80,  48,  48,  48,  48 },

+  {  64,  64,  64,  64,  64,  64,  64,  64 },

+  {  48,  48,  48,  48,  80,  80,  80,  80 },

+  {  32,  32,  32,  32,  96,  96,  96,  96 },

+  {  16,  16,  16,  16, 112, 112, 112, 112 }

+};

 extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,

                               const uint8_t *b, int b_stride,

                               unsigned int *sse, int *sum);

-unsigned int vpx_variance4x4_mmx(const unsigned char *a, int  a_stride,

-                                 const unsigned char *b, int  b_stride,

+extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,

+                                              int ref_pixels_per_line,

+                                              const unsigned char *src_ptr,

+                                              int src_pixels_per_line,

+                                              const int16_t *HFilter,

+                                              const int16_t *VFilter,

+                                              int *sum,

+                                              unsigned int *sumsquared);

+extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,

+                                           int ref_pixels_per_line,

+                                           const unsigned char *src_ptr,

+                                           int src_pixels_per_line,

+                                           unsigned int Height,

+                                           const int16_t *HFilter,

+                                           const int16_t *VFilter,

+                                           int *sum,

+                                           unsigned int *sumsquared);

+unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,

+                                 const unsigned char *b, int b_stride,

                                  unsigned int *sse) {

     unsigned int var;

     int avg;

@@ -25,8 +58,8 @@

     return (var - (((unsigned int)avg * avg) >> 4));

-unsigned int vpx_variance8x8_mmx(const unsigned char *a, int  a_stride,

-                                 const unsigned char *b, int  b_stride,

+unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,

+                                 const unsigned char *b, int b_stride,

                                  unsigned int *sse) {

     unsigned int var;

     int avg;

@@ -37,8 +70,8 @@

     return (var - (((unsigned int)avg * avg) >> 6));

-unsigned int vpx_mse16x16_mmx(const unsigned char *a, int  a_stride,

-                              const unsigned char *b, int  b_stride,

+unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,

+                              const unsigned char *b, int b_stride,

                               unsigned int *sse) {

     unsigned int sse0, sse1, sse2, sse3, var;

     int sum0, sum1, sum2, sum3;

@@ -55,8 +88,8 @@

     return var;

-unsigned int vpx_variance16x16_mmx(const unsigned char *a, int  a_stride,

-                                   const unsigned char *b, int  b_stride,

+unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,

+                                   const unsigned char *b, int b_stride,

                                    unsigned int *sse) {

     unsigned int sse0, sse1, sse2, sse3, var;

     int sum0, sum1, sum2, sum3, avg;

@@ -74,8 +107,8 @@

     return (var - (((unsigned int)avg * avg) >> 8));

-unsigned int vpx_variance16x8_mmx(const unsigned char *a, int  a_stride,

-                                  const unsigned char *b, int  b_stride,

+unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,

+                                  const unsigned char *b, int b_stride,

                                   unsigned int *sse) {

     unsigned int sse0, sse1, var;

     int sum0, sum1, avg;

@@ -89,8 +122,8 @@

     return (var - (((unsigned int)avg * avg) >> 7));

-unsigned int vpx_variance8x16_mmx(const unsigned char *a, int  a_stride,

-                                  const unsigned char *b, int  b_stride,

+unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,

+                                  const unsigned char *b, int b_stride,

                                   unsigned int *sse) {

     unsigned int sse0, sse1, var;

     int sum0, sum1, avg;

@@ -104,4 +137,113 @@

     *sse = var;

     return (var - (((unsigned int)avg * avg) >> 7));

+}

+uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,

+                                       int xoffset, int yoffset,

+                                       const uint8_t *b, int b_stride,

+                                       uint32_t *sse) {

+    int xsum;

+    unsigned int xxsum;

+    vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,

+                                      bilinear_filters_mmx[xoffset],

+                                      bilinear_filters_mmx[yoffset],

+                                      &xsum, &xxsum);

+    *sse = xxsum;

+    return (xxsum - (((unsigned int)xsum * xsum) >> 4));

+}

+uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,

+                                       int xoffset, int yoffset,

+                                       const uint8_t *b, int b_stride,

+                                       uint32_t *sse) {

+    int xsum;

+    uint32_t xxsum;

+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,

+                                   bilinear_filters_mmx[xoffset],

+                                   bilinear_filters_mmx[yoffset],

+                                   &xsum, &xxsum);

+    *sse = xxsum;

+    return (xxsum - (((uint32_t)xsum * xsum) >> 6));

+}

+uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,

+                                         int xoffset, int yoffset,

+                                         const uint8_t *b, int b_stride,

+                                         uint32_t *sse) {

+    int xsum0, xsum1;

+    unsigned int xxsum0, xxsum1;

+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,

+                                   bilinear_filters_mmx[xoffset],

+                                   bilinear_filters_mmx[yoffset],

+                                   &xsum0, &xxsum0);

+    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,

+                                   bilinear_filters_mmx[xoffset],

+                                   bilinear_filters_mmx[yoffset],

+                                   &xsum1, &xxsum1);

+    xsum0 += xsum1;

+    xxsum0 += xxsum1;

+    *sse = xxsum0;

+    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));

+}

+uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,

+                                        int xoffset, int yoffset,

+                                        const uint8_t *b, int b_stride,

+                                        uint32_t *sse) {

+    int xsum0, xsum1;

+    unsigned int xxsum0, xxsum1;

+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,

+                                   bilinear_filters_mmx[xoffset],

+                                   bilinear_filters_mmx[yoffset],

+                                   &xsum0, &xxsum0);

+    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,

+                                   bilinear_filters_mmx[xoffset],

+                                   bilinear_filters_mmx[yoffset],

+                                   &xsum1, &xxsum1);

+    xsum0 += xsum1;

+    xxsum0 += xxsum1;

+    *sse = xxsum0;

+    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));

+}

+uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,

+                                        int xoffset, int yoffset,

+                                        const uint8_t *b, int b_stride,

+                                        uint32_t *sse) {

+    int xsum;

+    unsigned int xxsum;

+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,

+                                   bilinear_filters_mmx[xoffset],

+                                   bilinear_filters_mmx[yoffset],

+                                   &xsum, &xxsum);

+    *sse = xxsum;

+    return (xxsum - (((uint32_t)xsum * xsum) >> 7));

+}

+uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,

+                                            const uint8_t *b, int b_stride,

+                                            uint32_t *sse) {

+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);

+}

+uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,

+                                            const uint8_t *b, int b_stride,

+                                            uint32_t *sse) {

+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);

+}

+uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,

+                                             const uint8_t *b, int b_stride,

+                                             uint32_t *sse) {

+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);

--- a/vpx_dsp/x86/variance_sse2.c

+++ b/vpx_dsp/x86/variance_sse2.c

@@ -307,3 +307,171 @@

   vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);

   return *sse;

+#if CONFIG_USE_X86INC

+// The 2 unused parameters are place holders for PIC enabled build.

+// These definitions are for functions defined in subpel_variance.asm

+#define DECL(w, opt) \

+  int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \

+                                          ptrdiff_t src_stride, \

+                                          int x_offset, int y_offset, \

+                                          const uint8_t *dst, \

+                                          ptrdiff_t dst_stride, \

+                                          int height, unsigned int *sse, \

+                                          void *unused0, void *unused)

+#define DECLS(opt1, opt2) \

+  DECL(4, opt2); \

+  DECL(8, opt1); \

+  DECL(16, opt1)

+DECLS(sse2, sse);

+DECLS(ssse3, ssse3);

+#undef DECLS

+#undef DECL

+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \

+unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \

+                                                     int src_stride, \

+                                                     int x_offset, \

+                                                     int y_offset, \

+                                                     const uint8_t *dst, \

+                                                     int dst_stride, \

+                                                     unsigned int *sse_ptr) { \

+  unsigned int sse; \

+  int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \

+                                                y_offset, dst, dst_stride, \

+                                                h, &sse, NULL, NULL); \

+  if (w > wf) { \

+    unsigned int sse2; \

+    int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \

+                                                   x_offset, y_offset, \

+                                                   dst + 16, dst_stride, \

+                                                   h, &sse2, NULL, NULL); \

+    se += se2; \

+    sse += sse2; \

+    if (w > wf * 2) { \

+      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \

+                                                 x_offset, y_offset, \

+                                                 dst + 32, dst_stride, \

+                                                 h, &sse2, NULL, NULL); \

+      se += se2; \

+      sse += sse2; \

+      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \

+                                                 x_offset, y_offset, \

+                                                 dst + 48, dst_stride, \

+                                                 h, &sse2, NULL, NULL); \

+      se += se2; \

+      sse += sse2; \

+    } \

+  } \

+  *sse_ptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+}

+#define FNS(opt1, opt2) \

+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \

+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \

+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \

+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \

+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \

+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \

+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \

+FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \

+FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \

+FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \

+FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \

+FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \

+FN(4,   4,  4, 2, 2, opt2, (uint32_t))

+FNS(sse2, sse);

+FNS(ssse3, ssse3);

+#undef FNS

+#undef FN

+// The 2 unused parameters are place holders for PIC enabled build.

+#define DECL(w, opt) \

+int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \

+                                            ptrdiff_t src_stride, \

+                                            int x_offset, int y_offset, \

+                                            const uint8_t *dst, \

+                                            ptrdiff_t dst_stride, \

+                                            const uint8_t *sec, \

+                                            ptrdiff_t sec_stride, \

+                                            int height, unsigned int *sse, \

+                                            void *unused0, void *unused)

+#define DECLS(opt1, opt2) \

+DECL(4, opt2); \

+DECL(8, opt1); \

+DECL(16, opt1)

+DECLS(sse2, sse);

+DECLS(ssse3, ssse3);

+#undef DECL

+#undef DECLS

+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \

+unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \

+                                                         int src_stride, \

+                                                         int x_offset, \

+                                                         int y_offset, \

+                                                         const uint8_t *dst, \

+                                                         int dst_stride, \

+                                                         unsigned int *sseptr, \

+                                                         const uint8_t *sec) { \

+  unsigned int sse; \

+  int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \

+                                                    y_offset, dst, dst_stride, \

+                                                    sec, w, h, &sse, NULL, \

+                                                    NULL); \

+  if (w > wf) { \

+    unsigned int sse2; \

+    int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \

+                                                       x_offset, y_offset, \

+                                                       dst + 16, dst_stride, \

+                                                       sec + 16, w, h, &sse2, \

+                                                       NULL, NULL); \

+    se += se2; \

+    sse += sse2; \

+    if (w > wf * 2) { \

+      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \

+                                                     x_offset, y_offset, \

+                                                     dst + 32, dst_stride, \

+                                                     sec + 32, w, h, &sse2, \

+                                                     NULL, NULL); \

+      se += se2; \

+      sse += sse2; \

+      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \

+                                                     x_offset, y_offset, \

+                                                     dst + 48, dst_stride, \

+                                                     sec + 48, w, h, &sse2, \

+                                                     NULL, NULL); \

+      se += se2; \

+      sse += sse2; \

+    } \

+  } \

+  *sseptr = sse; \

+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \

+}

+#define FNS(opt1, opt2) \

+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \

+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \

+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \

+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \

+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \

+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \

+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \

+FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \

+FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \

+FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \

+FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \

+FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \

+FN(4,   4,  4, 2, 2, opt2, (uint32_t))

+FNS(sse2, sse);

+FNS(ssse3, ssse3);

+#undef FNS

+#undef FN

+#endif  // CONFIG_USE_X86INC