shithub: libvpx

--- a/test/hadamard_test.cc

+++ b/test/hadamard_test.cc

@@ -302,6 +302,13 @@

 class Hadamard32x32Test : public HadamardTestBase {};

+void HadamardSpeedTest32x32(HadamardFunc const func, int times) {

+  DECLARE_ALIGNED(16, int16_t, input[1024]);

+  DECLARE_ALIGNED(16, tran_low_t, output[1024]);

+  memset(input, 1, sizeof(input));

+  HadamardSpeedTest("Hadamard32x32", func, input, 32, output, times);

+}

 TEST_P(Hadamard32x32Test, CompareReferenceRandom) {

   CompareReferenceRandom<32>();

@@ -308,6 +315,17 @@

 TEST_P(Hadamard32x32Test, VaryStride) { VaryStride<32>(); }

+TEST_P(Hadamard32x32Test, DISABLED_Speed) {

+  HadamardSpeedTest32x32(h_func_, 10);

+  HadamardSpeedTest32x32(h_func_, 10000);

+  HadamardSpeedTest32x32(h_func_, 10000000);

+}

 INSTANTIATE_TEST_CASE_P(C, Hadamard32x32Test,

                         ::testing::Values(&vpx_hadamard_32x32_c));

+#if HAVE_SSE2

+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard32x32Test,

+                        ::testing::Values(&vpx_hadamard_32x32_sse2));

+#endif  // HAVE_SSE2

 }  // namespace

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -783,7 +783,7 @@

     specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;

     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";

-    specialize qw/vpx_hadamard_32x32/;

+    specialize qw/vpx_hadamard_32x32 sse2/;

     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";

     specialize qw/vpx_satd avx2 sse2 neon/;

@@ -795,7 +795,7 @@

     specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;

     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";

-    specialize qw/vpx_hadamard_32x32/;

+    specialize qw/vpx_hadamard_32x32 sse2/;

     add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";

     specialize qw/vpx_satd avx2 sse2 neon msa/;

--- a/vpx_dsp/x86/avg_intrin_sse2.c

+++ b/vpx_dsp/x86/avg_intrin_sse2.c

@@ -372,6 +372,45 @@

+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,

+                             tran_low_t *coeff) {

+  int idx;

+  for (idx = 0; idx < 4; ++idx) {

+    const int16_t *src_ptr =

+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;

+    vpx_hadamard_16x16_sse2(src_ptr, src_stride, coeff + idx * 256);

+  }

+  for (idx = 0; idx < 256; idx += 8) {

+    __m128i coeff0 = load_tran_low(coeff);

+    __m128i coeff1 = load_tran_low(coeff + 256);

+    __m128i coeff2 = load_tran_low(coeff + 512);

+    __m128i coeff3 = load_tran_low(coeff + 768);

+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);

+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);

+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);

+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);

+    b0 = _mm_srai_epi16(b0, 2);

+    b1 = _mm_srai_epi16(b1, 2);

+    b2 = _mm_srai_epi16(b2, 2);

+    b3 = _mm_srai_epi16(b3, 2);

+    coeff0 = _mm_add_epi16(b0, b2);

+    coeff1 = _mm_add_epi16(b1, b3);

+    store_tran_low(coeff0, coeff);

+    store_tran_low(coeff1, coeff + 256);

+    coeff2 = _mm_sub_epi16(b0, b2);

+    coeff3 = _mm_sub_epi16(b1, b3);

+    store_tran_low(coeff2, coeff + 512);

+    store_tran_low(coeff3, coeff + 768);

+    coeff += 8;

+  }

+}

 int vpx_satd_sse2(const tran_low_t *coeff, int length) {

   int i;

   const __m128i zero = _mm_setzero_si128();