shithub: libvpx

Download patch

ref: dd2c6cc34af8352f41b234eab8b4d9b9d1bfa9c4
parent: 9e6fa9bfb8296ce9fe0ad94bf63c5335f8ca35dc
author: Scott LaVarnway <[email protected]>
date: Mon Jul 23 10:18:52 EDT 2018

VPX: Improve HBD vpx_hadamard_32x32_avx2()

~14% improvement.

BUG=webm:1546

Change-Id: I0b25f62f053e13c2185e4e8bd54e52250251efd0

--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -131,9 +131,9 @@
                       _mm256_permute2x128_si256(src[6], src[7], 0x31));
 }
 
-void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
-                             tran_low_t *coeff) {
-  int idx;
+static INLINE void hadamard_16x16_avx2(int16_t const *src_diff,
+                                       ptrdiff_t src_stride, tran_low_t *coeff,
+                                       int is_final) {
 #if CONFIG_VP9_HIGHBITDEPTH
   DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
   int16_t *t_coeff = temp_coeff;
@@ -140,7 +140,8 @@
 #else
   int16_t *t_coeff = coeff;
 #endif
-
+  int16_t *coeff16 = (int16_t *)coeff;
+  int idx;
   for (idx = 0; idx < 2; ++idx) {
     int16_t const *src_ptr = src_diff + idx * 8 * src_stride;
     hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
@@ -161,32 +162,54 @@
     b1 = _mm256_srai_epi16(b1, 1);
     b2 = _mm256_srai_epi16(b2, 1);
     b3 = _mm256_srai_epi16(b3, 1);
-
-    store_tran_low(_mm256_add_epi16(b0, b2), coeff);
-    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
-    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
-    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
-
-    coeff += 16;
+    if (is_final) {
+      store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+      store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+      store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+      store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+      coeff += 16;
+    } else {
+      _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
+      coeff16 += 16;
+    }
     t_coeff += 16;
   }
 }
 
+void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
+}
+
 void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
                              tran_low_t *coeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+  int16_t *t_coeff = temp_coeff;
+#else
+  int16_t *t_coeff = coeff;
+#endif
   int idx;
   for (idx = 0; idx < 4; ++idx) {
     // src_diff: 9 bit, dynamic range [-255, 255]
     const int16_t *src_ptr =
         src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
-    vpx_hadamard_16x16_avx2(src_ptr, src_stride, coeff + idx * 256);
+    hadamard_16x16_avx2(src_ptr, src_stride,
+                        (tran_low_t *)(t_coeff + idx * 256), 0);
   }
 
   for (idx = 0; idx < 256; idx += 16) {
-    const __m256i coeff0 = load_tran_low(coeff);
-    const __m256i coeff1 = load_tran_low(coeff + 256);
-    const __m256i coeff2 = load_tran_low(coeff + 512);
-    const __m256i coeff3 = load_tran_low(coeff + 768);
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
 
     __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
     __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
@@ -204,6 +227,7 @@
     store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
 
     coeff += 16;
+    t_coeff += 16;
   }
 }