shithub: libvpx

Download patch

ref: 43a30d3a1a6b627fa05ba63f4c51414ced781ccb
parent: 4a8c248744500f9caf00588ca312efce5659e45e
author: Johann <[email protected]>
date: Mon Nov 12 06:30:03 EST 2018

quantize: use aarch64 vmaxv

Simplify max value calculation on aarch64 by using vmaxv. Much
faster for 4x4 but diminishing returns as the block size grows.

Only the vp9 quantize has a speed test hooked up. Anticipate
similar results for the other quantize versions.

Before:
[ RUN      ] NEON/VP9QuantizeTest.DISABLED_Speed/2
[    BENCH ]      Bypass calculations       4x4  31.6 ms ( ±0.0 ms )
[    BENCH ]        Full calculations       4x4  31.6 ms ( ±0.0 ms )
[    BENCH ]      Bypass calculations       8x8  17.7 ms ( ±0.0 ms )
[    BENCH ]        Full calculations       8x8  17.7 ms ( ±0.0 ms )
[    BENCH ]      Bypass calculations     16x16  14.2 ms ( ±0.0 ms )
[    BENCH ]        Full calculations     16x16  14.2 ms ( ±0.0 ms )
[       OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1906 ms)
[ RUN      ] NEON/VP9QuantizeTest.DISABLED_Speed/3
[    BENCH ]      Bypass calculations     32x32  18.6 ms ( ±0.0 ms )
[    BENCH ]        Full calculations     32x32  18.6 ms ( ±0.0 ms )

After:
[ RUN      ] NEON/VP9QuantizeTest.DISABLED_Speed/2
[    BENCH ]      Bypass calculations       4x4  29.1 ms ( ±0.0 ms )
[    BENCH ]        Full calculations       4x4  29.1 ms ( ±0.0 ms )
[    BENCH ]      Bypass calculations       8x8  16.9 ms ( ±0.0 ms )
[    BENCH ]        Full calculations       8x8  16.9 ms ( ±0.0 ms )
[    BENCH ]      Bypass calculations     16x16  14.1 ms ( ±0.0 ms )
[    BENCH ]        Full calculations     16x16  14.1 ms ( ±0.0 ms )
[       OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1803 ms)
[ RUN      ] NEON/VP9QuantizeTest.DISABLED_Speed/3
[    BENCH ]      Bypass calculations     32x32  18.6 ms ( ±0.0 ms )
[    BENCH ]        Full calculations     32x32  18.6 ms ( ±0.0 ms )

Change-Id: Ic95812b3fdbd4e47b4dcb8ed46c68a9617de38d2

--- a/vp8/encoder/arm/neon/fastquantizeb_neon.c
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -26,9 +26,11 @@
                    zig_zag1 = vld1q_u16(inv_zig_zag + 8);
   int16x8_t x0, x1, sz0, sz1, y0, y1;
   uint16x8_t eob0, eob1;
+#ifndef __aarch64__
   uint16x4_t eob_d16;
   uint32x2_t eob_d32;
   uint32x4_t eob_q32;
+#endif  // __arch64__
 
   /* sign of z: z >> 15 */
   sz0 = vshrq_n_s16(z0, 15);
@@ -66,11 +68,17 @@
 
   /* select the largest value */
   eob0 = vmaxq_u16(eob0, eob1);
+#ifdef __aarch64__
+  *d->eob = (int8_t)vmaxvq_u16(eob0);
+#else
   eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
   eob_q32 = vmovl_u16(eob_d16);
   eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
   eob_d32 = vpmax_u32(eob_d32, eob_d32);
 
+  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
+#endif  // __aarch64__
+
   /* qcoeff = x */
   vst1q_s16(d->qcoeff, x0);
   vst1q_s16(d->qcoeff + 8, x1);
@@ -78,6 +86,4 @@
   /* dqcoeff = x * dequant */
   vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
   vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
-
-  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
 }
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -97,6 +97,9 @@
     store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
     store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
   }
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
+#else
   {
     const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
                                              vget_high_s16(v_eobmax_76543210));
@@ -111,6 +114,7 @@
 
     *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
   }
+#endif  // __aarch64__
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -226,6 +230,9 @@
       dqcoeff_ptr += 8;
     }
 
+#ifdef __aarch64__
+    *eob_ptr = vmaxvq_u16(eob_max);
+#else
     {
       const uint16x4_t eob_max_0 =
           vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
@@ -233,5 +240,6 @@
       const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
       vst1_lane_u16(eob_ptr, eob_max_2, 0);
     }
+#endif  // __aarch64__
   }
 }
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -135,6 +135,9 @@
     } while (n_coeffs > 0);
   }
 
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
   {
     const uint16x4_t eob_max_0 =
         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
@@ -142,6 +145,7 @@
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
+#endif  // __aarch64__
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -288,6 +292,9 @@
     }
   }
 
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
   {
     const uint16x4_t eob_max_0 =
         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
@@ -295,4 +302,5 @@
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
+#endif  // __aarch64__
 }