shithub: libvpx

Download patch

ref: ddb3d7a8a182bf0179b8c96d6d1a242ba5e4df6e
parent: 881c8ec816eb0790613dc674ee8f7a0e102e1835
parent: d8424d2890ddce0d92778e055eee145655cf034e
author: Linfeng Zhang <[email protected]>
date: Thu Mar 15 14:05:08 EDT 2018

Merge changes I9e0bf2c7,I695b4090

* changes:
  Fix a bug in vp9_highbd_iht8x8_64_add_neon
  Fix a bug in vp9_highbd_iht4x4_16_add_neon()

--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -629,14 +629,10 @@
 
 static const FuncInfo ht_neon_func_info[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
-// TODO(linfengz): reenable these functions once test vector failures are
-// addressed.
-#if 0
   { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
     2 },
   { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
     2 },
-#endif
 #endif
   { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
   { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
--- a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
@@ -23,34 +23,55 @@
 static INLINE void highbd_iadst4(int32x4_t *const io) {
   const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
   const int32x4_t sinpi = vld1q_s32(sinpis);
-  int32x4_t s[8];
+  int64x2x2_t s[7], t[4];
+  int32x4_t s7;
 
-  s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0);
-  s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1);
-  s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0);
-  s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1);
-  s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0);
-  s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1);
-  s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1);
-  s[7] = vsubq_s32(io[0], io[2]);
-  s[7] = vaddq_s32(s[7], io[3]);
+  s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0);
+  s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0);
+  s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1);
+  s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1);
+  s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0);
+  s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0);
+  s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1);
+  s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1);
+  s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0);
+  s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0);
+  s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1);
+  s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1);
+  s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1);
+  s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1);
+  s7 = vsubq_s32(io[0], io[2]);
+  s7 = vaddq_s32(s7, io[3]);
 
-  s[0] = vaddq_s32(s[0], s[3]);
-  s[0] = vaddq_s32(s[0], s[5]);
-  s[1] = vsubq_s32(s[1], s[4]);
-  s[1] = vsubq_s32(s[1], s[6]);
+  s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+  s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+  s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]);
+  s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]);
+  s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]);
+  s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]);
+  s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]);
+  s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]);
   s[3] = s[2];
-  s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0);
+  s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0);
+  s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0);
 
-  io[0] = vaddq_s32(s[0], s[3]);
-  io[1] = vaddq_s32(s[1], s[3]);
-  io[2] = s[2];
-  io[3] = vaddq_s32(s[0], s[1]);
-  io[3] = vsubq_s32(io[3], s[3]);
-  io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS);
-  io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS);
-  io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS);
-  io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS);
+  t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+  t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+  t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]);
+  t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]);
+  t[2] = s[2];
+  t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]);
+  t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]);
+  t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]);
+  t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]);
+  io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS));
+  io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS));
+  io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS));
+  io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS));
 }
 
 void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
--- a/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c
@@ -132,10 +132,10 @@
   return vcombine_s32(out_lo, out_hi);
 }
 
-static INLINE void iadst8_bd10(int32x4_t *const io0, int32x4_t *const io1,
-                               int32x4_t *const io2, int32x4_t *const io3,
-                               int32x4_t *const io4, int32x4_t *const io5,
-                               int32x4_t *const io6, int32x4_t *const io7) {
+static INLINE void highbd_iadst8(int32x4_t *const io0, int32x4_t *const io1,
+                                 int32x4_t *const io2, int32x4_t *const io3,
+                                 int32x4_t *const io4, int32x4_t *const io5,
+                                 int32x4_t *const io6, int32x4_t *const io7) {
   const int32x4_t c0 =
       create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
   const int32x4_t c1 =
@@ -143,80 +143,6 @@
   const int32x4_t c2 =
       create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
   int32x4_t x[8], t[4];
-  int32x4_t s[8];
-
-  x[0] = *io7;
-  x[1] = *io0;
-  x[2] = *io5;
-  x[3] = *io2;
-  x[4] = *io3;
-  x[5] = *io4;
-  x[6] = *io1;
-  x[7] = *io6;
-
-  // stage 1
-  iadst_butterfly_lane_0_1_bd10_neon(x[0], x[1], vget_low_s32(c0), &s[0],
-                                     &s[1]);
-  iadst_butterfly_lane_0_1_bd10_neon(x[2], x[3], vget_high_s32(c0), &s[2],
-                                     &s[3]);
-  iadst_butterfly_lane_0_1_bd10_neon(x[4], x[5], vget_low_s32(c1), &s[4],
-                                     &s[5]);
-  iadst_butterfly_lane_0_1_bd10_neon(x[6], x[7], vget_high_s32(c1), &s[6],
-                                     &s[7]);
-
-  x[0] = add_dct_const_round_shift_low_8_bd10(s[0], s[4]);
-  x[1] = add_dct_const_round_shift_low_8_bd10(s[1], s[5]);
-  x[2] = add_dct_const_round_shift_low_8_bd10(s[2], s[6]);
-  x[3] = add_dct_const_round_shift_low_8_bd10(s[3], s[7]);
-  x[4] = sub_dct_const_round_shift_low_8_bd10(s[0], s[4]);
-  x[5] = sub_dct_const_round_shift_low_8_bd10(s[1], s[5]);
-  x[6] = sub_dct_const_round_shift_low_8_bd10(s[2], s[6]);
-  x[7] = sub_dct_const_round_shift_low_8_bd10(s[3], s[7]);
-
-  // stage 2
-  t[0] = x[0];
-  t[1] = x[1];
-  t[2] = x[2];
-  t[3] = x[3];
-  iadst_butterfly_lane_0_1_bd10_neon(x[4], x[5], vget_high_s32(c2), &s[4],
-                                     &s[5]);
-  iadst_butterfly_lane_1_0_bd10_neon(x[7], x[6], vget_high_s32(c2), &s[7],
-                                     &s[6]);
-
-  x[0] = vaddq_s32(t[0], t[2]);
-  x[1] = vaddq_s32(t[1], t[3]);
-  x[2] = vsubq_s32(t[0], t[2]);
-  x[3] = vsubq_s32(t[1], t[3]);
-  x[4] = add_dct_const_round_shift_low_8_bd10(s[4], s[6]);
-  x[5] = add_dct_const_round_shift_low_8_bd10(s[5], s[7]);
-  x[6] = sub_dct_const_round_shift_low_8_bd10(s[4], s[6]);
-  x[7] = sub_dct_const_round_shift_low_8_bd10(s[5], s[7]);
-
-  // stage 3
-  iadst_half_butterfly_bd10_neon(x + 2, vget_low_s32(c2));
-  iadst_half_butterfly_bd10_neon(x + 6, vget_low_s32(c2));
-
-  *io0 = x[0];
-  *io1 = vnegq_s32(x[4]);
-  *io2 = x[6];
-  *io3 = vnegq_s32(x[2]);
-  *io4 = x[3];
-  *io5 = vnegq_s32(x[7]);
-  *io6 = x[5];
-  *io7 = vnegq_s32(x[1]);
-}
-
-static INLINE void iadst8_bd12(int32x4_t *const io0, int32x4_t *const io1,
-                               int32x4_t *const io2, int32x4_t *const io3,
-                               int32x4_t *const io4, int32x4_t *const io5,
-                               int32x4_t *const io6, int32x4_t *const io7) {
-  const int32x4_t c0 =
-      create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
-  const int32x4_t c1 =
-      create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
-  const int32x4_t c2 =
-      create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
-  int32x4_t x[8], t[4];
   int64x2_t s[8][2];
 
   x[0] = *io7;
@@ -394,31 +320,17 @@
         const int32x4_t cospis1 =
             vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
 
-        if (bd == 10) {
-          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
-                                 &a[4], &a[5], &a[6], &a[7]);
-          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
-                                 &a[12], &a[13], &a[14], &a[15]);
-          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
-                            &a[11]);
-          iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
-          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
-                            &a[15]);
-          iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                               &a[4], &a[5], &a[6], &a[7]);
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                               &a[12], &a[13], &a[14], &a[15]);
+        transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
+                          &a[11]);
+        highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
+        transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+                          &a[15]);
+        highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                       &a[15]);
-        } else {
-          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
-                                 &a[4], &a[5], &a[6], &a[7]);
-          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
-                                 &a[12], &a[13], &a[14], &a[15]);
-          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
-                            &a[11]);
-          iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
-          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
-                            &a[15]);
-          iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
-                      &a[15]);
-        }
         break;
       }
 
@@ -427,67 +339,36 @@
         const int32x4_t cospis1 =
             vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
 
-        if (bd == 10) {
-          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
-                            &a[7]);
-          iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
-          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
-                            &a[14], &a[15]);
-          iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+        transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
+                          &a[7]);
+        highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+        transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+                          &a[15]);
+        highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
                       &a[15]);
-          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
-                                 &a[2], &a[10], &a[3], &a[11]);
-          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
-                                 &a[6], &a[14], &a[7], &a[15]);
-        } else {
-          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
-                            &a[7]);
-          iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
-          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
-                            &a[14], &a[15]);
-          iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
-                      &a[15]);
-          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
-                                 &a[2], &a[10], &a[3], &a[11]);
-          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
-                                 &a[6], &a[14], &a[7], &a[15]);
-        }
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                               &a[2], &a[10], &a[3], &a[11]);
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                               &a[6], &a[14], &a[7], &a[15]);
         break;
       }
 
       default: {
         assert(tx_type == ADST_ADST);
-        if (bd == 10) {
-          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
-                            &a[7]);
-          iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
-          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
-                            &a[14], &a[15]);
-          iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+        transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
+                          &a[7]);
+        highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+        transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+                          &a[15]);
+        highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
                       &a[15]);
-          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
-                            &a[11]);
-          iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
-          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
-                            &a[15]);
-          iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+        transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
+                          &a[11]);
+        highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
+        transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+                          &a[15]);
+        highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                       &a[15]);
-        } else {
-          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
-                            &a[7]);
-          iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
-          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
-                            &a[14], &a[15]);
-          iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
-                      &a[15]);
-          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
-                            &a[11]);
-          iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
-          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
-                            &a[15]);
-          iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
-                      &a[15]);
-        }
         break;
       }
     }
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -103,8 +103,8 @@
   add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
 
   if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
-    specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
-    specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
+    specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
+    specialize qw/vp9_highbd_iht8x8_64_add neon sse4_1/;
     specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
   }
 }