shithub: libvpx

--- a/test/dct_test.cc

+++ b/test/dct_test.cc

@@ -639,12 +639,8 @@

 #endif

 #endif

   { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },

-// TODO(linfengz): reenable these functions once test vector failures are

-// addressed.

-#if 0

   { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },

   { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }

-#endif

};

 INSTANTIATE_TEST_CASE_P(

--- a/test/fdct8x8_test.cc

+++ b/test/fdct8x8_test.cc

@@ -675,9 +675,8 @@

                         ::testing::Values(make_tuple(&vpx_fdct8x8_neon,

                                                      &vpx_idct8x8_64_add_neon,

                                                      0, VPX_BITS_8)));

-// TODO(linfengz): reenable these functions once test vector failures are

-// addressed.

-#if 0   // !CONFIG_VP9_HIGHBITDEPTH

+#if !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     NEON, FwdTrans8x8HT,

     ::testing::Values(

--- a/vp9/common/arm/neon/vp9_iht16x16_add_neon.c

+++ b/vp9/common/arm/neon/vp9_iht16x16_add_neon.c

@@ -221,30 +221,10 @@

   x[15] = sub_dct_const_round_shift_low_8(s13, s15);

   // stage 4

-  {

-    const int16x8_t sum = vaddq_s16(x[2], x[3]);

-    const int16x8_t sub = vsubq_s16(x[2], x[3]);

-    x[2] = iadst_half_butterfly_neg_neon(sum, c_16_n16_8_24);

-    x[3] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);

-  }

-  {

-    const int16x8_t sum = vaddq_s16(x[7], x[6]);

-    const int16x8_t sub = vsubq_s16(x[7], x[6]);

-    x[6] = iadst_half_butterfly_pos_neon(sum, c_16_n16_8_24);

-    x[7] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);

-  }

-  {

-    const int16x8_t sum = vaddq_s16(x[11], x[10]);

-    const int16x8_t sub = vsubq_s16(x[11], x[10]);

-    x[10] = iadst_half_butterfly_pos_neon(sum, c_16_n16_8_24);

-    x[11] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);

-  }

-  {

-    const int16x8_t sum = vaddq_s16(x[14], x[15]);

-    const int16x8_t sub = vsubq_s16(x[14], x[15]);

-    x[14] = iadst_half_butterfly_neg_neon(sum, c_16_n16_8_24);

-    x[15] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);

-  }

+  iadst_half_butterfly_neg_neon(&x[3], &x[2], c_16_n16_8_24);

+  iadst_half_butterfly_pos_neon(&x[7], &x[6], c_16_n16_8_24);

+  iadst_half_butterfly_pos_neon(&x[11], &x[10], c_16_n16_8_24);

+  iadst_half_butterfly_neg_neon(&x[15], &x[14], c_16_n16_8_24);

   out[0] = x[0];

   out[1] = vnegq_s16(x[8]);

--- a/vp9/common/arm/neon/vp9_iht_neon.h

+++ b/vp9/common/arm/neon/vp9_iht_neon.h

@@ -59,34 +59,55 @@

 static INLINE void iadst_half_butterfly_neon(int16x8_t *const x,

                                              const int16x4_t c) {

-  const int16x8_t sum = vaddq_s16(x[0], x[1]);

-  const int16x8_t sub = vsubq_s16(x[0], x[1]);

+  // Don't add/sub before multiply, which will overflow in iadst8.

+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);

+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);

+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);

+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);

   int32x4_t t0[2], t1[2];

-  t0[0] = vmull_lane_s16(vget_low_s16(sum), c, 0);

-  t0[1] = vmull_lane_s16(vget_high_s16(sum), c, 0);

-  t1[0] = vmull_lane_s16(vget_low_s16(sub), c, 0);

-  t1[1] = vmull_lane_s16(vget_high_s16(sub), c, 0);

+  t0[0] = vaddq_s32(x0_lo, x1_lo);

+  t0[1] = vaddq_s32(x0_hi, x1_hi);

+  t1[0] = vsubq_s32(x0_lo, x1_lo);

+  t1[1] = vsubq_s32(x0_hi, x1_hi);

   x[0] = dct_const_round_shift_low_8(t0);

   x[1] = dct_const_round_shift_low_8(t1);

-static INLINE int16x8_t iadst_half_butterfly_neg_neon(const int16x8_t in,

-                                                      const int16x4_t c) {

-  int32x4_t t[2];

+static INLINE void iadst_half_butterfly_neg_neon(int16x8_t *const x0,

+                                                 int16x8_t *const x1,

+                                                 const int16x4_t c) {

+  // Don't add/sub before multiply, which will overflow in iadst8.

+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 1);

+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 1);

+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 1);

+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 1);

+  int32x4_t t0[2], t1[2];

-  t[0] = vmull_lane_s16(vget_low_s16(in), c, 1);

-  t[1] = vmull_lane_s16(vget_high_s16(in), c, 1);

-  return dct_const_round_shift_low_8(t);

+  t0[0] = vaddq_s32(x0_lo, x1_lo);

+  t0[1] = vaddq_s32(x0_hi, x1_hi);

+  t1[0] = vsubq_s32(x0_lo, x1_lo);

+  t1[1] = vsubq_s32(x0_hi, x1_hi);

+  *x1 = dct_const_round_shift_low_8(t0);

+  *x0 = dct_const_round_shift_low_8(t1);

-static INLINE int16x8_t iadst_half_butterfly_pos_neon(const int16x8_t in,

-                                                      const int16x4_t c) {

-  int32x4_t t[2];

+static INLINE void iadst_half_butterfly_pos_neon(int16x8_t *const x0,

+                                                 int16x8_t *const x1,

+                                                 const int16x4_t c) {

+  // Don't add/sub before multiply, which will overflow in iadst8.

+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 0);

+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 0);

+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 0);

+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 0);

+  int32x4_t t0[2], t1[2];

-  t[0] = vmull_lane_s16(vget_low_s16(in), c, 0);

-  t[1] = vmull_lane_s16(vget_high_s16(in), c, 0);

-  return dct_const_round_shift_low_8(t);

+  t0[0] = vaddq_s32(x0_lo, x1_lo);

+  t0[1] = vaddq_s32(x0_hi, x1_hi);

+  t1[0] = vsubq_s32(x0_lo, x1_lo);

+  t1[1] = vsubq_s32(x0_hi, x1_hi);

+  *x1 = dct_const_round_shift_low_8(t0);

+  *x0 = dct_const_round_shift_low_8(t1);

 static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0,

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -68,8 +68,8 @@

   # Note that there are more specializations appended when

   # CONFIG_VP9_HIGHBITDEPTH is off.

   specialize qw/vp9_iht4x4_16_add neon sse2/;

-  specialize qw/vp9_iht8x8_64_add sse2/;

-  specialize qw/vp9_iht16x16_256_add sse2/;

+  specialize qw/vp9_iht8x8_64_add neon sse2/;

+  specialize qw/vp9_iht16x16_256_add neon sse2/;

   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {

     # Note that these specializations are appended to the above ones.

     specialize qw/vp9_iht4x4_16_add dspr2 msa/;