shithub: libvpx

--- a/test/partial_idct_test.cc

+++ b/test/partial_idct_test.cc

@@ -450,6 +450,24 @@

 #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE

 const PartialInvTxfmParam neon_partial_idct_tests[] = {

 #if CONFIG_VP9_HIGHBITDEPTH

+  make_tuple(

+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,

+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 8, 2),

+  make_tuple(

+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,

+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 10, 2),

+  make_tuple(

+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,

+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 12, 2),

+  make_tuple(

+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,

+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 8, 2),

+  make_tuple(

+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,

+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 10, 2),

+  make_tuple(

+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,

+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 12, 2),

   make_tuple(&vpx_highbd_fdct8x8_c,

              &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,

              &highbd_wrapper<vpx_highbd_idct8x8_64_add_neon>, TX_8X8, 64, 8, 2),

--- /dev/null

+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c

@@ -1,0 +1,73 @@

+/*

+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/inv_txfm.h"

+static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest,

+                                                     const int stride,

+                                                     const int16x8_t res,

+                                                     const int16x8_t max) {

+  const uint16x8_t a0 = vld1q_u16(*dest);

+  const uint16x8_t a1 = vld1q_u16(*dest + 8);

+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));

+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));

+  const int16x8_t c0 = vminq_s16(b0, max);

+  const int16x8_t c1 = vminq_s16(b1, max);

+  vst1q_u16(*dest, vreinterpretq_u16_s16(c0));

+  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));

+  *dest += stride;

+}

+static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,

+                                                     const int stride,

+                                                     const int16x8_t res) {

+  const uint16x8_t a0 = vld1q_u16(*dest);

+  const uint16x8_t a1 = vld1q_u16(*dest + 8);

+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));

+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));

+  const uint16x8_t c0 = vqshluq_n_s16(b0, 0);

+  const uint16x8_t c1 = vqshluq_n_s16(b1, 0);

+  vst1q_u16(*dest, c0);

+  vst1q_u16(*dest + 8, c1);

+  *dest += stride;

+}

+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,

+                                     int stride, int bd) {

+  const tran_low_t out0 =

+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

+  const tran_low_t out1 =

+      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);

+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);

+  const int16x8_t dc = vdupq_n_s16(a1);

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  int i;

+  if (a1 >= 0) {

+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);

+    for (i = 0; i < 4; ++i) {

+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);

+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);

+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);

+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);

+    }

+  } else {

+    for (i = 0; i < 4; ++i) {

+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);

+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);

+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);

+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);

+    }

+  }

+}

--- /dev/null

+++ b/vpx_dsp/arm/highbd_idct32x32_add_neon.c

@@ -1,0 +1,89 @@

+/*

+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/inv_txfm.h"

+static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,

+                                                     const int stride,

+                                                     const int16x8_t res,

+                                                     const int16x8_t max) {

+  const uint16x8_t a0 = vld1q_u16(*dest);

+  const uint16x8_t a1 = vld1q_u16(*dest + 8);

+  const uint16x8_t a2 = vld1q_u16(*dest + 16);

+  const uint16x8_t a3 = vld1q_u16(*dest + 24);

+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));

+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));

+  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));

+  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));

+  const int16x8_t c0 = vminq_s16(b0, max);

+  const int16x8_t c1 = vminq_s16(b1, max);

+  const int16x8_t c2 = vminq_s16(b2, max);

+  const int16x8_t c3 = vminq_s16(b3, max);

+  vst1q_u16(*dest, vreinterpretq_u16_s16(c0));

+  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));

+  vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));

+  vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));

+  *dest += stride;

+}

+static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,

+                                                     const int stride,

+                                                     const int16x8_t res) {

+  const uint16x8_t a0 = vld1q_u16(*dest);

+  const uint16x8_t a1 = vld1q_u16(*dest + 8);

+  const uint16x8_t a2 = vld1q_u16(*dest + 16);

+  const uint16x8_t a3 = vld1q_u16(*dest + 24);

+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));

+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));

+  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));

+  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));

+  const uint16x8_t c0 = vqshluq_n_s16(b0, 0);

+  const uint16x8_t c1 = vqshluq_n_s16(b1, 0);

+  const uint16x8_t c2 = vqshluq_n_s16(b2, 0);

+  const uint16x8_t c3 = vqshluq_n_s16(b3, 0);

+  vst1q_u16(*dest, c0);

+  vst1q_u16(*dest + 8, c1);

+  vst1q_u16(*dest + 16, c2);

+  vst1q_u16(*dest + 24, c3);

+  *dest += stride;

+}

+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,

+                                     int stride, int bd) {

+  const tran_low_t out0 =

+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

+  const tran_low_t out1 =

+      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);

+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);

+  const int16x8_t dc = vdupq_n_s16(a1);

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  int i;

+  if (a1 >= 0) {

+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);

+    for (i = 0; i < 8; ++i) {

+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);

+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);

+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);

+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);

+    }

+  } else {

+    for (i = 0; i < 8; ++i) {

+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);

+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);

+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);

+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);

+    }

+  }

+}

--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c

@@ -15,21 +15,29 @@

 #include "vpx_dsp/arm/transpose_neon.h"

 #include "vpx_dsp/inv_txfm.h"

-static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest,

-                                               const int stride,

-                                               const int16x8_t res,

-                                               const int16x8_t max) {

+static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest,

+                                                   const int stride,

+                                                   const int16x8_t res,

+                                                   const int16x8_t max) {

   const uint16x8_t a = vld1q_u16(*dest);

   const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));

   const int16x8_t c = vminq_s16(b, max);

-  const uint16x8_t d = vqshluq_n_s16(c, 0);

-  vst1q_u16(*dest, d);

+  vst1q_u16(*dest, vreinterpretq_u16_s16(c));

   *dest += stride;

+static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,

+                                                   const int stride,

+                                                   const int16x8_t res) {

+  const uint16x8_t a = vld1q_u16(*dest);

+  const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));

+  const uint16x8_t c = vqshluq_n_s16(b, 0);

+  vst1q_u16(*dest, c);

+  *dest += stride;

+}

 void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,

                                    int stride, int bd) {

-  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);

   const tran_low_t out0 =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

   const tran_low_t out1 =

@@ -38,14 +46,26 @@

   const int16x8_t dc = vdupq_n_s16(a1);

   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);

-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);

-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);

-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);

-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);

-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);

-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);

-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);

+  if (a1 >= 0) {

+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);

+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);

+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);

+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);

+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);

+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);

+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);

+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);

+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);

+  } else {

+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);

+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);

+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);

+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);

+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);

+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);

+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);

+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);

+  }

 static INLINE void idct8x8_12_half1d_bd10(

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -224,6 +224,8 @@

 else  # CONFIG_VP9_HIGHBITDEPTH

 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct4x4_add_neon.c

 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct8x8_add_neon.c

+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct16x16_add_neon.c

+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_add_neon.c

 endif  # !CONFIG_VP9_HIGHBITDEPTH

 ifeq ($(HAVE_NEON_ASM),yes)

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -624,6 +624,7 @@

   specialize qw/vpx_highbd_idct8x8_1_add neon/;

   add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  specialize qw/vpx_highbd_idct16x16_1_add neon/;

   add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

@@ -630,7 +631,7 @@

   add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

   add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

-  specialize qw/vpx_highbd_idct32x32_1_add sse2/;

+  specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;

   add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";