ref: 86e340c76e73c5336b2a39142ab858431d441d6b
parent: 5d4aa325a6be1be3688eb893aa64b0f91b4ff07a
author: James Zern <[email protected]>
date: Tue Dec 6 15:52:34 EST 2016
enable vpx_idct32x32_1024_add_neon in hbd builds BUG=webm:1294 Change-Id: Ibdda54e6d1303b0f73bc7bc71417e4041d7618de
--- a/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_add_neon.c
@@ -12,6 +12,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
@@ -150,51 +151,97 @@
*qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14));
}
+static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1,
+ int16x8_t *s2, int16x8_t *s3, int16x8_t *s4,
+ int16x8_t *s5, int16x8_t *s6, int16x8_t *s7) {
+ *s0 = vld1q_s16(in);
+ in += 32;
+ *s1 = vld1q_s16(in);
+ in += 32;
+ *s2 = vld1q_s16(in);
+ in += 32;
+ *s3 = vld1q_s16(in);
+ in += 32;
+ *s4 = vld1q_s16(in);
+ in += 32;
+ *s5 = vld1q_s16(in);
+ in += 32;
+ *s6 = vld1q_s16(in);
+ in += 32;
+ *s7 = vld1q_s16(in);
+}
+
+static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1,
+ int16x8_t a2, int16x8_t a3,
+ int16x8_t a4, int16x8_t a5,
+ int16x8_t a6, int16x8_t a7,
+ int16_t **out) {
+ transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ vst1q_s16(*out, a0);
+ *out += 8;
+ vst1q_s16(*out, a1);
+ *out += 8;
+ vst1q_s16(*out, a2);
+ *out += 8;
+ vst1q_s16(*out, a3);
+ *out += 8;
+ vst1q_s16(*out, a4);
+ *out += 8;
+ vst1q_s16(*out, a5);
+ *out += 8;
+ vst1q_s16(*out, a6);
+ *out += 8;
+ vst1q_s16(*out, a7);
+ *out += 8;
+}
+
static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) {
- const int16_t *in;
int i;
- const int stride = 32;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
for (i = 0; i < 4; i++, input += 8) {
- in = input;
- q8s16 = vld1q_s16(in);
- in += stride;
- q9s16 = vld1q_s16(in);
- in += stride;
- q10s16 = vld1q_s16(in);
- in += stride;
- q11s16 = vld1q_s16(in);
- in += stride;
- q12s16 = vld1q_s16(in);
- in += stride;
- q13s16 = vld1q_s16(in);
- in += stride;
- q14s16 = vld1q_s16(in);
- in += stride;
- q15s16 = vld1q_s16(in);
+ load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+ }
+}
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
- &q14s16, &q15s16);
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void load_s16x8q_tran_low(const tran_low_t *in, int16x8_t *s0,
+ int16x8_t *s1, int16x8_t *s2,
+ int16x8_t *s3, int16x8_t *s4,
+ int16x8_t *s5, int16x8_t *s6,
+ int16x8_t *s7) {
+ *s0 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s1 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s2 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s3 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s4 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s5 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s6 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s7 = load_tran_low_to_s16q(in);
+}
- vst1q_s16(t_buf, q8s16);
- t_buf += 8;
- vst1q_s16(t_buf, q9s16);
- t_buf += 8;
- vst1q_s16(t_buf, q10s16);
- t_buf += 8;
- vst1q_s16(t_buf, q11s16);
- t_buf += 8;
- vst1q_s16(t_buf, q12s16);
- t_buf += 8;
- vst1q_s16(t_buf, q13s16);
- t_buf += 8;
- vst1q_s16(t_buf, q14s16);
- t_buf += 8;
- vst1q_s16(t_buf, q15s16);
- t_buf += 8;
+static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input,
+ int16_t *t_buf) {
+ int i;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ for (i = 0; i < 4; i++, input += 8) {
+ load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
}
}
+#else // !CONFIG_VP9_HIGHBITDEPTH
+#define idct32_transpose_pair_tran_low idct32_transpose_pair
+#endif // CONFIG_VP9_HIGHBITDEPTH
static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
int16x8_t q3s16, int16x8_t q6s16,
@@ -383,16 +430,22 @@
int16_t trans_buf[32 * 8];
int16_t pass1[32 * 32];
int16_t pass2[32 * 32];
- int16_t *out;
+ int16_t *in, *out;
int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
idct32_pass_loop++,
- input = pass1, // the input of pass2 is the result of pass1
+ in = pass1, // the input of pass2 is the result of pass1
out = pass2) {
- for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop
- idct32_transpose_pair(input, trans_buf);
+ for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop
+ if (idct32_pass_loop == 0) {
+ idct32_transpose_pair_tran_low(input, trans_buf);
+ input += 32 * 8;
+ } else {
+ idct32_transpose_pair(in, trans_buf);
+ in += 32 * 8;
+ }
// -----------------------------------------
// BLOCK A: 16-19,28-31
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -201,8 +201,6 @@
DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c
-
DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/idct8x8_msa.c
@@ -240,6 +238,7 @@
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c
endif # CONFIG_VP9
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -696,7 +696,7 @@
specialize qw/vpx_idct16x16_1_add neon sse2/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64";
+ specialize qw/vpx_idct32x32_1024_add neon sse2/, "$ssse3_x86_64";
add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_135_add neon sse2/, "$ssse3_x86_64";