ref: 6fc2e57c2ca6c9a7b1eecb2c7d93b65222b6727d
parent: fb60204d4c36a4041daaca2f1461b731fa2dfaa2
author: Linfeng Zhang <[email protected]>
date: Wed Apr 5 10:41:35 EDT 2017
Update 32x32 high bitdepth idct NEON optimization Preparation of CONVERT_TO_BYTEPTR/SHORTPTR clean up. BUG=webm:1388 Change-Id: I928d30a5698023bb90888d783cf81c51ec183760
--- a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
@@ -726,9 +726,10 @@
highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
}
-void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest,
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
int i;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
if (bd == 8) {
int16_t temp[32 * 16];
@@ -742,7 +743,6 @@
dest += 8;
}
} else {
- uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
int32_t temp[32 * 16];
int32_t *t = temp;
vpx_highbd_idct32_12_neon(input, temp);
@@ -749,9 +749,9 @@
vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8);
for (i = 0; i < 32; i += 8) {
- vpx_highbd_idct32_16_neon(t, dst, stride, bd);
+ vpx_highbd_idct32_16_neon(t, dest, stride, bd);
t += (16 * 8);
- dst += 8;
+ dest += 8;
}
}
}
--- a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
@@ -594,9 +594,10 @@
highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
}
-void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
int i;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
if (bd == 8) {
int16_t temp[32 * 8];
@@ -610,7 +611,6 @@
dest += 8;
}
} else {
- uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
int32_t temp[32 * 8];
int32_t *t = temp;
@@ -617,9 +617,9 @@
vpx_highbd_idct32_6_neon(input, t);
for (i = 0; i < 32; i += 8) {
- vpx_highbd_idct32_8_neon(t, dst, stride, bd);
+ vpx_highbd_idct32_8_neon(t, dest, stride, bd);
t += (8 * 8);
- dst += 8;
+ dest += 8;
}
}
}
--- a/vpx_dsp/arm/idct32x32_135_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -371,7 +371,7 @@
vst1q_s16(output, vsubq_s16(s7[0], s6[31]));
}
-void vpx_idct32_16_neon(const int16_t *const input, uint8_t *const output,
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
const int stride, const int highbd_flag) {
int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
out[32];
@@ -646,17 +646,17 @@
out[31] = final_sub(s7[0], s6[31]);
if (highbd_flag) {
- uint16_t *const outputT = CONVERT_TO_SHORTPTR(output);
- highbd_add_and_store_bd8(out, outputT, stride);
+ highbd_add_and_store_bd8(out, output, stride);
} else {
+ uint8_t *const outputT = (uint8_t *)output;
add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
- out[7], output, stride);
+ out[7], outputT, stride);
add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13],
- out[14], out[15], output + (8 * stride), stride);
+ out[14], out[15], outputT + (8 * stride), stride);
add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21],
- out[22], out[23], output + (16 * stride), stride);
+ out[22], out[23], outputT + (16 * stride), stride);
add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29],
- out[30], out[31], output + (24 * stride), stride);
+ out[30], out[31], outputT + (24 * stride), stride);
}
}
--- a/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -265,7 +265,7 @@
vst1q_s16(output, vsubq_s16(s1[0], s2[31]));
}
-void vpx_idct32_8_neon(const int16_t *input, uint8_t *output, int stride,
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
const int highbd_flag) {
int16x8_t in[8], s1[32], s2[32], s3[32], out[32];
@@ -486,17 +486,17 @@
out[31] = final_sub(s1[0], s2[31]);
if (highbd_flag) {
- uint16_t *const outputT = CONVERT_TO_SHORTPTR(output);
- highbd_add_and_store_bd8(out, outputT, stride);
+ highbd_add_and_store_bd8(out, output, stride);
} else {
+ uint8_t *const outputT = (uint8_t *)output;
add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
- out[7], output, stride);
+ out[7], outputT, stride);
add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13],
- out[14], out[15], output + (8 * stride), stride);
+ out[14], out[15], outputT + (8 * stride), stride);
add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21],
- out[22], out[23], output + (16 * stride), stride);
+ out[22], out[23], outputT + (16 * stride), stride);
add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29],
- out[30], out[31], output + (24 * stride), stride);
+ out[30], out[31], outputT + (24 * stride), stride);
}
}
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -982,11 +982,11 @@
const int stride, const int highbd_flag);
void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output);
-void vpx_idct32_16_neon(const int16_t *const input, uint8_t *const output,
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
const int stride, const int highbd_flag);
void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output);
-void vpx_idct32_8_neon(const int16_t *input, uint8_t *output, int stride,
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
const int highbd_flag);
#endif // VPX_DSP_ARM_IDCT_NEON_H_