ref: 90f889a56d14b9335d6710af9d4034c101927b3b
parent: 72746c079d0b8631dad90f7c172ae19193405571
parent: 911bb980b1d2501ee9fa053ff3f6faffbba0b0a2
author: Linfeng Zhang <[email protected]>
date: Thu Jan 5 20:16:18 EST 2017
Merge "Clean DC only idct NEON intrinsics"
--- a/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -11,49 +11,66 @@
#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
+static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a = vld1q_u8(*dest);
+ const uint8x16_t b = vqaddq_u8(a, res);
+ vst1q_u8(*dest, b);
+ *dest += stride;
+}
+
+static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a = vld1q_u8(*dest);
+ const uint8x16_t b = vqsubq_u8(a, res);
+ vst1q_u8(*dest, b);
+ *dest += stride;
+}
+
void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
- uint8x8_t d2u8, d3u8, d30u8, d31u8;
- uint64x1_t d2u64, d3u64, d4u64, d5u64;
- uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
- int16x8_t q0s16;
- uint8_t *d1, *d2;
- int16_t i, j, a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 6);
+ const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
- q0s16 = vdupq_n_s16(a1);
- q0u16 = vreinterpretq_u16_s16(q0s16);
-
- for (d1 = d2 = dest, i = 0; i < 4; i++) {
- for (j = 0; j < 2; j++) {
- d2u64 = vld1_u64((const uint64_t *)d1);
- d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
- d1 += stride;
- d4u64 = vld1_u64((const uint64_t *)d1);
- d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
- d1 += stride;
-
- q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
- q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
- q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
- q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
- d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
- d2 += stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
- vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
- d2 += stride;
- }
+ if (a1 >= 0) {
+ const uint8x16_t dc = create_dcq(a1);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ } else {
+ const uint8x16_t dc = create_dcq(-a1);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
}
}
--- a/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -10,127 +10,48 @@
#include <arm_neon.h>
-#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
- uint8x16_t *q9u8, uint8x16_t *q10u8,
- uint8x16_t *q11u8, uint8x16_t *q12u8,
- uint8x16_t *q13u8, uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vld1q_u8(d);
- d += d_stride;
- *q9u8 = vld1q_u8(d);
- d += d_stride;
- *q10u8 = vld1q_u8(d);
- d += d_stride;
- *q11u8 = vld1q_u8(d);
- d += d_stride;
- *q12u8 = vld1q_u8(d);
- d += d_stride;
- *q13u8 = vld1q_u8(d);
- d += d_stride;
- *q14u8 = vld1q_u8(d);
- d += d_stride;
- *q15u8 = vld1q_u8(d);
+static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a0 = vld1q_u8(*dest);
+ const uint8x16_t a1 = vld1q_u8(*dest + 16);
+ const uint8x16_t b0 = vqaddq_u8(a0, res);
+ const uint8x16_t b1 = vqaddq_u8(a1, res);
+ vst1q_u8(*dest, b0);
+ vst1q_u8(*dest + 16, b1);
+ *dest += stride;
}
-static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
- uint8x16_t *q9u8, uint8x16_t *q10u8,
- uint8x16_t *q11u8, uint8x16_t *q12u8,
- uint8x16_t *q13u8, uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
- *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
- *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
- *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
- *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
- *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
- *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
- *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a0 = vld1q_u8(*dest);
+ const uint8x16_t a1 = vld1q_u8(*dest + 16);
+ const uint8x16_t b0 = vqsubq_u8(a0, res);
+ const uint8x16_t b1 = vqsubq_u8(a1, res);
+ vst1q_u8(*dest, b0);
+ vst1q_u8(*dest + 16, b1);
+ *dest += stride;
}
-static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
- uint8x16_t *q9u8, uint8x16_t *q10u8,
- uint8x16_t *q11u8, uint8x16_t *q12u8,
- uint8x16_t *q13u8, uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
- *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
- *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
- *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
- *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
- *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
- *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
- *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
-}
-
-static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
- uint8x16_t *q9u8, uint8x16_t *q10u8,
- uint8x16_t *q11u8, uint8x16_t *q12u8,
- uint8x16_t *q13u8, uint8x16_t *q14u8,
- uint8x16_t *q15u8) {
- vst1q_u8(d, *q8u8);
- d += d_stride;
- vst1q_u8(d, *q9u8);
- d += d_stride;
- vst1q_u8(d, *q10u8);
- d += d_stride;
- vst1q_u8(d, *q11u8);
- d += d_stride;
- vst1q_u8(d, *q12u8);
- d += d_stride;
- vst1q_u8(d, *q13u8);
- d += d_stride;
- vst1q_u8(d, *q14u8);
- d += d_stride;
- vst1q_u8(d, *q15u8);
-}
-
void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
- uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
- int i, j, dest_stride8;
- uint8_t *d;
- int16_t a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ int i;
+ const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- dest_stride8 = stride * 8;
- if (a1 >= 0) { // diff_positive_32_32
- a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
- q0u8 = vdupq_n_u8((uint8_t)a1);
- for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
- d = dest;
- for (j = 0; j < 4; j++) {
- LD_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
- &q15u8);
- ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
- ST_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
- &q15u8);
- d += dest_stride8;
- }
+ if (a1 >= 0) {
+ const uint8x16_t dc = create_dcq(a1);
+ for (i = 0; i < 32; i++) {
+ idct32x32_1_add_pos_kernel(&dest, stride, dc);
}
- } else { // diff_negative_32_32
- a1 = -a1;
- a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
- q0u8 = vdupq_n_u8((uint8_t)a1);
- for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
- d = dest;
- for (j = 0; j < 4; j++) {
- LD_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
- &q15u8);
- SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
- &q14u8, &q15u8);
- ST_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
- &q15u8);
- d += dest_stride8;
- }
+ } else {
+ const uint8x16_t dc = create_dcq(-a1);
+ for (i = 0; i < 32; i++) {
+ idct32x32_1_add_neg_kernel(&dest, stride, dc);
}
}
}
--- a/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -14,28 +14,32 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/inv_txfm.h"
+static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
+ const int16x8_t res,
+ uint32x2_t *const d) {
+ uint16x8_t a;
+ uint8x8_t b;
+ *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0);
+ *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1);
+ a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d));
+ b = vqmovun_s16(vreinterpretq_s16_u16(a));
+ vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0);
+ *dest += stride;
+ vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1);
+ *dest += stride;
+}
+
void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
- int i;
- const int16_t out0 = dct_const_round_shift((int16_t)input[0] * cospi_16_64);
- const int16_t out1 = dct_const_round_shift(out0 * cospi_16_64);
+ const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
const int16x8_t dc = vdupq_n_s16(a1);
uint32x2_t d = vdup_n_u32(0);
- uint16x8_t a;
- uint8x8_t b;
assert(!((intptr_t)dest % sizeof(uint32_t)));
assert(!(stride % sizeof(uint32_t)));
- for (i = 0; i < 2; i++) {
- d = vld1_lane_u32((const uint32_t *)dest, d, 0);
- d = vld1_lane_u32((const uint32_t *)(dest + stride), d, 1);
- a = vaddw_u8(vreinterpretq_u16_s16(dc), vreinterpret_u8_u32(d));
- b = vqmovun_s16(vreinterpretq_s16_u16(a));
- vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(b), 0);
- dest += stride;
- vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(b), 1);
- dest += stride;
- }
+ idct4x4_1_add_kernel(&dest, stride, dc, &d);
+ idct4x4_1_add_kernel(&dest, stride, dc, &d);
}
--- a/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ b/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -12,47 +12,53 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
-void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
- int stride) {
- int i;
- const int16_t out0 = dct_const_round_shift(input[0] * cospi_16_64);
- const int16_t out1 = dct_const_round_shift(out0 * cospi_16_64);
- const int16_t out2 = ROUND_POWER_OF_TWO(out1, 5);
- const int16x8_t dc = vdupq_n_s16(out2);
- const uint16x8_t dc_u16 = vreinterpretq_u16_s16(dc);
- const uint8_t *dst = dest;
- uint8x8_t d0, d1, d2, d3;
- uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16;
+static INLINE uint8x8_t create_dcd(const int16_t dc) {
+ int16x8_t t = vdupq_n_s16(dc);
+ return vqmovun_s16(t);
+}
- for (i = 0; i < 2; i++) {
- d0 = vld1_u8(dst);
- dst += stride;
- d1 = vld1_u8(dst);
- dst += stride;
- d2 = vld1_u8(dst);
- dst += stride;
- d3 = vld1_u8(dst);
- dst += stride;
+static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x8_t res) {
+ const uint8x8_t a = vld1_u8(*dest);
+ const uint8x8_t b = vqadd_u8(a, res);
+ vst1_u8(*dest, b);
+ *dest += stride;
+}
- d0_u16 = vaddw_u8(dc_u16, d0);
- d1_u16 = vaddw_u8(dc_u16, d1);
- d2_u16 = vaddw_u8(dc_u16, d2);
- d3_u16 = vaddw_u8(dc_u16, d3);
+static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x8_t res) {
+ const uint8x8_t a = vld1_u8(*dest);
+ const uint8x8_t b = vqsub_u8(a, res);
+ vst1_u8(*dest, b);
+ *dest += stride;
+}
- d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16));
- d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16));
- d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16));
- d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16));
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
- vst1_u8(dest, d0);
- dest += stride;
- vst1_u8(dest, d1);
- dest += stride;
- vst1_u8(dest, d2);
- dest += stride;
- vst1_u8(dest, d3);
- dest += stride;
+ if (a1 >= 0) {
+ const uint8x8_t dc = create_dcd(a1);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ } else {
+ const uint8x8_t dc = create_dcd(-a1);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
}
}
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -181,6 +181,12 @@
vst1_u8(b, b7);
}
+static INLINE uint8x16_t create_dcq(const int16_t dc) {
+ // Clip both sides and gcc may compile to assembly 'usat'.
+ const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc);
+ return vdupq_n_u8((uint8_t)t);
+}
+
static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
int16x8_t *const a0,
int16x8_t *const a1) {