ref: 615566aa81327767f89543927048dbbab1156e6d
parent: 9c2bb7f3422e607ca6a3da9c62f47a63abf9b967
parent: 429e6528097850b08b675e1fa8d75eef59a10e32
author: Linfeng Zhang <[email protected]>
date: Tue Feb 14 19:46:29 EST 2017
Merge "Replace 14 with DCT_CONST_BITS in idct NEON functions' shifts"
--- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -19,14 +19,14 @@
int32x4x2_t *const d1) {
int32x2x2_t t32[4];
- t32[0].val[0] = vrshrn_n_s64(t[0].val[0], 14);
- t32[0].val[1] = vrshrn_n_s64(t[0].val[1], 14);
- t32[1].val[0] = vrshrn_n_s64(t[1].val[0], 14);
- t32[1].val[1] = vrshrn_n_s64(t[1].val[1], 14);
- t32[2].val[0] = vrshrn_n_s64(t[2].val[0], 14);
- t32[2].val[1] = vrshrn_n_s64(t[2].val[1], 14);
- t32[3].val[0] = vrshrn_n_s64(t[3].val[0], 14);
- t32[3].val[1] = vrshrn_n_s64(t[3].val[1], 14);
+ t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
+ t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
+ t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
+ t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
+ t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS);
+ t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS);
+ t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS);
+ t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS);
d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);
d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);
d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]);
--- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -82,10 +82,10 @@
b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
- b0 = vrshrq_n_s32(b0, 14);
- b1 = vrshrq_n_s32(b1, 14);
- b2 = vrshrq_n_s32(b2, 14);
- b3 = vrshrq_n_s32(b3, 14);
+ b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
+ b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
+ b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
+ b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
*a0 = vaddq_s32(b0, b3);
*a1 = vaddq_s32(b1, b2);
*a2 = vsubq_s32(b1, b2);
@@ -119,10 +119,14 @@
c5 = vsubq_s64(c5, c9);
c6 = vaddq_s64(c6, c10);
c7 = vaddq_s64(c7, c11);
- b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14));
- b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14));
- b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14));
- b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14));
+ b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS),
+ vrshrn_n_s64(c1, DCT_CONST_BITS));
+ b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS),
+ vrshrn_n_s64(c3, DCT_CONST_BITS));
+ b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS),
+ vrshrn_n_s64(c5, DCT_CONST_BITS));
+ b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS),
+ vrshrn_n_s64(c7, DCT_CONST_BITS));
*a0 = vaddq_s32(b0, b3);
*a1 = vaddq_s32(b1, b2);
*a2 = vsubq_s32(b1, b2);
--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -82,18 +82,18 @@
step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
- step1[4] = vrshrq_n_s32(step1[4], 14);
- step1[5] = vrshrq_n_s32(step1[5], 14);
- step1[6] = vrshrq_n_s32(step1[6], 14);
- step1[7] = vrshrq_n_s32(step1[7], 14);
+ step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+ step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
// stage 2
step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
- step2[1] = vrshrq_n_s32(step2[1], 14);
- step2[2] = vrshrq_n_s32(step2[2], 14);
- step2[3] = vrshrq_n_s32(step2[3], 14);
+ step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+ step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+ step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
step2[4] = vaddq_s32(step1[4], step1[5]);
step2[5] = vsubq_s32(step1[4], step1[5]);
@@ -109,8 +109,8 @@
step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
- step1[5] = vrshrq_n_s32(step1[5], 14);
- step1[6] = vrshrq_n_s32(step1[6], 14);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
// stage 4
*io0 = vaddq_s32(step1[0], step2[7]);
@@ -154,14 +154,14 @@
t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
- t32[0] = vrshrn_n_s64(t64[0], 14);
- t32[1] = vrshrn_n_s64(t64[1], 14);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
- t32[4] = vrshrn_n_s64(t64[4], 14);
- t32[5] = vrshrn_n_s64(t64[5], 14);
- t32[6] = vrshrn_n_s64(t64[6], 14);
- t32[7] = vrshrn_n_s64(t64[7], 14);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
step1[4] = vcombine_s32(t32[0], t32[1]);
step1[5] = vcombine_s32(t32[2], t32[3]);
step1[6] = vcombine_s32(t32[4], t32[5]);
@@ -174,12 +174,12 @@
t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
- t32[4] = vrshrn_n_s64(t64[4], 14);
- t32[5] = vrshrn_n_s64(t64[5], 14);
- t32[6] = vrshrn_n_s64(t64[6], 14);
- t32[7] = vrshrn_n_s64(t64[7], 14);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
step2[1] = vcombine_s32(t32[2], t32[3]);
step2[2] = vcombine_s32(t32[4], t32[5]);
step2[3] = vcombine_s32(t32[6], t32[7]);
@@ -205,10 +205,10 @@
vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
vget_high_s32(cospis0), 0);
- t32[0] = vrshrn_n_s64(t64[0], 14);
- t32[1] = vrshrn_n_s64(t64[1], 14);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
step1[5] = vcombine_s32(t32[0], t32[1]);
step1[6] = vcombine_s32(t32[2], t32[3]);
@@ -377,10 +377,10 @@
step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
- step1[4] = vrshrq_n_s32(step1[4], 14);
- step1[5] = vrshrq_n_s32(step1[5], 14);
- step1[6] = vrshrq_n_s32(step1[6], 14);
- step1[7] = vrshrq_n_s32(step1[7], 14);
+ step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+ step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
// stage 2
step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
@@ -392,10 +392,10 @@
step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
- step2[0] = vrshrq_n_s32(step2[0], 14);
- step2[1] = vrshrq_n_s32(step2[1], 14);
- step2[2] = vrshrq_n_s32(step2[2], 14);
- step2[3] = vrshrq_n_s32(step2[3], 14);
+ step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
+ step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+ step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+ step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
step2[4] = vaddq_s32(step1[4], step1[5]);
step2[5] = vsubq_s32(step1[4], step1[5]);
@@ -411,8 +411,8 @@
step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
- step1[5] = vrshrq_n_s32(step1[5], 14);
- step1[6] = vrshrq_n_s32(step1[6], 14);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
// stage 4
*io0 = vaddq_s32(step1[0], step2[7]);
@@ -473,14 +473,14 @@
t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
- t32[0] = vrshrn_n_s64(t64[0], 14);
- t32[1] = vrshrn_n_s64(t64[1], 14);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
- t32[4] = vrshrn_n_s64(t64[4], 14);
- t32[5] = vrshrn_n_s64(t64[5], 14);
- t32[6] = vrshrn_n_s64(t64[6], 14);
- t32[7] = vrshrn_n_s64(t64[7], 14);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
step1[4] = vcombine_s32(t32[0], t32[1]);
step1[5] = vcombine_s32(t32[2], t32[3]);
step1[6] = vcombine_s32(t32[4], t32[5]);
@@ -501,14 +501,14 @@
t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
- t32[0] = vrshrn_n_s64(t64[0], 14);
- t32[1] = vrshrn_n_s64(t64[1], 14);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
- t32[4] = vrshrn_n_s64(t64[4], 14);
- t32[5] = vrshrn_n_s64(t64[5], 14);
- t32[6] = vrshrn_n_s64(t64[6], 14);
- t32[7] = vrshrn_n_s64(t64[7], 14);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
step2[0] = vcombine_s32(t32[0], t32[1]);
step2[1] = vcombine_s32(t32[2], t32[3]);
step2[2] = vcombine_s32(t32[4], t32[5]);
@@ -535,10 +535,10 @@
vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
vget_high_s32(cospis0), 0);
- t32[0] = vrshrn_n_s64(t64[0], 14);
- t32[1] = vrshrn_n_s64(t64[1], 14);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
step1[5] = vcombine_s32(t32[0], t32[1]);
step1[6] = vcombine_s32(t32[2], t32[3]);
--- a/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_add_neon.c
@@ -16,8 +16,8 @@
static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
int16x4_t *const d1) {
- *d0 = vrshrn_n_s32(t32[0], 14);
- *d1 = vrshrn_n_s32(t32[1], 14);
+ *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
}
static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
--- a/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_add_neon.c
@@ -147,8 +147,10 @@
q11s32 = vaddq_s32(q12s32, q11s32);
q10s32 = vaddq_s32(q10s32, q15s32);
- *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14));
- *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14));
+ *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, DCT_CONST_BITS),
+ vrshrn_n_s32(q9s32, DCT_CONST_BITS));
+ *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, DCT_CONST_BITS),
+ vrshrn_n_s32(q10s32, DCT_CONST_BITS));
}
static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1,
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -15,6 +15,7 @@
#include "./vpx_config.h"
#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/vpx_dsp_common.h"
DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
@@ -93,14 +94,14 @@
//------------------------------------------------------------------------------
-// Multiply a by a_const. Saturate, shift and narrow by 14.
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
const int16_t a_const) {
- // Shift by 14 + rounding will be within 16 bits for well formed streams.
- // See WRAPLOW and dct_const_round_shift for details.
+ // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed
+ // streams. See WRAPLOW and dct_const_round_shift for details.
// This instruction doubles the result and returns the high half, essentially
// resulting in a right shift by 15. By multiplying the constant first that
- // becomes a right shift by 14.
+ // becomes a right shift by DCT_CONST_BITS.
// The largest possible value used here is
// vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
// within the range of int16_t (+32767 / -32768) even when negated.
@@ -107,7 +108,7 @@
return vqrdmulhq_n_s16(a, a_const * 2);
}
-// Add a and b, then multiply by ab_const. Shift and narrow by 14.
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
// In both add_ and it's pair, sub_, the input for well-formed streams will be
@@ -121,10 +122,12 @@
int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
temp_low = vmulq_n_s32(temp_low, ab_const);
temp_high = vmulq_n_s32(temp_high, ab_const);
- return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+ return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+ vrshrn_n_s32(temp_high, DCT_CONST_BITS));
}
-// Subtract b from a, then multiply by ab_const. Shift and narrow by 14.
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
@@ -131,11 +134,12 @@
int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
temp_low = vmulq_n_s32(temp_low, ab_const);
temp_high = vmulq_n_s32(temp_high, ab_const);
- return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+ return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+ vrshrn_n_s32(temp_high, DCT_CONST_BITS));
}
// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
-// 14.
+// DCT_CONST_BITS.
static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
const int16x8_t a, const int16_t a_const, const int16x8_t b,
const int16_t b_const) {
@@ -143,7 +147,8 @@
int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
- return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+ return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+ vrshrn_n_s32(temp_high, DCT_CONST_BITS));
}
// Shift the output down by 6 and add it to the destination buffer.
@@ -233,10 +238,10 @@
c3 = vmull_lane_s16(b2, cospis, 1);
c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
c3 = vmlal_lane_s16(c3, b3, cospis, 3);
- b0 = vrshrn_n_s32(c0, 14);
- b1 = vrshrn_n_s32(c1, 14);
- b2 = vrshrn_n_s32(c2, 14);
- b3 = vrshrn_n_s32(c3, 14);
+ b0 = vrshrn_n_s32(c0, DCT_CONST_BITS);
+ b1 = vrshrn_n_s32(c1, DCT_CONST_BITS);
+ b2 = vrshrn_n_s32(c2, DCT_CONST_BITS);
+ b3 = vrshrn_n_s32(c3, DCT_CONST_BITS);
d0 = vcombine_s16(b0, b1);
d1 = vcombine_s16(b3, b2);
*a0 = vaddq_s16(d0, d1);
@@ -278,8 +283,8 @@
t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
- step1[5] = vrshrn_n_s32(t32[0], 14);
- step1[6] = vrshrn_n_s32(t32[1], 14);
+ step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
// stage 4
*io0 = vadd_s16(step1[0], step2[7]);
@@ -337,10 +342,10 @@
t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
- t16[0] = vrshrn_n_s32(t32[0], 14);
- t16[1] = vrshrn_n_s32(t32[1], 14);
- t16[2] = vrshrn_n_s32(t32[2], 14);
- t16[3] = vrshrn_n_s32(t32[3], 14);
+ t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+ t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+ t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
step1[5] = vcombine_s16(t16[0], t16[1]);
step1[6] = vcombine_s16(t16[2], t16[3]);
@@ -405,14 +410,14 @@
t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
- t16[0] = vrshrn_n_s32(t32[0], 14);
- t16[1] = vrshrn_n_s32(t32[1], 14);
- t16[2] = vrshrn_n_s32(t32[2], 14);
- t16[3] = vrshrn_n_s32(t32[3], 14);
- t16[4] = vrshrn_n_s32(t32[4], 14);
- t16[5] = vrshrn_n_s32(t32[5], 14);
- t16[6] = vrshrn_n_s32(t32[6], 14);
- t16[7] = vrshrn_n_s32(t32[7], 14);
+ t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+ t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+ t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+ t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
+ t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
+ t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
+ t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
step1[4] = vcombine_s16(t16[0], t16[1]);
step1[5] = vcombine_s16(t16[2], t16[3]);
step1[6] = vcombine_s16(t16[4], t16[5]);
@@ -433,14 +438,14 @@
t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
- t16[0] = vrshrn_n_s32(t32[0], 14);
- t16[1] = vrshrn_n_s32(t32[1], 14);
- t16[2] = vrshrn_n_s32(t32[2], 14);
- t16[3] = vrshrn_n_s32(t32[3], 14);
- t16[4] = vrshrn_n_s32(t32[4], 14);
- t16[5] = vrshrn_n_s32(t32[5], 14);
- t16[6] = vrshrn_n_s32(t32[6], 14);
- t16[7] = vrshrn_n_s32(t32[7], 14);
+ t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+ t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+ t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+ t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
+ t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
+ t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
+ t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
step2[0] = vcombine_s16(t16[0], t16[1]);
step2[1] = vcombine_s16(t16[2], t16[3]);
step2[2] = vcombine_s16(t16[4], t16[5]);
@@ -463,10 +468,10 @@
t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
- t16[0] = vrshrn_n_s32(t32[0], 14);
- t16[1] = vrshrn_n_s32(t32[1], 14);
- t16[2] = vrshrn_n_s32(t32[2], 14);
- t16[3] = vrshrn_n_s32(t32[3], 14);
+ t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+ t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+ t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
step1[5] = vcombine_s16(t16[0], t16[1]);
step1[6] = vcombine_s16(t16[2], t16[3]);
@@ -486,10 +491,10 @@
int16x8_t *const d1) {
int16x4_t t16[4];
- t16[0] = vrshrn_n_s32(t32[0], 14);
- t16[1] = vrshrn_n_s32(t32[1], 14);
- t16[2] = vrshrn_n_s32(t32[2], 14);
- t16[3] = vrshrn_n_s32(t32[3], 14);
+ t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+ t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+ t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
*d0 = vcombine_s16(t16[0], t16[1]);
*d1 = vcombine_s16(t16[2], t16[3]);
}