shithub: libvpx

Download patch

ref: 615566aa81327767f89543927048dbbab1156e6d
parent: 9c2bb7f3422e607ca6a3da9c62f47a63abf9b967
parent: 429e6528097850b08b675e1fa8d75eef59a10e32
author: Linfeng Zhang <[email protected]>
date: Tue Feb 14 19:46:29 EST 2017

Merge "Replace 14 with DCT_CONST_BITS in idct NEON functions' shifts"

--- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -19,14 +19,14 @@
                                                      int32x4x2_t *const d1) {
   int32x2x2_t t32[4];
 
-  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], 14);
-  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], 14);
-  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], 14);
-  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], 14);
-  t32[2].val[0] = vrshrn_n_s64(t[2].val[0], 14);
-  t32[2].val[1] = vrshrn_n_s64(t[2].val[1], 14);
-  t32[3].val[0] = vrshrn_n_s64(t[3].val[0], 14);
-  t32[3].val[1] = vrshrn_n_s64(t[3].val[1], 14);
+  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
+  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
+  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
+  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
+  t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS);
+  t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS);
+  t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS);
+  t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS);
   d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);
   d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);
   d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]);
--- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -82,10 +82,10 @@
   b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
   b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
   b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
-  b0 = vrshrq_n_s32(b0, 14);
-  b1 = vrshrq_n_s32(b1, 14);
-  b2 = vrshrq_n_s32(b2, 14);
-  b3 = vrshrq_n_s32(b3, 14);
+  b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
+  b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
+  b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
+  b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
   *a0 = vaddq_s32(b0, b3);
   *a1 = vaddq_s32(b1, b2);
   *a2 = vsubq_s32(b1, b2);
@@ -119,10 +119,14 @@
   c5 = vsubq_s64(c5, c9);
   c6 = vaddq_s64(c6, c10);
   c7 = vaddq_s64(c7, c11);
-  b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14));
-  b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14));
-  b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14));
-  b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14));
+  b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS),
+                    vrshrn_n_s64(c1, DCT_CONST_BITS));
+  b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS),
+                    vrshrn_n_s64(c3, DCT_CONST_BITS));
+  b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS),
+                    vrshrn_n_s64(c5, DCT_CONST_BITS));
+  b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS),
+                    vrshrn_n_s64(c7, DCT_CONST_BITS));
   *a0 = vaddq_s32(b0, b3);
   *a1 = vaddq_s32(b1, b2);
   *a2 = vsubq_s32(b1, b2);
--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -82,18 +82,18 @@
   step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
   step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
   step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
-  step1[4] = vrshrq_n_s32(step1[4], 14);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
-  step1[7] = vrshrq_n_s32(step1[7], 14);
+  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
 
   // stage 2
   step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
   step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
   step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
-  step2[1] = vrshrq_n_s32(step2[1], 14);
-  step2[2] = vrshrq_n_s32(step2[2], 14);
-  step2[3] = vrshrq_n_s32(step2[3], 14);
+  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
 
   step2[4] = vaddq_s32(step1[4], step1[5]);
   step2[5] = vsubq_s32(step1[4], step1[5]);
@@ -109,8 +109,8 @@
   step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
   step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
   step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
 
   // stage 4
   *io0 = vaddq_s32(step1[0], step2[7]);
@@ -154,14 +154,14 @@
   t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
   t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
   t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
   step1[4] = vcombine_s32(t32[0], t32[1]);
   step1[5] = vcombine_s32(t32[2], t32[3]);
   step1[6] = vcombine_s32(t32[4], t32[5]);
@@ -174,12 +174,12 @@
   t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
   t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
   t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
   step2[1] = vcombine_s32(t32[2], t32[3]);
   step2[2] = vcombine_s32(t32[4], t32[5]);
   step2[3] = vcombine_s32(t32[6], t32[7]);
@@ -205,10 +205,10 @@
       vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
   t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
                           vget_high_s32(cospis0), 0);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
   step1[5] = vcombine_s32(t32[0], t32[1]);
   step1[6] = vcombine_s32(t32[2], t32[3]);
 
@@ -377,10 +377,10 @@
   step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
   step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
 
-  step1[4] = vrshrq_n_s32(step1[4], 14);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
-  step1[7] = vrshrq_n_s32(step1[7], 14);
+  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
 
   // stage 2
   step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
@@ -392,10 +392,10 @@
   step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
   step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
 
-  step2[0] = vrshrq_n_s32(step2[0], 14);
-  step2[1] = vrshrq_n_s32(step2[1], 14);
-  step2[2] = vrshrq_n_s32(step2[2], 14);
-  step2[3] = vrshrq_n_s32(step2[3], 14);
+  step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
+  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
 
   step2[4] = vaddq_s32(step1[4], step1[5]);
   step2[5] = vsubq_s32(step1[4], step1[5]);
@@ -411,8 +411,8 @@
   step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
   step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
   step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
 
   // stage 4
   *io0 = vaddq_s32(step1[0], step2[7]);
@@ -473,14 +473,14 @@
   t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
   t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
   t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
   step1[4] = vcombine_s32(t32[0], t32[1]);
   step1[5] = vcombine_s32(t32[2], t32[3]);
   step1[6] = vcombine_s32(t32[4], t32[5]);
@@ -501,14 +501,14 @@
   t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
   t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
   t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
   step2[0] = vcombine_s32(t32[0], t32[1]);
   step2[1] = vcombine_s32(t32[2], t32[3]);
   step2[2] = vcombine_s32(t32[4], t32[5]);
@@ -535,10 +535,10 @@
       vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
   t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
                           vget_high_s32(cospis0), 0);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
   step1[5] = vcombine_s32(t32[0], t32[1]);
   step1[6] = vcombine_s32(t32[2], t32[3]);
 
--- a/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_add_neon.c
@@ -16,8 +16,8 @@
 
 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
                                 int16x4_t *const d1) {
-  *d0 = vrshrn_n_s32(t32[0], 14);
-  *d1 = vrshrn_n_s32(t32[1], 14);
+  *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
 }
 
 static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
--- a/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_add_neon.c
@@ -147,8 +147,10 @@
   q11s32 = vaddq_s32(q12s32, q11s32);
   q10s32 = vaddq_s32(q10s32, q15s32);
 
-  *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14));
-  *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14));
+  *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, DCT_CONST_BITS),
+                        vrshrn_n_s32(q9s32, DCT_CONST_BITS));
+  *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, DCT_CONST_BITS),
+                        vrshrn_n_s32(q10s32, DCT_CONST_BITS));
 }
 
 static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1,
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -15,6 +15,7 @@
 
 #include "./vpx_config.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
 DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
@@ -93,14 +94,14 @@
 
 //------------------------------------------------------------------------------
 
-// Multiply a by a_const. Saturate, shift and narrow by 14.
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
 static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
                                                       const int16_t a_const) {
-  // Shift by 14 + rounding will be within 16 bits for well formed streams.
-  // See WRAPLOW and dct_const_round_shift for details.
+  // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed
+  // streams. See WRAPLOW and dct_const_round_shift for details.
   // This instruction doubles the result and returns the high half, essentially
   // resulting in a right shift by 15. By multiplying the constant first that
-  // becomes a right shift by 14.
+  // becomes a right shift by DCT_CONST_BITS.
   // The largest possible value used here is
   // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
   // within the range of int16_t (+32767 / -32768) even when negated.
@@ -107,7 +108,7 @@
   return vqrdmulhq_n_s16(a, a_const * 2);
 }
 
-// Add a and b, then multiply by ab_const. Shift and narrow by 14.
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
 static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
     const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
   // In both add_ and it's pair, sub_, the input for well-formed streams will be
@@ -121,10 +122,12 @@
   int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
   temp_low = vmulq_n_s32(temp_low, ab_const);
   temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
 }
 
-// Subtract b from a, then multiply by ab_const. Shift and narrow by 14.
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
 static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
     const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
   int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
@@ -131,11 +134,12 @@
   int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
   temp_low = vmulq_n_s32(temp_low, ab_const);
   temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
 }
 
 // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
-// 14.
+// DCT_CONST_BITS.
 static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
     const int16x8_t a, const int16_t a_const, const int16x8_t b,
     const int16_t b_const) {
@@ -143,7 +147,8 @@
   int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
   temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
   temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
 }
 
 // Shift the output down by 6 and add it to the destination buffer.
@@ -233,10 +238,10 @@
   c3 = vmull_lane_s16(b2, cospis, 1);
   c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
   c3 = vmlal_lane_s16(c3, b3, cospis, 3);
-  b0 = vrshrn_n_s32(c0, 14);
-  b1 = vrshrn_n_s32(c1, 14);
-  b2 = vrshrn_n_s32(c2, 14);
-  b3 = vrshrn_n_s32(c3, 14);
+  b0 = vrshrn_n_s32(c0, DCT_CONST_BITS);
+  b1 = vrshrn_n_s32(c1, DCT_CONST_BITS);
+  b2 = vrshrn_n_s32(c2, DCT_CONST_BITS);
+  b3 = vrshrn_n_s32(c3, DCT_CONST_BITS);
   d0 = vcombine_s16(b0, b1);
   d1 = vcombine_s16(b3, b2);
   *a0 = vaddq_s16(d0, d1);
@@ -278,8 +283,8 @@
   t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
   t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
   t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
-  step1[5] = vrshrn_n_s32(t32[0], 14);
-  step1[6] = vrshrn_n_s32(t32[1], 14);
+  step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
 
   // stage 4
   *io0 = vadd_s16(step1[0], step2[7]);
@@ -337,10 +342,10 @@
   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
   step1[5] = vcombine_s16(t16[0], t16[1]);
   step1[6] = vcombine_s16(t16[2], t16[3]);
 
@@ -405,14 +410,14 @@
   t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
   t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
   t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
-  t16[4] = vrshrn_n_s32(t32[4], 14);
-  t16[5] = vrshrn_n_s32(t32[5], 14);
-  t16[6] = vrshrn_n_s32(t32[6], 14);
-  t16[7] = vrshrn_n_s32(t32[7], 14);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
+  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
+  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
+  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
   step1[4] = vcombine_s16(t16[0], t16[1]);
   step1[5] = vcombine_s16(t16[2], t16[3]);
   step1[6] = vcombine_s16(t16[4], t16[5]);
@@ -433,14 +438,14 @@
   t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
   t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
   t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
-  t16[4] = vrshrn_n_s32(t32[4], 14);
-  t16[5] = vrshrn_n_s32(t32[5], 14);
-  t16[6] = vrshrn_n_s32(t32[6], 14);
-  t16[7] = vrshrn_n_s32(t32[7], 14);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
+  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
+  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
+  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
   step2[0] = vcombine_s16(t16[0], t16[1]);
   step2[1] = vcombine_s16(t16[2], t16[3]);
   step2[2] = vcombine_s16(t16[4], t16[5]);
@@ -463,10 +468,10 @@
   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
   step1[5] = vcombine_s16(t16[0], t16[1]);
   step1[6] = vcombine_s16(t16[2], t16[3]);
 
@@ -486,10 +491,10 @@
                                               int16x8_t *const d1) {
   int16x4_t t16[4];
 
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
   *d0 = vcombine_s16(t16[0], t16[1]);
   *d1 = vcombine_s16(t16[2], t16[3]);
 }