shithub: libvpx

Download patch

ref: 6fc2e57c2ca6c9a7b1eecb2c7d93b65222b6727d
parent: fb60204d4c36a4041daaca2f1461b731fa2dfaa2
author: Linfeng Zhang <[email protected]>
date: Wed Apr 5 10:41:35 EDT 2017

Update 32x32 high bitdepth idct NEON optimization

Preparation of CONVERT_TO_BYTEPTR/SHORTPTR clean up.

BUG=webm:1388

Change-Id: I928d30a5698023bb90888d783cf81c51ec183760

--- a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
@@ -726,9 +726,10 @@
   highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
 }
 
-void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest,
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8,
                                        int stride, int bd) {
   int i;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   if (bd == 8) {
     int16_t temp[32 * 16];
@@ -742,7 +743,6 @@
       dest += 8;
     }
   } else {
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
     int32_t temp[32 * 16];
     int32_t *t = temp;
     vpx_highbd_idct32_12_neon(input, temp);
@@ -749,9 +749,9 @@
     vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8);
 
     for (i = 0; i < 32; i += 8) {
-      vpx_highbd_idct32_16_neon(t, dst, stride, bd);
+      vpx_highbd_idct32_16_neon(t, dest, stride, bd);
       t += (16 * 8);
-      dst += 8;
+      dest += 8;
     }
   }
 }
--- a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
@@ -594,9 +594,10 @@
   highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
 }
 
-void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8,
                                       int stride, int bd) {
   int i;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   if (bd == 8) {
     int16_t temp[32 * 8];
@@ -610,7 +611,6 @@
       dest += 8;
     }
   } else {
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
     int32_t temp[32 * 8];
     int32_t *t = temp;
 
@@ -617,9 +617,9 @@
     vpx_highbd_idct32_6_neon(input, t);
 
     for (i = 0; i < 32; i += 8) {
-      vpx_highbd_idct32_8_neon(t, dst, stride, bd);
+      vpx_highbd_idct32_8_neon(t, dest, stride, bd);
       t += (8 * 8);
-      dst += 8;
+      dest += 8;
     }
   }
 }
--- a/vpx_dsp/arm/idct32x32_135_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -371,7 +371,7 @@
   vst1q_s16(output, vsubq_s16(s7[0], s6[31]));
 }
 
-void vpx_idct32_16_neon(const int16_t *const input, uint8_t *const output,
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
                         const int stride, const int highbd_flag) {
   int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
       out[32];
@@ -646,17 +646,17 @@
   out[31] = final_sub(s7[0], s6[31]);
 
   if (highbd_flag) {
-    uint16_t *const outputT = CONVERT_TO_SHORTPTR(output);
-    highbd_add_and_store_bd8(out, outputT, stride);
+    highbd_add_and_store_bd8(out, output, stride);
   } else {
+    uint8_t *const outputT = (uint8_t *)output;
     add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
-                         out[7], output, stride);
+                         out[7], outputT, stride);
     add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13],
-                         out[14], out[15], output + (8 * stride), stride);
+                         out[14], out[15], outputT + (8 * stride), stride);
     add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21],
-                         out[22], out[23], output + (16 * stride), stride);
+                         out[22], out[23], outputT + (16 * stride), stride);
     add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29],
-                         out[30], out[31], output + (24 * stride), stride);
+                         out[30], out[31], outputT + (24 * stride), stride);
   }
 }
 
--- a/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -265,7 +265,7 @@
   vst1q_s16(output, vsubq_s16(s1[0], s2[31]));
 }
 
-void vpx_idct32_8_neon(const int16_t *input, uint8_t *output, int stride,
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
                        const int highbd_flag) {
   int16x8_t in[8], s1[32], s2[32], s3[32], out[32];
 
@@ -486,17 +486,17 @@
   out[31] = final_sub(s1[0], s2[31]);
 
   if (highbd_flag) {
-    uint16_t *const outputT = CONVERT_TO_SHORTPTR(output);
-    highbd_add_and_store_bd8(out, outputT, stride);
+    highbd_add_and_store_bd8(out, output, stride);
   } else {
+    uint8_t *const outputT = (uint8_t *)output;
     add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
-                         out[7], output, stride);
+                         out[7], outputT, stride);
     add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13],
-                         out[14], out[15], output + (8 * stride), stride);
+                         out[14], out[15], outputT + (8 * stride), stride);
     add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21],
-                         out[22], out[23], output + (16 * stride), stride);
+                         out[22], out[23], outputT + (16 * stride), stride);
     add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29],
-                         out[30], out[31], output + (24 * stride), stride);
+                         out[30], out[31], outputT + (24 * stride), stride);
   }
 }
 
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -982,11 +982,11 @@
                         const int stride, const int highbd_flag);
 
 void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output);
-void vpx_idct32_16_neon(const int16_t *const input, uint8_t *const output,
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
                         const int stride, const int highbd_flag);
 
 void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output);
-void vpx_idct32_8_neon(const int16_t *input, uint8_t *output, int stride,
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
                        const int highbd_flag);
 
 #endif  // VPX_DSP_ARM_IDCT_NEON_H_