shithub: libvpx

Download patch

ref: 9c72e85e4cfc87a4346701139bc25a56d43761c0
parent: cbb991b6b862a4c3b304a2a01261d5199ad480ce
author: Linfeng Zhang <[email protected]>
date: Mon Jun 12 11:45:50 EDT 2017

Remove array_transpose_8x8() in x86

Duplicate of transpose_16bit_8x8()

Change-Id: Iaa5dd63b5cccb044974a65af22c90e13418e311f

--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -15,6 +15,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
 
@@ -706,58 +707,6 @@
   store_output(&res[7], (output + 7 * stride));
 }
 
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  // 04 14 05 15 06 16 07 17
-  // 24 34 25 35 26 36 27 37
-  // 40 50 41 51 42 52 43 53
-  // 60 70 61 71 62 72 63 73
-  // 44 54 45 55 46 56 47 57
-  // 64 74 65 75 66 76 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-  // 00 10 20 30 01 11 21 31
-  // 40 50 60 70 41 51 61 71
-  // 02 12 22 32 03 13 23 33
-  // 42 52 62 72 43 53 63 73
-  // 04 14 24 34 05 15 25 35
-  // 44 54 64 74 45 55 65 75
-  // 06 16 26 36 07 17 27 37
-  // 46 56 66 76 47 57 67 77
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-  // 00 10 20 30 40 50 60 70
-  // 01 11 21 31 41 51 61 71
-  // 02 12 22 32 42 52 62 72
-  // 03 13 23 33 43 53 63 73
-  // 04 14 24 34 44 54 64 74
-  // 05 15 25 35 45 55 65 75
-  // 06 16 26 36 46 56 66 76
-  // 07 17 27 37 47 57 67 77
-}
-
 static void fdct8_sse2(__m128i *in) {
   // constants
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
@@ -895,7 +844,7 @@
   in[7] = _mm_packs_epi32(v6, v7);
 
   // transpose
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
 }
 
 static void fadst8_sse2(__m128i *in) {
@@ -1125,7 +1074,7 @@
   in[7] = _mm_sub_epi16(k__const_0, s1);
 
   // transpose
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
 }
 
 void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
@@ -1184,10 +1133,10 @@
 
 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
+  transpose_16bit_8x8(res0, res0);
+  transpose_16bit_8x8(res1, tbuf);
+  transpose_16bit_8x8(res0 + 8, res1);
+  transpose_16bit_8x8(res1 + 8, res1 + 8);
 
   res0[8] = tbuf[0];
   res0[9] = tbuf[1];
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -182,8 +182,8 @@
 
     if (test) {
       // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_8x8(inptr, inptr);
-      array_transpose_8x8(inptr + 8, inptr + 16);
+      transpose_16bit_8x8(inptr, inptr);
+      transpose_16bit_8x8(inptr + 8, inptr + 16);
       for (i = 0; i < 4; i++) {
         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
         temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -63,7 +63,7 @@
     test = _mm_movemask_epi8(temp1);
 
     if (test) {
-      array_transpose_8x8(inptr, inptr);
+      transpose_16bit_8x8(inptr, inptr);
       for (i = 0; i < 8; i++) {
         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
         temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -325,7 +325,7 @@
   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
 
   // transpose
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
 
   // properly aligned for butterfly input
   in0 = in[7];
@@ -787,8 +787,8 @@
     in[7] = load_input_data(input + 8 * 14);
     in[15] = load_input_data(input + 8 * 15);
 
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
+    transpose_16bit_8x8(in, in);
+    transpose_16bit_8x8(in + 8, in + 8);
 
     IDCT16
 
@@ -816,8 +816,8 @@
   for (i = 0; i < 2; i++) {
     int j;
     // 1-D idct
-    array_transpose_8x8(l + i * 8, in);
-    array_transpose_8x8(r + i * 8, in + 8);
+    transpose_16bit_8x8(l + i * 8, in);
+    transpose_16bit_8x8(r + i * 8, in + 8);
 
     IDCT16
 
@@ -2131,7 +2131,7 @@
   in[6] = load_input_data(input + 192);
   in[7] = load_input_data(input + 224);
 
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
   IDCT32_34
 
   // 1_D: Store 32 intermediate results for each 8x32 block.
@@ -2170,7 +2170,7 @@
   for (i = 0; i < 4; i++) {
     int j;
     // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
+    transpose_16bit_8x8(col + i * 8, in);
     IDCT32_34
 
     // 2_D: Calculate the results and store them to destination.
@@ -2392,10 +2392,10 @@
     }
 
     // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
+    transpose_16bit_8x8(in, in);
+    transpose_16bit_8x8(in + 8, in + 8);
+    transpose_16bit_8x8(in + 16, in + 16);
+    transpose_16bit_8x8(in + 24, in + 24);
 
     IDCT32
 
@@ -2438,10 +2438,10 @@
     j = i << 3;
 
     // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
+    transpose_16bit_8x8(col + j, in);
+    transpose_16bit_8x8(col + j + 32, in + 8);
+    transpose_16bit_8x8(col + j + 64, in + 16);
+    transpose_16bit_8x8(col + j + 96, in + 24);
 
     IDCT32
 
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -15,38 +15,9 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
 static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in,
                                                   __m128i *const out) {
   // Unpack 16 bit elements. Goes from:
@@ -104,10 +75,10 @@
 
 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
+  transpose_16bit_8x8(res0, res0);
+  transpose_16bit_8x8(res1, tbuf);
+  transpose_16bit_8x8(res0 + 8, res1);
+  transpose_16bit_8x8(res1 + 8, res1 + 8);
 
   res0[8] = tbuf[0];
   res0[9] = tbuf[1];
--- a/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -632,7 +632,7 @@
   in[6] = load_input_data(input + 192);
   in[7] = load_input_data(input + 224);
 
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
   idct32_34_first_half(in, stp1);
   idct32_34_second_half(in, stp1);
 
@@ -641,7 +641,7 @@
   for (i = 0; i < 4; i++) {
     int j;
     // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
+    transpose_16bit_8x8(col + i * 8, in);
     idct32_34_first_half(in, stp1);
     idct32_34_second_half(in, stp1);
 
@@ -672,10 +672,10 @@
 
 static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,
                                     __m128i *out1) {
-  array_transpose_8x8(in0, out0);
-  array_transpose_8x8(&in0[8], out1);
-  array_transpose_8x8(in1, &out0[8]);
-  array_transpose_8x8(&in1[8], &out1[8]);
+  transpose_16bit_8x8(in0, out0);
+  transpose_16bit_8x8(&in0[8], out1);
+  transpose_16bit_8x8(in1, &out0[8]);
+  transpose_16bit_8x8(&in1[8], &out1[8]);
 }
 
 // Group the coefficient calculation into smaller functions
@@ -1306,10 +1306,10 @@
     input += 32 << 3;
 
     // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
+    transpose_16bit_8x8(in, in);
+    transpose_16bit_8x8(in + 8, in + 8);
+    transpose_16bit_8x8(in + 16, in + 16);
+    transpose_16bit_8x8(in + 24, in + 24);
 
     idct32_full_8x32(in, col + (i << 5));
   }
@@ -1318,10 +1318,10 @@
   for (i = 0; i < 4; ++i) {
     j = i << 3;
     // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
+    transpose_16bit_8x8(col + j, in);
+    transpose_16bit_8x8(col + j + 32, in + 8);
+    transpose_16bit_8x8(col + j + 64, in + 16);
+    transpose_16bit_8x8(col + j + 96, in + 24);
 
     idct32_full_8x32(in, in);
     store_buffer_8x32(in, dest, stride);