shithub: libvpx

--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c

+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c

@@ -1131,23 +1131,6 @@

   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);

-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {

-  __m128i tbuf[8];

-  transpose_16bit_8x8(res0, res0);

-  transpose_16bit_8x8(res1, tbuf);

-  transpose_16bit_8x8(res0 + 8, res1);

-  transpose_16bit_8x8(res1 + 8, res1 + 8);

-  res0[8] = tbuf[0];

-  res0[9] = tbuf[1];

-  res0[10] = tbuf[2];

-  res0[11] = tbuf[3];

-  res0[12] = tbuf[4];

-  res0[13] = tbuf[5];

-  res0[14] = tbuf[6];

-  res0[15] = tbuf[7];

-}

 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {

   // perform rounding operations

   right_shift_8x8(res0, 2);

@@ -1951,13 +1934,13 @@

 static void fdct16_sse2(__m128i *in0, __m128i *in1) {

   fdct16_8col(in0);

   fdct16_8col(in1);

-  array_transpose_16x16(in0, in1);

+  transpose_16bit_16x16(in0, in1);

 static void fadst16_sse2(__m128i *in0, __m128i *in1) {

   fadst16_8col(in0);

   fadst16_8col(in1);

-  array_transpose_16x16(in0, in1);

+  transpose_16bit_16x16(in0, in1);

 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,

--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c

+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c

@@ -66,7 +66,7 @@

     test = _mm_movemask_epi8(temp1);

     if (test) {

-      array_transpose_16x16(inptr, inptr + 16);

+      transpose_16bit_16x16(inptr, inptr + 16);

       for (i = 0; i < 16; i++) {

         sign_bits = _mm_cmplt_epi16(inptr[i], zero);

         temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c

+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c

@@ -165,7 +165,7 @@

     if (test) {

       // Use fact only first 4 rows contain non-zero coeffs

-      array_transpose_4X8(inptr, inptr);

+      transpose_16bit_4x8(inptr, inptr);

       for (i = 0; i < 4; i++) {

         sign_bits = _mm_cmplt_epi16(inptr[i], zero);

         temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

--- a/vpx_dsp/x86/inv_txfm_sse2.c

+++ b/vpx_dsp/x86/inv_txfm_sse2.c

@@ -1462,13 +1462,13 @@

 void idct16_sse2(__m128i *in0, __m128i *in1) {

-  array_transpose_16x16(in0, in1);

+  transpose_16bit_16x16(in0, in1);

   idct16_8col(in0);

   idct16_8col(in1);

 void iadst16_sse2(__m128i *in0, __m128i *in1) {

-  array_transpose_16x16(in0, in1);

+  transpose_16bit_16x16(in0, in1);

   iadst16_8col(in0);

   iadst16_8col(in1);

@@ -1616,7 +1616,7 @@

   // Second 1-D inverse transform, performed per 8x16 block

   for (i = 0; i < 2; i++) {

     int j;

-    array_transpose_4X8(l + 8 * i, in);

+    transpose_16bit_4x8(l + 8 * i, in);

     IDCT16_10

--- a/vpx_dsp/x86/inv_txfm_sse2.h

+++ b/vpx_dsp/x86/inv_txfm_sse2.h

@@ -56,40 +56,6 @@

   out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);

-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {

-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);

-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);

-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);

-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);

-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);

-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);

-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);

-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);

-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);

-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);

-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);

-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);

-}

-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {

-  __m128i tbuf[8];

-  transpose_16bit_8x8(res0, res0);

-  transpose_16bit_8x8(res1, tbuf);

-  transpose_16bit_8x8(res0 + 8, res1);

-  transpose_16bit_8x8(res1 + 8, res1 + 8);

-  res0[8] = tbuf[0];

-  res0[9] = tbuf[1];

-  res0[10] = tbuf[2];

-  res0[11] = tbuf[3];

-  res0[12] = tbuf[4];

-  res0[13] = tbuf[5];

-  res0[14] = tbuf[6];

-  res0[15] = tbuf[7];

-}

 static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) {

   const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING));

   return _mm_srai_epi32(t, DCT_CONST_BITS);

--- a/vpx_dsp/x86/inv_txfm_ssse3.c

+++ b/vpx_dsp/x86/inv_txfm_ssse3.c

@@ -670,14 +670,6 @@

-static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,

-                                    __m128i *out1) {

-  transpose_16bit_8x8(in0, out0);

-  transpose_16bit_8x8(&in0[8], out1);

-  transpose_16bit_8x8(in1, &out0[8]);

-  transpose_16bit_8x8(&in1[8], &out1[8]);

-}

 // Group the coefficient calculation into smaller functions

 // to prevent stack spillover:

 // quarter_1: 0-7

@@ -986,7 +978,7 @@

   switch (cols) {

     case left_16: {

       int i;

-      array_transpose_16x16(in0, in1);

+      transpose_16bit_16x16(in0, in1);

       for (i = 0; i < 16; ++i) {

         store[i] = in0[16 + i];

         store[16 + i] = in1[16 + i];

@@ -994,7 +986,10 @@

       break;

     case right_16: {

-      array_transpose_16x16_2(store, &store[16], in0, in1);

+      transpose_16bit_8x8(store, in0);

+      transpose_16bit_8x8(&store[8], in1);

+      transpose_16bit_8x8(&store[16], &in0[8]);

+      transpose_16bit_8x8(&store[24], &in1[8]);

       break;

     default: { assert(0); }

@@ -1013,7 +1008,7 @@

   load_buffer_16x16(input, col0, col1);

   // columns

-  array_transpose_16x16(col0, col1);

+  transpose_16bit_16x16(col0, col1);

   idct32_135(col0, col1);

   // rows

--- a/vpx_dsp/x86/transpose_sse2.h

+++ b/vpx_dsp/x86/transpose_sse2.h

@@ -33,6 +33,48 @@

   out[1] = _mm_unpackhi_epi32(tr0_0, tr0_1);

+static INLINE void transpose_16bit_4x8(const __m128i *const in,

+                                       __m128i *const out) {

+  // Unpack 16 bit elements. Goes from:

+  // in[0]: 00 01 02 03  XX XX XX XX

+  // in[1]: 10 11 12 13  XX XX XX XX

+  // in[2]: 20 21 22 23  XX XX XX XX

+  // in[3]: 30 31 32 33  XX XX XX XX

+  // in[4]: 40 41 42 43  XX XX XX XX

+  // in[5]: 50 51 52 53  XX XX XX XX

+  // in[6]: 60 61 62 63  XX XX XX XX

+  // in[7]: 70 71 72 73  XX XX XX XX

+  // to:

+  // tr0_0: 00 10 01 11  02 12 03 13

+  // tr0_1: 20 30 21 31  22 32 23 33

+  // tr0_2: 40 50 41 51  42 52 43 53

+  // tr0_3: 60 70 61 71  62 72 63 73

+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);

+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);

+  const __m128i tr0_2 = _mm_unpacklo_epi16(in[4], in[5]);

+  const __m128i tr0_3 = _mm_unpacklo_epi16(in[6], in[7]);

+  // Unpack 32 bit elements resulting in:

+  // tr1_0: 00 10 20 30  01 11 21 31

+  // tr1_1: 40 50 60 70  41 51 61 71

+  // tr1_2: 02 12 22 32  03 13 23 33

+  // tr1_3: 42 52 62 72  43 53 63 73

+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);

+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);

+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);

+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);

+  // Unpack 64 bit elements resulting in:

+  // out[0]: 00 10 20 30  40 50 60 70

+  // out[1]: 01 11 21 31  41 51 61 71

+  // out[2]: 02 12 22 32  42 52 62 72

+  // out[3]: 03 13 23 33  43 53 63 73

+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);

+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);

+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);

+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);

+}

 static INLINE void transpose_16bit_8x8(const __m128i *const in,

                                        __m128i *const out) {

   // Unpack 16 bit elements. Goes from:

@@ -97,6 +139,25 @@

   out[5] = _mm_unpackhi_epi64(tr1_2, tr1_3);

   out[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);

   out[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);

+}

+// Transpose in-place

+static INLINE void transpose_16bit_16x16(__m128i *const left,

+                                         __m128i *const right) {

+  __m128i tbuf[8];

+  transpose_16bit_8x8(left, left);

+  transpose_16bit_8x8(right, tbuf);

+  transpose_16bit_8x8(left + 8, right);

+  transpose_16bit_8x8(right + 8, right + 8);

+  left[8] = tbuf[0];

+  left[9] = tbuf[1];

+  left[10] = tbuf[2];

+  left[11] = tbuf[3];

+  left[12] = tbuf[4];

+  left[13] = tbuf[5];

+  left[14] = tbuf[6];

+  left[15] = tbuf[7];

 static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1,