ref: 6ff5de68ddae842a92459ac0ee39fc141f92bdf9
parent: ee1fcb0e694747f1d6f9e7ffd095ed645d827e9d
parent: b8a4b5dd8d2ae895ff08e88c2cd2b9b8c8bf17c5
author: Linfeng Zhang <[email protected]>
date: Fri Jun 23 20:51:07 EDT 2017
Merge "Cosmetics, 8x8 idct SSE2 optimization"
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -175,46 +175,52 @@
}
static INLINE void idct8(const __m128i *const in, __m128i *const out) {
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- /* Stage1 */
- multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &stg1_0, &stg1_1,
- &stg1_2, &stg1_3, &stp1_4, &stp1_7, &stp1_5, &stp1_6);
+ const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ __m128i step1[8], step2[8];
- /* Stage2 */
- multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &stg2_0, &stg2_1,
- &stg2_2, &stg2_3, &stp2_0, &stp2_1, &stp2_2, &stp2_3);
+ // stage 1
+ {
+ const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &cp_28_n4, &cp_4_28,
+ &cp_n20_12, &cp_12_20, &step1[4], &step1[7],
+ &step1[5], &step1[6]);
+ }
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+ // stage 2
+ {
+ const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &cp_16_16,
+ &cp_16_n16, &cp_24_n8, &cp_8_24, &step2[0],
+ &step2[1], &step2[2], &step2[3]);
+ }
- /* Stage3 */
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
- multiplication_and_add_2(&stp2_6, &stp2_5, &stg2_1, &stg2_0, &stp1_5,
- &stp1_6);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
- /* Stage4 */
- out[0] = _mm_add_epi16(stp1_0, stp2_7);
- out[1] = _mm_add_epi16(stp1_1, stp1_6);
- out[2] = _mm_add_epi16(stp1_2, stp1_5);
- out[3] = _mm_add_epi16(stp1_3, stp2_4);
- out[4] = _mm_sub_epi16(stp1_3, stp2_4);
- out[5] = _mm_sub_epi16(stp1_2, stp1_5);
- out[6] = _mm_sub_epi16(stp1_1, stp1_6);
- out[7] = _mm_sub_epi16(stp1_0, stp2_7);
+ // stage 3
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16,
+ &step1[5], &step1[6]);
+
+ // stage 4
+ out[0] = _mm_add_epi16(step1[0], step2[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step2[4]);
+ out[4] = _mm_sub_epi16(step1[3], step2[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step2[7]);
}
void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
@@ -481,70 +487,59 @@
void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
const __m128i zero = _mm_setzero_si128();
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ __m128i in[8], step1[8], step2[8], tmp[4];
- __m128i in[8];
- __m128i stp1_2, stp1_3, stp1_4, stp1_5;
- __m128i stp2_0, stp2_2, stp2_4, stp2_5, stp2_6;
- __m128i tmp[4];
+ in[0] = load_input_data(input + 0 * 8);
+ in[1] = load_input_data(input + 1 * 8);
+ in[2] = load_input_data(input + 2 * 8);
+ in[3] = load_input_data(input + 3 * 8);
- // Rows. Load 4-row input data.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 1);
- in[2] = load_input_data(input + 8 * 2);
- in[3] = load_input_data(input + 8 * 3);
-
- // 8x4 Transpose
transpose_16bit_4x4(in, in);
- // Stage1
- {
- const __m128i lo_17 = _mm_unpackhi_epi16(in[0], zero);
- const __m128i lo_35 = _mm_unpackhi_epi16(in[1], zero);
+ // in[0]: 00 10 20 30 01 11 21 31
+ // in[1]: 02 12 22 32 03 13 23 33
- stp1_4 = idct_calc_wraplow_sse2(stg1_0, stg1_1, lo_17);
- stp1_5 = idct_calc_wraplow_sse2(stg1_2, stg1_3, lo_35);
+ // stage 1
+ {
+ const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i lo_1 = _mm_unpackhi_epi16(in[0], zero);
+ const __m128i lo_3 = _mm_unpackhi_epi16(in[1], zero);
+ step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7
+ step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6
}
- // Stage2
+ // stage 2
{
- const __m128i lo_04 = _mm_unpacklo_epi16(in[0], zero);
- const __m128i lo_26 = _mm_unpacklo_epi16(in[1], zero);
-
- stp2_0 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_04);
- stp2_2 = idct_calc_wraplow_sse2(stg2_3, stg2_2, lo_26);
-
- tmp[0] = _mm_add_epi16(stp1_4, stp1_5);
- tmp[1] = _mm_sub_epi16(stp1_4, stp1_5);
-
- stp2_4 = tmp[0];
- stp2_5 = _mm_unpacklo_epi64(tmp[1], zero);
- stp2_6 = _mm_unpackhi_epi64(tmp[1], zero);
+ const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i lo_0 = _mm_unpacklo_epi16(in[0], zero);
+ const __m128i lo_2 = _mm_unpacklo_epi16(in[1], zero);
+ step2[0] = idct_calc_wraplow_sse2(cp_16_16, cp_16_n16, lo_0); // step2 0&1
+ step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2
+ step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
+ step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6
}
- // Stage3
+ // stage 3
{
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
- tmp[0] = _mm_add_epi16(stp2_0, stp2_2);
- tmp[1] = _mm_sub_epi16(stp2_0, stp2_2);
- stp1_2 = _mm_unpackhi_epi64(tmp[1], tmp[0]);
- stp1_3 = _mm_unpacklo_epi64(tmp[1], tmp[0]);
- stp1_5 = idct_calc_wraplow_sse2(stg3_0, stg2_0, lo_56); // stg3_1 = stg2_0
+ const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]);
+ tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
+ tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
+ step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
+ step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
+ step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6
}
- // Stage4
- tmp[0] = _mm_add_epi16(stp1_3, stp2_4);
- tmp[1] = _mm_add_epi16(stp1_2, stp1_5);
- tmp[2] = _mm_sub_epi16(stp1_3, stp2_4);
- tmp[3] = _mm_sub_epi16(stp1_2, stp1_5);
+ // stage 4
+ tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
+ tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
+ tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
+ tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
idct8x8_12_transpose_16bit_4x8(tmp, in);
in[4] = in[5] = in[6] = in[7] = zero;