ref: aac1ef7f80151bff1be06a32361d9cf6c31ef455
parent: 8fdab8a4a4374bff18c02560eb52a6d1b6adc48e
author: Yaowu Xu <[email protected]>
date: Wed Jan 27 10:25:42 EST 2016
Enable hbd_build to use SSSE3optimized functions This commit changes the SSSE3 assembly functions for idct32x32 to support highbitdepth build. On test clip fdJc1_IBKJA.248.webm, this cuts the speed difference between hbd and lbd build from between 3-4% to 1-2%. Change-Id: Ic3390e0113bc1ca5bba8ec80d1795ad31b484fca
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -803,14 +803,15 @@
specialize qw/vpx_idct16x16_1_add sse2/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct32x32_1024_add sse2/;
+ specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct32x32_135_add sse2/;
+ specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64_x86inc";
+ # Need to add 135 eob idct32x32 implementations.
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct32x32_34_add sse2/;
+ specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add sse2/;
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -765,7 +765,25 @@
lea r4, [rsp + transposed_in]
idct32x32_34_transpose:
+%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [r3 + 0]
+ packssdw m0, [r3 + 16]
+ mova m1, [r3 + 32 * 4]
+ packssdw m1, [r3 + 32 * 4 + 16]
+ mova m2, [r3 + 32 * 8]
+ packssdw m2, [r3 + 32 * 8 + 16]
+ mova m3, [r3 + 32 * 12]
+ packssdw m3, [r3 + 32 * 12 + 16]
+ mova m4, [r3 + 32 * 16]
+ packssdw m4, [r3 + 32 * 16 + 16]
+ mova m5, [r3 + 32 * 20]
+ packssdw m5, [r3 + 32 * 20 + 16]
+ mova m6, [r3 + 32 * 24]
+ packssdw m6, [r3 + 32 * 24 + 16]
+ mova m7, [r3 + 32 * 28]
+ packssdw m7, [r3 + 32 * 28 + 16]
+%else
+ mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
mova m3, [r3 + 16 * 12]
@@ -773,6 +791,7 @@
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
+%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
@@ -1176,7 +1195,25 @@
mov r7, 2
idct32x32_135_transpose:
+%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [r3 + 0]
+ packssdw m0, [r3 + 16]
+ mova m1, [r3 + 32 * 4]
+ packssdw m1, [r3 + 32 * 4 + 16]
+ mova m2, [r3 + 32 * 8]
+ packssdw m2, [r3 + 32 * 8 + 16]
+ mova m3, [r3 + 32 * 12]
+ packssdw m3, [r3 + 32 * 12 + 16]
+ mova m4, [r3 + 32 * 16]
+ packssdw m4, [r3 + 32 * 16 + 16]
+ mova m5, [r3 + 32 * 20]
+ packssdw m5, [r3 + 32 * 20 + 16]
+ mova m6, [r3 + 32 * 24]
+ packssdw m6, [r3 + 32 * 24 + 16]
+ mova m7, [r3 + 32 * 28]
+ packssdw m7, [r3 + 32 * 28 + 16]
+%else
+ mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
mova m3, [r3 + 16 * 12]
@@ -1184,7 +1221,7 @@
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
-
+%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
mova [r4 + 0], m0
@@ -1196,7 +1233,11 @@
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
+%if CONFIG_VP9_HIGHBITDEPTH
+ add r3, 32
+%else
add r3, 16
+%endif
add r4, 16 * 8
dec r7
jne idct32x32_135_transpose
@@ -1203,7 +1244,11 @@
IDCT32X32_135 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea inputq, [inputq + 32 * 32]
+%else
lea inputq, [inputq + 16 * 32]
+%endif
dec r6
jnz idct32x32_135
@@ -1614,7 +1659,25 @@
mov r7, 4
idct32x32_1024_transpose:
+%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [r3 + 0]
+ packssdw m0, [r3 + 16]
+ mova m1, [r3 + 32 * 4]
+ packssdw m1, [r3 + 32 * 4 + 16]
+ mova m2, [r3 + 32 * 8]
+ packssdw m2, [r3 + 32 * 8 + 16]
+ mova m3, [r3 + 32 * 12]
+ packssdw m3, [r3 + 32 * 12 + 16]
+ mova m4, [r3 + 32 * 16]
+ packssdw m4, [r3 + 32 * 16 + 16]
+ mova m5, [r3 + 32 * 20]
+ packssdw m5, [r3 + 32 * 20 + 16]
+ mova m6, [r3 + 32 * 24]
+ packssdw m6, [r3 + 32 * 24 + 16]
+ mova m7, [r3 + 32 * 28]
+ packssdw m7, [r3 + 32 * 28 + 16]
+%else
+ mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
mova m3, [r3 + 16 * 12]
@@ -1622,6 +1685,7 @@
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
+%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
@@ -1633,8 +1697,11 @@
mova [r4 + 16 * 5], m5
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
-
+%if CONFIG_VP9_HIGHBITDEPTH
+ add r3, 32
+%else
add r3, 16
+%endif
add r4, 16 * 8
dec r7
jne idct32x32_1024_transpose
@@ -1642,7 +1709,11 @@
IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea inputq, [inputq + 32 * 32]
+%else
lea inputq, [inputq + 16 * 32]
+%endif
dec r6
jnz idct32x32_1024