ref: b229710811f618247f712e4567b8949ec696ce0b
parent: 432136ef565b52bd1896556603e5a0bb07417e32
author: Yaowu Xu <[email protected]>
date: Thu Jan 28 11:29:29 EST 2016
SSSE3 idct8x8 functions for highbitdpeth build This commit changes SSSE3 optimized idct8x8 functions to work with highbitdepth build. With this commit and the previous one that enabled SSSE3 idct32x32 functions, tests showed virtually no difference on decoding speed for file fdJc1_IBKJA.248.webm for the build with -enable-vp9-highbitdpeth option and the build without the option. Change-Id: Ibe0634149ec70e8b921e6b30171664b8690a9c45
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -762,7 +762,7 @@
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add/;
-
+
add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct4x4_16_add/;
@@ -785,10 +785,10 @@
specialize qw/vpx_idct4x4_1_add sse2/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct8x8_64_add sse2/;
+ specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct8x8_12_add sse2/;
+ specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_1_add sse2/;
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -220,8 +220,25 @@
mova m12, [pw_11585x2]
lea r3, [2 * strideq]
-
+%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [inputq + 0]
+ packssdw m0, [inputq + 16]
+ mova m1, [inputq + 32]
+ packssdw m1, [inputq + 48]
+ mova m2, [inputq + 64]
+ packssdw m2, [inputq + 80]
+ mova m3, [inputq + 96]
+ packssdw m3, [inputq + 112]
+ mova m4, [inputq + 128]
+ packssdw m4, [inputq + 144]
+ mova m5, [inputq + 160]
+ packssdw m5, [inputq + 176]
+ mova m6, [inputq + 192]
+ packssdw m6, [inputq + 208]
+ mova m7, [inputq + 224]
+ packssdw m7, [inputq + 240]
+%else
+ mova m0, [inputq + 0]
mova m1, [inputq + 16]
mova m2, [inputq + 32]
mova m3, [inputq + 48]
@@ -229,7 +246,7 @@
mova m5, [inputq + 80]
mova m6, [inputq + 96]
mova m7, [inputq + 112]
-
+%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
IDCT8_1D
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
@@ -254,10 +271,21 @@
lea r3, [2 * strideq]
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova m0, [inputq + 0]
+ packssdw m0, [inputq + 16]
+ mova m1, [inputq + 32]
+ packssdw m1, [inputq + 48]
+ mova m2, [inputq + 64]
+ packssdw m2, [inputq + 80]
+ mova m3, [inputq + 96]
+ packssdw m3, [inputq + 112]
+%else
mova m0, [inputq + 0]
mova m1, [inputq + 16]
mova m2, [inputq + 32]
mova m3, [inputq + 48]
+%endif
punpcklwd m0, m1
punpcklwd m2, m3