ref: b90166665fa7af95bca24e102fef121b9ec51407
parent: a8c8bf1c99d7876334fc406ecae87a51b2e1ee35
parent: ad0646cb848e9facce33b856d3b05a095fc929f2
author: Linfeng Zhang <[email protected]>
date: Fri Jun 3 12:35:14 EDT 2016
Merge "Slow pshufb removal in 3 intra prediction functions."
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -191,14 +191,15 @@
INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
- vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+ vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
+ NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
vpx_tm_predictor_4x4_sse2)
#endif // HAVE_SSE2 && CONFIG_USE_X86INC
#if HAVE_SSSE3 && CONFIG_USE_X86INC
INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
- NULL, vpx_d45_predictor_4x4_ssse3, NULL, NULL,
- vpx_d153_predictor_4x4_ssse3, vpx_d207_predictor_4x4_ssse3,
+ NULL, NULL, NULL, NULL,
+ vpx_d153_predictor_4x4_ssse3, NULL,
vpx_d63_predictor_4x4_ssse3, NULL)
#endif // HAVE_SSSE3 && CONFIG_USE_X86INC
@@ -240,13 +241,13 @@
INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
- vpx_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL,
- NULL, vpx_tm_predictor_8x8_sse2)
+ vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
+ NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
#endif // HAVE_SSE2 && CONFIG_USE_X86INC
#if HAVE_SSSE3 && CONFIG_USE_X86INC
INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
- NULL, vpx_d45_predictor_8x8_ssse3, NULL, NULL,
+ NULL, NULL, NULL, NULL,
vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
vpx_d63_predictor_8x8_ssse3, NULL)
#endif // HAVE_SSSE3 && CONFIG_USE_X86INC
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -55,13 +55,13 @@
#
add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/vpx_d207_predictor_4x4/, "$sse2_x86inc";
add_proto qw/void vpx_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d207e_predictor_4x4/;
add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_4x4 neon/, "$ssse3_x86inc";
+specialize qw/vpx_d45_predictor_4x4 neon/, "$sse2_x86inc";
add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d45e_predictor_4x4/;
@@ -118,7 +118,7 @@
specialize qw/vpx_d207e_predictor_8x8/;
add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_8x8 neon/, "$ssse3_x86inc";
+specialize qw/vpx_d45_predictor_8x8 neon/, "$sse2_x86inc";
add_proto qw/void vpx_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d45e_predictor_8x8/;
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -11,6 +11,7 @@
%include "third_party/x86inc/x86inc.asm"
SECTION_RODATA
+pb_1: times 16 db 1
pw_4: times 8 dw 4
pw_8: times 8 dw 8
pw_16: times 8 dw 16
@@ -22,6 +23,115 @@
pw2_32: times 8 dw 16
SECTION .text
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+ pavgb %4, %1, %3
+ pxor %3, %1
+ pand %3, [GLOBAL(pb_1)]
+ psubb %4, %3
+ pavgb %4, %2
+%endmacro
+
+INIT_XMM sse2
+cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, temp
+ psrldq m1, m0, 1
+ psrldq m2, m0, 2
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+
+ ; store 4 lines
+ movd [dstq ], m3
+ psrlq m3, 8
+ movd [dstq+strideq ], m3
+ lea dstq, [dstq+strideq*2]
+ psrlq m3, 8
+ movd [dstq ], m3
+ psrlq m3, 8
+ movd [dstq+strideq ], m3
+ psrlq m0, 56
+ movd tempq, m0
+ mov [dstq+strideq+3], tempb
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movu m1, [aboveq]
+ pslldq m0, m1, 1
+ psrldq m2, m1, 1
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+ punpckhbw m0, m0 ; 7 7
+ punpcklwd m0, m0 ; 7 7 7 7
+ punpckldq m0, m0 ; 7 7 7 7 7 7 7 7
+ punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
+
+ ; store 4 lines
+ psrldq m3, 1
+ movq [dstq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq*2], m3
+ psrldq m3, 1
+ movq [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+
+ ; store next 4 lines
+ psrldq m3, 1
+ movq [dstq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq ], m3
+ psrldq m3, 1
+ movq [dstq+strideq*2], m3
+ psrldq m3, 1
+ movq [dstq+stride3q ], m3
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
+ GET_GOT goffsetq
+
+ movd m0, [leftq] ; abcd [byte]
+ punpcklbw m4, m0, m0 ; aabb ccdd
+ punpcklwd m4, m4 ; aaaa bbbb cccc dddd
+ psrldq m4, 12 ; dddd
+ punpckldq m0, m4 ; abcd dddd
+ psrldq m1, m0, 1 ; bcdd
+ psrldq m2, m0, 2 ; cddd
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d
+ pavgb m1, m0 ; ab, bc, cd, d [byte]
+
+ punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+ movd [dstq ], m1
+ psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
+ movd [dstq+strideq], m1
+
+ lea dstq, [dstq+strideq*2]
+ psrlq m1, 16 ; cd, c3d, d, d
+ movd [dstq ], m1
+ movd [dstq+strideq], m4 ; d, d, d, d
+ RESTORE_GOT
+ RET
INIT_XMM sse2
cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
--- a/vpx_dsp/x86/intrapred_ssse3.asm
+++ b/vpx_dsp/x86/intrapred_ssse3.asm
@@ -13,7 +13,6 @@
SECTION_RODATA
pb_1: times 16 db 1
-sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
@@ -28,77 +27,9 @@
sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
SECTION .text
-INIT_MMX ssse3
-cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
- GET_GOT goffsetq
-
- movq m0, [aboveq]
- pshufb m2, m0, [GLOBAL(sh_b23456777)]
- pshufb m1, m0, [GLOBAL(sh_b01234577)]
- pshufb m0, [GLOBAL(sh_b12345677)]
- pavgb m3, m2, m1
- pxor m2, m1
- pand m2, [GLOBAL(pb_1)]
- psubb m3, m2
- pavgb m0, m3
-
- ; store 4 lines
- movd [dstq ], m0
- psrlq m0, 8
- movd [dstq+strideq], m0
- lea dstq, [dstq+strideq*2]
- psrlq m0, 8
- movd [dstq ], m0
- psrlq m0, 8
- movd [dstq+strideq], m0
-
- RESTORE_GOT
- RET
-
-INIT_MMX ssse3
-cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
- GET_GOT goffsetq
-
- movq m0, [aboveq]
- mova m1, [GLOBAL(sh_b12345677)]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- pshufb m2, m0, [GLOBAL(sh_b23456777)]
- pavgb m3, m2, m0
- pxor m2, m0
- pshufb m0, m1
- pand m2, [GLOBAL(pb_1)]
- psubb m3, m2
- pavgb m0, m3
-
- ; store 4 lines
- movq [dstq ], m0
- pshufb m0, m1
- movq [dstq+strideq ], m0
- pshufb m0, m1
- movq [dstq+strideq*2], m0
- pshufb m0, m1
- movq [dstq+stride3q ], m0
- pshufb m0, m1
- lea dstq, [dstq+strideq*4]
-
- ; store next 4 lines
- movq [dstq ], m0
- pshufb m0, m1
- movq [dstq+strideq ], m0
- pshufb m0, m1
- movq [dstq+strideq*2], m0
- pshufb m0, m1
- movq [dstq+stride3q ], m0
-
- RESTORE_GOT
- RET
-
INIT_XMM ssse3
cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
GET_GOT goffsetq
@@ -712,28 +643,6 @@
mova [dstq+stride3q ], m2
mova [dstq+stride3q+16 ], m3
- RESTORE_GOT
- RET
-
-INIT_MMX ssse3
-cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
- GET_GOT goffsetq
- movd m0, [leftq] ; abcd [byte]
- pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
- pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd
-
- X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
- pavgb m1, m0 ; ab, bc, cd, d [byte]
-
- punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
- movd [dstq ], m1
- psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
- movd [dstq+strideq], m1
- lea dstq, [dstq+strideq*2]
- psrlq m1, 16 ; cd, c3d, d, d
- movd [dstq ], m1
- pshufw m1, m1, q1111 ; d, d, d, d
- movd [dstq+strideq], m1
RESTORE_GOT
RET