ref: 26a6ce4c6d25ec6254af67551698679340ce581c
parent: 355bfa21930740c7670debd65f13532591a947cb
author: Jian Zhou <[email protected]>
date: Tue Dec 22 11:51:57 EST 2015
Code clean of highbd_tm_predictor_32x32 Remove the ARCH_X86_64 constraint. No performance hit on both big core and small core. Change-Id: I39860b62b7a0ae4acaafdca7d68f3e5820133a81
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -132,7 +132,6 @@
#if HAVE_SSE2
#if CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_USE_X86INC
-#if ARCH_X86_64
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
::testing::Values(
make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -159,34 +158,7 @@
&vpx_highbd_tm_predictor_4x4_c, 4, 8),
make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
&vpx_highbd_tm_predictor_8x8_c, 8, 8)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
- ::testing::Values(
- make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
- &vpx_highbd_dc_predictor_32x32_c, 32, 8),
- make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
- &vpx_highbd_tm_predictor_16x16_c, 16, 8),
- make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
- &vpx_highbd_dc_predictor_4x4_c, 4, 8),
- make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
- &vpx_highbd_dc_predictor_8x8_c, 8, 8),
- make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
- &vpx_highbd_dc_predictor_16x16_c, 16, 8),
- make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
- &vpx_highbd_v_predictor_4x4_c, 4, 8),
- make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
- &vpx_highbd_v_predictor_8x8_c, 8, 8),
- make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
- &vpx_highbd_v_predictor_16x16_c, 16, 8),
- make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
- &vpx_highbd_v_predictor_32x32_c, 32, 8),
- make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
- &vpx_highbd_tm_predictor_4x4_c, 4, 8),
- make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
- &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
-#endif // !ARCH_X86_64
-#if ARCH_X86_64
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
::testing::Values(
make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -219,37 +191,7 @@
&vpx_highbd_tm_predictor_4x4_c, 4, 10),
make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
&vpx_highbd_tm_predictor_8x8_c, 8, 10)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
- ::testing::Values(
- make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
- &vpx_highbd_dc_predictor_32x32_c, 32,
- 10),
- make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
- &vpx_highbd_tm_predictor_16x16_c, 16,
- 10),
- make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
- &vpx_highbd_dc_predictor_4x4_c, 4, 10),
- make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
- &vpx_highbd_dc_predictor_8x8_c, 8, 10),
- make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
- &vpx_highbd_dc_predictor_16x16_c, 16,
- 10),
- make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
- &vpx_highbd_v_predictor_4x4_c, 4, 10),
- make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
- &vpx_highbd_v_predictor_8x8_c, 8, 10),
- make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
- &vpx_highbd_v_predictor_16x16_c, 16, 10),
- make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
- &vpx_highbd_v_predictor_32x32_c, 32, 10),
- make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
- &vpx_highbd_tm_predictor_4x4_c, 4, 10),
- make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
- &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
-#endif // !ARCH_X86_64
-#if ARCH_X86_64
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
::testing::Values(
make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -282,35 +224,7 @@
&vpx_highbd_tm_predictor_4x4_c, 4, 12),
make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
&vpx_highbd_tm_predictor_8x8_c, 8, 12)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
- ::testing::Values(
- make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
- &vpx_highbd_dc_predictor_32x32_c, 32,
- 12),
- make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
- &vpx_highbd_tm_predictor_16x16_c, 16,
- 12),
- make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
- &vpx_highbd_dc_predictor_4x4_c, 4, 12),
- make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
- &vpx_highbd_dc_predictor_8x8_c, 8, 12),
- make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
- &vpx_highbd_dc_predictor_16x16_c, 16,
- 12),
- make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
- &vpx_highbd_v_predictor_4x4_c, 4, 12),
- make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
- &vpx_highbd_v_predictor_8x8_c, 8, 12),
- make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
- &vpx_highbd_v_predictor_16x16_c, 16, 12),
- make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
- &vpx_highbd_v_predictor_32x32_c, 32, 12),
- make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
- &vpx_highbd_tm_predictor_4x4_c, 4, 12),
- make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
- &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
-#endif // !ARCH_X86_64
+
#endif // CONFIG_USE_X86INC
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_SSE2
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -435,7 +435,7 @@
specialize qw/vpx_highbd_v_predictor_32x32/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86_64_x86inc";
+ specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86inc";
--- a/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -385,9 +385,8 @@
jnz .loop
REP_RET
-%if ARCH_X86_64
INIT_XMM sse2
-cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
movd m0, [aboveq-2]
mova m1, [aboveq]
mova m2, [aboveq+16]
@@ -395,70 +394,60 @@
mova m4, [aboveq+48]
pshuflw m0, m0, 0x0
; Get the values to compute the maximum value at this bit depth
- mov oned, 1
- pxor m10, m10
- pxor m11, m11
- pinsrw m10, oned, 0
- pinsrw m11, bpsd, 0
- pshuflw m10, m10, 0x0
+ pcmpeqw m5, m5
+ movd m6, bpsd
+ psllw m5, m6
+ pcmpeqw m7, m7
+ pxor m6, m6 ; min possible value
+ pxor m5, m7 ; max possible value
+ punpcklqdq m0, m0
DEFINE_ARGS dst, stride, line, left
- punpcklqdq m10, m10
mov lineq, -16
- mova m5, m10
- punpcklqdq m0, m0
- psllw m10, m11
- add leftq, 64
- psubw m10, m5 ; max possible value
- pxor m11, m11 ; min possible value
psubw m1, m0
psubw m2, m0
psubw m3, m0
psubw m4, m0
.loop:
- movd m5, [leftq+lineq*4]
- movd m6, [leftq+lineq*4+2]
- pshuflw m5, m5, 0x0
- pshuflw m6, m6, 0x0
- punpcklqdq m5, m5
- punpcklqdq m6, m6
- paddw m7, m5, m1
- paddw m8, m5, m2
- paddw m9, m5, m3
- paddw m5, m4
- ;Clamp these values to the bit-depth
- pminsw m7, m10
- pminsw m8, m10
- pminsw m9, m10
- pminsw m5, m10
- pmaxsw m7, m11
- pmaxsw m8, m11
- pmaxsw m9, m11
- pmaxsw m5, m11
- ;Store these values
- mova [dstq ], m7
- mova [dstq +16], m8
- mova [dstq +32], m9
- mova [dstq +48], m5
- paddw m7, m6, m1
- paddw m8, m6, m2
- paddw m9, m6, m3
- paddw m6, m4
- ;Clamp these values to the bit-depth
- pminsw m7, m10
- pminsw m8, m10
- pminsw m9, m10
- pminsw m6, m10
- pmaxsw m7, m11
- pmaxsw m8, m11
- pmaxsw m9, m11
- pmaxsw m6, m11
- ;Store these values
- mova [dstq+strideq*2 ], m7
- mova [dstq+strideq*2+16], m8
- mova [dstq+strideq*2+32], m9
- mova [dstq+strideq*2+48], m6
+ movd m7, [leftq]
+ pshuflw m7, m7, 0x0
+ punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1
+ paddw m0, m7, m1
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq ], m0
+ paddw m0, m7, m2
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +16], m0
+ paddw m0, m7, m3
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +32], m0
+ paddw m0, m7, m4
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq +48], m0
+ movd m7, [leftq+2]
+ pshuflw m7, m7, 0x0
+ punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2
+ paddw m0, m7, m1
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2 ], m0
+ paddw m0, m7, m2
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+16], m0
+ paddw m0, m7, m3
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+32], m0
+ paddw m0, m7, m4
+ pminsw m0, m5
+ pmaxsw m0, m6
+ mova [dstq+strideq*2+48], m0
lea dstq, [dstq+strideq*4]
+ lea leftq, [leftq+4]
inc lineq
jnz .loop
REP_RET
-%endif