shithub: libvpx

Download patch

ref: 26a6ce4c6d25ec6254af67551698679340ce581c
parent: 355bfa21930740c7670debd65f13532591a947cb
author: Jian Zhou <[email protected]>
date: Tue Dec 22 11:51:57 EST 2015

Code clean of highbd_tm_predictor_32x32

Remove the ARCH_X86_64 constraint. No performance hit on both
big core and small core.

Change-Id: I39860b62b7a0ae4acaafdca7d68f3e5820133a81

--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -132,7 +132,6 @@
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_USE_X86INC
-#if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                         ::testing::Values(
                             make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -159,34 +158,7 @@
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
-                        ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
-                                       &vpx_highbd_dc_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
-                                       &vpx_highbd_tm_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
-                                       &vpx_highbd_dc_predictor_4x4_c, 4, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
-                                       &vpx_highbd_dc_predictor_8x8_c, 8, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
-                                       &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
-                                       &vpx_highbd_v_predictor_4x4_c, 4, 8),
-                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
-                                       &vpx_highbd_v_predictor_8x8_c, 8, 8),
-                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
-                                       &vpx_highbd_v_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
-                                       &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
-                                       &vpx_highbd_tm_predictor_4x4_c, 4, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
-                                       &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
-#endif  // !ARCH_X86_64
 
-#if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                         ::testing::Values(
                             make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -219,37 +191,7 @@
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
-                        ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
-                                       &vpx_highbd_dc_predictor_32x32_c, 32,
-                                       10),
-                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
-                                       &vpx_highbd_tm_predictor_16x16_c, 16,
-                                       10),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
-                                       &vpx_highbd_dc_predictor_4x4_c, 4, 10),
-                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
-                                       &vpx_highbd_dc_predictor_8x8_c, 8, 10),
-                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
-                                       &vpx_highbd_dc_predictor_16x16_c, 16,
-                                       10),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
-                                       &vpx_highbd_v_predictor_4x4_c, 4, 10),
-                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
-                                       &vpx_highbd_v_predictor_8x8_c, 8, 10),
-                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
-                                       &vpx_highbd_v_predictor_16x16_c, 16, 10),
-                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
-                                       &vpx_highbd_v_predictor_32x32_c, 32, 10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
-                                       &vpx_highbd_tm_predictor_4x4_c, 4, 10),
-                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
-                                       &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
-#endif  // !ARCH_X86_64
 
-#if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                         ::testing::Values(
                             make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -282,35 +224,7 @@
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
-                        ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
-                                       &vpx_highbd_dc_predictor_32x32_c, 32,
-                                       12),
-                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
-                                       &vpx_highbd_tm_predictor_16x16_c, 16,
-                                       12),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
-                                       &vpx_highbd_dc_predictor_4x4_c, 4, 12),
-                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
-                                       &vpx_highbd_dc_predictor_8x8_c, 8, 12),
-                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
-                                       &vpx_highbd_dc_predictor_16x16_c, 16,
-                                       12),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
-                                       &vpx_highbd_v_predictor_4x4_c, 4, 12),
-                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
-                                       &vpx_highbd_v_predictor_8x8_c, 8, 12),
-                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
-                                       &vpx_highbd_v_predictor_16x16_c, 16, 12),
-                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
-                                       &vpx_highbd_v_predictor_32x32_c, 32, 12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
-                                       &vpx_highbd_tm_predictor_4x4_c, 4, 12),
-                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
-                                       &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
-#endif  // !ARCH_X86_64
+
 #endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -435,7 +435,7 @@
   specialize qw/vpx_highbd_v_predictor_32x32/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86_64_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86inc";
--- a/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -385,9 +385,8 @@
   jnz .loop
   REP_RET
 
-%if ARCH_X86_64
 INIT_XMM sse2
-cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
   movd                  m0, [aboveq-2]
   mova                  m1, [aboveq]
   mova                  m2, [aboveq+16]
@@ -395,70 +394,60 @@
   mova                  m4, [aboveq+48]
   pshuflw               m0, m0, 0x0
   ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  pxor                 m10, m10
-  pxor                 m11, m11
-  pinsrw               m10, oned, 0
-  pinsrw               m11, bpsd, 0
-  pshuflw              m10, m10, 0x0
+  pcmpeqw               m5, m5
+  movd                  m6, bpsd
+  psllw                 m5, m6
+  pcmpeqw               m7, m7
+  pxor                  m6, m6         ; min possible value
+  pxor                  m5, m7         ; max possible value
+  punpcklqdq            m0, m0
   DEFINE_ARGS dst, stride, line, left
-  punpcklqdq           m10, m10
   mov                lineq, -16
-  mova                  m5, m10
-  punpcklqdq            m0, m0
-  psllw                m10, m11
-  add                leftq, 64
-  psubw                m10, m5 ; max possible value
-  pxor                 m11, m11 ; min possible value
   psubw                 m1, m0
   psubw                 m2, m0
   psubw                 m3, m0
   psubw                 m4, m0
 .loop:
-  movd                  m5, [leftq+lineq*4]
-  movd                  m6, [leftq+lineq*4+2]
-  pshuflw               m5, m5, 0x0
-  pshuflw               m6, m6, 0x0
-  punpcklqdq            m5, m5
-  punpcklqdq            m6, m6
-  paddw                 m7, m5, m1
-  paddw                 m8, m5, m2
-  paddw                 m9, m5, m3
-  paddw                 m5, m4
-  ;Clamp these values to the bit-depth
-  pminsw                m7, m10
-  pminsw                m8, m10
-  pminsw                m9, m10
-  pminsw                m5, m10
-  pmaxsw                m7, m11
-  pmaxsw                m8, m11
-  pmaxsw                m9, m11
-  pmaxsw                m5, m11
-  ;Store these values
-  mova   [dstq           ], m7
-  mova   [dstq        +16], m8
-  mova   [dstq        +32], m9
-  mova   [dstq        +48], m5
-  paddw                 m7, m6, m1
-  paddw                 m8, m6, m2
-  paddw                 m9, m6, m3
-  paddw                 m6, m4
-  ;Clamp these values to the bit-depth
-  pminsw                m7, m10
-  pminsw                m8, m10
-  pminsw                m9, m10
-  pminsw                m6, m10
-  pmaxsw                m7, m11
-  pmaxsw                m8, m11
-  pmaxsw                m9, m11
-  pmaxsw                m6, m11
-  ;Store these values
-  mova   [dstq+strideq*2   ], m7
-  mova   [dstq+strideq*2+16], m8
-  mova   [dstq+strideq*2+32], m9
-  mova   [dstq+strideq*2+48], m6
+  movd                  m7, [leftq]
+  pshuflw               m7, m7, 0x0
+  punpcklqdq            m7, m7         ; l1 l1 l1 l1 l1 l1 l1 l1
+  paddw                 m0, m7, m1
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq           ], m0
+  paddw                 m0, m7, m2
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +16], m0
+  paddw                 m0, m7, m3
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +32], m0
+  paddw                 m0, m7, m4
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +48], m0
+  movd                  m7, [leftq+2]
+  pshuflw               m7, m7, 0x0
+  punpcklqdq            m7, m7         ; l2 l2 l2 l2 l2 l2 l2 l2
+  paddw                 m0, m7, m1
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2 ], m0
+  paddw                 m0, m7, m2
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+16], m0
+  paddw                 m0, m7, m3
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+32], m0
+  paddw                 m0, m7, m4
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+48], m0
   lea                 dstq, [dstq+strideq*4]
+  lea                leftq, [leftq+4]
   inc                lineq
   jnz .loop
   REP_RET
-%endif