shithub: libvpx

Download patch

ref: dbe2d8c33c289804f10e5fd0c76b10dabedc65a0
parent: c84d3abeb8f854db1aaea54670db1d6789e7bf05
parent: 355bfa21930740c7670debd65f13532591a947cb
author: Jian Zhou <[email protected]>
date: Mon Dec 28 13:16:13 EST 2015

Merge changes I0139f8e9,I7d2545fc

* changes:
  Code clean of highbd_tm_predictor_16x16
  Code clean of highbd_dc_predictor_32x32

--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -162,6 +162,10 @@
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                         ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
+                                       &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
+                                       &vpx_highbd_tm_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
@@ -218,6 +222,12 @@
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                         ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
+                                       &vpx_highbd_dc_predictor_32x32_c, 32,
+                                       10),
+                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
+                                       &vpx_highbd_tm_predictor_16x16_c, 16,
+                                       10),
                             make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
@@ -275,6 +285,12 @@
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                         ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
+                                       &vpx_highbd_dc_predictor_32x32_c, 32,
+                                       12),
+                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
+                                       &vpx_highbd_tm_predictor_16x16_c, 16,
+                                       12),
                             make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -387,7 +387,7 @@
   specialize qw/vpx_highbd_v_predictor_16x16/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86_64_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_16x16/, "$sse2_x86inc";
@@ -438,7 +438,7 @@
   specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86_64_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86_64_x86inc";
+  specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_32x32/;
--- a/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -118,30 +118,29 @@
   RESTORE_GOT
   REP_RET
 
-%if ARCH_X86_64
 INIT_XMM sse2
-cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
-  pxor                  m1, m1
   mova                  m0, [aboveq]
   mova                  m2, [aboveq+16]
   mova                  m3, [aboveq+32]
   mova                  m4, [aboveq+48]
-  mova                  m5, [leftq]
-  mova                  m6, [leftq+16]
-  mova                  m7, [leftq+32]
-  mova                  m8, [leftq+48]
+  paddw                 m0, m2
+  paddw                 m3, m4
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  mova                  m5, [leftq+32]
+  mova                  m6, [leftq+48]
+  paddw                 m2, m4
+  paddw                 m5, m6
+  paddw                 m0, m3
+  paddw                 m2, m5
+  pxor                  m1, m1
+  paddw                 m0, m2
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 8
-  paddw                 m0, m2
-  paddw                 m0, m3
-  paddw                 m0, m4
-  paddw                 m0, m5
-  paddw                 m0, m6
-  paddw                 m0, m7
-  paddw                 m0, m8
   movhlps               m2, m0
   paddw                 m0, m2
   punpcklwd             m0, m1
@@ -177,7 +176,6 @@
 
   RESTORE_GOT
   REP_RET
-%endif
 
 INIT_XMM sse2
 cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
@@ -340,61 +338,54 @@
   jnz .loop
   REP_RET
 
-%if ARCH_X86_64
 INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
   movd                  m2, [aboveq-2]
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]
   pshuflw               m2, m2, 0x0
   ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  pxor                  m7, m7
-  pxor                  m8, m8
-  pinsrw                m7, oned, 0
-  pinsrw                m8, bpsd, 0
-  pshuflw               m7, m7, 0x0
+  pcmpeqw               m3, m3
+  movd                  m4, bpsd
+  punpcklqdq            m2, m2
+  psllw                 m3, m4
+  pcmpeqw               m5, m5
+  pxor                  m4, m4         ; min possible value
+  pxor                  m3, m5         ; max possible value
   DEFINE_ARGS dst, stride, line, left
-  punpcklqdq            m7, m7
   mov                lineq, -8
-  mova                  m5, m7
-  punpcklqdq            m2, m2
-  psllw                 m7, m8
-  add                leftq, 32
-  psubw                 m7, m5 ; max possible value
-  pxor                  m8, m8 ; min possible value
   psubw                 m0, m2
   psubw                 m1, m2
 .loop:
-  movd                  m2, [leftq+lineq*4]
-  movd                  m3, [leftq+lineq*4+2]
-  pshuflw               m2, m2, 0x0
-  pshuflw               m3, m3, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m3, m3
-  paddw                 m4, m2, m0
-  paddw                 m5, m3, m0
+  movd                  m7, [leftq]
+  pshuflw               m5, m7, 0x0
+  pshuflw               m2, m7, 0x55
+  punpcklqdq            m5, m5         ; l1 l1 l1 l1 l1 l1 l1 l1
+  punpcklqdq            m2, m2         ; l2 l2 l2 l2 l2 l2 l2 l2
+  paddw                 m6, m5, m0     ; t1-tl+l1 to t4-tl+l1
+  paddw                 m5, m1         ; t5-tl+l1 to t8-tl+l1
+  pminsw                m6, m3
+  pminsw                m5, m3
+  pmaxsw                m6, m4         ; Clamp to the bit-depth
+  pmaxsw                m5, m4
+  mova   [dstq           ], m6
+  mova   [dstq        +16], m5
+  paddw                 m6, m2, m0
   paddw                 m2, m1
-  paddw                 m3, m1
-  ;Clamp to the bit-depth
-  pminsw                m4, m7
-  pminsw                m5, m7
-  pminsw                m2, m7
-  pminsw                m3, m7
-  pmaxsw                m4, m8
-  pmaxsw                m5, m8
-  pmaxsw                m2, m8
-  pmaxsw                m3, m8
-  ;Store the values
-  mova   [dstq             ], m4
-  mova   [dstq+strideq*2   ], m5
-  mova   [dstq          +16], m2
-  mova   [dstq+strideq*2+16], m3
+  pminsw                m6, m3
+  pminsw                m2, m3
+  pmaxsw                m6, m4
+  pmaxsw                m2, m4
+  mova   [dstq+strideq*2 ], m6
+  mova [dstq+strideq*2+16], m2
   lea                 dstq, [dstq+strideq*4]
   inc                lineq
+  lea                leftq, [leftq+4]
+
   jnz .loop
   REP_RET
 
+%if ARCH_X86_64
 INIT_XMM sse2
 cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
   movd                  m0, [aboveq-2]