shithub: libvpx

Download patch

ref: e4bdbd3c0b670b4af61e44f087535033031b2851
parent: 84e3639454de7243da971b74ba15dbc906b9a792
parent: 3fb55d24e86cbedd11fc718430b0758c3e8f01f0
author: James Zern <[email protected]>
date: Fri May 20 15:11:06 EDT 2016

Merge "Revert "Code clean of sub_pixel_variance4xh""

--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1026,8 +1026,8 @@
                       make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
                       make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
                       make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxSubpelAvgVarianceTest,
@@ -1043,8 +1043,8 @@
         make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
         make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
         make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
+        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse, 0),
+        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0)));
 #endif  // CONFIG_USE_X86INC
 
 #if CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1493,10 +1493,10 @@
   specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -1532,10 +1532,10 @@
   specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
 
 #
 # Specialty Subpixel
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -57,8 +57,8 @@
   paddd                %6, %1
 %endmacro
 
-%macro STORE_AND_RET 1
-%if %1 > 4
+%macro STORE_AND_RET 0
+%if mmsize == 16
   ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
   ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
   ; We have to sign-extend it before adding the words within the register
@@ -78,9 +78,9 @@
   movd               [r1], m7           ; store sse
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
-%else ; 4xh
-  pshuflw              m4, m6, 0xe
-  pshuflw              m3, m7, 0xe
+%else ; mmsize == 8
+  pshufw               m4, m6, 0xe
+  pshufw               m3, m7, 0xe
   paddw                m6, m4
   paddd                m7, m3
   pcmpgtw              m5, m6           ; mask for 0 > x
@@ -87,7 +87,7 @@
   mov                  r1, ssem         ; r1 = unsigned int *sse
   punpcklwd            m6, m5           ; sign-extend m6 word->dword
   movd               [r1], m7           ; store sse
-  pshuflw              m4, m6, 0xe
+  pshufw               m4, m6, 0xe
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
 %endif
@@ -226,14 +226,8 @@
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
 %endif
-%if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
-%else
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-
 %if %2 == 0 ; !avg
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
@@ -245,40 +239,22 @@
 %else ; %1 < 16
   movh                 m0, [srcq]
 %if %2 == 1 ; avg
-%if %1 > 4
+%if mmsize == 16
   movhps               m0, [srcq+src_strideq]
-%else ; 4xh
-  movd                 m1, [srcq+src_strideq]
-  punpckldq            m0, m1
+%else ; mmsize == 8
+  punpckldq            m0, [srcq+src_strideq]
 %endif
 %else ; !avg
   movh                 m2, [srcq+src_strideq]
 %endif
-
-%if %1 > 4
   movh                 m1, [dstq]
   movh                 m3, [dstq+dst_strideq]
-%else ; 4xh
-  movd                 m1, [dstq]
-  movd                 m3, [dstq+dst_strideq]
-%endif
-
 %if %2 == 1 ; avg
-%if %1 > 4
   pavgb                m0, [secq]
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-%endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
-%if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
-%else ; 4xh
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
 %else ; !avg
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -295,7 +271,7 @@
 %endif
   dec                   block_height
   jg .x_zero_y_zero_loop
-  STORE_AND_RET %1
+  STORE_AND_RET
 
 .x_zero_y_nonzero:
   cmp           y_offsetd, 4
@@ -323,9 +299,9 @@
   movh                 m0, [srcq]
   movh                 m2, [srcq+src_strideq]
 %if %2 == 1 ; avg
-%if %1 > 4
+%if mmsize == 16
   movhps               m2, [srcq+src_strideq*2]
-%else ; 4xh
+%else ; mmsize == 8
 %if %1 == 4
   movh                 m1, [srcq+src_strideq*2]
   punpckldq            m2, m1
@@ -334,26 +310,18 @@
 %endif
 %endif
   movh                 m1, [dstq]
-%if %1 > 4
+%if mmsize == 16
   movlhps              m0, m2
-%else ; 4xh
+%else ; mmsize == 8
   punpckldq            m0, m2
 %endif
   movh                 m3, [dstq+dst_strideq]
   pavgb                m0, m2
   punpcklbw            m1, m5
-%if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m3, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m4, [secq]
-  pavgb                m0, m4
-  punpcklbw            m3, m5
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
 %else ; !avg
   movh                 m4, [srcq+src_strideq*2]
   movh                 m1, [dstq]
@@ -375,7 +343,7 @@
 %endif
   dec                   block_height
   jg .x_zero_y_half_loop
-  STORE_AND_RET %1
+  STORE_AND_RET
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
@@ -383,7 +351,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
@@ -488,21 +456,11 @@
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
   packuswb             m0, m2
-%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
 %endif
-%endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
@@ -517,7 +475,7 @@
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET %1
+  STORE_AND_RET
 
 .x_nonzero:
   cmp           x_offsetd, 4
@@ -548,31 +506,21 @@
   movh                 m0, [srcq]
   movh                 m4, [srcq+1]
 %if %2 == 1 ; avg
-%if %1 > 4
+%if mmsize == 16
   movhps               m0, [srcq+src_strideq]
   movhps               m4, [srcq+src_strideq+1]
-%else ; 4xh
-  movd                 m1, [srcq+src_strideq]
-  punpckldq            m0, m1
-  movd                 m2, [srcq+src_strideq+1]
-  punpckldq            m4, m2
+%else ; mmsize == 8
+  punpckldq            m0, [srcq+src_strideq]
+  punpckldq            m4, [srcq+src_strideq+1]
 %endif
   movh                 m1, [dstq]
   movh                 m3, [dstq+dst_strideq]
   pavgb                m0, m4
   punpcklbw            m3, m5
-%if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m1, m5
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
 %else ; !avg
   movh                 m2, [srcq+src_strideq]
   movh                 m1, [dstq]
@@ -595,7 +543,7 @@
 %endif
   dec                   block_height
   jg .x_half_y_zero_loop
-  STORE_AND_RET %1
+  STORE_AND_RET
 
 .x_half_y_nonzero:
   cmp           y_offsetd, 4
@@ -638,7 +586,7 @@
   movh                 m2, [srcq]
   movh                 m3, [srcq+1]
 %if %2 == 1 ; avg
-%if %1 > 4
+%if mmsize == 16
   movhps               m2, [srcq+src_strideq]
   movhps               m3, [srcq+src_strideq+1]
 %else
@@ -653,31 +601,21 @@
 %endif
 %endif
   pavgb                m2, m3
-%if %1 > 4
+%if mmsize == 16
   movlhps              m0, m2
   movhlps              m4, m2
-%else ; 4xh
+%else ; mmsize == 8
   punpckldq            m0, m2
-  pshuflw              m4, m2, 0xe
+  pshufw               m4, m2, 0xe
 %endif
   movh                 m1, [dstq]
   pavgb                m0, m2
   movh                 m3, [dstq+dst_strideq]
-%if %1 > 4
   pavgb                m0, [secq]
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-%endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
-%if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
-%else
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
 %else ; !avg
   movh                 m4, [srcq+src_strideq]
   movh                 m1, [srcq+src_strideq+1]
@@ -703,7 +641,7 @@
 %endif
   dec                   block_height
   jg .x_half_y_half_loop
-  STORE_AND_RET %1
+  STORE_AND_RET
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
@@ -711,7 +649,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
@@ -828,21 +766,11 @@
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
   packuswb             m0, m2
-%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
 %endif
-%endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
@@ -858,7 +786,7 @@
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET %1
+  STORE_AND_RET
 
 .x_nonhalf:
   test          y_offsetd, y_offsetd
@@ -869,7 +797,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -971,21 +899,11 @@
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
   packuswb             m0, m2
-%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
 %endif
-%endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
@@ -1000,7 +918,7 @@
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
-  STORE_AND_RET %1
+  STORE_AND_RET
 
 .x_nonhalf_y_nonzero:
   cmp           y_offsetd, 4
@@ -1011,7 +929,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -1171,21 +1089,11 @@
   pavgw                m2, m4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline - also consider going to bytes here
-%if %1 == 4
-  movlhps              m0, m2
-%endif
   packuswb             m0, m2
-%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
 %endif
-%endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -1202,7 +1110,7 @@
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
-  STORE_AND_RET %1
+  STORE_AND_RET
 
 .x_nonhalf_y_nonhalf:
 %ifdef PIC
@@ -1210,7 +1118,7 @@
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -1437,21 +1345,11 @@
 %endif
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
   packuswb             m0, m2
-%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
 %endif
-%endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
 
@@ -1468,7 +1366,7 @@
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET %1
+  STORE_AND_RET
 %endmacro
 
 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
@@ -1477,22 +1375,26 @@
 ; location in the sse/2 version, rather than duplicating that code in the
 ; binary.
 
-INIT_XMM sse2
+INIT_MMX sse
 SUBPEL_VARIANCE  4
+INIT_XMM sse2
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
 
-INIT_XMM ssse3
+INIT_MMX ssse3
 SUBPEL_VARIANCE  4
+INIT_XMM ssse3
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
 
-INIT_XMM sse2
+INIT_MMX sse
 SUBPEL_VARIANCE  4, 1
+INIT_XMM sse2
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1
 
-INIT_XMM ssse3
+INIT_MMX ssse3
 SUBPEL_VARIANCE  4, 1
+INIT_XMM ssse3
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -320,11 +320,11 @@
                                           int height, unsigned int *sse, \
                                           void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
-  DECL(4, opt1); \
+  DECL(4, opt2); \
   DECL(8, opt1); \
   DECL(16, opt1)
 
-DECLS(sse2, sse2);
+DECLS(sse2, sse);
 DECLS(ssse3, ssse3);
 #undef DECLS
 #undef DECL
@@ -380,10 +380,10 @@
 FN(8,  16,  8, 3, 4, opt1, (int32_t), (int32_t)); \
 FN(8,   8,  8, 3, 3, opt1, (int32_t), (int32_t)); \
 FN(8,   4,  8, 3, 2, opt1, (int32_t), (int32_t)); \
-FN(4,   8,  4, 2, 3, opt1, (int32_t), (int32_t)); \
-FN(4,   4,  4, 2, 2, opt1, (int32_t), (int32_t))
+FN(4,   8,  4, 2, 3, opt2, (int32_t), (int32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (int32_t), (int32_t))
 
-FNS(sse2, sse2);
+FNS(sse2, sse);
 FNS(ssse3, ssse3);
 
 #undef FNS
@@ -401,11 +401,11 @@
                                             int height, unsigned int *sse, \
                                             void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
-DECL(4, opt1); \
+DECL(4, opt2); \
 DECL(8, opt1); \
 DECL(16, opt1)
 
-DECLS(sse2, sse2);
+DECLS(sse2, sse);
 DECLS(ssse3, ssse3);
 #undef DECL
 #undef DECLS
@@ -466,8 +466,8 @@
 FN(8,  16,  8, 3, 4, opt1, (uint32_t), (int32_t)); \
 FN(8,   8,  8, 3, 3, opt1, (uint32_t), (int32_t)); \
 FN(8,   4,  8, 3, 2, opt1, (uint32_t), (int32_t)); \
-FN(4,   8,  4, 2, 3, opt1, (uint32_t), (int32_t)); \
-FN(4,   4,  4, 2, 2, opt1, (uint32_t), (int32_t))
+FN(4,   8,  4, 2, 3, opt2, (uint32_t), (int32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (uint32_t), (int32_t))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);