ref: e4bdbd3c0b670b4af61e44f087535033031b2851
parent: 84e3639454de7243da971b74ba15dbc906b9a792
parent: 3fb55d24e86cbedd11fc718430b0758c3e8f01f0
author: James Zern <[email protected]>
date: Fri May 20 15:11:06 EDT 2016
Merge "Revert "Code clean of sub_pixel_variance4xh""
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1026,8 +1026,8 @@
make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
- make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
- make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
+ make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse, 0),
+ make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse, 0)));
INSTANTIATE_TEST_CASE_P(
SSE2, VpxSubpelAvgVarianceTest,
@@ -1043,8 +1043,8 @@
make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
- make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
- make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
+ make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse, 0),
+ make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0)));
#endif // CONFIG_USE_X86INC
#if CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1493,10 +1493,10 @@
specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+ specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+ specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -1532,10 +1532,10 @@
specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+ specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+ specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
#
# Specialty Subpixel
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -57,8 +57,8 @@
paddd %6, %1
%endmacro
-%macro STORE_AND_RET 1
-%if %1 > 4
+%macro STORE_AND_RET 0
+%if mmsize == 16
; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
; We have to sign-extend it before adding the words within the register
@@ -78,9 +78,9 @@
movd [r1], m7 ; store sse
paddd m6, m4
movd raxd, m6 ; store sum as return value
-%else ; 4xh
- pshuflw m4, m6, 0xe
- pshuflw m3, m7, 0xe
+%else ; mmsize == 8
+ pshufw m4, m6, 0xe
+ pshufw m3, m7, 0xe
paddw m6, m4
paddd m7, m3
pcmpgtw m5, m6 ; mask for 0 > x
@@ -87,7 +87,7 @@
mov r1, ssem ; r1 = unsigned int *sse
punpcklwd m6, m5 ; sign-extend m6 word->dword
movd [r1], m7 ; store sse
- pshuflw m4, m6, 0xe
+ pshufw m4, m6, 0xe
paddd m6, m4
movd raxd, m6 ; store sum as return value
%endif
@@ -226,14 +226,8 @@
punpckhbw m3, m1, m5
punpcklbw m1, m5
%endif
-%if %1 > 4
punpckhbw m2, m0, m5
punpcklbw m0, m5
-%else
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
-
%if %2 == 0 ; !avg
punpckhbw m3, m1, m5
punpcklbw m1, m5
@@ -245,40 +239,22 @@
%else ; %1 < 16
movh m0, [srcq]
%if %2 == 1 ; avg
-%if %1 > 4
+%if mmsize == 16
movhps m0, [srcq+src_strideq]
-%else ; 4xh
- movd m1, [srcq+src_strideq]
- punpckldq m0, m1
+%else ; mmsize == 8
+ punpckldq m0, [srcq+src_strideq]
%endif
%else ; !avg
movh m2, [srcq+src_strideq]
%endif
-
-%if %1 > 4
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
-%else ; 4xh
- movd m1, [dstq]
- movd m3, [dstq+dst_strideq]
-%endif
-
%if %2 == 1 ; avg
-%if %1 > 4
pavgb m0, [secq]
-%else
- movh m2, [secq]
- pavgb m0, m2
-%endif
punpcklbw m3, m5
punpcklbw m1, m5
-%if %1 > 4
punpckhbw m2, m0, m5
punpcklbw m0, m5
-%else ; 4xh
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
%else ; !avg
punpcklbw m0, m5
punpcklbw m2, m5
@@ -295,7 +271,7 @@
%endif
dec block_height
jg .x_zero_y_zero_loop
- STORE_AND_RET %1
+ STORE_AND_RET
.x_zero_y_nonzero:
cmp y_offsetd, 4
@@ -323,9 +299,9 @@
movh m0, [srcq]
movh m2, [srcq+src_strideq]
%if %2 == 1 ; avg
-%if %1 > 4
+%if mmsize == 16
movhps m2, [srcq+src_strideq*2]
-%else ; 4xh
+%else ; mmsize == 8
%if %1 == 4
movh m1, [srcq+src_strideq*2]
punpckldq m2, m1
@@ -334,26 +310,18 @@
%endif
%endif
movh m1, [dstq]
-%if %1 > 4
+%if mmsize == 16
movlhps m0, m2
-%else ; 4xh
+%else ; mmsize == 8
punpckldq m0, m2
%endif
movh m3, [dstq+dst_strideq]
pavgb m0, m2
punpcklbw m1, m5
-%if %1 > 4
pavgb m0, [secq]
punpcklbw m3, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
-%else ; 4xh
- movh m4, [secq]
- pavgb m0, m4
- punpcklbw m3, m5
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
%else ; !avg
movh m4, [srcq+src_strideq*2]
movh m1, [dstq]
@@ -375,7 +343,7 @@
%endif
dec block_height
jg .x_zero_y_half_loop
- STORE_AND_RET %1
+ STORE_AND_RET
.x_zero_y_nonhalf:
; x_offset == 0 && y_offset == bilin interpolation
@@ -383,7 +351,7 @@
lea bilin_filter, [bilin_filter_m]
%endif
shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16]
@@ -488,21 +456,11 @@
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
-%if %1 == 4
- movlhps m0, m2
-%endif
packuswb m0, m2
-%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
-%else ; 4xh
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
%endif
-%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@@ -517,7 +475,7 @@
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
- STORE_AND_RET %1
+ STORE_AND_RET
.x_nonzero:
cmp x_offsetd, 4
@@ -548,31 +506,21 @@
movh m0, [srcq]
movh m4, [srcq+1]
%if %2 == 1 ; avg
-%if %1 > 4
+%if mmsize == 16
movhps m0, [srcq+src_strideq]
movhps m4, [srcq+src_strideq+1]
-%else ; 4xh
- movd m1, [srcq+src_strideq]
- punpckldq m0, m1
- movd m2, [srcq+src_strideq+1]
- punpckldq m4, m2
+%else ; mmsize == 8
+ punpckldq m0, [srcq+src_strideq]
+ punpckldq m4, [srcq+src_strideq+1]
%endif
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
pavgb m0, m4
punpcklbw m3, m5
-%if %1 > 4
pavgb m0, [secq]
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
-%else ; 4xh
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m1, m5
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
%else ; !avg
movh m2, [srcq+src_strideq]
movh m1, [dstq]
@@ -595,7 +543,7 @@
%endif
dec block_height
jg .x_half_y_zero_loop
- STORE_AND_RET %1
+ STORE_AND_RET
.x_half_y_nonzero:
cmp y_offsetd, 4
@@ -638,7 +586,7 @@
movh m2, [srcq]
movh m3, [srcq+1]
%if %2 == 1 ; avg
-%if %1 > 4
+%if mmsize == 16
movhps m2, [srcq+src_strideq]
movhps m3, [srcq+src_strideq+1]
%else
@@ -653,31 +601,21 @@
%endif
%endif
pavgb m2, m3
-%if %1 > 4
+%if mmsize == 16
movlhps m0, m2
movhlps m4, m2
-%else ; 4xh
+%else ; mmsize == 8
punpckldq m0, m2
- pshuflw m4, m2, 0xe
+ pshufw m4, m2, 0xe
%endif
movh m1, [dstq]
pavgb m0, m2
movh m3, [dstq+dst_strideq]
-%if %1 > 4
pavgb m0, [secq]
-%else
- movh m2, [secq]
- pavgb m0, m2
-%endif
punpcklbw m3, m5
punpcklbw m1, m5
-%if %1 > 4
punpckhbw m2, m0, m5
punpcklbw m0, m5
-%else
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
%else ; !avg
movh m4, [srcq+src_strideq]
movh m1, [srcq+src_strideq+1]
@@ -703,7 +641,7 @@
%endif
dec block_height
jg .x_half_y_half_loop
- STORE_AND_RET %1
+ STORE_AND_RET
.x_half_y_nonhalf:
; x_offset == 0.5 && y_offset == bilin interpolation
@@ -711,7 +649,7 @@
lea bilin_filter, [bilin_filter_m]
%endif
shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+y_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+y_offsetq+16]
@@ -828,21 +766,11 @@
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
-%if %1 == 4
- movlhps m0, m2
-%endif
packuswb m0, m2
-%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
-%else
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
%endif
-%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
@@ -858,7 +786,7 @@
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
- STORE_AND_RET %1
+ STORE_AND_RET
.x_nonhalf:
test y_offsetd, y_offsetd
@@ -869,7 +797,7 @@
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
@@ -971,21 +899,11 @@
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
-%if %1 == 4
- movlhps m0, m2
-%endif
packuswb m0, m2
-%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
-%else
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
%endif
-%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@@ -1000,7 +918,7 @@
%undef filter_x_a
%undef filter_x_b
%undef filter_rnd
- STORE_AND_RET %1
+ STORE_AND_RET
.x_nonhalf_y_nonzero:
cmp y_offsetd, 4
@@ -1011,7 +929,7 @@
lea bilin_filter, [bilin_filter_m]
%endif
shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
@@ -1171,21 +1089,11 @@
pavgw m2, m4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline - also consider going to bytes here
-%if %1 == 4
- movlhps m0, m2
-%endif
packuswb m0, m2
-%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
-%else
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
%endif
-%endif
punpcklbw m3, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@@ -1202,7 +1110,7 @@
%undef filter_x_a
%undef filter_x_b
%undef filter_rnd
- STORE_AND_RET %1
+ STORE_AND_RET
.x_nonhalf_y_nonhalf:
%ifdef PIC
@@ -1210,7 +1118,7 @@
%endif
shl x_offsetd, filter_idx_shift
shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if ARCH_X86_64 && mmsize == 16
mova m8, [bilin_filter+x_offsetq]
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
mova m9, [bilin_filter+x_offsetq+16]
@@ -1437,21 +1345,11 @@
%endif
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
-%if %1 == 4
- movlhps m0, m2
-%endif
packuswb m0, m2
-%if %1 > 4
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
-%else
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
%endif
-%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
@@ -1468,7 +1366,7 @@
%undef filter_y_a
%undef filter_y_b
%undef filter_rnd
- STORE_AND_RET %1
+ STORE_AND_RET
%endmacro
; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
@@ -1477,22 +1375,26 @@
; location in the sse/2 version, rather than duplicating that code in the
; binary.
-INIT_XMM sse2
+INIT_MMX sse
SUBPEL_VARIANCE 4
+INIT_XMM sse2
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16
-INIT_XMM ssse3
+INIT_MMX ssse3
SUBPEL_VARIANCE 4
+INIT_XMM ssse3
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16
-INIT_XMM sse2
+INIT_MMX sse
SUBPEL_VARIANCE 4, 1
+INIT_XMM sse2
SUBPEL_VARIANCE 8, 1
SUBPEL_VARIANCE 16, 1
-INIT_XMM ssse3
+INIT_MMX ssse3
SUBPEL_VARIANCE 4, 1
+INIT_XMM ssse3
SUBPEL_VARIANCE 8, 1
SUBPEL_VARIANCE 16, 1
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -320,11 +320,11 @@
int height, unsigned int *sse, \
void *unused0, void *unused)
#define DECLS(opt1, opt2) \
- DECL(4, opt1); \
+ DECL(4, opt2); \
DECL(8, opt1); \
DECL(16, opt1)
-DECLS(sse2, sse2);
+DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
@@ -380,10 +380,10 @@
FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \
FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \
FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \
-FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \
-FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
+FN(4, 8, 4, 2, 3, opt2, (int32_t), (int32_t)); \
+FN(4, 4, 4, 2, 2, opt2, (int32_t), (int32_t))
-FNS(sse2, sse2);
+FNS(sse2, sse);
FNS(ssse3, ssse3);
#undef FNS
@@ -401,11 +401,11 @@
int height, unsigned int *sse, \
void *unused0, void *unused)
#define DECLS(opt1, opt2) \
-DECL(4, opt1); \
+DECL(4, opt2); \
DECL(8, opt1); \
DECL(16, opt1)
-DECLS(sse2, sse2);
+DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECL
#undef DECLS
@@ -466,8 +466,8 @@
FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \
FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \
FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \
-FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \
-FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
+FN(4, 8, 4, 2, 3, opt2, (uint32_t), (int32_t)); \
+FN(4, 4, 4, 2, 2, opt2, (uint32_t), (int32_t))
FNS(sse2, sse);
FNS(ssse3, ssse3);