ref: afd2f68daef62d1185ba1e65971fdd3a7fc1d8eb
parent: 16c0ec711c02062d10cf9ad53e1a4978792ecb43
author: James Zern <[email protected]>
date: Tue Aug 4 13:52:57 EDT 2015
Revert "VP9_COPY_CONVOLVE_SSE2 optimization" This reverts commit a5e97d874b16ae5826b68515f1e35ffb44361cf8. Additionally: Revert "vpx_convolve_copy_sse2: fix win64" This reverts commit 22a8474fe7ec30d96f746dc6e4b23771758c071e. This change performs poorly on various x86_64 devices affecting performance by 1-3% at 1080P. Performance on chromebook like devices was mixed neutral to slightly negative, so there should be minimal change there. Change-Id: I95831233b4b84ee96369baa192a2d4cc7639658c
--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -16,289 +16,140 @@
%macro convolve_fn 1
INIT_XMM sse2
-cglobal convolve_%1, 4, 7, 8, src, src_stride, dst, dst_stride, \
+cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
fx, fxs, fy, fys, w, h
- mov r4d, dword wm
- cmp r4d, 4
+ mov r4d, dword wm
+ cmp r4d, 4
je .w4
- cmp r4d, 8
+ cmp r4d, 8
je .w8
- cmp r4d, 16
+ cmp r4d, 16
je .w16
- cmp r4d, 32
+ cmp r4d, 32
je .w32
- ; 64xh
- mov r4d, dword hm
- shr r4d, 1 ; ASSUMPTION: hm is at least EVEN
- sub r4d, 1
-
- movu m0, [srcq]
- movu m4, [srcq+src_strideq]
- movu m1, [srcq+16]
- movu m5, [srcq+src_strideq+16]
- movu m2, [srcq+32]
- movu m6, [srcq+src_strideq+32]
- movu m3, [srcq+48]
- movu m7, [srcq+src_strideq+48]
-
+ mov r4d, dword hm
.loop64:
- prefetcht0 [srcq+64 ]
- prefetcht0 [srcq+src_strideq+64]
-
- lea srcq, [srcq+src_strideq*2]
-
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+ add srcq, src_strideq
%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+16]
-
- mova [dstq ], m0
- movu m0, [srcq]
-
- mova [dstq+16], m1
- movu m1, [srcq+16]
-
- pavgb m2, [dstq+32]
- mova [dstq+32], m2
- movu m2, [srcq+32]
- pavgb m3, [dstq+48]
- mova [dstq+48], m3
- movu m3, [srcq+48]
- pavgb m4, [dstq+dst_strideq]
-
- mova [dstq+dst_strideq], m4
- movu m4, [srcq+src_strideq]
-
- pavgb m5, [dstq+dst_strideq+16]
- mova [dstq+dst_strideq+16], m5
- movu m5, [srcq+src_strideq+16]
- pavgb m6, [dstq+dst_strideq+32]
- mova [dstq+dst_strideq+32], m6
- movu m6, [srcq+src_strideq+32]
- pavgb m7, [dstq+dst_strideq+48]
- mova [dstq+dst_strideq+48], m7
- movu m7, [srcq+src_strideq+48]
-
- lea dstq, [dstq+dst_strideq*2]
-%else
- mova [dstq ], m0
- movu m0, [srcq]
-
- mova [dstq+16], m1
- movu m1, [srcq+16]
- mova [dstq+32], m2
- movu m2, [srcq+32]
- mova [dstq+48], m3
- movu m3, [srcq+48]
-
- mova [dstq+dst_strideq], m4
- movu m4, [srcq+src_strideq]
-
- mova [dstq+dst_strideq+16], m5
- movu m5, [srcq+src_strideq+16]
- mova [dstq+dst_strideq+32], m6
- movu m6, [srcq+src_strideq+32]
- mova [dstq+dst_strideq+48], m7
- movu m7, [srcq+src_strideq+48]
-
- lea dstq, [dstq+dst_strideq*2]
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+16]
+ pavgb m2, [dstq+32]
+ pavgb m3, [dstq+48]
%endif
- dec r4d
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, dst_strideq
+ dec r4d
jnz .loop64
-
-%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+16]
- pavgb m2, [dstq+32]
- pavgb m3, [dstq+48]
- pavgb m4, [dstq+dst_strideq]
- pavgb m5, [dstq+dst_strideq+16]
- pavgb m6, [dstq+dst_strideq+32]
- pavgb m7, [dstq+dst_strideq+48]
-%endif
- mova [dstq ], m0
- mova [dstq+16], m1
- mova [dstq+32], m2
- mova [dstq+48], m3
-
- mova [dstq+dst_strideq ], m4
- mova [dstq+dst_strideq+16], m5
- mova [dstq+dst_strideq+32], m6
- mova [dstq+dst_strideq+48], m7
-
RET
.w32:
- mov r4d, dword hm
- sub r4d, 2
-
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+src_strideq]
- movu m3, [srcq+src_strideq+16]
-
+ mov r4d, dword hm
.loop32:
- prefetcht0 [srcq+64]
- prefetcht0 [srcq+src_strideq+64]
-
- lea srcq, [srcq+src_strideq*2]
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+src_strideq]
+ movu m3, [srcq+src_strideq+16]
+ lea srcq, [srcq+src_strideq*2]
%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+16]
- pavgb m2, [dstq+dst_strideq]
- pavgb m3, [dstq+dst_strideq+16]
+ pavgb m0, [dstq]
+ pavgb m1, [dstq +16]
+ pavgb m2, [dstq+dst_strideq]
+ pavgb m3, [dstq+dst_strideq+16]
%endif
- mova [dstq], m0
- movu m0, [srcq]
-
- mova [dstq+16], m1
- movu m1, [srcq+16]
-
- mova [dstq+dst_strideq], m2
- movu m2, [srcq+src_strideq]
-
- mova [dstq+dst_strideq+16], m3
- movu m3, [srcq+src_strideq+16]
-
- lea dstq, [dstq+dst_strideq*2]
-
- sub r4d, 2
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+dst_strideq ], m2
+ mova [dstq+dst_strideq+16], m3
+ lea dstq, [dstq+dst_strideq*2]
+ sub r4d, 2
jnz .loop32
-
-%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+16]
- pavgb m2, [dstq+dst_strideq]
- pavgb m3, [dstq+dst_strideq+16]
-%endif
- mova [dstq ], m0
- mova [dstq+16], m1
-
- mova [dstq+dst_strideq ], m2
- mova [dstq+dst_strideq+16], m3
-
RET
.w16:
- mov r4d, dword hm
- sub r4d, 4
-
- movu m0, [srcq]
- movu m1, [srcq+src_strideq]
-
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
.loop16:
- lea srcq, [srcq+src_strideq]
- prefetcht0 [srcq+src_strideq*4]
- lea srcq, [srcq+src_strideq]
- prefetcht0 [srcq+src_strideq*2]
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+dst_strideq]
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
%endif
- mova [dstq ], m0
- mova [dstq+dst_strideq], m1
-
- lea dstq, [dstq+dst_strideq*2]
-
- movu m0, [srcq]
- movu m1, [srcq+src_strideq]
-
- sub r4d, 2
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
jnz .loop16
-
- lea srcq, [srcq+src_strideq*2]
-%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+dst_strideq]
-%endif
- mova [dstq ], m0
- mova [dstq+dst_strideq], m1
-
- lea dstq, [dstq+dst_strideq*2]
-
- movu m0, [srcq]
- movu m1, [srcq+src_strideq]
-
-%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+dst_strideq]
-%endif
-
- mova [dstq ], m0
- mova [dstq+dst_strideq], m1
-
RET
INIT_MMX sse
.w8:
- mov r4d, dword hm
- sub r4d, 2
-
- movu m0, [srcq]
- movu m1, [srcq+src_strideq]
-
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
.loop8:
- lea srcq, [srcq+src_strideq]
- prefetcht0 [srcq+src_strideq*4]
- lea srcq, [srcq+src_strideq]
- prefetcht0 [srcq+src_strideq*2]
-
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+dst_strideq]
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
%endif
- mova [dstq ], m0
- mova [dstq+dst_strideq], m1
-
- movu m0, [srcq]
- movu m1, [srcq+src_strideq]
-
- lea dstq, [dstq+dst_strideq*2]
-
- sub r4d, 2
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
jnz .loop8
-
-%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+dst_strideq]
-%endif
- mova [dstq ], m0
- mova [dstq+dst_strideq], m1
-
RET
.w4:
- mov r4d, dword hm
-
- lea r5q, [src_strideq*3]
- lea r6q, [dst_strideq*3]
-
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
.loop4:
- movh m0, [srcq]
- movh m1, [srcq+src_strideq]
- movh m2, [srcq+src_strideq*2]
- movh m3, [srcq+r5q]
-
- lea srcq, [srcq+src_strideq*4]
+ movh m0, [srcq]
+ movh m1, [srcq+src_strideq]
+ movh m2, [srcq+src_strideq*2]
+ movh m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
- movh m4, [dstq]
- movh m5, [dstq+dst_strideq]
- movh m6, [dstq+dst_strideq*2]
- movh m7, [dstq+r6q]
-
- pavgb m0, m4
- pavgb m1, m5
- pavgb m2, m6
- pavgb m3, m7
+ movh m4, [dstq]
+ movh m5, [dstq+dst_strideq]
+ movh m6, [dstq+dst_strideq*2]
+ movh m7, [dstq+r6q]
+ pavgb m0, m4
+ pavgb m1, m5
+ pavgb m2, m6
+ pavgb m3, m7
%endif
- movh [dstq ], m0
- movh [dstq+dst_strideq ], m1
- movh [dstq+dst_strideq*2], m2
- movh [dstq+r6q ], m3
-
- lea dstq, [dstq+dst_strideq*4]
-
- sub r4d, 4
+ movh [dstq ], m0
+ movh [dstq+dst_strideq ], m1
+ movh [dstq+dst_strideq*2], m2
+ movh [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
jnz .loop4
RET
%endmacro