ref: cfd5e0221c51e79986a9b44104e5a81fe33dfa5f
parent: 168eea5d60aafc9b13555ee0da7661330918de4b
author: James Zern <[email protected]>
date: Sat Jun 25 07:26:27 EDT 2016
Revert "Update vpx subpixel 1d filter ssse3 asm" This reverts commit 1517fb74fd40eaab67246e8fb81d5c321bb33b06. Fixes a segfault in windows x64 builds. Change-Id: I6a6959cd7e64a28376849a9f2b11fc852a7c1fbe
--- a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -23,7 +23,11 @@
; z = signed SAT(x + y)
SECTION .text
-%define LOCAL_VARS_SIZE 16*6
+%if ARCH_X86_64
+ %define LOCAL_VARS_SIZE 16*4
+%else
+ %define LOCAL_VARS_SIZE 16*6
+%endif
%macro SETUP_LOCAL_VARS 0
; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
@@ -50,11 +54,11 @@
mova k6k7, m3
%if ARCH_X86_64
%define krd m12
- %define tmp0 [rsp + 16*4]
- %define tmp1 [rsp + 16*5]
+ %define tmp m13
mova krd, [GLOBAL(pw_64)]
%else
- %define krd [rsp + 16*4]
+ %define tmp [rsp + 16*4]
+ %define krd [rsp + 16*5]
%if CONFIG_PIC=0
mova m6, [GLOBAL(pw_64)]
%else
@@ -67,22 +71,40 @@
%endif
%endm
-;-------------------------------------------------------------------------------
-%if ARCH_X86_64
- %define LOCAL_VARS_SIZE_H4 0
-%else
- %define LOCAL_VARS_SIZE_H4 16*4
-%endif
+%macro HORIZx4_ROW 2
+ mova %2, %1
+ punpcklbw %1, %1
+ punpckhbw %2, %2
+ mova m3, %2
+ palignr %2, %1, 1
+ palignr m3, %1, 5
+
+ pmaddubsw %2, k0k1k4k5
+ pmaddubsw m3, k2k3k6k7
+ mova m4, %2 ;k0k1
+ mova m5, m3 ;k2k3
+ psrldq %2, 8 ;k4k5
+ psrldq m3, 8 ;k6k7
+ paddsw %2, m4
+ paddsw m5, m3
+ paddsw %2, m5
+ paddsw %2, krd
+ psraw %2, 7
+ packuswb %2, %2
+%endm
+
+;-------------------------------------------------------------------------------
%macro SUBPIX_HFILTER4 1
-cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
+cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
src, sstride, dst, dstride, height, filter
mova m4, [filterq]
packsswb m4, m4
%if ARCH_X86_64
- %define k0k1k4k5 m8
- %define k2k3k6k7 m9
- %define krd m10
+ %define k0k1k4k5 m8
+ %define k2k3k6k7 m9
+ %define krd m10
+ %define orig_height r7d
mova krd, [GLOBAL(pw_64)]
pshuflw k0k1k4k5, m4, 0b ;k0_k1
pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
@@ -89,9 +111,10 @@
pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
%else
- %define k0k1k4k5 [rsp + 16*0]
- %define k2k3k6k7 [rsp + 16*1]
- %define krd [rsp + 16*2]
+ %define k0k1k4k5 [rsp + 16*0]
+ %define k2k3k6k7 [rsp + 16*1]
+ %define krd [rsp + 16*2]
+ %define orig_height [rsp + 16*3]
pshuflw m6, m4, 0b ;k0_k1
pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
pshuflw m7, m4, 01010101b ;k2_k3
@@ -108,46 +131,61 @@
mova k2k3k6k7, m7
mova krd, m1
%endif
- dec heightd
-
+ mov orig_height, heightd
+ shr heightd, 1
.loop:
;Do two rows at once
- movu m4, [srcq - 3]
- movu m5, [srcq + sstrideq - 3]
- punpckhbw m1, m4, m4
- punpcklbw m4, m4
- punpckhbw m3, m5, m5
- punpcklbw m5, m5
- palignr m0, m1, m4, 1
- pmaddubsw m0, k0k1k4k5
- palignr m1, m4, 5
+ movh m0, [srcq - 3]
+ movh m1, [srcq + 5]
+ punpcklqdq m0, m1
+ mova m1, m0
+ movh m2, [srcq + sstrideq - 3]
+ movh m3, [srcq + sstrideq + 5]
+ punpcklqdq m2, m3
+ mova m3, m2
+ punpcklbw m0, m0
+ punpckhbw m1, m1
+ punpcklbw m2, m2
+ punpckhbw m3, m3
+ mova m4, m1
+ palignr m4, m0, 1
+ pmaddubsw m4, k0k1k4k5
+ palignr m1, m0, 5
pmaddubsw m1, k2k3k6k7
- palignr m2, m3, m5, 1
- pmaddubsw m2, k0k1k4k5
- palignr m3, m5, 5
+ mova m7, m3
+ palignr m7, m2, 1
+ pmaddubsw m7, k0k1k4k5
+ palignr m3, m2, 5
pmaddubsw m3, k2k3k6k7
- punpckhqdq m4, m0, m2
- punpcklqdq m0, m2
- punpckhqdq m5, m1, m3
- punpcklqdq m1, m3
- paddsw m0, m4
- paddsw m1, m5
-%ifidn %1, h8_avg
- movd m4, [dstq]
- movd m5, [dstq + dstrideq]
-%endif
- paddsw m0, m1
- paddsw m0, krd
- psraw m0, 7
- packuswb m0, m0
- psrldq m1, m0, 4
+ mova m0, m4 ;k0k1
+ mova m5, m1 ;k2k3
+ mova m2, m7 ;k0k1 upper
+ psrldq m4, 8 ;k4k5
+ psrldq m1, 8 ;k6k7
+ paddsw m4, m0
+ paddsw m5, m1
+ mova m1, m3 ;k2k3 upper
+ psrldq m7, 8 ;k4k5 upper
+ psrldq m3, 8 ;k6k7 upper
+ paddsw m7, m2
+ paddsw m4, m5
+ paddsw m1, m3
+ paddsw m7, m1
+ paddsw m4, krd
+ psraw m4, 7
+ packuswb m4, m4
+ paddsw m7, krd
+ psraw m7, 7
+ packuswb m7, m7
%ifidn %1, h8_avg
- pavgb m0, m4
- pavgb m1, m5
+ movd m0, [dstq]
+ pavgb m4, m0
+ movd m2, [dstq + dstrideq]
+ pavgb m7, m2
%endif
- movd [dstq], m0
- movd [dstq + dstrideq], m1
+ movd [dstq], m4
+ movd [dstq + dstrideq], m7
lea srcq, [srcq + sstrideq ]
prefetcht0 [srcq + 4 * sstrideq - 3]
@@ -155,89 +193,127 @@
lea dstq, [dstq + 2 * dstrideq ]
prefetcht0 [srcq + 2 * sstrideq - 3]
- sub heightd, 2
- jg .loop
+ dec heightd
+ jnz .loop
; Do last row if output_height is odd
- jne .done
+ mov heightd, orig_height
+ and heightd, 1
+ je .done
- movu m4, [srcq - 3]
- punpckhbw m1, m4, m4
- punpcklbw m4, m4
- palignr m0, m1, m4, 1
- palignr m1, m4, 5
- pmaddubsw m0, k0k1k4k5
- pmaddubsw m1, k2k3k6k7
- psrldq m2, m0, 8
- psrldq m3, m1, 8
- paddsw m0, m2
- paddsw m1, m3
- paddsw m0, m1
- paddsw m0, krd
- psraw m0, 7
- packuswb m0, m0
+ movh m0, [srcq - 3] ; load src
+ movh m1, [srcq + 5]
+ punpcklqdq m0, m1
+
+ HORIZx4_ROW m0, m1
%ifidn %1, h8_avg
- movd m4, [dstq]
- pavgb m0, m4
+ movd m0, [dstq]
+ pavgb m1, m0
%endif
- movd [dstq], m0
+ movd [dstq], m1
.done
- REP_RET
+ RET
%endm
+%macro HORIZx8_ROW 5
+ mova %2, %1
+ punpcklbw %1, %1
+ punpckhbw %2, %2
+
+ mova %3, %2
+ mova %4, %2
+ mova %5, %2
+
+ palignr %2, %1, 1
+ palignr %3, %1, 5
+ palignr %4, %1, 9
+ palignr %5, %1, 13
+
+ pmaddubsw %2, k0k1
+ pmaddubsw %3, k2k3
+ pmaddubsw %4, k4k5
+ pmaddubsw %5, k6k7
+ paddsw %2, %4
+ paddsw %5, %3
+ paddsw %2, %5
+ paddsw %2, krd
+ psraw %2, 7
+ packuswb %2, %2
+ SWAP %1, %2
+%endm
+
;-------------------------------------------------------------------------------
%macro SUBPIX_HFILTER8 1
-cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
src, sstride, dst, dstride, height, filter
mova m4, [filterq]
SETUP_LOCAL_VARS
- dec heightd
+%if ARCH_X86_64
+ %define orig_height r7d
+%else
+ %define orig_height heightmp
+%endif
+ mov orig_height, heightd
+ shr heightd, 1
.loop:
- ;Do two rows at once
- movu m0, [srcq - 3]
- movu m4, [srcq + sstrideq - 3]
- punpckhbw m1, m0, m0
+ movh m0, [srcq - 3]
+ movh m3, [srcq + 5]
+ movh m4, [srcq + sstrideq - 3]
+ movh m7, [srcq + sstrideq + 5]
+ punpcklqdq m0, m3
+ mova m1, m0
punpcklbw m0, m0
- palignr m5, m1, m0, 13
+ punpckhbw m1, m1
+ mova m5, m1
+ palignr m5, m0, 13
pmaddubsw m5, k6k7
- palignr m2, m1, m0, 5
- palignr m3, m1, m0, 9
+ mova m2, m1
+ mova m3, m1
palignr m1, m0, 1
pmaddubsw m1, k0k1
- punpckhbw m6, m4, m4
+ punpcklqdq m4, m7
+ mova m6, m4
punpcklbw m4, m4
+ palignr m2, m0, 5
+ punpckhbw m6, m6
+ palignr m3, m0, 9
+ mova m7, m6
pmaddubsw m2, k2k3
pmaddubsw m3, k4k5
- palignr m7, m6, m4, 13
- palignr m0, m6, m4, 5
+ palignr m7, m4, 13
+ mova m0, m6
+ palignr m0, m4, 5
pmaddubsw m7, k6k7
paddsw m1, m3
paddsw m2, m5
paddsw m1, m2
-%ifidn %1, h8_avg
- movh m2, [dstq]
- movhps m2, [dstq + dstrideq]
-%endif
- palignr m5, m6, m4, 9
+ mova m5, m6
palignr m6, m4, 1
pmaddubsw m0, k2k3
pmaddubsw m6, k0k1
+ palignr m5, m4, 9
paddsw m1, krd
pmaddubsw m5, k4k5
psraw m1, 7
paddsw m0, m7
+%ifidn %1, h8_avg
+ movh m7, [dstq]
+ movh m2, [dstq + dstrideq]
+%endif
+ packuswb m1, m1
paddsw m6, m5
paddsw m6, m0
paddsw m6, krd
psraw m6, 7
- packuswb m1, m6
+ packuswb m6, m6
%ifidn %1, h8_avg
- pavgb m1, m2
+ pavgb m1, m7
+ pavgb m6, m2
%endif
- movh [dstq], m1
- movhps [dstq + dstrideq], m1
+ movh [dstq], m1
+ movh [dstq + dstrideq], m6
lea srcq, [srcq + sstrideq ]
prefetcht0 [srcq + 4 * sstrideq - 3]
@@ -244,67 +320,78 @@
lea srcq, [srcq + sstrideq ]
lea dstq, [dstq + 2 * dstrideq ]
prefetcht0 [srcq + 2 * sstrideq - 3]
- sub heightd, 2
- jg .loop
+ dec heightd
+ jnz .loop
- ; Do last row if output_height is odd
- jne .done
+ ;Do last row if output_height is odd
+ mov heightd, orig_height
+ and heightd, 1
+ je .done
- movu m0, [srcq - 3]
- punpckhbw m3, m0, m0
- punpcklbw m0, m0
- palignr m1, m3, m0, 1
- palignr m2, m3, m0, 5
- palignr m4, m3, m0, 13
- palignr m3, m0, 9
- pmaddubsw m1, k0k1
- pmaddubsw m2, k2k3
- pmaddubsw m3, k4k5
- pmaddubsw m4, k6k7
- paddsw m1, m3
- paddsw m4, m2
- paddsw m1, m4
- paddsw m1, krd
- psraw m1, 7
- packuswb m1, m1
+ movh m0, [srcq - 3]
+ movh m3, [srcq + 5]
+ punpcklqdq m0, m3
+
+ HORIZx8_ROW m0, m1, m2, m3, m4
+
%ifidn %1, h8_avg
- movh m0, [dstq]
- pavgb m1, m0
+ movh m1, [dstq]
+ pavgb m0, m1
%endif
- movh [dstq], m1
+ movh [dstq], m0
.done:
- REP_RET
+ RET
%endm
;-------------------------------------------------------------------------------
%macro SUBPIX_HFILTER16 1
-cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
src, sstride, dst, dstride, height, filter
mova m4, [filterq]
SETUP_LOCAL_VARS
-
.loop:
prefetcht0 [srcq + 2 * sstrideq -3]
- movu m0, [srcq - 3]
- movu m4, [srcq - 2]
+ movh m0, [srcq - 3]
+ movh m4, [srcq + 5]
+ movh m6, [srcq + 13]
+ punpcklqdq m0, m4
+ mova m7, m0
+ punpckhbw m0, m0
+ mova m1, m0
+ punpcklqdq m4, m6
+ mova m3, m0
+ punpcklbw m7, m7
+
+ palignr m3, m7, 13
+ mova m2, m0
+ pmaddubsw m3, k6k7
+ palignr m0, m7, 1
pmaddubsw m0, k0k1
- pmaddubsw m4, k0k1
- movu m1, [srcq - 1]
- movu m5, [srcq + 0]
+ palignr m1, m7, 5
pmaddubsw m1, k2k3
- pmaddubsw m5, k2k3
- movu m2, [srcq + 1]
- movu m6, [srcq + 2]
+ palignr m2, m7, 9
pmaddubsw m2, k4k5
+ paddsw m1, m3
+ mova m3, m4
+ punpckhbw m4, m4
+ mova m5, m4
+ punpcklbw m3, m3
+ mova m7, m4
+ palignr m5, m3, 5
+ mova m6, m4
+ palignr m4, m3, 1
+ pmaddubsw m4, k0k1
+ pmaddubsw m5, k2k3
+ palignr m6, m3, 9
pmaddubsw m6, k4k5
- movu m3, [srcq + 3]
- movu m7, [srcq + 4]
- pmaddubsw m3, k6k7
+ palignr m7, m3, 13
pmaddubsw m7, k6k7
paddsw m0, m2
- paddsw m1, m3
paddsw m0, m1
+%ifidn %1, h8_avg
+ mova m1, [dstq]
+%endif
paddsw m4, m6
paddsw m5, m7
paddsw m4, m5
@@ -312,11 +399,9 @@
paddsw m4, krd
psraw m0, 7
psraw m4, 7
- packuswb m0, m0
- packuswb m4, m4
- punpcklbw m0, m4
+ packuswb m0, m4
%ifidn %1, h8_avg
- pavgb m0, [dstq]
+ pavgb m0, m1
%endif
lea srcq, [srcq + sstrideq]
mova [dstq], m0
@@ -323,7 +408,7 @@
lea dstq, [dstq + dstrideq]
dec heightd
jnz .loop
- REP_RET
+ RET
%endm
INIT_XMM ssse3
@@ -335,457 +420,204 @@
SUBPIX_HFILTER4 h8_avg
;-------------------------------------------------------------------------------
-
-; TODO(Linfeng): Detect cpu type and choose the code with better performance.
-%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
-
%macro SUBPIX_VFILTER 2
-cglobal filter_block1d%2_%1, 6, 6, 15, LOCAL_VARS_SIZE, \
+cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
src, sstride, dst, dstride, height, filter
mova m4, [filterq]
SETUP_LOCAL_VARS
-
-%ifidn %2, 8
- %define movx movh
+%if ARCH_X86_64
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
%else
- %define movx movd
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
%endif
+ mov src1q, srcq
+ add src1q, sstrideq
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
- dec heightd
-
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-
-%if ARCH_X86_64
- %define src1q r7
- %define sstride6q r8
- %define dst_stride dstrideq
+%ifidn %2, 8
+ %define movx movh
%else
- %define src1q filterq
- %define sstride6q dstrideq
- %define dst_stride dstridemp
+ %define movx movd
%endif
- mov src1q, srcq
- add src1q, sstrideq
- lea sstride6q, [sstrideq + sstrideq * 4]
- add sstride6q, sstrideq ;pitch * 6
-
.loop:
- ;Do two rows at once
- movx m0, [srcq ] ;A
- movx m1, [src1q ] ;B
- punpcklbw m0, m1 ;A B
- movx m2, [srcq + sstrideq * 2 ] ;C
- pmaddubsw m0, k0k1
- mova m6, m2
- movx m3, [src1q + sstrideq * 2] ;D
- punpcklbw m2, m3 ;C D
- pmaddubsw m2, k2k3
- movx m4, [srcq + sstrideq * 4 ] ;E
- mova m7, m4
- movx m5, [src1q + sstrideq * 4] ;F
- punpcklbw m4, m5 ;E F
- pmaddubsw m4, k4k5
- punpcklbw m1, m6 ;A B next iter
- movx m6, [srcq + sstride6q ] ;G
- punpcklbw m5, m6 ;E F next iter
- punpcklbw m3, m7 ;C D next iter
- pmaddubsw m5, k4k5
- movx m7, [src1q + sstride6q ] ;H
- punpcklbw m6, m7 ;G H
- pmaddubsw m6, k6k7
- pmaddubsw m3, k2k3
- pmaddubsw m1, k0k1
- paddsw m0, m4
- paddsw m2, m6
- movx m6, [srcq + sstrideq * 8 ] ;H next iter
- punpcklbw m7, m6
- pmaddubsw m7, k6k7
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
- paddsw m1, m5
- packuswb m0, m0
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ punpcklbw m0, m1 ;A B
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ pmaddubsw m0, k0k1
+ mova m6, m2
+ movx m3, [src1q + sstrideq * 2] ;D
+ punpcklbw m2, m3 ;C D
+ pmaddubsw m2, k2k3
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ mova m7, m4
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m4, k4k5
+ punpcklbw m1, m6 ;A B next iter
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m5, m6 ;E F next iter
+ punpcklbw m3, m7 ;C D next iter
+ pmaddubsw m5, k4k5
+ movx m7, [src1q + sstride6q ] ;H
+ punpcklbw m6, m7 ;G H
+ pmaddubsw m6, k6k7
+ pmaddubsw m3, k2k3
+ pmaddubsw m1, k0k1
+ paddsw m0, m4
+ paddsw m2, m6
+ movx m6, [srcq + sstrideq * 8 ] ;H next iter
+ punpcklbw m7, m6
+ pmaddubsw m7, k6k7
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ paddsw m1, m5
+ packuswb m0, m0
- paddsw m3, m7
- paddsw m1, m3
- paddsw m1, krd
- psraw m1, 7
- lea srcq, [srcq + sstrideq * 2 ]
- lea src1q, [src1q + sstrideq * 2]
- packuswb m1, m1
+ paddsw m3, m7
+ paddsw m1, m3
+ paddsw m1, krd
+ psraw m1, 7
+ lea srcq, [srcq + sstrideq * 2 ]
+ lea src1q, [src1q + sstrideq * 2]
+ packuswb m1, m1
%ifidn %1, v8_avg
- movx m2, [dstq]
- pavgb m0, m2
+ movx m2, [dstq]
+ pavgb m0, m2
%endif
- movx [dstq], m0
- add dstq, dst_stride
+ movx [dstq], m0
+ add dstq, dst_stride
%ifidn %1, v8_avg
- movx m3, [dstq]
- pavgb m1, m3
+ movx m3, [dstq]
+ pavgb m1, m3
%endif
- movx [dstq], m1
- add dstq, dst_stride
- sub heightd, 2
- jg .loop
+ movx [dstq], m1
+ add dstq, dst_stride
+ sub heightd, 2
+ cmp heightd, 1
+ jg .loop
- ; Do last row if output_height is odd
- jne .done
+ cmp heightd, 0
+ je .done
- movx m0, [srcq ] ;A
- movx m1, [srcq + sstrideq ] ;B
- movx m6, [srcq + sstride6q ] ;G
- punpcklbw m0, m1 ;A B
- movx m7, [src1q + sstride6q ] ;H
- pmaddubsw m0, k0k1
- movx m2, [srcq + sstrideq * 2 ] ;C
- punpcklbw m6, m7 ;G H
- movx m3, [src1q + sstrideq * 2] ;D
- pmaddubsw m6, k6k7
- movx m4, [srcq + sstrideq * 4 ] ;E
- punpcklbw m2, m3 ;C D
- movx m5, [src1q + sstrideq * 4] ;F
- punpcklbw m4, m5 ;E F
- pmaddubsw m2, k2k3
- pmaddubsw m4, k4k5
- paddsw m2, m6
- paddsw m0, m4
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
- packuswb m0, m0
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m0, m1 ;A B
+ movx m7, [src1q + sstride6q ] ;H
+ pmaddubsw m0, k0k1
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ punpcklbw m6, m7 ;G H
+ movx m3, [src1q + sstrideq * 2] ;D
+ pmaddubsw m6, k6k7
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ punpcklbw m2, m3 ;C D
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m2, k2k3
+ pmaddubsw m4, k4k5
+ paddsw m2, m6
+ paddsw m0, m4
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
%ifidn %1, v8_avg
- movx m1, [dstq]
- pavgb m0, m1
+ movx m1, [dstq]
+ pavgb m0, m1
%endif
- movx [dstq], m0
-
-%else
- ; ARCH_X86_64
-
- movx m0, [srcq ] ;A
- movx m1, [srcq + sstrideq ] ;B
- lea srcq, [srcq + sstrideq * 2 ]
- movx m2, [srcq] ;C
- movx m3, [srcq + sstrideq] ;D
- lea srcq, [srcq + sstrideq * 2 ]
- movx m4, [srcq] ;E
- movx m5, [srcq + sstrideq] ;F
- lea srcq, [srcq + sstrideq * 2 ]
- movx m6, [srcq] ;G
- punpcklbw m0, m1 ;A B
- punpcklbw m1, m2 ;A B next iter
- punpcklbw m2, m3 ;C D
- punpcklbw m3, m4 ;C D next iter
- punpcklbw m4, m5 ;E F
- punpcklbw m5, m6 ;E F next iter
-
-.loop:
- ;Do two rows at once
- movx m7, [srcq + sstrideq] ;H
- lea srcq, [srcq + sstrideq * 2 ]
- movx m14, [srcq] ;H next iter
- punpcklbw m6, m7 ;G H
- punpcklbw m7, m14 ;G H next iter
- pmaddubsw m8, m0, k0k1
- pmaddubsw m9, m1, k0k1
- mova m0, m2
- mova m1, m3
- pmaddubsw m10, m2, k2k3
- pmaddubsw m11, m3, k2k3
- mova m2, m4
- mova m3, m5
- pmaddubsw m4, k4k5
- pmaddubsw m5, k4k5
- paddsw m8, m4
- paddsw m9, m5
- mova m4, m6
- mova m5, m7
- pmaddubsw m6, k6k7
- pmaddubsw m7, k6k7
- paddsw m10, m6
- paddsw m11, m7
- paddsw m8, m10
- paddsw m9, m11
- mova m6, m14
- paddsw m8, krd
- paddsw m9, krd
- psraw m8, 7
- psraw m9, 7
-%ifidn %2, 4
- packuswb m8, m8
- packuswb m9, m9
-%else
- packuswb m8, m9
-%endif
-
-%ifidn %1, v8_avg
- movx m7, [dstq]
-%ifidn %2, 4
- movx m10, [dstq + dstrideq]
- pavgb m9, m10
-%else
- movhpd m7, [dstq + dstrideq]
-%endif
- pavgb m8, m7
-%endif
- movx [dstq], m8
-%ifidn %2, 4
- movx [dstq + dstrideq], m9
-%else
- movhpd [dstq + dstrideq], m8
-%endif
-
- lea dstq, [dstq + dstrideq * 2 ]
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movx m7, [srcq + sstrideq] ;H
- punpcklbw m6, m7 ;G H
- pmaddubsw m0, k0k1
- pmaddubsw m2, k2k3
- pmaddubsw m4, k4k5
- pmaddubsw m6, k6k7
- paddsw m0, m4
- paddsw m2, m6
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
- packuswb m0, m0
-%ifidn %1, v8_avg
- movx m1, [dstq]
- pavgb m0, m1
-%endif
- movx [dstq], m0
-
-%endif ; ARCH_X86_64
-
+ movx [dstq], m0
.done:
- REP_RET
-
+ RET
%endm
;-------------------------------------------------------------------------------
%macro SUBPIX_VFILTER16 1
-cglobal filter_block1d16_%1, 6, 6, 16, LOCAL_VARS_SIZE, \
+cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
+ mova m4, [filterq]
SETUP_LOCAL_VARS
-
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-
%if ARCH_X86_64
- %define src1q r7
- %define sstride6q r8
- %define dst_stride dstrideq
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
%else
- %define src1q filterq
- %define sstride6q dstrideq
- %define dst_stride dstridemp
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
%endif
- lea src1q, [srcq + sstrideq]
- lea sstride6q, [sstrideq + sstrideq * 4]
- add sstride6q, sstrideq ;pitch * 6
+ mov src1q, srcq
+ add src1q, sstrideq
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
.loop:
- movh m0, [srcq ] ;A
- movh m1, [src1q ] ;B
- movh m2, [srcq + sstrideq * 2 ] ;C
- movh m3, [src1q + sstrideq * 2] ;D
- movh m4, [srcq + sstrideq * 4 ] ;E
- movh m5, [src1q + sstrideq * 4] ;F
+ movh m0, [srcq ] ;A
+ movh m1, [srcq + sstrideq ] ;B
+ movh m2, [srcq + sstrideq * 2 ] ;C
+ movh m3, [src1q + sstrideq * 2] ;D
+ movh m4, [srcq + sstrideq * 4 ] ;E
+ movh m5, [src1q + sstrideq * 4] ;F
- punpcklbw m0, m1 ;A B
- movh m6, [srcq + sstride6q] ;G
- punpcklbw m2, m3 ;C D
- movh m7, [src1q + sstride6q] ;H
- punpcklbw m4, m5 ;E F
- pmaddubsw m0, k0k1
- movh m3, [srcq + 8] ;A
- pmaddubsw m2, k2k3
- punpcklbw m6, m7 ;G H
- movh m5, [srcq + sstrideq + 8] ;B
- pmaddubsw m4, k4k5
- punpcklbw m3, m5 ;A B
- movh m7, [srcq + sstrideq * 2 + 8] ;C
- pmaddubsw m6, k6k7
- movh m5, [src1q + sstrideq * 2 + 8] ;D
- punpcklbw m7, m5 ;C D
- paddsw m2, m6
- pmaddubsw m3, k0k1
- movh m1, [srcq + sstrideq * 4 + 8] ;E
- paddsw m0, m4
- pmaddubsw m7, k2k3
- movh m6, [src1q + sstrideq * 4 + 8] ;F
- punpcklbw m1, m6 ;E F
- paddsw m0, m2
- paddsw m0, krd
- movh m2, [srcq + sstride6q + 8] ;G
- pmaddubsw m1, k4k5
- movh m5, [src1q + sstride6q + 8] ;H
- psraw m0, 7
- punpcklbw m2, m5 ;G H
- pmaddubsw m2, k6k7
- paddsw m7, m2
- paddsw m3, m1
- paddsw m3, m7
- paddsw m3, krd
- psraw m3, 7
- packuswb m0, m3
-
- add srcq, sstrideq
- add src1q, sstrideq
+ punpcklbw m0, m1 ;A B
+ movh m6, [srcq + sstride6q] ;G
+ punpcklbw m2, m3 ;C D
+ movh m7, [src1q + sstride6q] ;H
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m0, k0k1
+ movh m3, [srcq + 8] ;A
+ pmaddubsw m2, k2k3
+ punpcklbw m6, m7 ;G H
+ movh m5, [srcq + sstrideq + 8] ;B
+ pmaddubsw m4, k4k5
+ punpcklbw m3, m5 ;A B
+ movh m7, [srcq + sstrideq * 2 + 8] ;C
+ pmaddubsw m6, k6k7
+ movh m5, [src1q + sstrideq * 2 + 8] ;D
+ punpcklbw m7, m5 ;C D
+ paddsw m2, m6
+ pmaddubsw m3, k0k1
+ movh m1, [srcq + sstrideq * 4 + 8] ;E
+ paddsw m0, m4
+ pmaddubsw m7, k2k3
+ movh m6, [src1q + sstrideq * 4 + 8] ;F
+ punpcklbw m1, m6 ;E F
+ paddsw m0, m2
+ paddsw m0, krd
+ movh m2, [srcq + sstride6q + 8] ;G
+ pmaddubsw m1, k4k5
+ movh m5, [src1q + sstride6q + 8] ;H
+ psraw m0, 7
+ punpcklbw m2, m5 ;G H
+ pmaddubsw m2, k6k7
%ifidn %1, v8_avg
- pavgb m0, [dstq]
+ mova m4, [dstq]
%endif
- mova [dstq], m0
- add dstq, dst_stride
- dec heightd
- jnz .loop
- REP_RET
+ movh [dstq], m0
+ paddsw m7, m2
+ paddsw m3, m1
+ paddsw m3, m7
+ paddsw m3, krd
+ psraw m3, 7
+ packuswb m0, m3
-%else
- ; ARCH_X86_64
- dec heightd
-
- movu m1, [srcq ] ;A
- movu m3, [srcq + sstrideq ] ;B
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m0, m1, m3 ;A B
- punpckhbw m1, m3 ;A B
- movu m5, [srcq] ;C
- punpcklbw m2, m3, m5 ;A B next iter
- punpckhbw m3, m5 ;A B next iter
- mova tmp0, m2 ;store to stack
- mova tmp1, m3 ;store to stack
- movu m7, [srcq + sstrideq] ;D
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m4, m5, m7 ;C D
- punpckhbw m5, m7 ;C D
- movu m9, [srcq] ;E
- punpcklbw m6, m7, m9 ;C D next iter
- punpckhbw m7, m9 ;C D next iter
- movu m11, [srcq + sstrideq] ;F
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m8, m9, m11 ;E F
- punpckhbw m9, m11 ;E F
- movu m2, [srcq] ;G
- punpcklbw m10, m11, m2 ;E F next iter
- punpckhbw m11, m2 ;E F next iter
-
-.loop:
- ;Do two rows at once
- pmaddubsw m13, m0, k0k1
- mova m0, m4
- pmaddubsw m14, m8, k4k5
- pmaddubsw m15, m4, k2k3
- mova m4, m8
- paddsw m13, m14
- movu m3, [srcq + sstrideq] ;H
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m14, m2, m3 ;G H
- mova m8, m14
- pmaddubsw m14, k6k7
- paddsw m15, m14
- paddsw m13, m15
- paddsw m13, krd
- psraw m13, 7
-
- pmaddubsw m14, m1, k0k1
- pmaddubsw m1, m9, k4k5
- pmaddubsw m15, m5, k2k3
- paddsw m14, m1
- mova m1, m5
- mova m5, m9
- punpckhbw m2, m3 ;G H
- mova m9, m2
- pmaddubsw m2, k6k7
- paddsw m15, m2
- paddsw m14, m15
- paddsw m14, krd
- psraw m14, 7
- packuswb m13, m14
+ add srcq, sstrideq
+ add src1q, sstrideq
%ifidn %1, v8_avg
- pavgb m13, [dstq]
+ pavgb m0, m4
%endif
- mova [dstq], m13
-
- ; next iter
- pmaddubsw m15, tmp0, k0k1
- pmaddubsw m14, m10, k4k5
- pmaddubsw m13, m6, k2k3
- paddsw m15, m14
- mova tmp0, m6
- mova m6, m10
- movu m2, [srcq] ;G next iter
- punpcklbw m14, m3, m2 ;G H next iter
- mova m10, m14
- pmaddubsw m14, k6k7
- paddsw m13, m14
- paddsw m15, m13
- paddsw m15, krd
- psraw m15, 7
-
- pmaddubsw m14, tmp1, k0k1
- mova tmp1, m7
- pmaddubsw m13, m7, k2k3
- mova m7, m11
- pmaddubsw m11, k4k5
- paddsw m14, m11
- punpckhbw m3, m2 ;G H next iter
- mova m11, m3
- pmaddubsw m3, k6k7
- paddsw m13, m3
- paddsw m14, m13
- paddsw m14, krd
- psraw m14, 7
- packuswb m15, m14
-%ifidn %1, v8_avg
- pavgb m15, [dstq + dstrideq]
-%endif
- mova [dstq + dstrideq], m15
- lea dstq, [dstq + dstrideq * 2]
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movu m3, [srcq + sstrideq] ;H
- punpcklbw m6, m2, m3 ;G H
- punpckhbw m2, m3 ;G H
- pmaddubsw m0, k0k1
- pmaddubsw m1, k0k1
- pmaddubsw m4, k2k3
- pmaddubsw m5, k2k3
- pmaddubsw m8, k4k5
- pmaddubsw m9, k4k5
- pmaddubsw m6, k6k7
- pmaddubsw m2, k6k7
- paddsw m0, m8
- paddsw m1, m9
- paddsw m4, m6
- paddsw m5, m2
- paddsw m0, m4
- paddsw m1, m5
- paddsw m0, krd
- paddsw m1, krd
- psraw m0, 7
- psraw m1, 7
- packuswb m0, m1
-%ifidn %1, v8_avg
- pavgb m0, [dstq]
-%endif
- mova [dstq], m0
-
-.done:
- REP_RET
-
-%endif ; ARCH_X86_64
-
+ mova [dstq], m0
+ add dstq, dst_stride
+ dec heightd
+ jnz .loop
+ RET
%endm
INIT_XMM ssse3