ref: 578535509a789530c6bb9de061419a34fc8a1447
parent: 4769d4d0b8917e16106747442e24336cf44f5ad8
author: Sindre Aamås <[email protected]>
date: Tue Mar 7 09:36:08 EST 2017
[Processing/x86] Simplify downsample_bilinear X86_32_PICASM handling Utilize program counter-relative offsets to simplify X86_32_PICASM code. In order for this to work with nasm, data constants are placed in the text segment. Avoid this for some routines where putting constants on the stack is convenient and efficient.
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -57,7 +57,11 @@
; Local Data (Read Only)
;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=32
+%else
SECTION .rodata align=32
+%endif
;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
@@ -64,6 +68,7 @@
;***********************************************************************
ALIGN 32
+%ifndef X86_32_PICASM
db80h_256:
times 32 db 80h
shufb_0000000088888888:
@@ -74,6 +79,7 @@
times 4 db 4
times 4 db 8
times 4 db 12
+%endif
shufb_mask_low:
db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
shufb_mask_high:
@@ -1253,20 +1259,7 @@
pmaddwd xmm2, xmm1
pshufd xmm1, xmm2, 00000001b
paddd xmm2, xmm1
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x00000000
- push 0x00000000
- push 0x00000000
- push 0x00004000
- movdqa xmm1, [esp]
- mov esp, r0
- pop r0
-%else
movdqa xmm1, [add_extra_half]
-%endif
paddd xmm2, xmm1
psrld xmm2, 15
@@ -1567,20 +1560,7 @@
pmaddwd xmm2, xmm1
pshufd xmm1, xmm2, 00000001b
paddd xmm2, xmm1
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x00000000
- push 0x00000000
- push 0x00000000
- push 0x00004000
- movdqa xmm1, [esp]
- mov esp, r0
- pop r0
-%else
movdqa xmm1, [add_extra_half]
-%endif
paddd xmm2, xmm1
psrld xmm2, 15
@@ -1657,6 +1637,12 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+ %define i_height dword arg6
+%else
+ %define i_height r5
+%endif
+ INIT_X86_32_PIC_NOPRESERVE r5
%ifndef X86_32
push r12
@@ -1664,7 +1650,7 @@
%endif
mov r6, r1 ;Save the tailer for the unasigned size
- imul r6, r5
+ imul r6, i_height
add r6, r0
movdqa xmm7, [r6]
@@ -1697,52 +1683,15 @@
;1st line
movdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
movdqa xmm1, xmm0
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080 ;shufb_mask_onethird_low_1
- push 0x80808080
- push 0x80800f0c
- push 0x09060300
- push 0x80808080 ;shufb_mask_onethird_high_1
- push 0x80808080
- push 0x8080800d
- push 0x0a070401
- push 0x80808080 ;shufb_mask_onethird_low_2
- push 0x800e0b08
- push 0x05028080
- push 0x80808080
- push 0x80808080 ;shufb_mask_onethird_high_2
- push 0x800f0c09
- push 0x06030080
- push 0x80808080
- push 0x0d0a0704 ;shufb_mask_onethird_low_3
- push 0x01808080
- push 0x80808080
- push 0x80808080
- push 0x0e0b0805 ;shufb_mask_onethird_high_3
- push 0x02808080
- push 0x80808080
- push 0x80808080
- movdqa xmm5, [esp+80]
- movdqa xmm6, [esp+64]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_1]
- movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
movdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+48]
- movdqa xmm6, [esp+32]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_2]
- movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
@@ -1751,13 +1700,8 @@
movdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+16]
- movdqa xmm6, [esp]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_3]
- movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
@@ -1768,25 +1712,15 @@
;2nd line
movdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+80]
- movdqa xmm6, [esp+64]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_1]
- movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
movdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+48]
- movdqa xmm6, [esp+32]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_2]
- movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
@@ -1795,15 +1729,8 @@
movdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+16]
- movdqa xmm6, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm5, [shufb_mask_onethird_low_3]
- movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
@@ -1832,7 +1759,7 @@
lea r0, [r0+r1]
lea r0, [r0+r6] ;current dst lien + 1 line
- dec r5
+ dec i_height
jg near .yloops_onethird_sse3
movdqa [r0], xmm7 ;restore the tailer for the unasigned size
@@ -1841,6 +1768,7 @@
pop r12
%endif
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -1847,6 +1775,7 @@
pop r6
%endif
ret
+%undef i_height
;***********************************************************************
; void DyadicBilinearOneThirdDownsampler_sse4( unsigned char* pDst, const int iDstStride,
@@ -1866,6 +1795,12 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+ %define i_height dword arg6
+%else
+ %define i_height r5
+%endif
+ INIT_X86_32_PIC_NOPRESERVE r5
%ifndef X86_32
push r12
@@ -1873,7 +1808,7 @@
%endif
mov r6, r1 ;Save the tailer for the unasigned size
- imul r6, r5
+ imul r6, i_height
add r6, r0
movdqa xmm7, [r6]
@@ -1906,52 +1841,15 @@
;1st line
movntdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
movdqa xmm1, xmm0
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080 ;shufb_mask_onethird_low_1
- push 0x80808080
- push 0x80800f0c
- push 0x09060300
- push 0x80808080 ;shufb_mask_onethird_high_1
- push 0x80808080
- push 0x8080800d
- push 0x0a070401
- push 0x80808080 ;shufb_mask_onethird_low_2
- push 0x800e0b08
- push 0x05028080
- push 0x80808080
- push 0x80808080 ;shufb_mask_onethird_high_2
- push 0x800f0c09
- push 0x06030080
- push 0x80808080
- push 0x0d0a0704 ;shufb_mask_onethird_low_3
- push 0x01808080
- push 0x80808080
- push 0x80808080
- push 0x0e0b0805 ;shufb_mask_onethird_high_3
- push 0x02808080
- push 0x80808080
- push 0x80808080
- movdqa xmm5, [esp+80]
- movdqa xmm6, [esp+64]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_1]
- movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
movntdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+48]
- movdqa xmm6, [esp+32]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_2]
- movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
@@ -1960,13 +1858,8 @@
movntdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+16]
- movdqa xmm6, [esp]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_3]
- movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
@@ -1977,25 +1870,15 @@
;2nd line
movntdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+80]
- movdqa xmm6, [esp+64]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_1]
- movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
movntdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+48]
- movdqa xmm6, [esp+32]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_2]
- movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
@@ -2004,15 +1887,8 @@
movntdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+16]
- movdqa xmm6, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm5, [shufb_mask_onethird_low_3]
- movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
@@ -2041,7 +1917,7 @@
lea r0, [r0+r1]
lea r0, [r0+r6] ;current dst lien + 1 line
- dec r5
+ dec i_height
jg near .yloops_onethird_sse4
movdqa [r0], xmm7 ;restore the tailer for the unasigned size
@@ -2050,6 +1926,7 @@
pop r12
%endif
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -2056,6 +1933,7 @@
pop r6
%endif
ret
+%undef i_height
;***********************************************************************
; void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride,
@@ -2256,20 +2134,10 @@
add r6, r0
movq xmm7, [r6]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080
- push 0x0d090501
- push 0x80808080
- push 0x0c080400
- movdqa xmm6, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm6, [shufb_mask_quarter]
-%endif
+ INIT_X86_32_PIC_NOPRESERVE r4
+ movdqa xmm6, [pic(shufb_mask_quarter)]
+ DEINIT_X86_32_PIC
+
.yloops_quarter_sse3:
;mov eax, [esp+40] ; iSrcWidth
;sar eax, $02 ; iSrcWidth >> 2
@@ -2378,20 +2246,9 @@
add r6, r0
movq xmm7, [r6]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080
- push 0x0d090501
- push 0x80808080
- push 0x0c080400
- movdqa xmm6, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm6, [shufb_mask_quarter] ;mask
-%endif
+ INIT_X86_32_PIC_NOPRESERVE r4
+ movdqa xmm6, [pic(shufb_mask_quarter)] ;mask
+ DEINIT_X86_32_PIC
.yloops_quarter_sse4:
%ifdef X86_32
@@ -2534,20 +2391,7 @@
%macro SSSE3_BilinearFastDownsample4xOrLess_8px 0
movdqa xmm_tmp0, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x08080808
- push 0x08080808
- push 0x00000000
- push 0x00000000
- pshufb xmm_tmp0, [esp]
- mov esp, r0
- pop r0
-%else
- pshufb xmm_tmp0, [shufb_0000000088888888]
-%endif
+ pshufb xmm_tmp0, xmm_shufb_0000000088888888
psubb xmm_xpos_int, xmm_tmp0
SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
mov r_tmp0, i_xpos
@@ -2555,24 +2399,7 @@
lddqu xmm_tmp3, [p_src_row0 + r_tmp0]
lddqu xmm_tmp4, [p_src_row1 + r_tmp0]
movdqa xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- punpcklbw xmm_tmp2, [esp]
- mov esp, r5
- pop r5
-%else
- punpcklbw xmm_tmp2, [db80h_256]
-%endif
+ punpcklbw xmm_tmp2, xmm_db80h
pshufb xmm_tmp3, xmm_tmp2
pshufb xmm_tmp4, xmm_tmp2
SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2585,24 +2412,7 @@
lddqu xmm_tmp3, [p_src_row0 + r_tmp0]
lddqu xmm_tmp4, [p_src_row1 + r_tmp0]
movdqa xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- punpckhbw xmm_tmp2, [esp]
- mov esp, r5
- pop r5
-%else
- punpckhbw xmm_tmp2, [db80h_256]
-%endif
+ punpckhbw xmm_tmp2, xmm_db80h
pshufb xmm_tmp3, xmm_tmp2
pshufb xmm_tmp4, xmm_tmp2
SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2741,43 +2551,13 @@
%macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
movdqa xmm_tmp0, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x08080808
- push 0x08080808
- push 0x00000000
- push 0x00000000
- pshufb xmm_tmp0, [esp]
- mov esp, r0
- pop r0
-%else
- pshufb xmm_tmp0, [shufb_0000000088888888]
-%endif
+ pshufb xmm_tmp0, xmm_shufb_0000000088888888
psubb xmm_xpos_int, xmm_tmp0
SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
mov r_tmp0, i_xpos
shr r_tmp0, 16
movdqa xmm_tmp3, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- punpcklbw xmm_tmp3, [esp]
- mov esp, r5
- pop r5
-%else
- punpcklbw xmm_tmp3, [db80h_256]
-%endif
+ punpcklbw xmm_tmp3, xmm_db80h
lddqu xmm_tmp4, [p_src_row0 + r_tmp0]
lddqu xmm_tmp2, [p_src_row1 + r_tmp0]
lea r_tmp0, [i_xpos + 4 * i_scalex]
@@ -2789,24 +2569,7 @@
pmaddwd xmm_tmp2, xmm_tmp0
SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
movdqa xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- punpckhbw xmm_tmp2, [esp]
- mov esp, r5
- pop r5
-%else
- punpckhbw xmm_tmp2, [db80h_256]
-%endif
+ punpckhbw xmm_tmp2, xmm_db80h
lddqu xmm_tmp4, [p_src_row0 + r_tmp0]
lddqu xmm_tmp3, [p_src_row1 + r_tmp0]
pshufb xmm_tmp4, xmm_tmp2
@@ -2987,7 +2750,11 @@
movd xmm0, arg8
movd xmm1, esp
and esp, -16
+%ifdef X86_32_PICASM
+ sub esp, 8 * 4 + 9 * 16
+%else
sub esp, 8 * 4 + 7 * 16
+%endif
movd [esp], xmm1
%define p_dst r0
%define i_dst_stride_less_width [esp + 1 * 4]
@@ -3021,6 +2788,22 @@
%define xmm_0 [esp + 8 * 4 + 4 * 16]
%define xmm_xpos_int_begin [esp + 8 * 4 + 5 * 16]
%define xmm_xpos_frac_begin [esp + 8 * 4 + 6 * 16]
+%ifdef X86_32_PICASM
+ %define xmm_db80h [esp + 8 * 4 + 7 * 16]
+ %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 16]
+ pxor xmm_tmp4, xmm_tmp4
+ pcmpeqb xmm_tmp5, xmm_tmp5
+ psubb xmm_tmp4, xmm_tmp5
+ movdqa xmm_tmp3, xmm_tmp4
+ psllw xmm_tmp3, 3
+ pslldq xmm_tmp3, 8
+ movdqa xmm_shufb_0000000088888888, xmm_tmp3
+ psllw xmm_tmp4, 7
+ movdqa xmm_db80h, xmm_tmp4
+%else
+ %define xmm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+%endif
mov i_dst_stride_less_width, r1
mov i_dst_width, r2
mov i_dst_height, r3
@@ -3067,6 +2850,8 @@
%define xmm_tmp5 xmm6
%define xmm_xpos_int_begin xmm14
%define xmm_xpos_frac_begin xmm15
+ %define xmm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
pxor xmm_0, xmm_0
%endif
@@ -3230,6 +3015,8 @@
%undef xmm_xfrac0_begin
%undef xmm_xfrac1_begin
%undef xmm_xfrac_inc
+%undef xmm_db80h
+%undef xmm_shufb_0000000088888888
;**************************************************************************************************************
;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
@@ -3265,7 +3052,11 @@
movd xmm0, arg8
movd xmm1, esp
and esp, -16
+%ifdef X86_32_PICASM
+ sub esp, 8 * 4 + 10 * 16
+%else
sub esp, 8 * 4 + 8 * 16
+%endif
movd [esp], xmm1
%define p_dst r0
%define i_dst_stride_less_width [esp + 1 * 4]
@@ -3300,6 +3091,22 @@
%define xmm_7fff [esp + 8 * 4 + 5 * 16]
%define xmm_xpos_int_begin [esp + 8 * 4 + 6 * 16]
%define xmm_xpos_frac_begin [esp + 8 * 4 + 7 * 16]
+%ifdef X86_32_PICASM
+ %define xmm_db80h [esp + 8 * 4 + 8 * 16]
+ %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 16]
+ pxor xmm_tmp4, xmm_tmp4
+ pcmpeqb xmm_tmp5, xmm_tmp5
+ psubb xmm_tmp4, xmm_tmp5
+ movdqa xmm_tmp3, xmm_tmp4
+ psllw xmm_tmp3, 3
+ pslldq xmm_tmp3, 8
+ movdqa xmm_shufb_0000000088888888, xmm_tmp3
+ psllw xmm_tmp4, 7
+ movdqa xmm_db80h, xmm_tmp4
+%else
+ %define xmm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+%endif
mov i_dst_stride_less_width, r1
mov i_dst_width, r2
mov i_dst_height, r3
@@ -3350,6 +3157,8 @@
%define xmm_7fff xmm13
%define xmm_xpos_int_begin xmm14
%define xmm_xpos_frac_begin xmm15
+ %define xmm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
pxor xmm_0, xmm_0
pcmpeqw xmm_7fff, xmm_7fff
psrlw xmm_7fff, 1
@@ -3517,6 +3326,8 @@
%undef xmm_xfrac0_begin
%undef xmm_xfrac1_begin
%undef xmm_xfrac_inc
+%undef xmm_db80h
+%undef xmm_shufb_0000000088888888
%ifdef HAVE_AVX2
; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
@@ -3585,20 +3396,7 @@
%endmacro
%macro AVX2_BilinearFastDownsample4xOrLess_16px 0
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x08080808
- push 0x08080808
- push 0x00000000
- push 0x00000000
- vbroadcasti128 ymm_tmp0, [esp]
- mov esp, r0
- pop r0
-%else
- vbroadcasti128 ymm_tmp0, [shufb_0000000088888888]
-%endif
+ vbroadcasti128 ymm_tmp0, xmm_shufb_0000000088888888
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
@@ -3642,20 +3440,7 @@
%endmacro
%macro AVX2_BilinearFastDownsample8xOrLess_16px 0
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x0c0c0c0c
- push 0x08080808
- push 0x04040404
- push 0x00000000
- vbroadcasti128 ymm_tmp0, [esp]
- mov esp, r0
- pop r0
-%else
- vbroadcasti128 ymm_tmp0, [shufb_000044448888CCCC]
-%endif
+ vbroadcasti128 ymm_tmp0, xmm_shufb_000044448888CCCC
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
mov r_tmp0, i_xpos
@@ -3894,20 +3679,7 @@
%endmacro
%macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x08080808 ;shufb_0000000088888888
- push 0x08080808
- push 0x00000000
- push 0x00000000
- vbroadcasti128 ymm_tmp0, [esp]
- mov esp, r5
- pop r5
-%else
- vbroadcasti128 ymm_tmp0, [shufb_0000000088888888]
-%endif
+ vbroadcasti128 ymm_tmp0, xmm_shufb_0000000088888888
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
@@ -3922,24 +3694,7 @@
lea r_tmp0, [i_xpos + 2 * i_scalex2]
lea i_xpos, [r_tmp0 + 4 * i_scalex2]
shr r_tmp0, 16
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- vpunpcklbw ymm_tmp3, ymm_xpos_int, [esp]
- mov esp, r5
- pop r5
-%else
- vpunpcklbw ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+ vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_db80h
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3
vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp0
@@ -3952,24 +3707,7 @@
shr r_tmp0, 16
vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- vpunpckhbw ymm_tmp3, ymm_xpos_int, [esp]
- mov esp, r5
- pop r5
-%else
- vpunpckhbw ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+ vpunpckhbw ymm_tmp3, ymm_xpos_int, ymm_db80h
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3
vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1
@@ -3985,20 +3723,7 @@
%endmacro
%macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x0c0c0c0c ;shufb_000044448888cccc
- push 0x08080808
- push 0x04040404
- push 0x00000000
- vbroadcasti128 ymm_tmp0, [esp]
- mov esp, r5
- pop r5
-%else
- vbroadcasti128 ymm_tmp0, [shufb_000044448888CCCC]
-%endif
+ vbroadcasti128 ymm_tmp0, xmm_shufb_000044448888CCCC
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
mov r_tmp0, i_xpos
@@ -4019,24 +3744,7 @@
shr r_tmp0, 16
vinserti128 ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
vinserti128 ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- vpunpcklbw ymm_tmp3, ymm_xpos_int, [esp]
- mov esp, r5
- pop r5
-%else
- vpunpcklbw ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+ vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_db80h
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
vpshufb ymm_tmp5, ymm_tmp5, ymm_tmp3
vpshufb ymm_tmp0, ymm_tmp0, ymm_tmp3
@@ -4313,7 +4021,11 @@
vmovd xmm0, arg8
vmovd xmm1, esp
and esp, -32
+%ifdef X86_32_PICASM
+ sub esp, 8 * 4 + 9 * 32
+%else
sub esp, 8 * 4 + 8 * 32
+%endif
vmovd [esp], xmm1
%define p_dst r0
%define i_dst_stride_less_width [esp + 1 * 4]
@@ -4354,6 +4066,22 @@
%define ymm_ffff [esp + 8 * 4 + 5 * 32]
%define ymm_xpos_int_begin [esp + 8 * 4 + 6 * 32]
%define ymm_xpos_frac_begin [esp + 8 * 4 + 7 * 32]
+%ifdef X86_32_PICASM
+ %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 32]
+ %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 8 * 32 + 16]
+ vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4
+ vpcmpeqb ymm_tmp5, ymm_tmp5, ymm_tmp5
+ vpsubb ymm_tmp4, ymm_tmp4, ymm_tmp5
+ vpsllw ymm_tmp3, ymm_tmp4, 3
+ vpslldq ymm_tmp3, ymm_tmp3, 8
+ vmovdqa xmm_shufb_0000000088888888, xmm_tmp3
+ vpsllq ymm_tmp5, ymm_tmp4, 34
+ vpaddb ymm_tmp5, ymm_tmp5, ymm_tmp3
+ vmovdqa xmm_shufb_000044448888CCCC, xmm_tmp5
+%else
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+ %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
+%endif
mov i_dst_stride_less_width, r1
mov i_dst_width, r2
mov i_dst_height, r3
@@ -4409,6 +4137,8 @@
%define ymm_ffff ymm13
%define ymm_xpos_int_begin ymm14
%define ymm_xpos_frac_begin ymm15
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+ %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
vpxor ymm_0, ymm_0, ymm_0
vpcmpeqw ymm_ffff, ymm_ffff, ymm_ffff
%endif
@@ -4597,6 +4327,8 @@
%undef ymm_xfrac0_begin
%undef ymm_xfrac1_begin
%undef ymm_xfrac_inc
+%undef xmm_shufb_0000000088888888
+%undef xmm_shufb_000044448888CCCC
;**************************************************************************************************************
;void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
@@ -4632,7 +4364,11 @@
vmovd xmm0, arg8
vmovd xmm1, esp
and esp, -32
+%ifdef X86_32_PICASM
+ sub esp, 8 * 4 + 10 * 32
+%else
sub esp, 8 * 4 + 8 * 32
+%endif
vmovd [esp], xmm1
%define p_dst r0
%define i_dst_stride_less_width [esp + 1 * 4]
@@ -4673,6 +4409,26 @@
%define ymm_7fff [esp + 8 * 4 + 5 * 32]
%define ymm_xpos_int_begin [esp + 8 * 4 + 6 * 32]
%define ymm_xpos_frac_begin [esp + 8 * 4 + 7 * 32]
+%ifdef X86_32_PICASM
+ %define ymm_db80h [esp + 8 * 4 + 8 * 32]
+ %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 32]
+ %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 9 * 32 + 16]
+ vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4
+ vpcmpeqb ymm_tmp5, ymm_tmp5, ymm_tmp5
+ vpsubb ymm_tmp4, ymm_tmp4, ymm_tmp5
+ vpsllw ymm_tmp3, ymm_tmp4, 3
+ vpslldq ymm_tmp3, ymm_tmp3, 8
+ vmovdqa xmm_shufb_0000000088888888, xmm_tmp3
+ vpsllq ymm_tmp5, ymm_tmp4, 34
+ vpaddb ymm_tmp5, ymm_tmp5, ymm_tmp3
+ vmovdqa xmm_shufb_000044448888CCCC, xmm_tmp5
+ vpsllw ymm_tmp4, ymm_tmp4, 7
+ vmovdqa ymm_db80h, ymm_tmp4
+%else
+ %define ymm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+ %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
+%endif
mov i_dst_stride_less_width, r1
mov i_dst_width, r2
mov i_dst_height, r3
@@ -4729,6 +4485,9 @@
%define ymm_7fff ymm13
%define ymm_xpos_int_begin ymm14
%define ymm_xpos_frac_begin ymm15
+ %define ymm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+ %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
vpxor ymm_0, ymm_0, ymm_0
vpcmpeqw ymm_7fff, ymm_7fff, ymm_7fff
vpsrlw ymm_7fff, ymm_7fff, 1
@@ -4920,5 +4679,8 @@
%undef ymm_xfrac0_begin
%undef ymm_xfrac1_begin
%undef ymm_xfrac_inc
-%endif
+%undef ymm_db80h
+%undef xmm_shufb_0000000088888888
+%undef xmm_shufb_000044448888CCCC
+%endif