ref: c4636c3f4e1043684089a2cbade3af085d55da77
parent: 3b372806c06f10f634f6e8aa48aaab311422fde0
parent: f711d0a2a838691f3695595149de982f9b8a56c7
author: ruil2 <[email protected]>
date: Mon Mar 20 12:32:39 EDT 2017
Merge pull request #2677 from saamas/x86-32-picasm-improvements X86_32_PICASM improvements
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -668,3 +668,68 @@
vpcmpeqw %1, %1, %1
vpsrlw %1, %1, 1
%endmacro
+
+
+;***********************************************************************
+; Utility macros for X86_32 PIC support
+;***********************************************************************
+
+; Used internally by other macros.
+%macro INIT_X86_32_PIC_ 2
+%ifdef X86_32_PICASM
+ %xdefine pic_ptr %1
+ %xdefine pic_ptr_preserve %2
+ %if pic_ptr_preserve
+ %assign push_num push_num+1
+ push pic_ptr
+ %endif
+ call %%get_pc
+%%pic_refpoint:
+ jmp %%pic_init_done
+%%get_pc:
+ mov pic_ptr, [esp]
+ ret
+%%pic_init_done:
+ %define pic(data_addr) (pic_ptr+(data_addr)-%%pic_refpoint)
+%else
+ %define pic(data_addr) (data_addr)
+%endif
+%endmacro
+
+; Get program counter and define a helper macro "pic(addr)" to convert absolute
+; addresses to program counter-relative addresses if X86_32_PICASM is defined.
+; Otherwise define "pic(addr)" as an identity function.
+; %1=register to store PC/EIP in.
+%macro INIT_X86_32_PIC 1
+ INIT_X86_32_PIC_ %1, 1
+%endmacro
+
+; Equivalent as above, but without preserving the value of the register argument.
+%macro INIT_X86_32_PIC_NOPRESERVE 1
+ INIT_X86_32_PIC_ %1, 0
+%endmacro
+
+; Clean up after INIT_X86_32_PIC.
+; Restore the register used to hold PC/EIP if applicable, and undefine defines.
+%macro DEINIT_X86_32_PIC 0
+%ifdef X86_32_PICASM
+ %if pic_ptr_preserve
+ pop pic_ptr
+ %assign push_num push_num-1
+ %endif
+ %undef pic_ptr
+ %undef pic_ptr_preserve
+%endif
+ %undef pic
+%endmacro
+
+; Equivalent as above, but without undefining. Useful for functions with
+; multiple epilogues.
+%macro DEINIT_X86_32_PIC_KEEPDEF 0
+%ifdef X86_32_PICASM
+ %if pic_ptr_preserve
+ pop pic_ptr
+ %assign push_num push_num-1
+ %endif
+%endif
+%endmacro
--- a/codec/common/x86/dct.asm
+++ b/codec/common/x86/dct.asm
@@ -60,7 +60,11 @@
%define prefixed(a) a
%endif
+%ifdef X86_32_PICASM
+SECTION .text align=32
+%else
SECTION .rodata align=32
+%endif
;***********************************************************************
; Constant
@@ -392,40 +396,14 @@
; Do 2 horizontal 4-pt DCTs in parallel packed as 8 words in an xmm register.
; out=%1 in=%1 clobber=%2
%macro SSE2_DCT_HORIZONTAL 2
- pshuflw %2, %1, 1bh ; [x[3],x[2],x[1],x[0]] low qw
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0xffff0001 ;wels_p1m1p1m1w_128
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0x0001ffff ;wels_p1m1m1p1w_128
- push 0xffff0001
- push 0x0001ffff
- push 0xffff0001
- push 0x00020001 ;wels_p1p2p1p2w_128
- push 0x00020001
- push 0x00020001
- push 0x00020001
- pmullw %1, [esp+32] ; [x[0],-x[1],x[2],-x[3], ...]
-%else
- pmullw %1, [wels_p1m1p1m1w_128] ; [x[0],-x[1],x[2],-x[3], ...]
-%endif
- pshufhw %2, %2, 1bh ; [x[3],x[2],x[1],x[0]] high qw
- paddw %1, %2 ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
- pshufd %2, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
-%ifdef X86_32_PICASM
- pmullw %1, [esp+16] ; [s[0],-s[1],-s[2],s[3], ...]
- pmullw %2, [esp] ; [s[2],2*s[3],s[0],2*s[1], ...]]
- mov esp, r0
- pop r0
-%else
- pmullw %1, [wels_p1m1m1p1w_128] ; [s[0],-s[1],-s[2],s[3], ...]
- pmullw %2, [wels_p1p2p1p2w_128] ; [s[2],2*s[3],s[0],2*s[1], ...]]
-%endif
- paddw %1, %2 ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
+ pshuflw %2, %1, 1bh ; [x[3],x[2],x[1],x[0]] low qw
+ pmullw %1, [pic(wels_p1m1p1m1w_128)] ; [x[0],-x[1],x[2],-x[3], ...]
+ pshufhw %2, %2, 1bh ; [x[3],x[2],x[1],x[0]] high qw
+ paddw %1, %2 ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
+ pshufd %2, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
+ pmullw %1, [pic(wels_p1m1m1p1w_128)] ; [s[0],-s[1],-s[2],s[3], ...]
+ pmullw %2, [pic(wels_p1p2p1p2w_128)] ; [s[2],2*s[3],s[0],2*s[1], ...]]
+ paddw %1, %2 ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
%endmacro
; Do 2 horizontal 4-pt IDCTs in parallel packed as 8 words in an xmm register.
@@ -436,22 +414,7 @@
;
; out=%1 in=%1 wels_p1m1m1p1w_128=%2 clobber=%3,%4
%macro SSE2_IDCT_HORIZONTAL 4
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80000000 ;wels_p0m8000p0m8000w_128
- push 0x80000000
- push 0x80000000
- push 0x80000000
- push 0xffffffff ;wels_p1p1m1m1w_128
- push 0x00010001
- push 0xffffffff
- push 0x00010001
- movdqa %3, [esp+16]
-%else
- movdqa %3, [wels_p0m8000p0m8000w_128]
-%endif
+ movdqa %3, [pic(wels_p0m8000p0m8000w_128)]
pmulhw %3, %1 ; x[0:7] * [0,-8000h,0,-8000h, ...] >> 16
pshufd %4, %1, 0b1h ; [x[2],x[3],x[0],x[1], ...]
pmullw %4, %2 ; [x[2],-x[3],-x[0],x[1], ...]
@@ -458,13 +421,7 @@
paddw %1, %3 ; [x[0]+0,x[1]+(-x[1]>>1),x[2]+0,x[3]+(-x[3]>>1), ...]
paddw %1, %4 ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
pshuflw %3, %1, 1bh ; [s[3],s[2],s[1],s[0]] low qw
-%ifdef X86_32_PICASM
- pmullw %1, [esp] ; [s[0],s[1],-s[2],-s[3], ...]
- mov esp, r0
- pop r0
-%else
- pmullw %1, [wels_p1p1m1m1w_128] ; [s[0],s[1],-s[2],-s[3], ...]
-%endif
+ pmullw %1, [pic(wels_p1p1m1m1w_128)] ; [s[0],s[1],-s[2],-s[3], ...]
pshufhw %3, %3, 1bh ; [s[3],s[2],s[1],s[0]] high qw
pmullw %3, %2 ; [s[3],-s[2],-s[1],s[0], ...]
paddw %1, %3 ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
@@ -481,24 +438,9 @@
punpckhqdq %2, %1 ; s03 = [x0+x3,x0-x3]
punpcklqdq %3, %1 ; s12 = [x1+x2,x1-x2]
movdqa %1, %2
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00020002 ;wels_4xp1w_4xp2w
- push 0x00020002
- push 0x00010001
- push 0x00010001
- pmullw %1, [esp] ; [s03[0],2*s03[1]]
- paddw %1, %3 ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
- pmullw %3, [esp] ; [s12[0],2*s12[1]]
- mov esp, r0
- pop r0
-%else
- pmullw %1, [wels_4xp1w_4xp2w] ; [s03[0],2*s03[1]]
- paddw %1, %3 ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
- pmullw %3, [wels_4xp1w_4xp2w] ; [s12[0],2*s12[1]]
-%endif
+ pmullw %1, [pic(wels_4xp1w_4xp2w)] ; [s03[0],2*s03[1]]
+ paddw %1, %3 ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
+ pmullw %3, [pic(wels_4xp1w_4xp2w)] ; [s12[0],2*s12[1]]
psubw %2, %3 ; [y2,y3] = [s03[0]-s12[0],s03[1]-2*s12[1]]
%endmacro
@@ -506,20 +448,7 @@
; Output is scrambled to save a negation.
; [y1,y0]=%1 [y2,y3]=%2 [x0,x1]=%1 [x2,x3]=%2 clobber=%3,%4
%macro SSE2_IDCT_4x4P 4
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80008000 ;wels_4xp0w_4xm8000w
- push 0x80008000
- push 0x00000000
- push 0x00000000
- movdqa %4, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa %4, [wels_4xp0w_4xm8000w]
-%endif
+ movdqa %4, [pic(wels_4xp0w_4xm8000w)]
movdqa %3, %1
pmulhw %3, %4 ; x[0:1] * [0,-8000h] >> 16
pmulhw %4, %2 ; x[2:3] * [0,-8000h] >> 16
@@ -540,6 +469,7 @@
;***********************************************************************
WELS_EXTERN WelsDctFourT4_sse2
%assign push_num 0
+ INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -582,6 +512,7 @@
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
@@ -589,6 +520,7 @@
;***********************************************************************
WELS_EXTERN WelsIDctFourT4Rec_sse2
%assign push_num 0
+ INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -596,18 +528,7 @@
;Load 4x8
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x0001ffff ;wels_p1m1m1p1w_128
- push 0xffff0001
- push 0x0001ffff
- push 0xffff0001
- movdqa xmm7, [esp]
-%else
- movdqa xmm7, [wels_p1m1m1p1w_128]
-%endif
+ movdqa xmm7, [pic(wels_p1m1m1p1w_128)]
SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
@@ -626,13 +547,7 @@
lea r2, [r2 + 2 * r3]
SSE2_Load4x8p r4+64, xmm0, xmm1, xmm4, xmm2, xmm5
-%ifdef X86_32_PICASM
- movdqa xmm7, [esp]
- mov esp, r5
- pop r5
-%else
- movdqa xmm7, [wels_p1m1m1p1w_128]
-%endif
+ movdqa xmm7, [pic(wels_p1m1m1p1w_128)]
SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
@@ -648,6 +563,7 @@
SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
@@ -655,6 +571,7 @@
;***********************************************************************
WELS_EXTERN WelsDctT4_sse2
%assign push_num 0
+ INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 5
SIGN_EXTENSION r2, r2d
@@ -673,6 +590,7 @@
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
@@ -690,6 +608,7 @@
%assign push_num 0
LOAD_5_PARA
.begin:
+ INIT_X86_32_PIC r5
PUSH_XMM 6
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
@@ -696,20 +615,7 @@
SSE2_Load2x4P xmm0, r4
SSE2_Load2x4P xmm1, r4+16
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x0001ffff ;wels_p1m1m1p1w_128
- push 0xffff0001
- push 0x0001ffff
- push 0xffff0001
- movdqa xmm4, [esp]
- mov esp, r5
- pop r5
-%else
- movdqa xmm4, [wels_p1m1m1p1w_128]
-%endif
+ movdqa xmm4, [pic(wels_p1m1m1p1w_128)]
SSE2_IDCT_HORIZONTAL xmm0, xmm4, xmm2, xmm3
SSE2_IDCT_HORIZONTAL xmm1, xmm4, xmm2, xmm3
SSE2_IDCT_4x4P xmm0, xmm1, xmm2, xmm3
@@ -721,6 +627,7 @@
SSE2_StoreDiff2x4P r0+r1, r0+2*r1, xmm1, r2+r3, r2+2*r3, xmm5, xmm4, xmm2, xmm3
POP_XMM
+ DEINIT_X86_32_PIC
LOAD_5_PARA_POP
ret
@@ -815,20 +722,7 @@
vpshufb y%9, y%9, y%8
vpaddsw y%4, y%4, y%9
vpackuswb y%3, y%3, y%4
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x0d0f0e0c ;wels_shufb0231_128
- push 0x090b0a08
- push 0x05070604
- push 0x01030200
- vbroadcasti128 y%4, [esp]
- mov esp, r0
- pop r0
-%else
- vbroadcasti128 y%4, [wels_shufb0231_128]
-%endif
+ vbroadcasti128 y%4, [pic(wels_shufb0231_128)]
vpshufb y%3, y%3, y%4
vextracti128 x%4, y%3, 1
vmovlps [%1 ], x%3
@@ -906,20 +800,7 @@
AVX2_Loadzx4x4P %8, %4, %5, y%7, %9, %10
vpaddsw y%3, y%3, y%8
vpackuswb y%3, y%3, y%3
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x0d0f0e0c ;wels_shufb0231_128
- push 0x090b0a08
- push 0x05070604
- push 0x01030200
- vbroadcasti128 y%8, [esp]
- mov esp, r0
- pop r0
-%else
- vbroadcasti128 y%8, [wels_shufb0231_128]
-%endif
+ vbroadcasti128 y%8, [pic(wels_shufb0231_128)]
vpshufb y%3, y%3, y%8
vextracti128 x%8, y%3, 1
vmovd [%1 ], x%3
@@ -965,39 +846,10 @@
; Uses scrambled input to save a negation.
; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 wels_shufb2301=%2 clobber=%3
%macro AVX2_DCT_HORIZONTAL 3
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0xffff0001 ;wels_p1m1p1m1w_256
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xfffeffff ;wels_p1m2m1m2w_256
- push 0x00020001
- push 0xfffeffff
- push 0x00020001
- push 0xfffeffff
- push 0x00020001
- push 0xfffeffff
- push 0x00020001
- vpsignw %3, %1, [esp+32] ; [x0,-x3,x1,-x2]
-%else
- vpsignw %3, %1, [wels_p1m1p1m1w_256] ; [x0,-x3,x1,-x2]
-%endif
+ vpsignw %3, %1, [pic(wels_p1m1p1m1w_256)] ; [x0,-x3,x1,-x2]
vpshufb %1, %1, %2 ; [x3,x0,x2,x1]
vpaddw %1, %1, %3 ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
-%ifdef X86_32_PICASM
- vpmullw %3, %1, [esp] ; [s[0],2*s[1],-s[2],-2*s[3], ...]
- mov esp, r0
- pop r0
-%else
- vpmullw %3, %1, [wels_p1p2m1m2w_256] ; [s[0],2*s[1],-s[2],-2*s[3], ...]
-%endif
+ vpmullw %3, %1, [pic(wels_p1p2m1m2w_256)] ; [s[0],2*s[1],-s[2],-2*s[3], ...]
vpshufd %1, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
vpaddw %1, %1, %3 ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1], ...]
%endmacro
@@ -1008,40 +860,11 @@
%macro AVX2_IDCT_HORIZONTAL 3
vpsraw %3, %1, 1 ; [x0>>1,x1>>1,x2>>1,x3>>1]
vpblendw %3, %1, %3, 10101010b ; [x0,x1>>1,x2,x3>>1]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0xffffffff ;wels_p1p1m1m1w_256
- push 0x00010001
- push 0xffffffff
- push 0x00010001
- push 0xffffffff
- push 0x00010001
- push 0xffffffff
- push 0x00010001
- push 0xffff0001 ;wels_p1m1p1m1w_256
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- vpsignw %1, %1, [esp+32] ; [x0,x1,-x2,-x3]
-%else
- vpsignw %1, %1, [wels_p1p1m1m1w_256] ; [x0,x1,-x2,-x3]
-%endif
+ vpsignw %1, %1, [pic(wels_p1p1m1m1w_256)] ; [x0,x1,-x2,-x3]
vpshufd %3, %3, 0b1h ; [x2,x3>>1,x0,x1>>1]
vpaddw %1, %3, %1 ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
vpshufb %3, %1, %2 ; [s[1],s[0],s[3],s[2], ...]
-%ifdef X86_32_PICASM
- vpsignw %1, %1, [esp] ; [s[0],-s[1],s[2],-s[3], ...]
- mov esp, r0
- pop r0
-%else
- vpsignw %1, %1, [wels_p1m1p1m1w_256] ; [s[0],-s[1],s[2],-s[3], ...]
-%endif
+ vpsignw %1, %1, [pic(wels_p1m1p1m1w_256)] ; [s[0],-s[1],s[2],-s[3], ...]
vpaddw %1, %1, %3 ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2], ...]
%endmacro
@@ -1049,39 +872,10 @@
; Uses scrambled input to save a negation.
; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 clobber=%2
%macro AVX2_DCT_4x4P 2
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0xffffffff ;wels_4xp1w_4xm1w_256
- push 0xffffffff
- push 0x00010001
- push 0x00010001
- push 0xffffffff
- push 0xffffffff
- push 0x00010001
- push 0x00010001
- push 0xfffefffe ;wels_4xp1w_4xp2w_4xm1w_4xm2w
- push 0xfffefffe
- push 0xffffffff
- push 0xffffffff
- push 0x00020002
- push 0x00020002
- push 0x00010001
- push 0x00010001
- vpsignw %2, %1, [esp+32] ; [x0,-x3,x1,-x2]
-%else
- vpsignw %2, %1, [wels_4xp1w_4xm1w_256] ; [x0,-x3,x1,-x2]
-%endif
+ vpsignw %2, %1, [pic(wels_4xp1w_4xm1w_256)] ; [x0,-x3,x1,-x2]
vpshufd %1, %1, 4eh ; [x3,x0,x2,x1]
vpaddw %1, %1, %2 ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
-%ifdef X86_32_PICASM
- vpmullw %2, %1, [esp] ; [s[0],2*s[1],-s[2],-2*s[3]]
- mov esp, r0
- pop r0
-%else
- vpmullw %2, %1, [wels_4xp1w_4xp2w_4xm1w_4xm2w] ; [s[0],2*s[1],-s[2],-2*s[3]]
-%endif
+ vpmullw %2, %1, [pic(wels_4xp1w_4xp2w_4xm1w_4xm2w)] ; [s[0],2*s[1],-s[2],-2*s[3]]
vpermq %1, %1, 4eh ; [s[2],s[3],s[0],s[1]]
vpaddw %1, %1, %2 ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1]]
%endmacro
@@ -1092,40 +886,11 @@
%macro AVX2_IDCT_4x4P 2
vpsraw %2, %1, 1 ; [x0>>1,x1>>1,x2>>1,x3>>1]
vpblendw %2, %1, %2, 11110000b ; [x0,x1>>1,x2,x3>>1]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0xffffffff ;wels_8xp1w_8xm1w
- push 0xffffffff
- push 0xffffffff
- push 0xffffffff
- push 0x00010001
- push 0x00010001
- push 0x00010001
- push 0x00010001
- push 0xffffffff ;wels_4xp1w_4xm1w_256
- push 0xffffffff
- push 0x00010001
- push 0x00010001
- push 0xffffffff
- push 0xffffffff
- push 0x00010001
- push 0x00010001
- vpsignw %1, %1, [esp+32] ; [x0,x1,-x2,-x3]
-%else
- vpsignw %1, %1, [wels_8xp1w_8xm1w] ; [x0,x1,-x2,-x3]
-%endif
+ vpsignw %1, %1, [pic(wels_8xp1w_8xm1w)] ; [x0,x1,-x2,-x3]
vpermq %2, %2, 4eh ; [x2,x3>>1,x0,x1>>1]
vpaddw %1, %2, %1 ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
vpshufd %2, %1, 4eh ; [s[1],s[0],s[3],s[2]]
-%ifdef X86_32_PICASM
- vpmullw %1, %1, [esp] ; [s[0],-s[1],s[2],-s[3], ...]
- mov esp, r0
- pop r0
-%else
- vpmullw %1, %1, [wels_4xp1w_4xm1w_256] ; [s[0],-s[1],s[2],-s[3], ...]
-%endif
+ vpmullw %1, %1, [pic(wels_4xp1w_4xm1w_256)] ; [s[0],-s[1],s[2],-s[3], ...]
vpaddw %1, %1, %2 ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2]]
%endmacro
@@ -1134,27 +899,13 @@
;***********************************************************************
WELS_EXTERN WelsDctFourT4_avx2
%assign push_num 0
+ INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 7
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80068005 ;wels_shufb0312_movzxw_128
- push 0x80078004
- push 0x80028001
- push 0x80038000
- push 0x0d0c0f0e ;wels_shufb2301_128
- push 0x09080b0a
- push 0x05040706
- push 0x01000302
- vbroadcasti128 ymm6, [esp+16]
-%else
- vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
-%endif
+ vbroadcasti128 ymm6, [pic(wels_shufb0312_movzxw_128)]
;Load 4x16
AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5
@@ -1169,13 +920,7 @@
AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5
AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5
-%ifdef X86_32_PICASM
- vbroadcasti128 ymm6, [esp]
- mov esp, r5
- pop r5
-%else
- vbroadcasti128 ymm6, [wels_shufb2301_128]
-%endif
+ vbroadcasti128 ymm6, [pic(wels_shufb2301_128)]
AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5
AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5
AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5
@@ -1186,6 +931,7 @@
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
@@ -1203,31 +949,13 @@
%assign push_num 0
LOAD_5_PARA
.begin:
+ INIT_X86_32_PIC r5
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x0d0c0f0e ;wels_shufb2301_128
- push 0x09080b0a
- push 0x05040706
- push 0x01000302
- push 0x80068005 ;wels_shufb0312_movzxw_128
- push 0x80078004
- push 0x80028001
- push 0x80038000
- push 0x00200020 ;wels_dw32_128
- push 0x00200020
- push 0x00200020
- push 0x00200020
- vbroadcasti128 ymm6, [esp+32]
-%else
- vbroadcasti128 ymm6, [wels_shufb2301_128]
-%endif
+ vbroadcasti128 ymm6, [pic(wels_shufb2301_128)]
AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5
AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5
AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5
@@ -1234,15 +962,8 @@
AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5
AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5
-%ifdef X86_32_PICASM
- vbroadcasti128 ymm6, [esp+16]
- vbroadcasti128 ymm7, [esp]
- mov esp, r5
- pop r5
-%else
- vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
- vbroadcasti128 ymm7, [wels_dw32_128]
-%endif
+ vbroadcasti128 ymm6, [pic(wels_shufb0312_movzxw_128)]
+ vbroadcasti128 ymm7, [pic(wels_dw32_128)]
AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4
add r2, r3
add r0, r1
@@ -1250,6 +971,7 @@
vzeroupper
POP_XMM
+ DEINIT_X86_32_PIC
LOAD_5_PARA_POP
ret
@@ -1258,36 +980,16 @@
;***********************************************************************
WELS_EXTERN WelsDctT4_avx2
%assign push_num 0
+ INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 5
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80068005 ;wels_shufb0312_movzxw_128
- push 0x80078004
- push 0x80028001
- push 0x80038000
- push 0x0d0c0f0e ;wels_shufb2301_128
- push 0x09080b0a
- push 0x05040706
- push 0x01000302
- vbroadcasti128 ymm1, [esp+16]
-%else
- vbroadcasti128 ymm1, [wels_shufb0312_movzxw_128]
-%endif
+ vbroadcasti128 ymm1, [pic(wels_shufb0312_movzxw_128)]
AVX2_LoadDiff4x4P mm0, r1, r2, r3, r4, mm1, mm2, mm3, mm4
AVX2_DCT_4x4P ymm0, ymm2
-%ifdef X86_32_PICASM
- vbroadcasti128 ymm1, [esp]
- mov esp, r5
- pop r5
-%else
- vbroadcasti128 ymm1, [wels_shufb2301_128]
-%endif
+ vbroadcasti128 ymm1, [pic(wels_shufb2301_128)]
AVX2_DCT_HORIZONTAL ymm0, ymm1, ymm2
AVX2_Store4x4P r0, mm0
vzeroupper
@@ -1294,6 +996,7 @@
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
@@ -1311,46 +1014,22 @@
%assign push_num 0
LOAD_5_PARA
.begin:
+ INIT_X86_32_PIC r5
PUSH_XMM 6
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
AVX2_Load4x4P mm0, r4
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x0d0c0f0e ;wels_shufb2301_128
- push 0x09080b0a
- push 0x05040706
- push 0x01000302
- push 0x80068005 ;wels_shufb0312_movzxw_128
- push 0x80078004
- push 0x80028001
- push 0x80038000
- push 0x00200020 ;wels_dw32_128
- push 0x00200020
- push 0x00200020
- push 0x00200020
- vbroadcasti128 ymm4, [esp+32]
-%else
- vbroadcasti128 ymm4, [wels_shufb2301_128]
-%endif
+ vbroadcasti128 ymm4, [pic(wels_shufb2301_128)]
AVX2_IDCT_HORIZONTAL ymm0, ymm4, ymm1
AVX2_IDCT_4x4P ymm0, ymm1
-%ifdef X86_32_PICASM
- vbroadcasti128 ymm4, [esp+16]
- vbroadcasti128 ymm5, [esp]
- mov esp, r5
- pop r5
-%else
- vbroadcasti128 ymm4, [wels_shufb0312_movzxw_128]
- vbroadcasti128 ymm5, [wels_dw32_128]
-%endif
+ vbroadcasti128 ymm4, [pic(wels_shufb0312_movzxw_128)]
+ vbroadcasti128 ymm5, [pic(wels_dw32_128)]
AVX2_StoreDiff4x4P r0, r1, mm0, r2, r3, mm5, mm4, mm1, mm2, mm3
vzeroupper
POP_XMM
+ DEINIT_X86_32_PIC
LOAD_5_PARA_POP
ret
%endif
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -45,7 +45,11 @@
; Macros and other preprocessor constants
;*******************************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
SECTION .rodata align=16
+%endif
ALIGN 16
FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4
@@ -157,25 +161,9 @@
; Unbias and split into a non-negative and a non-positive part.
; Clip each part to iTc via minub.
; Add/subtract each part to/from p0/q0 and clip.
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- sub esp, 16
- and esp, -16
- push 0x60606060 ;WELS_DB96_16
- push 0x60606060
- push 0x60606060
- push 0x60606060
- movdqa %6, [esp]
+ movdqa %6, [pic(WELS_DB96_16)]
psubusb %6, %8
- psubusb %8, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa %6, [WELS_DB96_16]
- psubusb %6, %8
- psubusb %8, [WELS_DB96_16]
-%endif
+ psubusb %8, [pic(WELS_DB96_16)]
pminub %6, %5
pminub %8, %5
psubusb %2, %6
@@ -192,6 +180,7 @@
WELS_EXTERN DeblockLumaLt4V_ssse3
%assign push_num 0
+ INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -198,21 +187,8 @@
movd xmm1, arg3d
movd xmm2, arg4d
pxor xmm3, xmm3
-%ifdef X86_32_PICASM
- push r4
- mov r4, esp
- sub esp, 16
- and esp, -16
- push 0x7f7f7f7f
- push 0x7f7f7f7f
- push 0x7f7f7f7f
- push 0x7f7f7f7f
- pxor xmm1, [esp]
- pxor xmm2, [esp]
-%else
- pxor xmm1, [WELS_DB127_16]
- pxor xmm2, [WELS_DB127_16]
-%endif
+ pxor xmm1, [pic(WELS_DB127_16)]
+ pxor xmm2, [pic(WELS_DB127_16)]
pshufb xmm1, xmm3 ; iAlpha ^ 0x7f
pshufb xmm2, xmm3 ; iBeta ^ 0x7f
mov r2, r1 ; iStride
@@ -225,40 +201,22 @@
MOVDQ xmm0, [r0 + 0 * r2] ; q0
movdqa xmm4, xmm6
SSE2_AbsDiffUB xmm6, xmm0, xmm3 ; |p0 - q0|
-%ifdef X86_32_PICASM
- SSE2_CmpltUB xmm6, xmm1, [esp] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%else
- SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%endif
+ SSE2_CmpltUB xmm6, xmm1, [pic(WELS_DB127_16)] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
MOVDQ xmm1, [r0 + 1 * r2] ; q1
SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p1 - p0|
SSE2_AbsDiffUB xmm0, xmm1, xmm3 ; |q1 - q0|
pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|)
-%ifdef X86_32_PICASM
- SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%else
- SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%endif
+ SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
pand xmm6, xmm7 ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
MOVDQ xmm7, [r3 + 2 * r1] ; p2
movdqa xmm0, xmm7
SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p2 - p0|
-%ifdef X86_32_PICASM
- SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP2P0 = |p2 - p0| < iBeta
-%else
- SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP2P0 = |p2 - p0| < iBeta
-%endif
+ SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)] ; bDeltaP2P0 = |p2 - p0| < iBeta
MOVDQ xmm5, [r0 + 2 * r2] ; q2
MOVDQ xmm3, [r0 + 0 * r2] ; q0
movdqa xmm1, xmm5
SSE2_AbsDiffUB xmm5, xmm3, xmm4 ; |q2 - q0|
-%ifdef X86_32_PICASM
- SSE2_CmpltUB xmm5, xmm2, [esp] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
- mov esp, r4
- pop r4
-%else
- SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
-%endif
+ SSE2_CmpltUB xmm5, xmm2, [pic(WELS_DB127_16)] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
pavgb xmm3, [r3 + 0 * r1]
pcmpeqw xmm2, xmm2 ; FFh
@@ -273,21 +231,7 @@
pxor xmm1, xmm2
movd xmm3, [r4]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- sub esp, 16
- and esp, -16
- push 0x03030303 ;WELS_SHUFB0000111122223333
- push 0x02020202
- push 0x01010101
- push 0x00000000
- pshufb xmm3, [esp] ; iTc
- mov esp, r0
- pop r0
-%else
- pshufb xmm3, [WELS_SHUFB0000111122223333] ; iTc
-%endif
+ pshufb xmm3, [pic(WELS_SHUFB0000111122223333)] ; iTc
movdqa xmm4, xmm3 ; iTc0 = iTc
pcmpgtb xmm3, xmm2 ; iTc > -1 ? 0xff : 0x00
pand xmm6, xmm3 ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1
@@ -315,6 +259,7 @@
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
ret
@@ -380,6 +325,7 @@
WELS_EXTERN DeblockLumaEq4V_ssse3
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 10
SIGN_EXTENSION r1, r1d
@@ -389,21 +335,8 @@
add r2, 1
movd xmm3, r2d
pxor xmm4, xmm4
-%ifdef X86_32_PICASM
- push r4
- mov r4, esp
- sub esp, 16
- and esp, -16
- push 0x7f7f7f7f ;WELS_DB127_16
- push 0x7f7f7f7f
- push 0x7f7f7f7f
- push 0x7f7f7f7f
- pxor xmm1, [esp]
- pxor xmm2, [esp]
-%else
- pxor xmm1, [WELS_DB127_16]
- pxor xmm2, [WELS_DB127_16]
-%endif
+ pxor xmm1, [pic(WELS_DB127_16)]
+ pxor xmm2, [pic(WELS_DB127_16)]
pshufb xmm1, xmm4 ; iAlpha ^ 0x7f
pshufb xmm2, xmm4 ; iBeta ^ 0x7f
pshufb xmm3, xmm4 ; (iAlpha >> 2) + 1
@@ -418,41 +351,23 @@
movdqa xmm4, xmm6
SSE2_AbsDiffUB xmm6, xmm0, xmm5 ; |p0 - q0|
SSE2_CmpgeUB xmm3, xmm6 ; |p0 - q0| < (iAlpha >> 2) + 2
-%ifdef X86_32_PICASM
- SSE2_CmpltUB xmm6, xmm1, [esp] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%else
- SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%endif
+ SSE2_CmpltUB xmm6, xmm1, [pic(WELS_DB127_16)] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
MOVDQ xmm1, [r0 + 1 * r2] ; q1
SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p1 - p0|
SSE2_AbsDiffUB xmm0, xmm1, xmm5 ; |q1 - q0|
pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|)
-%ifdef X86_32_PICASM
- SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%else
- SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%endif
+ SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
pand xmm6, xmm7 ; & bDeltaP0Q0
MOVDQ xmm7, [r3 + 2 * r1] ; p2
SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p2 - p0|
-%ifdef X86_32_PICASM
- SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP2P0 = |p2 - p0| < iBeta
-%else
- SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP2P0 = |p2 - p0| < iBeta
-%endif
+ SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)] ; bDeltaP2P0 = |p2 - p0| < iBeta
pand xmm7, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2
MOVDQ xmm0, [r0 + 0 * r2] ; q0
MOVDQ xmm5, [r0 + 2 * r2] ; q2
SSE2_AbsDiffUB xmm5, xmm0, xmm4 ; |q2 - q0|
-%ifdef X86_32_PICASM
- SSE2_CmpltUB xmm5, xmm2, [esp] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
- mov esp, r4
- pop r4
-%else
- SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
-%endif
+ SSE2_CmpltUB xmm5, xmm2, [pic(WELS_DB127_16)] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
pand xmm5, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2
%ifdef X86_32
@@ -461,26 +376,12 @@
mov r2, esp
sub esp, 16
and esp, -16
-%ifdef X86_32_PICASM
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- sub esp, 16
movdqa [esp], xmm5
- SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [esp+16]
+ SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [pic(WELS_DB1_16)]
movdqa xmm5, [esp]
- neg r1
- SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [esp+16]
mov esp, r2
-%else
- movdqa [esp], xmm5
- SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [WELS_DB1_16]
- movdqa xmm5, [esp]
- mov esp, r2
neg r1
- SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [WELS_DB1_16]
-%endif
+ SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [pic(WELS_DB1_16)]
%else
movdqa xmm9, [WELS_DB1_16]
SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9
@@ -489,6 +390,7 @@
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
@@ -649,6 +551,7 @@
WELS_EXTERN DeblockChromaLt4V_ssse3
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -681,6 +584,7 @@
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
@@ -737,7 +641,9 @@
lea r3, [3 * r2 - 1] ; 3 * iStride - 1
SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
+ INIT_X86_32_PIC r1
SSSE3_DeblockChromaLt4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, r5, xmm2, xmm3, xmm6, 0
+ DEINIT_X86_32_PIC
SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
POP_XMM
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -44,7 +44,11 @@
;*******************************************************************************
; Local Data (Read Only)
;*******************************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=32
+%else
SECTION .rodata align=32
+%endif
;*******************************************************************************
; Various memory constants (trigonometric values or rounding values)
@@ -120,12 +124,6 @@
psllw %1, 4
%endmacro
-%macro MOVEIMM_DW32 1
- pcmpeqw %1, %1
- psrlw %1, 15
- psllw %1, 5
-%endmacro
-
%endif
;*******************************************************************************
@@ -197,12 +195,7 @@
%macro FILTER_HV_W8 9
paddw %1, %6
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 %8
- paddw %1, %8
-%else
- paddw %1, [h264_w0x10_1]
-%endif
+ paddw %1, [pic(h264_w0x10_1)]
movdqa %8, %3
movdqa %7, %2
paddw %8, %4
@@ -221,12 +214,7 @@
%macro FILTER_HV_W4 9
paddw %1, %6
-%ifdef X86_32_PICASM
-MOVEIMM_DW16 %8
-paddw %1, %8
-%else
-paddw %1, [h264_w0x10_1]
-%endif
+paddw %1, [pic(h264_w0x10_1)]
movdqa %8, %3
movdqa %7, %2
paddw %8, %4
@@ -457,6 +445,7 @@
;*******************************************************************************
WELS_EXTERN McHorVer02WidthEq8_sse2
%assign push_num 0
+ INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -530,6 +519,7 @@
.xx_exit:
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
@@ -550,6 +540,7 @@
;***********************************************************************
WELS_EXTERN McHorVer02Height9Or17_sse2
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -671,6 +662,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
@@ -684,6 +676,7 @@
;***********************************************************************
WELS_EXTERN McHorVer02Height5_sse2
%assign push_num 0
+INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -805,6 +798,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
ret
@@ -819,6 +813,7 @@
;***********************************************************************
WELS_EXTERN McHorVer20Width9Or17_sse2
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -855,12 +850,7 @@
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 xmm6
- paddw xmm0, xmm6
-%else
- paddw xmm0, [h264_w0x10_1]
-%endif
+ paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2], xmm0
@@ -877,11 +867,7 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
-%ifdef X86_32_PICASM
- paddw xmm2, xmm6
-%else
- paddw xmm2, [h264_w0x10_1]
-%endif
+ paddw xmm2, [pic(h264_w0x10_1)]
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+1], xmm2
@@ -892,6 +878,7 @@
jnz .yloop_width_9
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
@@ -918,12 +905,7 @@
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 xmm6
- paddw xmm0, xmm6
-%else
- paddw xmm0, [h264_w0x10_1]
-%endif
+ paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movq [r2], xmm0
@@ -951,12 +933,7 @@
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 xmm6
- paddw xmm0, xmm6
-%else
- paddw xmm0, [h264_w0x10_1]
-%endif
+ paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2+8], xmm0
@@ -974,11 +951,7 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
-%ifdef X86_32_PICASM
- paddw xmm2, xmm6
-%else
- paddw xmm2, [h264_w0x10_1]
-%endif
+ paddw xmm2, [pic(h264_w0x10_1)]
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+9], xmm2
@@ -988,6 +961,7 @@
jnz .yloop_width_17
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
@@ -1002,6 +976,7 @@
;***********************************************************************
WELS_EXTERN McHorVer20Width5_sse2
%assign push_num 0
+INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -1035,12 +1010,7 @@
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
-%ifdef X86_32_PICASM
-MOVEIMM_DW16 xmm6
-paddw xmm0, xmm6
-%else
-paddw xmm0, [h264_w0x10_1]
-%endif
+paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2], xmm0
@@ -1057,11 +1027,7 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
-%ifdef X86_32_PICASM
-paddw xmm2, xmm6
-%else
-paddw xmm2, [h264_w0x10_1]
-%endif
+paddw xmm2, [pic(h264_w0x10_1)]
psraw xmm2, 5
packuswb xmm2, xmm2
movd [r2+1], xmm2
@@ -1072,6 +1038,7 @@
jnz .yloop_width_5
POP_XMM
LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
ret
@@ -1238,12 +1205,7 @@
psubw %1, %7
psraw %1, 2
paddw %8, %1
-%ifdef X86_32_PICASM
- MOVEIMM_DW32 %7
- paddw %8, %7
-%else
- paddw %8, [h264_mc_hc_32]
-%endif
+ paddw %8, [pic(h264_mc_hc_32)]
psraw %8, 6
packuswb %8, %8
movq %9, %8
@@ -1260,6 +1222,7 @@
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -1377,6 +1340,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
@@ -1391,6 +1355,7 @@
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -1507,6 +1472,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
@@ -1595,12 +1561,7 @@
psubw %1, %7
psraw %1, 2
paddw %8, %1
-%ifdef X86_32_PICASM
-MOVEIMM_DW32 %7
-paddw %8, %7
-%else
-paddw %8, [h264_mc_hc_32]
-%endif
+paddw %8, [pic(h264_mc_hc_32)]
psraw %8, 6
packuswb %8, %8
movd %9, %8
@@ -1619,6 +1580,7 @@
WELS_EXTERN McHorVer22Width4VerLastAlign_sse2
%assign push_num 0
+INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -1736,6 +1698,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
ret
@@ -1751,6 +1714,7 @@
WELS_EXTERN McHorVer22Width4VerLastUnAlign_sse2
%assign push_num 0
+INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -1867,6 +1831,7 @@
%endif
POP_XMM
LOAD_6_PARA_POP
+DEINIT_X86_32_PIC
ret
@@ -1879,12 +1844,7 @@
movdqa %7, %3
pmaddubsw %7, %6
paddw %1, %7
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 %7
- paddw %1, %7
-%else
- paddw %1, [h264_w0x10_1]
-%endif
+ paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
@@ -1901,12 +1861,7 @@
movdqa %7, %4
pmaddubsw %7, %6
paddw %1, %7
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 %7
- paddw %1, %7
-%else
- paddw %1, [h264_w0x10_1]
-%endif
+ paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
@@ -1916,20 +1871,7 @@
pshufb %1, %2
pshufb %5, %3
pshufd %6, %1, 10110001b
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- pmaddubsw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmaddubsw %1, [db20_128]
-%endif
+ pmaddubsw %1, [pic(db20_128)]
pmaddubsw %5, %4
pmaddubsw %6, %4
paddw %1, %5
@@ -1939,12 +1881,7 @@
; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
%macro SSSE3_FilterHorizontal_8px 6
SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 %5
- paddw %1, %5
-%else
- paddw %1, [h264_w0x10_1]
-%endif
+ paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
@@ -1959,20 +1896,7 @@
pshufb %7, %4
punpcklqdq %6, %7
pshufd %7, %1, 10110001b
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- pmaddubsw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmaddubsw %1, [db20_128]
-%endif
+ pmaddubsw %1, [pic(db20_128)]
pmaddubsw %6, %5
pmaddubsw %7, %5
paddw %1, %6
@@ -1982,31 +1906,13 @@
; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
%macro SSSE3_FilterHorizontal_2x4px 7
SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7
-%ifdef X86_32_PICASM
- MOVEIMM_DW16 %6
- paddw %1, %6
-%else
- paddw %1, [h264_w0x10_1]
-%endif
+ paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
; pixels=%1 -32768>>scale=%2 tmp=%3
%macro SSSE3_FilterHorizontalbw_2px 3
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- pmaddubsw %1, [esp]
- mov esp, r1
- pop r1
-%else
- pmaddubsw %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_128]
-%endif
+ pmaddubsw %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_128)]
pmaddwd %1, %2
pshufd %3, %1, 10110001b
paddd %1, %3
@@ -2014,33 +1920,8 @@
; pixels=%1 tmp=%2
%macro SSSE3_FilterHorizontal_2px 2
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- pmaddubsw %1, [esp]
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- pmaddwd %1, [esp]
- pshufd %2, %1, 10110001b
- paddd %1, %2
- push 0x00008000
- push 0x00008000
- push 0x00008000
- push 0x00008000
- paddd %1, [esp]
- mov esp, r1
- pop r1
-%else
- SSSE3_FilterHorizontalbw_2px %1, [dwm1024_128], %2
- paddd %1, [dd32768_128]
-%endif
+ SSSE3_FilterHorizontalbw_2px %1, [pic(dwm1024_128)], %2
+ paddd %1, [pic(dd32768_128)]
%endmacro
; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
@@ -2055,14 +1936,8 @@
paddw %7, %4
paddw %1, %7
psraw %1, 2
-%ifdef X86_32_PICASM
+ paddw %7, [pic(h264_mc_hc_32)]
paddw %1, %7
- MOVEIMM_DW32 %7
- paddw %1, %7
-%else
- paddw %7, [h264_mc_hc_32]
- paddw %1, %7
-%endif
psraw %1, 6
%endmacro
@@ -2080,7 +1955,11 @@
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
+%ifdef X86_32_PICASM
+%define i_width dword arg5
+%else
%define i_width r4
+%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
@@ -2094,28 +1973,14 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+ INIT_X86_32_PIC_NOPRESERVE r4
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
+ %assign push_num_begin push_num
cmp i_width, 4
jg .width8or16
-%ifdef X86_32_PICASM
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- movdqu xmm6, [esp]
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- movdqu xmm7, [esp]
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
-%endif
movd xmm0, [p_src]
movd xmm4, [p_src + i_srcstride]
punpcklbw xmm0, xmm4
@@ -2134,14 +1999,8 @@
movd xmm3, [p_src]
punpcklbw xmm4, xmm3
punpcklqdq xmm2, xmm4
-%ifdef X86_32_PICASM
- movdqu xmm5, [esp]
- SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
- add esp, 48
-%else
- movdqa xmm5, [db20_128]
- SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ movdqa xmm5, [pic(db20_128)]
+ SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm0, xmm0
movd [p_dst], xmm0
psrlq xmm0, 32
@@ -2152,11 +2011,7 @@
movd xmm0, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm0
punpcklqdq xmm3, xmm4
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, xmm6, xmm5, xmm7, xmm4
-%else
- SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm1, xmm1
movd [p_dst], xmm1
psrlq xmm1, 32
@@ -2167,14 +2022,11 @@
movd xmm4, [p_src + i_srcstride3]
punpcklbw xmm0, xmm4
jg .width4_height_ge8
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
-%else
- SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm2, xmm2
movd [p_dst], xmm2
.width4_height_le5_done:
+ DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -2186,11 +2038,7 @@
movd xmm1, [p_src]
punpcklbw xmm4, xmm1
punpcklqdq xmm0, xmm4
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
-%else
- SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm2, xmm2
movd [p_dst], xmm2
psrlq xmm2, 32
@@ -2201,11 +2049,7 @@
movd xmm2, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm2
punpcklqdq xmm1, xmm4
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, xmm6, xmm5, xmm7, xmm4
-%else
- SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm3, xmm3
movd [p_dst], xmm3
psrlq xmm3, 32
@@ -2215,14 +2059,11 @@
lea p_dst, [p_dst + 2 * i_dststride]
movd xmm4, [p_src + i_srcstride3]
punpcklbw xmm2, xmm4
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
-%else
- SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm0, xmm0
movd [p_dst], xmm0
.width4_height_ge8_done:
+ DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -2231,38 +2072,16 @@
ret
.width8or16:
+ %assign push_num push_num_begin
sub i_height, 1
push i_height
+ %assign push_num push_num + 1
%xdefine i_ycnt i_height
%define i_height [r7]
.xloop:
push p_src
push p_dst
-%ifdef X86_32_PICASM
- push i_width
- mov i_width, esp
- and esp, 0xfffffff0
- push 0xfb01fb01 ;[esp+64]maddubsw_p1m5_128
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0x14141414 ;[esp+48]db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x01fb01fb ;[esp+32]maddubsw_m5p1_128
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x14fb14fb ;[esp+16]maddubsw_m5p20_128
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0xfb14fb14 ;[esp] maddubsw_p20m5_128
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
-%endif
+ %assign push_num push_num + 2
test i_ycnt, 1
jnz .yloop_begin_even
movq xmm0, [p_src]
@@ -2276,11 +2095,7 @@
movq xmm5, [p_src + i_srcstride]
lea p_src, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm5
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm7
-%else
- SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm7
-%endif
+ SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm7
packuswb xmm0, xmm0
movlps [p_dst], xmm0
add p_dst, i_dststride
@@ -2297,36 +2112,20 @@
punpcklbw xmm4, xmm5
.yloop:
movq xmm6, [p_src]
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [esp+16], [esp], xmm0, xmm7
-%else
- SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm0, xmm7
-%endif
+ SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm0, xmm7
movq xmm7, [p_src + i_srcstride]
punpcklbw xmm6, xmm7
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [esp+64], [esp+48], [esp+32], xmm0
-%else
- SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm0
-%endif
+ SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm0
packuswb xmm1, xmm2
movlps [p_dst], xmm1
movhps [p_dst + i_dststride], xmm1
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm0, [p_src + 2 * i_srcstride]
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [esp+16], [esp], xmm2, xmm1
-%else
- SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm2, xmm1
-%endif
+ SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm2, xmm1
movq xmm1, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklbw xmm0, xmm1
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [esp+64], [esp+48], [esp+32], xmm2
-%else
- SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm2
-%endif
+ SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm2
packuswb xmm3, xmm4
movlps [p_dst], xmm3
movhps [p_dst + i_dststride], xmm3
@@ -2334,36 +2133,20 @@
jle .yloop_exit
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm2, [p_src]
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [esp+16], [esp], xmm4, xmm3
-%else
- SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm4, xmm3
-%endif
+ SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm4, xmm3
movq xmm3, [p_src + i_srcstride]
punpcklbw xmm2, xmm3
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [esp+64], [esp+48], [esp+32], xmm4
-%else
- SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm4
-%endif
+ SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm5, xmm6
movlps [p_dst], xmm5
movhps [p_dst + i_dststride], xmm5
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm4, [p_src + 2 * i_srcstride]
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [esp+16], [esp], xmm6, xmm5
-%else
- SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm6, xmm5
-%endif
+ SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm6, xmm5
movq xmm5, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklbw xmm4, xmm5
-%ifdef X86_32_PICASM
- SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm6
-%else
- SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm6
-%endif
+ SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm6
packuswb xmm7, xmm0
movlps [p_dst], xmm7
movhps [p_dst + i_dststride], xmm7
@@ -2371,12 +2154,9 @@
sub i_ycnt, 8
jg .yloop
.yloop_exit:
-%ifdef X86_32_PICASM
- mov esp, i_width
- pop i_width
-%endif
pop p_dst
pop p_src
+ %assign push_num push_num - 2
sub i_width, 8
jle .width8or16_done
add p_src, 8
@@ -2385,6 +2165,8 @@
jmp .xloop
.width8or16_done:
pop i_ycnt
+ %assign push_num push_num - 1
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -2418,6 +2200,7 @@
%define i_width r4
%define i_height r5
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -2424,28 +2207,9 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
-%ifdef X86_32_PICASM
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- movdqu xmm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- movdqu xmm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- movdqu xmm6, [esp]
- add esp, 48
-%else
- movdqa xmm4, [shufb_32435465768798A9]
- movdqa xmm5, [shufb_011267784556ABBC]
- movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ movdqa xmm4, [pic(shufb_32435465768798A9)]
+ movdqa xmm5, [pic(shufb_011267784556ABBC)]
+ movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 8
je .width8_yloop
jg .width16_yloop
@@ -2463,6 +2227,7 @@
jg .width4_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width8_yloop:
movdqu xmm0, [p_src - 2]
@@ -2478,6 +2243,7 @@
jg .width8_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width16_yloop:
movdqu xmm0, [p_src - 2]
@@ -2492,6 +2258,7 @@
jg .width16_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -2518,6 +2285,7 @@
%define i_width r4
%define i_height r5
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -2524,28 +2292,9 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
-%ifdef X86_32_PICASM
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- movdqu xmm5, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- movdqu xmm6, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- movdqu xmm7, [esp]
- add esp, 48
-%else
- movdqa xmm5, [shufb_32435465768798A9]
- movdqa xmm6, [shufb_011267784556ABBC]
- movdqa xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ movdqa xmm5, [pic(shufb_32435465768798A9)]
+ movdqa xmm6, [pic(shufb_011267784556ABBC)]
+ movdqa xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 9
je .width9_yloop
jg .width17_yloop
@@ -2563,6 +2312,7 @@
jg .width5_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width9_yloop:
movdqu xmm0, [p_src - 2]
@@ -2586,6 +2336,7 @@
jg .width9_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width17_yloop:
movdqu xmm0, [p_src - 2]
@@ -2615,6 +2366,7 @@
jg .width17_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -2637,6 +2389,7 @@
%define p_dst r2
%define i_height r3
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -2643,28 +2396,9 @@
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
-%ifdef X86_32_PICASM
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- movdqu xmm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- movdqu xmm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- movdqu xmm6, [esp]
- add esp, 48
-%else
- movdqa xmm4, [shufb_32435465768798A9]
- movdqa xmm5, [shufb_011267784556ABBC]
- movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ movdqa xmm4, [pic(shufb_32435465768798A9)]
+ movdqa xmm5, [pic(shufb_011267784556ABBC)]
+ movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
movdqu xmm0, [p_src - 2]
@@ -2681,6 +2415,7 @@
movlps [p_dst], xmm0
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -2702,6 +2437,7 @@
%define i_height r3
%define i_srcstride 8
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -2746,6 +2482,7 @@
.done:
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef p_dst
@@ -2769,6 +2506,7 @@
%define i_dststride r3
%define i_height r4
%assign push_num 0
+ INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -2776,28 +2514,9 @@
SIGN_EXTENSION r4, r4d
sub p_src, i_srcstride
sub p_src, i_srcstride
-%ifdef X86_32_PICASM
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- movdqu xmm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- movdqu xmm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- movdqu xmm6, [esp]
- add esp, 48
-%else
- movdqa xmm4, [shufb_32435465768798A9]
- movdqa xmm5, [shufb_011267784556ABBC]
- movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ movdqa xmm4, [pic(shufb_32435465768798A9)]
+ movdqa xmm5, [pic(shufb_011267784556ABBC)]
+ movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
movdqu xmm0, [p_src - 2]
@@ -2818,6 +2537,7 @@
.done:
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -2846,6 +2566,7 @@
push r5
%assign push_num 1
%endif
+ INIT_X86_32_PIC r6
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -2936,6 +2657,7 @@
.done:
POP_XMM
LOAD_5_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r5
%endif
@@ -2965,6 +2687,7 @@
%define i_width r4
%define i_height r5
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -2975,28 +2698,9 @@
sub p_src, i_srcstride
pcmpeqw xmm4, xmm4
psllw xmm4, 15 ; dw -32768
-%ifdef X86_32_PICASM
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- movdqu xmm5, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- movdqu xmm6, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- movdqu xmm7, [esp]
- add esp, 48
-%else
- movdqa xmm5, [shufb_32435465768798A9]
- movdqa xmm6, [shufb_011267784556ABBC]
- movdqa xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ movdqa xmm5, [pic(shufb_32435465768798A9)]
+ movdqa xmm6, [pic(shufb_011267784556ABBC)]
+ movdqa xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 9
jne .width17_yloop
@@ -3019,6 +2723,7 @@
jg .width9_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width17_yloop:
@@ -3047,6 +2752,7 @@
jg .width17_yloop
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -3070,7 +2776,11 @@
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
+%ifdef X86_32_PICASM
+%define i_width dword arg5
+%else
%define i_width r4
+%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
@@ -3084,14 +2794,23 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+ INIT_X86_32_PIC_NOPRESERVE r4
sub i_height, 1
push i_height
+ %assign push_num push_num + 1
lea i_srcstride3, [3 * i_srcstride]
test i_width, 1
jz .width_loop
push p_src
push p_dst
+ %assign push_num push_num + 2
+%ifdef X86_32_PICASM
+ add p_src, i_width
+ add p_src, i_width
+ sub p_src, 2
+%else
lea p_src, [p_src + 2 * i_width - 2]
+%endif
add p_dst, i_width
movd xmm0, [p_src]
punpcklwd xmm0, [p_src + i_srcstride]
@@ -3186,11 +2905,13 @@
.unalign_done:
pop p_dst
pop p_src
+ %assign push_num push_num - 2
mov i_height, [r7]
sub i_width, 1
.width_loop:
push p_src
push p_dst
+ %assign push_num push_num + 2
movdqa xmm0, [p_src]
movdqa xmm1, [p_src + i_srcstride]
movdqa xmm2, [p_src + 2 * i_srcstride]
@@ -3245,6 +2966,7 @@
.x_loop_dec:
pop p_dst
pop p_src
+ %assign push_num push_num - 2
sub i_width, 8
jle .done
mov i_height, [r7]
@@ -3258,6 +2980,8 @@
pop p_src
.done:
pop i_height
+ %assign push_num push_num - 1
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -3280,24 +3004,7 @@
vpshufb %5, %1, %3
vpshufb %1, %1, %2
vpshufd %6, %1, 10110001b
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- vpmaddubsw %1, %1, [esp]
- mov esp, r0
- pop r0
-%else
- vpmaddubsw %1, %1, [db20_256]
-%endif
+ vpmaddubsw %1, %1, [pic(db20_256)]
vpmaddubsw %5, %5, %4
vpmaddubsw %6, %6, %4
vpaddw %1, %1, %5
@@ -3307,14 +3014,7 @@
; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6
%macro AVX2_FilterHorizontal_16px 6
AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6
-%ifdef X86_32_PICASM
- vpcmpeqw %6, %6, %6
- vpsrlw %6, %6, 15
- vpsllw %6, %6, 4
- vpaddw %1, %1, %6
-%else
- vpaddw %1, %1, [h264_w0x10_256]
-%endif
+ vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
@@ -3327,24 +3027,7 @@
vpunpcklqdq %1, %1, %2
vpunpcklqdq %6, %6, %7
vpshufd %7, %1, 10110001b
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- vpmaddubsw %1, %1, [esp]
- mov esp, r0
- pop r0
-%else
- vpmaddubsw %1, %1, [db20_256]
-%endif
+ vpmaddubsw %1, %1, [pic(db20_256)]
vpmaddubsw %6, %6, %5
vpmaddubsw %7, %7, %5
vpaddw %1, %1, %6
@@ -3354,20 +3037,13 @@
; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7
%macro AVX2_FilterHorizontal_4x4px 7
AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7
-%ifdef X86_32_PICASM
- vpcmpeqw %7, %7, %7
- vpsrlw %7, %7, 15
- vpsllw %7, %7, 4
- vpaddw %1, %1, %7
-%else
- vpaddw %1, %1, [h264_w0x10_256]
-%endif
+ vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
; pixels=%1 -32768>>scale=%2 tmp=%3
%macro AVX2_FilterHorizontalbw_4px 3
- vpmaddubsw %1, %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_256]
+ vpmaddubsw %1, %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_256)]
vpmaddwd %1, %1, %2
vpshufd %3, %1, 10110001b
vpaddd %1, %1, %3
@@ -3375,45 +3051,8 @@
; pixels=%1 tmp=%2
%macro AVX2_FilterHorizontal_4px 2
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x0000fe0a ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0xfc00fc00 ;dwm1024_256
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0xfc00fc00
- push 0x00008000 ;dd32768_256
- push 0x00008000
- push 0x00008000
- push 0x00008000
- push 0x00008000
- push 0x00008000
- push 0x00008000
- push 0x00008000
- vpmaddubsw %1, %1, [esp+64]
- vpmaddwd %1, %1, [esp+32]
- vpshufd %2, %1, 10110001b
- vpaddd %1, %1, %2
- vpaddd %1, %1, [esp]
- mov esp, r0
- pop r0
-%else
- AVX2_FilterHorizontalbw_4px %1, [dwm1024_256], %2
- vpaddd %1, %1, [dd32768_256]
-%endif
+ AVX2_FilterHorizontalbw_4px %1, [pic(dwm1024_256)], %2
+ vpaddd %1, %1, [pic(dd32768_256)]
%endmacro
; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
@@ -3423,14 +3062,7 @@
vpaddw %1, %1, %7
vpmaddubsw %7, %3, %6
vpaddw %1, %1, %7
-%ifdef X86_32_PICASM
- vpcmpeqw %7, %7, %7
- vpsrlw %7, %7, 15
- vpsllw %7, %7, 4
- vpaddw %1, %1, %7
-%else
- vpaddw %1, %1, [h264_w0x10_256]
-%endif
+ vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
@@ -3444,14 +3076,7 @@
vpaddw %1, %1, %7
vpmaddubsw %7, %4, %6
vpaddw %1, %1, %7
-%ifdef X86_32_PICASM
- vpcmpeqw %7, %7, %7
- vpsrlw %7, %7, 15
- vpsllw %7, %7, 4
- vpaddw %1, %1, %7
-%else
- vpaddw %1, %1, [h264_w0x10_256]
-%endif
+ vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
@@ -3465,24 +3090,7 @@
vpaddw %7, %3, %4
vpaddw %1, %1, %7
vpsraw %1, %1, 2
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x00200020
- push 0x00200020
- push 0x00200020
- push 0x00200020
- push 0x00200020
- push 0x00200020
- push 0x00200020
- push 0x00200020
- vpaddw %7, %7, [esp]
- mov esp, r0
- pop r0
-%else
- vpaddw %7, %7, [dw32_256]
-%endif
+ vpaddw %7, %7, [pic(dw32_256)]
vpaddw %1, %1, %7
vpsraw %1, %1, 6
%endmacro
@@ -3501,7 +3109,11 @@
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
+%ifdef X86_32_PICASM
+%define i_width dword arg5
+%else
%define i_width r4
+%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
@@ -3515,6 +3127,7 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+ INIT_X86_32_PIC_NOPRESERVE r4
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
@@ -3522,32 +3135,6 @@
je .width8
jg .width16
; .width4:
-%ifdef X86_32_PICASM
- push i_width
- mov i_width, esp
- and esp, 0xffffffe0
- sub esp, 16
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0xfb01fb01 ;maddubsw_p1m5_256
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0x01fb01fb ;maddubsw_m5p1_256
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
-%endif
vmovd xmm0, [p_src]
vpbroadcastd xmm5, [p_src + i_srcstride]
vpunpcklbw xmm0, xmm0, xmm5
@@ -3574,13 +3161,8 @@
vpunpcklbw ymm5, ymm5, ymm4
vpblendd ymm3, ymm3, ymm5, 11001100b
vpblendd ymm2, ymm2, ymm3, 11110000b
-%ifdef X86_32_PICASM
- vbroadcasti128 ymm6, [esp+64]
- AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm6, [esp], ymm5
-%else
- vbroadcasti128 ymm6, [db20_128]
- AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
-%endif
+ vbroadcasti128 ymm6, [pic(db20_128)]
+ AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
vpackuswb ymm0, ymm0, ymm0
vmovd [p_dst], xmm0
vpsrlq xmm5, xmm0, 32
@@ -3596,11 +3178,7 @@
vpbroadcastd ymm5, [p_src + i_srcstride3]
vpunpcklbw ymm4, ymm4, ymm5
jg .width4_height_ge8
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [esp+32], xmm6, [esp], xmm5
-%else
- AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
-%endif
+ AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
vpackuswb xmm2, xmm2, xmm2
vmovd [p_dst], xmm2
jmp .width4_done
@@ -3616,11 +3194,7 @@
vpunpcklbw ymm5, ymm5, ymm0
vpblendd ymm1, ymm1, ymm5, 11001100b
vpblendd ymm4, ymm4, ymm1, 11110000b
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [esp+32], ymm6, [esp], ymm5
-%else
- AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
-%endif
+ AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
vpackuswb ymm2, ymm2, ymm2
vmovd [p_dst], xmm2
vpsrlq xmm5, xmm2, 32
@@ -3635,19 +3209,12 @@
lea p_dst, [p_dst + 2 * i_dststride]
vmovd xmm5, [p_src + i_srcstride3]
vpunpcklbw xmm0, xmm0, xmm5
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [esp+32], xmm6, [esp], xmm5
-%else
- AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
-%endif
+ AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
vpackuswb xmm4, xmm4, xmm4
vmovd [p_dst], xmm4
.width4_done:
-%ifdef X86_32_PICASM
- mov esp, i_width
- pop i_width
-%endif
vzeroupper
+ DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -3656,32 +3223,6 @@
ret
.width8:
-%ifdef X86_32_PICASM
- push i_width
- mov i_width, esp
- and esp, 0xffffffe0
- sub esp, 16
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0xfb01fb01 ;maddubsw_p1m5_256
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0x01fb01fb ;maddubsw_m5p1_256
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
-%endif
sub i_height, 1
vmovq xmm0, [p_src]
vmovq xmm4, [p_src + i_srcstride]
@@ -3701,13 +3242,8 @@
vmovq xmm3, [p_src + 2 * i_srcstride]
vpunpcklbw xmm4, xmm4, xmm3
vinserti128 ymm2, ymm2, xmm4, 1
-%ifdef X86_32_PICASM
- vbroadcasti128 ymm5, [esp+64]
- AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm5, [esp], ymm4
-%else
- vbroadcasti128 ymm5, [db20_128]
- AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+ vbroadcasti128 ymm5, [pic(db20_128)]
+ AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vmovq xmm4, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklbw xmm3, xmm3, xmm4
@@ -3714,11 +3250,7 @@
vmovq xmm6, [p_src]
vpunpcklbw xmm4, xmm4, xmm6
vinserti128 ymm3, ymm3, xmm4, 1
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [esp+32], ymm5, [esp], ymm4
-%else
- AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+ AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vpackuswb ymm0, ymm0, ymm1
vmovlps [p_dst], xmm0
vextracti128 xmm1, ymm0, 1
@@ -3732,11 +3264,7 @@
vmovq xmm4, [p_src + i_srcstride]
vpunpcklbw xmm0, xmm6, xmm4
jg .width8_height_ge8
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [esp+32], xmm5, [esp], xmm4
-%else
- AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
-%endif
+ AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
vpackuswb xmm2, xmm2, xmm2
vmovlps [p_dst], xmm2
jmp .width8_done
@@ -3744,11 +3272,7 @@
vmovq xmm1, [p_src + 2 * i_srcstride]
vpunpcklbw xmm4, xmm4, xmm1
vinserti128 ymm0, ymm0, xmm4, 1
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [esp+32], ymm5, [esp], ymm4
-%else
- AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+ AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vmovq xmm4, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklbw xmm1, xmm1, xmm4
@@ -3755,11 +3279,7 @@
vmovq xmm6, [p_src]
vpunpcklbw xmm4, xmm4, xmm6
vinserti128 ymm1, ymm1, xmm4, 1
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [esp+32], ymm5, [esp], ymm4
-%else
- AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
-%endif
+ AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vpackuswb ymm2, ymm2, ymm3
vmovlps [p_dst], xmm2
vextracti128 xmm3, ymm2, 1
@@ -3773,19 +3293,12 @@
jl .width8_done
vmovq xmm4, [p_src + i_srcstride]
vpunpcklbw xmm2, xmm6, xmm4
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [esp+32], xmm5, [esp], xmm4
-%else
- AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
-%endif
+ AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
vpackuswb xmm0, xmm0, xmm0
vmovlps [p_dst], xmm0
.width8_done:
-%ifdef X86_32_PICASM
- mov esp, i_width
- pop i_width
-%endif
vzeroupper
+ DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -3794,51 +3307,6 @@
ret
.width16:
-%ifdef X86_32_PICASM
- push i_width
- mov i_width, esp
- and esp, 0xffffffe0
- push 0x14141414 ;db20_128
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0x14141414
- push 0xfb01fb01 ;maddubsw_p1m5_256
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0xfb01fb01
- push 0x01fb01fb ;maddubsw_m5p1_256
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x01fb01fb
- push 0x14fb14fb ;maddubsw_m5p20_256
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0x14fb14fb
- push 0xfb14fb14 ;maddubsw_p20m5_256
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
- push 0xfb14fb14
-%endif
sub i_height, 1
test i_height, 1
jnz .width16_yloop_begin_even
@@ -3865,11 +3333,7 @@
lea p_src, [p_src + 2 * i_srcstride]
vpblendd ymm5, ymm5, ymm6, 11110000b
vpunpcklbw ymm4, ymm4, ymm5
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm7
-%else
- AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm7
-%endif
+ AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm7
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 1000b
vmovdqa [p_dst], xmm0
@@ -3899,20 +3363,12 @@
vmovq xmm6, [p_src]
vpbroadcastq ymm7, [p_src + 8]
vpblendd ymm6, ymm6, ymm7, 11110000b
-%ifdef X86_32_PICASM
- AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [esp+32], [esp], ymm0, ymm7
-%else
- AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm0, ymm7
-%endif
+ AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm0, ymm7
vmovq xmm7, [p_src + i_srcstride]
vpbroadcastq ymm0, [p_src + i_srcstride + 8]
vpblendd ymm7, ymm7, ymm0, 11110000b
vpunpcklbw ymm6, ymm6, ymm7
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [esp+96], [esp+128], [esp+64], ymm0
-%else
- AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm0
-%endif
+ AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm0
vpackuswb ymm1, ymm1, ymm2
vpermq ymm1, ymm1, 11011000b
vmovdqa [p_dst], xmm1
@@ -3921,21 +3377,13 @@
vmovq xmm0, [p_src + 2 * i_srcstride]
vpbroadcastq ymm1, [p_src + 2 * i_srcstride + 8]
vpblendd ymm0, ymm0, ymm1, 11110000b
-%ifdef X86_32_PICASM
- AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [esp+32], [esp], ymm2, ymm1
-%else
- AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm2, ymm1
-%endif
+ AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm2, ymm1
vmovq xmm1, [p_src + i_srcstride3]
vpbroadcastq ymm2, [p_src + i_srcstride3 + 8]
lea p_src, [p_src + 4 * i_srcstride]
vpblendd ymm1, ymm1, ymm2, 11110000b
vpunpcklbw ymm0, ymm0, ymm1
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [esp+96], [esp+128], [esp+64], ymm2
-%else
- AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm2
-%endif
+ AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm2
vpackuswb ymm3, ymm3, ymm4
vpermq ymm3, ymm3, 11011000b
vmovdqa [p_dst], xmm3
@@ -3944,20 +3392,12 @@
vmovq xmm2, [p_src]
vpbroadcastq ymm3, [p_src + 8]
vpblendd ymm2, ymm2, ymm3, 11110000b
-%ifdef X86_32_PICASM
- AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [esp+32], [esp], ymm4, ymm3
-%else
- AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm4, ymm3
-%endif
+ AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm4, ymm3
vmovq xmm3, [p_src + i_srcstride]
vpbroadcastq ymm4, [p_src + i_srcstride + 8]
vpblendd ymm3, ymm3, ymm4, 11110000b
vpunpcklbw ymm2, ymm2, ymm3
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [esp+96], [esp+128], [esp+64], ymm4
-%else
- AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm4
-%endif
+ AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm4
vpackuswb ymm5, ymm5, ymm6
vpermq ymm5, ymm5, 11011000b
vmovdqa [p_dst], xmm5
@@ -3966,21 +3406,13 @@
vmovq xmm4, [p_src + 2 * i_srcstride]
vpbroadcastq ymm5, [p_src + 2 * i_srcstride + 8]
vpblendd ymm4, ymm4, ymm5, 11110000b
-%ifdef X86_32_PICASM
- AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [esp+32], [esp], ymm6, ymm5
-%else
- AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm6, ymm5
-%endif
+ AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm6, ymm5
vmovq xmm5, [p_src + i_srcstride3]
vpbroadcastq ymm6, [p_src + i_srcstride3 + 8]
lea p_src, [p_src + 4 * i_srcstride]
vpblendd ymm5, ymm5, ymm6, 11110000b
vpunpcklbw ymm4, ymm4, ymm5
-%ifdef X86_32_PICASM
- AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm6
-%else
- AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm6
-%endif
+ AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm6
vpackuswb ymm7, ymm7, ymm0
vpermq ymm7, ymm7, 11011000b
vmovdqa [p_dst], xmm7
@@ -3988,11 +3420,8 @@
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 8
jg .width16_yloop
-%ifdef X86_32_PICASM
- mov esp, i_width
- pop i_width
-%endif
vzeroupper
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -4026,6 +3455,7 @@
%define i_width r4
%define i_height r5
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -4032,32 +3462,9 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm6, [esp]
- mov esp, r1
- pop r1
-%else
- vbroadcasti128 ymm4, [shufb_32435465768798A9]
- vbroadcasti128 ymm5, [shufb_011267784556ABBC]
- vbroadcasti128 ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 8
je .width8
jg .width16_yloop
@@ -4086,6 +3493,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width8:
lea i_srcstride3, [3 * i_srcstride]
@@ -4110,6 +3518,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
%undef i_srcstride3
.width16_yloop:
@@ -4129,6 +3538,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -4155,6 +3565,7 @@
%define i_width r4
%define i_height r5
%assign push_num 0
+ INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -4161,32 +3572,9 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm5, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm6, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm7, [esp]
- mov esp, r1
- pop r1
-%else
- vbroadcasti128 ymm5, [shufb_32435465768798A9]
- vbroadcasti128 ymm6, [shufb_011267784556ABBC]
- vbroadcasti128 ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm5, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm6, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 9
je .width9
jg .width17
@@ -4210,6 +3598,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width9:
%xdefine i_srcstride3 i_width
@@ -4248,6 +3637,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC_KEEPDEF
ret
.width17:
lea i_srcstride3, [3 * i_srcstride]
@@ -4291,6 +3681,7 @@
vzeroupper
POP_XMM
LOAD_6_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef i_srcstride3
%undef p_src
@@ -4320,6 +3711,7 @@
push r4
%assign push_num 1
%endif
+ INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -4327,32 +3719,9 @@
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm6, [esp]
- mov esp, r1
- pop r1
-%else
- vbroadcasti128 ymm4, [shufb_32435465768798A9]
- vbroadcasti128 ymm5, [shufb_011267784556ABBC]
- vbroadcasti128 ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 3
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -4372,6 +3741,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
@@ -4403,6 +3773,7 @@
push r4
%assign push_num 1
%endif
+ INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -4443,6 +3814,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
@@ -4469,6 +3841,7 @@
%define i_height r3
%define i_dststride 16
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 6
SIGN_EXTENSION r1, r1d
@@ -4475,32 +3848,9 @@
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm3, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm4, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm5, [esp]
- mov esp, r1
- pop r1
-%else
- vbroadcasti128 ymm3, [shufb_32435465768798A9]
- vbroadcasti128 ymm4, [shufb_011267784556ABBC]
- vbroadcasti128 ymm5, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm3, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm4, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm5, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -4519,6 +3869,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -4541,6 +3892,7 @@
%define i_height r3
%define i_srcstride 16
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -4614,6 +3966,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef p_dst
@@ -4641,6 +3994,7 @@
push r4
%assign push_num 1
%endif
+ INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -4687,6 +4041,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
@@ -4713,6 +4068,7 @@
%define i_height r3
%define i_dststride 32
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
@@ -4719,32 +4075,9 @@
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
-%ifdef X86_32_PICASM
- push r1
- mov r1, esp
- and esp, 0xfffffff0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm4, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm5, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm6, [esp]
- mov esp, r1
- pop r1
-%else
- vbroadcasti128 ymm4, [shufb_32435465768798A9]
- vbroadcasti128 ymm5, [shufb_011267784556ABBC]
- vbroadcasti128 ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -4768,6 +4101,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -4790,6 +4124,7 @@
%define i_height r3
%define i_srcstride 32
%assign push_num 0
+ INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -4869,6 +4204,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
@@ -4896,6 +4232,7 @@
push r4
%assign push_num 1
%endif
+ INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -4903,47 +4240,9 @@
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x090a0809 ;shufb_32435465768798A9
- push 0x07080607
- push 0x05060405
- push 0x03040203
- vbroadcasti128 ymm5, [esp]
- push 0x0c0b0b0a
- push 0x06050504
- push 0x08070706
- push 0x02010100
- vbroadcasti128 ymm6, [esp]
- push 0x01fb01fb
- push 0xfb01fb01
- push 0x01fb01fb
- push 0xfb01fb01
- vbroadcasti128 ymm7, [esp]
- sub esp, 16
- push 0x0000fe0a ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x0000fe0a
- push 0xd8d80afe
- push 0x80008000 ;dwm32768_256
- push 0x80008000
- push 0x80008000
- push 0x80008000
- push 0x80008000
- push 0x80008000
- push 0x80008000
- push 0x80008000
-%else
- vbroadcasti128 ymm5, [shufb_32435465768798A9]
- vbroadcasti128 ymm6, [shufb_011267784556ABBC]
- vbroadcasti128 ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
-%endif
+ vbroadcasti128 ymm5, [pic(shufb_32435465768798A9)]
+ vbroadcasti128 ymm6, [pic(shufb_011267784556ABBC)]
+ vbroadcasti128 ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 3
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -4961,14 +4260,7 @@
vinserti128 ymm0, ymm0, [p_src + i_srcstride3 + 6], 1
lea p_src, [p_src + 4 * i_srcstride]
vpunpckhqdq ymm4, ymm4, ymm0
-%ifdef X86_32_PICASM
- vpmaddubsw ymm4, ymm4, [esp+32]
- vpmaddwd ymm4, ymm4, [esp]
- vpshufd ymm2, ymm4, 10110001b
- vpaddd ymm4, ymm4, ymm2
-%else
- AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
-%endif
+ AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
vmovlps [p_dst + 26], xmm4
vmovdqa [p_dst + 16], xmm3
vextracti128 xmm2, ymm4, 1
@@ -4991,16 +4283,7 @@
vmovdqu xmm3, [p_src + i_srcstride - 2]
vinserti128 ymm3, ymm3, [p_src + i_srcstride + 6], 1
vpunpckhqdq ymm4, ymm0, ymm3
-%ifdef X86_32_PICASM
- vpmaddubsw ymm4, ymm4, [esp+32]
- vpmaddwd ymm4, ymm4, [esp]
- vpshufd ymm2, ymm4, 10110001b
- vpaddd ymm4, ymm4, ymm2
- mov esp, r5
- pop r5
-%else
- AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
-%endif
+ AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
vextracti128 xmm4, ymm4, 1
@@ -5011,6 +4294,7 @@
vzeroupper
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
@@ -5037,7 +4321,11 @@
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
+%ifdef X86_32_PICASM
+%define i_width dword arg5
+%else
%define i_width r4
+%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
@@ -5051,6 +4339,7 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+ INIT_X86_32_PIC_NOPRESERVE r4
sub i_height, 1
lea i_srcstride3, [3 * i_srcstride]
test i_width, 1
@@ -5058,7 +4347,14 @@
push i_height
push p_src
push p_dst
+ %assign push_num push_num + 3
+%ifdef X86_32_PICASM
+ add p_src, i_width
+ add p_src, i_width
+ sub p_src, 2
+%else
lea p_src, [p_src + 2 * i_width - 2]
+%endif
add p_dst, i_width
vmovd xmm0, [p_src]
vpunpcklwd xmm0, xmm0, [p_src + i_srcstride]
@@ -5119,6 +4415,7 @@
pop p_dst
pop p_src
pop i_height
+ %assign push_num push_num - 3
.align_begin:
vmovdqa ymm0, [p_src]
vmovdqa ymm1, [p_src + i_srcstride]
@@ -5175,6 +4472,7 @@
vmovdqa [p_dst], xmm0
.done:
vzeroupper
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -53,7 +53,11 @@
;***********************************************************************
; Data
;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
SECTION .rodata align=16
+%endif
align 16
HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
@@ -772,29 +776,12 @@
mov r12, r2
%endif
+ INIT_X86_32_PIC r2
pxor xmm4, xmm4
-%ifdef X86_32_PICASM
- push 0xff01ff01
- push 0xff01ff01
- push 0x01010101
- push 0x01010101
- movdqu xmm5, [esp]
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- movdqu xmm6, [esp]
- push 0x00010001
- push 0x00010001
- push 0x00010001
- push 0x00010001
- movdqu xmm7, [esp]
- add esp, 48
-%else
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
-%endif
+ movdqa xmm5, [pic(HSumSubDB1)]
+ movdqa xmm6, [pic(HSumSubDW1)]
+ movdqa xmm7, [pic(PDW1)]
+ DEINIT_X86_32_PIC
sub r0, r1
movdqu xmm0, [r0]
movhlps xmm1, xmm0
@@ -916,9 +903,9 @@
ret
%macro SSE41_ChromaGetX38x8Satd 0
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
+ movdqa xmm5, [pic(HSumSubDB1)]
+ movdqa xmm6, [pic(HSumSubDW1)]
+ movdqa xmm7, [pic(PDW1)]
sub r0, r1
movq xmm0, [r0]
punpcklqdq xmm0, xmm0
@@ -940,7 +927,7 @@
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
movdqa [r6+16], xmm0 ;H
;(sum+2)>>2
- movdqa xmm6, [PDQ2]
+ movdqa xmm6, [pic(PDQ2)]
movdqa xmm5, xmm4
punpckhqdq xmm5, xmm1
paddd xmm5, xmm6
@@ -993,88 +980,8 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
loop_chroma_satdx3:
-%ifdef X86_32_PICASM
- mov r0, esp
- and esp, 0xfffffff0
- push 0xff01ff01
- push 0xff01ff01
- push 0x01010101
- push 0x01010101
- movdqa xmm5, [esp]
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- movdqa xmm6, [esp]
- push 0x00010001
- push 0x00010001
- push 0x00010001
- push 0x00010001
- movdqa xmm7, [esp]
- mov esp, r0
- mov r0, [esp + push_num*4 + 4]
-
- sub r0, r1
- movq xmm0, [r0]
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
- movdqa [r6], xmm0 ;V
- add r0, r1
- pinsrb xmm0, byte[r0-1], 0
- pinsrb xmm0, byte[r0+r1-1], 1
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 2
- pinsrb xmm0, byte[r0+r1-1], 3
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 4
- pinsrb xmm0, byte[r0+r1-1], 5
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 6
- pinsrb xmm0, byte[r0+r1-1], 7
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-;movdqa [r6+16], xmm0 ;H
-;(sum+2)>>2
- mov DWORD [r6+16], 0x0002
- mov DWORD [r6+20], 0x0000
- mov DWORD [r6+24], 0x0002
- mov DWORD [r6+28], 0x0000
- movdqa xmm6, [r6+16]
- movdqa [r6+16], xmm0 ;H
-
- movdqa xmm5, xmm4
- punpckhqdq xmm5, xmm1
- paddd xmm5, xmm6
- psrld xmm5, 2
-;(sum1+sum2+4)>>3
- paddd xmm6, xmm6
- paddd xmm4, xmm1
- paddd xmm4, xmm6
- psrld xmm4, 3
-;satd *16
- pslld xmm5, 4
- pslld xmm4, 4
-;temp satd
- movdqa xmm6, xmm4
- punpcklqdq xmm4, xmm5
- psllq xmm4, 32
- psrlq xmm4, 32
- movdqa [r6+32], xmm4
- punpckhqdq xmm5, xmm6
- psllq xmm5, 32
- psrlq xmm5, 32
- movdqa [r6+48], xmm5
-
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov r0, 0
- SSE41_ChromaGetX38x4Satd r0, 0
- inc r0
- SSE41_ChromaGetX38x4Satd r0, 0
-%else
+ INIT_X86_32_PIC r4
SSE41_ChromaGetX38x8Satd
-%endif
SSEReg2MMX xmm4, mm0,mm1
SSEReg2MMX xmm5, mm2,mm3
SSEReg2MMX xmm6, mm5,mm6
@@ -1081,89 +988,8 @@
mov r0, arg8
mov r2, arg9
-%ifdef X86_32_PICASM
- mov r0, esp
- and esp, 0xfffffff0
- push 0xff01ff01
- push 0xff01ff01
- push 0x01010101
- push 0x01010101
- movdqa xmm5, [esp]
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- push 0xffff0001
- movdqa xmm6, [esp]
- push 0x00010001
- push 0x00010001
- push 0x00010001
- push 0x00010001
- movdqa xmm7, [esp]
- mov esp, r0
- mov r0, arg8
-
- sub r0, r1
- movq xmm0, [r0]
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
- movdqa [r6], xmm0 ;V
- add r0, r1
- pinsrb xmm0, byte[r0-1], 0
- pinsrb xmm0, byte[r0+r1-1], 1
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 2
- pinsrb xmm0, byte[r0+r1-1], 3
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 4
- pinsrb xmm0, byte[r0+r1-1], 5
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 6
- pinsrb xmm0, byte[r0+r1-1], 7
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
- ;movdqa [r6+16], xmm0 ;H
-;(sum+2)>>2
-
- mov DWORD [r6+16], 0x0002
- mov DWORD [r6+20], 0x0000
- mov DWORD [r6+24], 0x0002
- mov DWORD [r6+28], 0x0000
- movdqa xmm6, [r6+16]
- movdqa [r6+16], xmm0 ;H
-
- movdqa xmm5, xmm4
- punpckhqdq xmm5, xmm1
- paddd xmm5, xmm6
- psrld xmm5, 2
-;(sum1+sum2+4)>>3
- paddd xmm6, xmm6
- paddd xmm4, xmm1
- paddd xmm4, xmm6
- psrld xmm4, 3
-;satd *16
- pslld xmm5, 4
- pslld xmm4, 4
-;temp satd
- movdqa xmm6, xmm4
- punpcklqdq xmm4, xmm5
- psllq xmm4, 32
- psrlq xmm4, 32
- movdqa [r6+32], xmm4
- punpckhqdq xmm5, xmm6
- psllq xmm5, 32
- psrlq xmm5, 32
- movdqa [r6+48], xmm5
-
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov r0, 0
- SSE41_ChromaGetX38x4Satd r0, 0
- inc r0
- SSE41_ChromaGetX38x4Satd r0, 0
-%else
SSE41_ChromaGetX38x8Satd
-%endif
+ DEINIT_X86_32_PIC
MMXReg2SSE xmm0, xmm3, mm0, mm1
MMXReg2SSE xmm1, xmm3, mm2, mm3
@@ -1457,20 +1283,12 @@
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse41
%assign push_num 0
+ INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
-%ifdef X86_32_PICASM
- push 0xff01ff01
- push 0x01010101
- push 0xff01ff01
- push 0x01010101
- movdqu xmm4, [esp]
- add esp, 16
-%else
- movdqa xmm4,[HSwapSumSubDB1]
-%endif
+ movdqa xmm4,[pic(HSwapSumSubDB1)]
movd xmm2,[r2]
movd xmm5,[r2+r3]
shufps xmm2,xmm5,0
@@ -1511,6 +1329,7 @@
SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
@@ -1524,21 +1343,13 @@
push r5
%endif
%assign push_num 2
+ INIT_X86_32_PIC r6
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
-%ifdef X86_32_PICASM
- push 0xff01ff01
- push 0xff01ff01
- push 0x01010101
- push 0x01010101
- movdqu xmm7, [esp]
- add esp, 16
-%else
- movdqa xmm7, [HSumSubDB1]
-%endif
+ movdqa xmm7, [pic(HSumSubDB1)]
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
@@ -1549,6 +1360,7 @@
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r5
pop r4
@@ -1572,16 +1384,9 @@
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
-%ifdef X86_32_PICASM
- push 0xff01ff01
- push 0xff01ff01
- push 0x01010101
- push 0x01010101
- movdqu xmm7, [esp]
- add esp, 16
-%else
- movdqa xmm7, [HSumSubDB1]
-%endif
+ INIT_X86_32_PIC_NOPRESERVE r4
+ movdqa xmm7, [pic(HSumSubDB1)]
+ DEINIT_X86_32_PIC
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
@@ -1614,6 +1419,7 @@
push r5
%endif
%assign push_num 2
+ INIT_X86_32_PIC r6
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -1621,16 +1427,7 @@
push r0
push r2
-%ifdef X86_32_PICASM
- push 0xff01ff01
- push 0xff01ff01
- push 0x01010101
- push 0x01010101
- movdqu xmm7, [esp]
- add esp, 16
-%else
- movdqa xmm7, [HSumSubDB1]
-%endif
+ movdqa xmm7, [pic(HSumSubDB1)]
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
@@ -1650,6 +1447,7 @@
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
POP_XMM
LOAD_4_PARA_POP
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r5
pop r4
@@ -1677,16 +1475,9 @@
push r0
push r2
-%ifdef X86_32_PICASM
- push 0xff01ff01
- push 0xff01ff01
- push 0x01010101
- push 0x01010101
- movdqu xmm7, [esp]
- add esp, 16
-%else
- movdqa xmm7, [HSumSubDB1]
-%endif
+ INIT_X86_32_PIC_NOPRESERVE r4
+ movdqa xmm7, [pic(HSumSubDB1)]
+ DEINIT_X86_32_PIC
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
@@ -1863,19 +1654,9 @@
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
-%ifdef X86_32_PICASM
- mov r1, esp
- and esp, 0xfffffff0
- push 0xff01ff01
- push 0xff01ff01
- push 0x01010101
- push 0x01010101
- vbroadcasti128 ymm7, [esp]
- mov esp, r1
- mov r1, [esp + push_num*4 + 8]
-%else
- vbroadcasti128 ymm7, [HSumSubDB1]
-%endif
+ INIT_X86_32_PIC_NOPRESERVE r5
+ vbroadcasti128 ymm7, [pic(HSumSubDB1)]
+ DEINIT_X86_32_PIC
lea r5, [3 * r1]
lea r6, [3 * r3]
vpxor ymm6, ymm6, ymm6
@@ -1941,22 +1722,11 @@
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
-%ifdef X86_32_PICASM
- mov r0, esp
- and esp, 0xfffffff0
- push 0xff01ff01
- push 0xff01ff01
- push 0x01010101
- push 0x01010101
- vpbroadcastq xmm0, [esp]
- vpbroadcastq ymm6, [esp + 8]
- mov esp, r0
- mov r0, [esp + push_num*4 + 4]
-%else
- vpbroadcastq xmm0, [HSumSubDB1]
- vpbroadcastq ymm6, [HSumSubDB1 + 8]
-%endif
+ INIT_X86_32_PIC_NOPRESERVE r5
+ vpbroadcastq xmm0, [pic(HSumSubDB1)]
+ vpbroadcastq ymm6, [pic(HSumSubDB1 + 8)]
vpblendd ymm6, ymm0, ymm6, 11110000b
+ DEINIT_X86_32_PIC
lea r5, [3 * r1]
lea r6, [3 * r3]
vpxor ymm5, ymm5, ymm5
--- a/codec/decoder/core/x86/intra_pred.asm
+++ b/codec/decoder/core/x86/intra_pred.asm
@@ -49,7 +49,11 @@
; Local Data (Read Only)
;*******************************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
SECTION .rodata align=16
+%endif
align 16
sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
@@ -132,20 +136,7 @@
%macro COPY_16_TIMES 2
movdqa %2, [%1-16]
psrldq %2, 15
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x01010101 ;mmx_01bytes
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq %2, [esp]
- mov esp, r5
- pop r5
-%else
- pmuludq %2, [mmx_01bytes]
-%endif
+ pmuludq %2, [pic(mmx_01bytes)]
pshufd %2, %2, 0
%endmacro
@@ -152,20 +143,7 @@
%macro COPY_16_TIMESS 3
movdqa %2, [%1+%3-16]
psrldq %2, 15
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x01010101 ;mmx_01bytes
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq %2, [esp]
- mov esp, r5
- pop r5
-%else
- pmuludq %2, [mmx_01bytes]
-%endif
+ pmuludq %2, [pic(mmx_01bytes)]
pshufd %2, %2, 0
%endmacro
@@ -203,52 +181,26 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
-%ifdef X86_32_PICASM
- push r3
- mov r3, esp
- and esp, 0xfffffff0
- push 0x01010101 ;mmx_01bytes
- push 0x01010101
- push 0x01010101
- push 0x01010101
-%endif
movzx r2, byte [r0-1]
movd xmm0, r2d
-%ifdef X86_32_PICASM
- pmuludq xmm0, [esp]
-%else
- pmuludq xmm0, [mmx_01bytes]
-%endif
+ pmuludq xmm0, [pic(mmx_01bytes)]
movzx r2, byte [r0+r1-1]
movd xmm1, r2d
-%ifdef X86_32_PICASM
- pmuludq xmm1, [esp]
-%else
- pmuludq xmm1, [mmx_01bytes]
-%endif
+ pmuludq xmm1, [pic(mmx_01bytes)]
lea r0, [r0+r1]
movzx r2, byte [r0+r1-1]
movd xmm2, r2d
-%ifdef X86_32_PICASM
- pmuludq xmm2, [esp]
-%else
- pmuludq xmm2, [mmx_01bytes]
-%endif
+ pmuludq xmm2, [pic(mmx_01bytes)]
movzx r2, byte [r0+2*r1-1]
movd xmm3, r2d
-%ifdef X86_32_PICASM
- pmuludq xmm3, [esp]
- mov esp, r3
- pop r3
-%else
- pmuludq xmm3, [mmx_01bytes]
-%endif
+ pmuludq xmm3, [pic(mmx_01bytes)]
sub r0, r1
movd [r0], xmm0
@@ -257,6 +209,7 @@
movd [r0], xmm2
movd [r0+r1], xmm3
+ DEINIT_X86_32_PIC
ret
;*******************************************************************************
@@ -266,6 +219,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -276,37 +230,11 @@
;for H
pxor xmm7, xmm7
movq xmm0, [r0]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00010002
- push 0x00030004
- push 0x00050006
- push 0x00070008
- movdqa xmm5, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm5, [sse2_plane_dec]
-%endif
+ movdqa xmm5, [pic(sse2_plane_dec)]
punpcklbw xmm0, xmm7
pmullw xmm0, xmm5
movq xmm1, [r0 + 9]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00080007 ;sse2_plane_inc
- push 0x00060005
- push 0x00040003
- push 0x00020001
- movdqa xmm6, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm6, [sse2_plane_inc]
-%endif
+ movdqa xmm6, [pic(sse2_plane_inc)]
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
psubw xmm1, xmm0
@@ -361,19 +289,7 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r2, r2
-%ifdef X86_32_PICASM
- mov r2, esp
- and esp, 0xfffffff0
- push 0x0000ffff ;sse2_plane_inc_minus
- push 0xfffefffd
- push 0xfffcfffb
- push 0xfffafff9
- movdqa xmm5, [esp]
- mov esp, r2
- xor r2, r2
-%else
- movdqa xmm5, [sse2_plane_inc_minus]
-%endif
+ movdqa xmm5, [pic(sse2_plane_inc_minus)]
get_i16x16_luma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -393,6 +309,7 @@
jnz get_i16x16_luma_pred_plane_sse2_1
POP_XMM
+ DEINIT_X86_32_PIC
pop r4
pop r3
ret
@@ -414,6 +331,7 @@
WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
%assign push_num 0
+ INIT_X86_32_PIC_NOPRESERVE r2
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -430,6 +348,7 @@
SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ DEINIT_X86_32_PIC
ret
;*******************************************************************************
@@ -477,6 +396,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -486,30 +406,11 @@
pxor mm7, mm7
movq mm0, [r0]
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x00010002 ;sse2_plane_dec_c
- push 0x00030004
- push 0x00040003 ;sse2_plane_inc_c
- push 0x00020001
- push 0x00040003 ;
- push 0x00020001
- push 0x0000ffff
- push 0xfffefffd
- movq mm5, [esp+24]
-%else
- movq mm5, [sse2_plane_dec_c]
-%endif
+ movq mm5, [pic(sse2_plane_dec_c)]
punpcklbw mm0, mm7
pmullw mm0, mm5
movq mm1, [r0 + 5]
-%ifdef X86_32_PICASM
- movq mm6, [esp+16]
-%else
- movq mm6, [sse2_plane_inc_c]
-%endif
+ movq mm6, [pic(sse2_plane_inc_c)]
punpcklbw mm1, mm7
pmullw mm1, mm6
psubw mm1, mm0
@@ -561,13 +462,7 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r2, r2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp]
- mov esp, r5
- pop r5
-%else
- movdqa xmm5, [sse2_plane_mul_b_c]
-%endif
+ movdqa xmm5, [pic(sse2_plane_mul_b_c)]
get_i_chroma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -583,6 +478,7 @@
jnz get_i_chroma_pred_plane_sse2_1
POP_XMM
+ DEINIT_X86_32_PIC
pop r4
pop r3
WELSEMMS
@@ -602,6 +498,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -629,20 +526,7 @@
movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
pxor mm1,mm4 ;find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pand mm1,[esp] ;set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm1,[mmx_01bytes] ;set the odd bit
-%endif
+ pand mm1,[pic(mmx_01bytes)] ;set the odd bit
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
@@ -655,6 +539,7 @@
movd [r0+r1],mm2
psrlq mm2,8
movd [r0],mm2
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -667,20 +552,7 @@
movq %1, [%3-8]
psrlq %1, 38h
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw %1, [mmx_01bytes]
-%endif
+ pmullw %1, [pic(mmx_01bytes)]
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -689,20 +561,7 @@
movq %1, [%3+r1-8]
psrlq %1, 38h
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw %1, [mmx_01bytes]
-%endif
+ pmullw %1, [pic(mmx_01bytes)]
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -709,6 +568,7 @@
WELS_EXTERN WelsDecoderIChromaPredH_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -716,20 +576,7 @@
movq mm0, [r2-8]
psrlq mm0, 38h
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw mm0, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw mm0, [mmx_01bytes]
-%endif
+ pmullw mm0, [pic(mmx_01bytes)]
pshufw mm0, mm0, 0
movq [r0], mm0
@@ -753,6 +600,7 @@
lea r0, [r0+2*r1]
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -816,6 +664,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -841,18 +690,7 @@
pavgb mm1, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm4, [esp]
- mov esp, r0
- pop r0
-%else
- pand mm4, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm4, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm1, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
@@ -876,6 +714,7 @@
movd [r0+2*r1], mm3
psrlq mm3, 10h
movd [r0+r1], mm3
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -909,6 +748,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -937,18 +777,7 @@
pavgb mm2, mm0
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm5, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm5, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm5, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm5 ; decrease 1 from odd bytes
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
@@ -970,6 +799,7 @@
movd [r0+r1], mm1
psrlq mm1, 10h
movd [r0+2*r1], mm1
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1005,6 +835,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -1030,18 +861,7 @@
pavgb mm2, mm0
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm3, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm3, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm3, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm3 ; decrease 1 from odd bytes
movq mm3, mm0
@@ -1071,6 +891,7 @@
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
lea r0, [r0+2*r1]
movd [r0+r1], mm5
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1102,6 +923,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -1121,18 +943,7 @@
movq mm3, mm1
pavgb mm1, mm2
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm3, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm3, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm3, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm1, mm3 ; decrease 1 from odd bytes
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
@@ -1146,6 +957,7 @@
psrlq mm0, 8h
lea r0, [r0+2*r1]
movd [r0+r1], mm0
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1181,6 +993,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -1199,18 +1012,7 @@
movq mm4, mm2
pavgb mm2, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm4, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm4, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm4, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
@@ -1223,6 +1025,7 @@
psrlq mm2, 8h
lea r0, [r0+2*r1]
movd [r0+r1], mm2
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1234,6 +1037,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r4, r0
@@ -1275,18 +1079,7 @@
movq mm1, mm2
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00000000
- push 0x00000002
- movq mm4, [esp]
- mov esp, r0
- pop r0
-%else
- movq mm4, [mmx_0x02]
-%endif
+ movq mm4, [pic(mmx_0x02)]
paddq mm0, mm4
psrlq mm0, 0x02
@@ -1302,30 +1095,13 @@
paddq mm1, mm4
psrlq mm1, 0x03
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pmuludq mm0, [esp]
- pmuludq mm3, [esp]
-%else
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
-%endif
+ pmuludq mm0, [pic(mmx_01bytes)]
+ pmuludq mm3, [pic(mmx_01bytes)]
psllq mm0, 0x20
pxor mm0, mm3 ; mm0 = m_up
-%ifdef X86_32_PICASM
- pmuludq mm2, [esp]
- pmuludq mm1, [esp]
- mov esp, r5
- pop r5
-%else
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
-%endif
+ pmuludq mm2, [pic(mmx_01bytes)]
+ pmuludq mm1, [pic(mmx_01bytes)]
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
@@ -1342,6 +1118,7 @@
lea r4, [r4+2*r1]
movq [r4+r1], mm1
+ DEINIT_X86_32_PIC
pop r4
pop r3
WELSEMMS
@@ -1357,6 +1134,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r4, r0
@@ -1385,20 +1163,7 @@
movd xmm1, r2d
paddw xmm0, xmm1
psrld xmm0, 0x05
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq xmm0, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq xmm0, [mmx_01bytes]
-%endif
+ pmuludq xmm0, [pic(mmx_01bytes)]
pshufd xmm0, xmm0, 0
movdqa [r4], xmm0
@@ -1432,6 +1197,7 @@
movdqa [r4+r1], xmm0
+ DEINIT_X86_32_PIC
pop r4
pop r3
@@ -1518,24 +1284,12 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
lea r2, [2*r1+r1] ; 3*kiStride
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- movdqa xmm0, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm0, [sse2_dc_0x80]
-%endif
+ movdqa xmm0, [pic(sse2_dc_0x80)]
movdqa xmm1, xmm0
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
@@ -1557,6 +1311,7 @@
movdqa [r0+2*r1], xmm0
movdqa [r0+r2], xmm1
+ DEINIT_X86_32_PIC
ret
;*******************************************************************************
@@ -1680,21 +1435,11 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
lea r2, [2*r1+r1]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080
- push 0x80808080
- movq mm0, [esp]
- mov esp, r0
- pop r0
-%else
- movq mm0, [sse2_dc_0x80]
-%endif
+ movq mm0, [pic(sse2_dc_0x80)]
movq mm1, mm0
movq [r0], mm0
movq [r0+r1], mm1
@@ -1705,6 +1450,7 @@
movq [r0+r1], mm1
movq [r0+2*r1], mm0
movq [r0+r2], mm1
+ DEINIT_X86_32_PIC
emms
ret
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -75,9 +75,7 @@
#ifdef X86_ASM
-#ifndef X86_32_PICASM
int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
-#endif
int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel);
/****************************************************************************
@@ -86,9 +84,7 @@
void WelsScan4x4Ac_sse2 (int16_t* zig_value, int16_t* pDct);
void WelsScan4x4DcAc_ssse3 (int16_t* pLevel, int16_t* pDct);
void WelsScan4x4DcAc_sse2 (int16_t* pLevel, int16_t* pDct);
-#ifndef X86_32_PICASM
int32_t WelsCalculateSingleCtr4x4_sse2 (int16_t* pDct);
-#endif
/****************************************************************************
* DCT functions
--- a/codec/encoder/core/inc/set_mb_syn_cavlc.h
+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h
@@ -78,12 +78,10 @@
int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
int32_t iEndIdx);
#ifdef X86_ASM
-#ifndef X86_32_PICASM
int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
int32_t iEndIdx);
int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
int32_t iEndIdx);
-#endif
#endif
#if defined(__cplusplus)
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -500,9 +500,7 @@
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmx;
}
if (uiCpuFlag & WELS_CPU_SSE2) {
-#ifndef X86_32_PICASM
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse2;
-#endif
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_sse2;
pFuncList->pfQuantization4x4 = WelsQuant4x4_sse2;
@@ -516,9 +514,7 @@
pFuncList->pfScan4x4 = WelsScan4x4DcAc_sse2;
pFuncList->pfScan4x4Ac = WelsScan4x4Ac_sse2;
-#ifndef X86_32_PICASM
pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_sse2;
-#endif
pFuncList->pfDctT4 = WelsDctT4_sse2;
pFuncList->pfDctFourT4 = WelsDctFourT4_sse2;
--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@@ -291,19 +291,15 @@
pFuncList->pfCavlcParamCal = CavlcParamCal_c;
#if defined(X86_32_ASM)
-#ifndef X86_32_PICASM
if (uiCpuFlag & WELS_CPU_SSE2) {
pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;
}
#endif
-#endif
#ifdef X86_ASM
-#ifndef X86_32_PICASM
if (uiCpuFlag & WELS_CPU_SSE42) {
pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;
}
-#endif
#endif
if (iEntropyCodingModeFlag) {
pFuncList->pfStashMBStatus = StashMBStatusCabac;
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@@ -42,7 +42,11 @@
%include "asm_inc.asm"
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
SECTION .rodata align=16
+%endif
align 16
@@ -369,7 +373,6 @@
%ifdef X86_32
-%ifndef X86_32_PICASM
;***********************************************************************
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
@@ -377,10 +380,12 @@
push ebx
push edi
push esi
+ %assign push_num 3
+ INIT_X86_32_PIC ebp
- mov eax, [esp+16] ;coffLevel
- mov edi, [esp+24] ;Level
- mov ebx, [esp+32] ;endIdx
+ mov eax, arg1 ;coffLevel
+ mov edi, arg3 ;Level
+ mov ebx, arg5 ;endIdx
cmp ebx, 3
jne .Level16
pxor xmm1, xmm1
@@ -400,7 +405,7 @@
pmovmskb edx, xmm0
cmp edx, 0
je near .return
- movdqa xmm6, [sse2_b_1]
+ movdqa xmm6, [pic(sse2_b_1)]
pcmpeqw xmm7, xmm7 ;generate -1
mov ebx, 0xff
;pinsrw xmm6, ebx, 3
@@ -407,7 +412,7 @@
mov bl, dh
- lea ebx, [byte_1pos_table+8*ebx]
+ lea ebx, [pic(byte_1pos_table+8*ebx)]
movq xmm0, [ebx]
pextrw ecx, xmm0, 3
shr ecx, 8
@@ -438,7 +443,7 @@
add edi, 2
.LowByteFind0:
and edx, 0xff
- lea ebx, [byte_1pos_table+8*edx]
+ lea ebx, [pic(byte_1pos_table+8*edx)]
movq xmm1, [ebx]
pextrw esi, xmm1, 3
or esi, 0xff
@@ -466,7 +471,7 @@
mov edx, [eax]
mov [edi], dx
.getLevelEnd:
- mov edx, [esp+28] ;total_coeffs
+ mov edx, arg4 ;total_coeffs
;mov ebx, ecx
;and ebx, 0xff
movzx ebx, byte cl
@@ -473,7 +478,7 @@
add cl, ch
mov [edx], cl
;getRun
- movq xmm5, [sse2_b8]
+ movq xmm5, [pic(sse2_b8)]
paddb xmm0, xmm5
pxor xmm2, xmm2
pxor xmm3, xmm3
@@ -499,18 +504,17 @@
paddb xmm1, xmm7
psrldq xmm0, 1
psubb xmm1, xmm0
- mov ecx, [esp+20] ;run
+ mov ecx, arg2 ;run
movdqa [ecx], xmm1
;getRunEnd
.return:
+ DEINIT_X86_32_PIC
pop esi
pop edi
pop ebx
ret
-%endif ;%ifndef X86_32_PICASM
%endif ;%ifdef X86_32
-%ifndef X86_32_PICASM
;***********************************************************************
;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
@@ -524,17 +528,21 @@
push r5
push r6
%assign push_num 4
+%ifdef X86_32_PICASM
+ %define p_total_coeffs r1
+%else
%define p_total_coeffs r0
+%endif
%define r_tmp r1
%define r_tmpd r1d
%define r_tmpb r1b
%define p_level r2
%define p_coeff_level r3
+ %define p_run r6
%define r_mask r5
%define r_maskd r5d
- %define p_run r6
- %define p_shufb_lut wels_cavlc_param_cal_shufb_lut
- %define p_run_lut wels_cavlc_param_cal_run_lut
+ %define p_shufb_lut pic(wels_cavlc_param_cal_shufb_lut)
+ %define p_run_lut pic(wels_cavlc_param_cal_run_lut)
mov p_coeff_level, arg1
mov p_run, arg2
mov p_level, arg3
@@ -571,6 +579,7 @@
%define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
%endif
+ INIT_X86_32_PIC_NOPRESERVE r0
; Acquire a bitmask indicating which words are non-zero.
; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.
@@ -588,7 +597,7 @@
.load_done:
movdqa [p_run], xmm1 ; Zero-initialize because we may read back implied zeros.
pcmpeqb xmm0, xmm1
- pshufb xmm0, [wels_shufb_rev]
+ pshufb xmm0, [pic(wels_shufb_rev)]
pmovmskb r_maskd, xmm0
xor r_maskd, 0FFFFh
%undef i_endidxd
@@ -605,12 +614,18 @@
%xdefine i_total_zeros p_total_coeffs
%endif
%undef p_total_coeffs
+%ifdef X86_32_PICASM
+ push r_tmp2
+ %undef i_total_zeros
+ %define i_total_zeros dword [esp]
+%else
mov i_total_zeros, r_tmp2
+%endif
jz .done
- mov i_total_zeros, 16
- sub i_total_zeros, r_tmp2
bsf r_tmpd, r_maskd ; Find first set bit.
- sub i_total_zeros, r_tmp
+ lea r_tmp2, [r_tmp2 + r_tmp - 16]
+ neg r_tmp2
+ mov i_total_zeros, r_tmp2
; Skip trailing zeros.
; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.
and r_tmpd, -4
@@ -649,8 +664,13 @@
jnz .loop
.done:
%ifnidni retrq, i_total_zeros
+ %ifdef X86_32_PICASM
+ pop retrq
+ %else
mov retrq, i_total_zeros
+ %endif
%endif
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r6
pop r5
@@ -673,5 +693,3 @@
%undef r_tmp2d
%undef p_shufb_lut
%undef p_run_lut
-
-%endif ;ifndef X86_32_PICASM
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -45,7 +45,11 @@
; Local Data (Read Only)
;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
SECTION .rodata align=16
+%endif
align 16
sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
@@ -144,20 +148,7 @@
%macro COPY_16_TIMES 2
movdqa %2, [%1-16]
psrldq %2, 15
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq %2, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq %2, [mmx_01bytes]
-%endif
+ pmuludq %2, [pic(mmx_01bytes)]
pshufd %2, %2, 0
%endmacro
@@ -164,20 +155,7 @@
%macro COPY_16_TIMESS 3
movdqa %2, [%1+%3-16]
psrldq %2, 15
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq %2, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq %2, [mmx_01bytes]
-%endif
+ pmuludq %2, [pic(mmx_01bytes)]
pshufd %2, %2, 0
%endmacro
@@ -215,30 +193,16 @@
WELS_EXTERN WelsI4x4LumaPredH_sse2
push r3
%assign push_num 1
+ INIT_X86_32_PIC r4
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
movzx r3, byte [r1-1]
movd xmm0, r3d
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq xmm0, [esp]
-%else
- pmuludq xmm0, [mmx_01bytes]
-%endif
+ pmuludq xmm0, [pic(mmx_01bytes)]
movzx r3, byte [r1+r2-1]
movd xmm1, r3d
-%ifdef X86_32_PICASM
- pmuludq xmm1, [esp]
-%else
- pmuludq xmm1, [mmx_01bytes]
-%endif
+ pmuludq xmm1, [pic(mmx_01bytes)]
unpcklps xmm0, xmm1
@@ -245,26 +209,17 @@
lea r1, [r1+r2*2]
movzx r3, byte [r1-1]
movd xmm2, r3d
-%ifdef X86_32_PICASM
- pmuludq xmm2, [esp]
-%else
- pmuludq xmm2, [mmx_01bytes]
-%endif
+ pmuludq xmm2, [pic(mmx_01bytes)]
movzx r3, byte [r1+r2-1]
movd xmm3, r3d
-%ifdef X86_32_PICASM
- pmuludq xmm3, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq xmm3, [mmx_01bytes]
-%endif
+ pmuludq xmm3, [pic(mmx_01bytes)]
unpcklps xmm2, xmm3
unpcklpd xmm0, xmm2
movdqa [r0], xmm0
+ DEINIT_X86_32_PIC
pop r3
ret
@@ -275,6 +230,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_3_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -284,34 +240,11 @@
;for H
pxor xmm7, xmm7
movq xmm0, [r1]
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x00010002 ;sse2_plane_dec
- push 0x00030004
- push 0x00050006
- push 0x00070008
- push 0x00080007 ;sse_plane_inc
- push 0x00060005
- push 0x00040003
- push 0x00020001
- push 0x0000ffff ;sse_plane_inc_minus
- push 0xfffefffd
- push 0xfffcfffb
- push 0xfffafff9
- movdqa xmm5, [esp+32]
-%else
- movdqa xmm5, [sse2_plane_dec]
-%endif
+ movdqa xmm5, [pic(sse2_plane_dec)]
punpcklbw xmm0, xmm7
pmullw xmm0, xmm5
movq xmm1, [r1 + 9]
-%ifdef X86_32_PICASM
- movdqa xmm6, [esp+16]
-%else
- movdqa xmm6, [sse2_plane_inc]
-%endif
+ movdqa xmm6, [pic(sse2_plane_inc)]
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
psubw xmm1, xmm0
@@ -357,13 +290,7 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r3, r3
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp]
- mov esp, r5
- pop r5
-%else
- movdqa xmm5, [sse2_plane_inc_minus]
-%endif
+ movdqa xmm5, [pic(sse2_plane_inc_minus)]
get_i16x16_luma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -382,6 +309,7 @@
cmp r3, 16
jnz get_i16x16_luma_pred_plane_sse2_1
POP_XMM
+ DEINIT_X86_32_PIC
pop r4
pop r3
ret
@@ -393,6 +321,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_3_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
@@ -401,30 +330,11 @@
pxor mm7, mm7
movq mm0, [r1]
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x00010002 ;sse2_plane_dec_c
- push 0x00030004
- push 0x00040003 ;sse2_plane_inc_c
- push 0x00020001
- push 0x00040003 ;sse2_plane_mul_b_c
- push 0x00020001
- push 0x0000ffff
- push 0xfffefffd
- movq mm5, [esp+24]
-%else
- movq mm5, [sse2_plane_dec_c]
-%endif
+ movq mm5, [pic(sse2_plane_dec_c)]
punpcklbw mm0, mm7
pmullw mm0, mm5
movq mm1, [r1 + 5]
-%ifdef X86_32_PICASM
- movq mm6, [esp+16]
-%else
- movq mm6, [sse2_plane_inc_c]
-%endif
+ movq mm6, [pic(sse2_plane_inc_c)]
punpcklbw mm1, mm7
pmullw mm1, mm6
psubw mm1, mm0
@@ -474,13 +384,7 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r3, r3
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp]
- mov esp, r5
- pop r5
-%else
- movdqa xmm5, [sse2_plane_mul_b_c]
-%endif
+ movdqa xmm5, [pic(sse2_plane_mul_b_c)]
get_i_chroma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -495,6 +399,7 @@
cmp r3, 8
jnz get_i_chroma_pred_plane_sse2_1
POP_XMM
+ DEINIT_X86_32_PIC
pop r4
pop r3
WELSEMMS
@@ -514,6 +419,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDR_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
@@ -539,18 +445,7 @@
movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
pxor mm1,mm4 ;find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm1,[esp] ;set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm1,[mmx_01bytes] ;set the odd bit
-%endif
+ pand mm1,[pic(mmx_01bytes)] ;set the odd bit
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
@@ -561,6 +456,7 @@
movd [r0+4],mm2
psrlq mm2,8
movd [r0],mm2
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -619,20 +515,7 @@
psrlq %1, 38h
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw %1, [mmx_01bytes]
-%endif
+ pmullw %1, [pic(mmx_01bytes)]
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -642,20 +525,7 @@
psrlq %1, 38h
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw %1, [mmx_01bytes]
-%endif
+ pmullw %1, [pic(mmx_01bytes)]
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -662,6 +532,7 @@
WELS_EXTERN WelsIChromaPredH_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
movq mm0, [r1-8]
@@ -668,20 +539,7 @@
psrlq mm0, 38h
;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw mm0, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw mm0, [mmx_01bytes]
-%endif
+ pmullw mm0, [pic(mmx_01bytes)]
pshufw mm0, mm0, 0
movq [r0], mm0
@@ -701,6 +559,7 @@
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -767,6 +626,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -791,18 +651,7 @@
pavgb mm1, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm4, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm4, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm4, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm1, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
@@ -824,6 +673,7 @@
movd [r0+8], mm3
psrlq mm3, 10h
movd [r0+4], mm3
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -855,6 +705,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
movd mm0, [r1-4] ; mm0[3] = l0
@@ -881,18 +732,7 @@
pavgb mm2, mm0
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm5, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm5, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm5, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm5 ; decrease 1 from odd bytes
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
@@ -912,6 +752,7 @@
movd [r0+4], mm1
psrlq mm1, 10h
movd [r0+8], mm1
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -947,6 +788,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -971,18 +813,7 @@
pavgb mm2, mm0
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm3, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm3, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm3, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm3 ; decrease 1 from odd bytes
movq mm3, mm0
@@ -1011,6 +842,7 @@
psllq mm2, 8h
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
movd [r0+12], mm5
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1042,6 +874,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -1060,18 +893,7 @@
movq mm3, mm1
pavgb mm1, mm2
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm3, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm3, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm3, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm1, mm3 ; decrease 1 from odd bytes
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
@@ -1084,6 +906,7 @@
movd [r0+8], mm0
psrlq mm0, 8h
movd [r0+12], mm0
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1119,6 +942,7 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -1135,18 +959,7 @@
movq mm4, mm2
pavgb mm2, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm4, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm4, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm4, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
@@ -1158,6 +971,7 @@
movd [r0+4], mm2
psrlq mm2, 8h
movd [r0+12], mm2
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1169,6 +983,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -1208,18 +1023,7 @@
movq mm1, mm2
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00000000
- push 0x00000002
- movq mm4, [esp]
- mov esp, r0
- pop r0
-%else
- movq mm4, [mmx_0x02]
-%endif
+ movq mm4, [pic(mmx_0x02)]
paddq mm0, mm4
psrlq mm0, 0x02
@@ -1235,32 +1039,13 @@
paddq mm1, mm4
psrlq mm1, 0x03
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq mm0, [esp]
- pmuludq mm3, [esp]
-%else
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
-%endif
+ pmuludq mm0, [pic(mmx_01bytes)]
+ pmuludq mm3, [pic(mmx_01bytes)]
psllq mm0, 0x20
pxor mm0, mm3 ; mm0 = m_up
-%ifdef X86_32_PICASM
- pmuludq mm2, [esp]
- pmuludq mm1, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
-%endif
+ pmuludq mm2, [pic(mmx_01bytes)]
+ pmuludq mm1, [pic(mmx_01bytes)]
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
@@ -1274,6 +1059,7 @@
movq [r0+0x30], mm1
movq [r0+0x38], mm1
+ DEINIT_X86_32_PIC
pop r4
pop r3
WELSEMMS
@@ -1289,6 +1075,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
sub r1, r2
@@ -1316,20 +1103,7 @@
movd xmm1, r3d
paddw xmm0, xmm1
psrld xmm0, 0x05
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq xmm0, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq xmm0, [mmx_01bytes]
-%endif
+ pmuludq xmm0, [pic(mmx_01bytes)]
pshufd xmm0, xmm0, 0
movdqa [r0], xmm0
@@ -1349,6 +1123,7 @@
movdqa [r0+0xe0], xmm0
movdqa [r0+0xf0], xmm0
+ DEINIT_X86_32_PIC
pop r4
pop r3
ret
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -34,7 +34,11 @@
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
SECTION .rodata align=16
+%endif
ALIGN 16
mv_x_inc_x4 dw 0x10, 0x10, 0x10, 0x10
@@ -696,26 +700,12 @@
mov ebx, [height]
mov [i_height], ebx
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00100010 ;mv_x_inc_x4
- push 0x00100010
- push 0x00040004 ;mv_y_inc_x4
- push 0x00040004
- push 0x000c0008 ;mx_x_offset_x4
- push 0x00040000
- movq xmm7, [esp+16] ; x_qpel inc
- movq xmm6, [esp+8] ; y_qpel inc
- movq xmm5, [esp] ; x_qpel vector
- mov esp, r0
- pop r0
-%else
- movq xmm7, [mv_x_inc_x4] ; x_qpel inc
- movq xmm6, [mv_y_inc_x4] ; y_qpel inc
- movq xmm5, [mx_x_offset_x4] ; x_qpel vector
-%endif
+ %assign push_num 5
+ INIT_X86_32_PIC_NOPRESERVE ecx
+ movq xmm7, [pic(mv_x_inc_x4)] ; x_qpel inc
+ movq xmm6, [pic(mv_y_inc_x4)] ; y_qpel inc
+ movq xmm5, [pic(mx_x_offset_x4)] ; x_qpel vector
+ DEINIT_X86_32_PIC
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
@@ -1415,24 +1405,9 @@
push r13
mov r12, r2
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00100010 ;mv_x_inc_x4
- push 0x00100010
- push 0x00040004 ;mv_y_inc_x4
- push 0x00040004
- push 0x000c0008 ;mx_x_offset_x4
- push 0x00040000
- movq xmm7, [esp+16] ; x_qpel inc
- movq xmm6, [esp+8] ; y_qpel inc
- movq xmm5, [esp] ; x_qpel vector
-%else
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
-%endif
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -49,7 +49,11 @@
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
SECTION .rodata align=16
+%endif
;align 16
;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
@@ -200,6 +204,7 @@
;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_ssse3
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
@@ -207,29 +212,12 @@
pextrw r1d, xmm1, 0 ; eax = [8]
pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x0d0c0706 ;pb_scanacdc_maska
- push 0x05040b0a
- push 0x0f0e0908
- push 0x03020100
- push 0x0f0e0d0c ;pb_scanacdc_maskb
- push 0x07060100
- push 0x05040b0a
- push 0x09080302
- pshufb xmm1, [esp]
- pshufb xmm0, [esp+16]
- mov esp, r0
- pop r0
-%else
- pshufb xmm1, [pb_scanacdc_maskb]
- pshufb xmm0, [pb_scanacdc_maska]
-%endif
+ pshufb xmm1, [pic(pb_scanacdc_maskb)]
+ pshufb xmm0, [pic(pb_scanacdc_maska)]
movdqa [r0],xmm0
movdqa [r0+16], xmm1
+ DEINIT_X86_32_PIC
ret
;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
@@ -268,7 +256,6 @@
ret
-%ifndef X86_32_PICASM
;***********************************************************************
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
@@ -279,6 +266,7 @@
%else
%assign push_num 0
%endif
+ INIT_X86_32_PIC r4
LOAD_1_PARA
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
@@ -309,16 +297,17 @@
.find1end:
sub r1, r2
sub r1, 1
- lea r2, [i_ds_table]
+ lea r2, [pic(i_ds_table)]
add r0b, [r2+r1]
mov r1, r3
and r3, 0xff
shr r1, 8
and r1, 0xff
- lea r2 , [low_mask_table]
+ lea r2 , [pic(low_mask_table)]
add r0b, [r2 +r3]
- lea r2, [high_mask_table]
+ lea r2, [pic(high_mask_table)]
add r0b, [r2+r1]
+ DEINIT_X86_32_PIC
%ifdef X86_32
pop r3
%else
@@ -325,15 +314,14 @@
mov retrd, r0d
%endif
ret
-%endif ;ifndef X86_32_PICASM
-%ifndef X86_32_PICASM
;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;***********************************************************************
WELS_EXTERN WelsGetNoneZeroCount_sse2
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_1_PARA
movdqa xmm0, [r0]
movdqa xmm1, [r0+16]
@@ -350,14 +338,14 @@
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
; xor retr, retr
;add al, [nozero_count_table+r2]
- lea r0 , [nozero_count_table]
+ lea r0 , [pic(nozero_count_table)]
movzx r2, byte [r0+r2]
movzx r1, byte [r0+r1]
mov retrq, r2
add retrq, r1
;add al, [nozero_count_table+r1]
+ DEINIT_X86_32_PIC
ret
-%endif ;%ifndef X86_32_PICASM
;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
--- a/codec/processing/src/x86/denoisefilter.asm
+++ b/codec/processing/src/x86/denoisefilter.asm
@@ -44,7 +44,11 @@
;***********************************************************************
; Constant
;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
SECTION .rodata align=16
+%endif
sse2_32 times 8 dw 32
sse2_20 times 8 dw 20
@@ -147,20 +151,7 @@
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00140014
- push 0x00140014
- push 0x00140014
- push 0x00140014
- pmullw %2, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw %2, [sse2_20]
-%endif
+ pmullw %2, [pic(sse2_20)]
paddw %3, %2
movdqa %2, %1
@@ -254,6 +245,7 @@
%assign push_num 1
+ INIT_X86_32_PIC r4
LOAD_2_PARA
mov r3, r1
@@ -285,6 +277,7 @@
movq [r0 + 2], xmm3
+ DEINIT_X86_32_PIC
pop r3
%assign push_num 0
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -57,7 +57,11 @@
; Local Data (Read Only)
;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=32
+%else
SECTION .rodata align=32
+%endif
;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
@@ -64,6 +68,7 @@
;***********************************************************************
ALIGN 32
+%ifndef X86_32_PICASM
db80h_256:
times 32 db 80h
shufb_0000000088888888:
@@ -74,6 +79,7 @@
times 4 db 4
times 4 db 8
times 4 db 12
+%endif
shufb_mask_low:
db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
shufb_mask_high:
@@ -1253,20 +1259,7 @@
pmaddwd xmm2, xmm1
pshufd xmm1, xmm2, 00000001b
paddd xmm2, xmm1
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x00000000
- push 0x00000000
- push 0x00000000
- push 0x00004000
- movdqa xmm1, [esp]
- mov esp, r0
- pop r0
-%else
movdqa xmm1, [add_extra_half]
-%endif
paddd xmm2, xmm1
psrld xmm2, 15
@@ -1567,20 +1560,7 @@
pmaddwd xmm2, xmm1
pshufd xmm1, xmm2, 00000001b
paddd xmm2, xmm1
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x00000000
- push 0x00000000
- push 0x00000000
- push 0x00004000
- movdqa xmm1, [esp]
- mov esp, r0
- pop r0
-%else
movdqa xmm1, [add_extra_half]
-%endif
paddd xmm2, xmm1
psrld xmm2, 15
@@ -1657,6 +1637,12 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+ %define i_height dword arg6
+%else
+ %define i_height r5
+%endif
+ INIT_X86_32_PIC_NOPRESERVE r5
%ifndef X86_32
push r12
@@ -1664,7 +1650,7 @@
%endif
mov r6, r1 ;Save the tailer for the unasigned size
- imul r6, r5
+ imul r6, i_height
add r6, r0
movdqa xmm7, [r6]
@@ -1697,52 +1683,15 @@
;1st line
movdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
movdqa xmm1, xmm0
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080 ;shufb_mask_onethird_low_1
- push 0x80808080
- push 0x80800f0c
- push 0x09060300
- push 0x80808080 ;shufb_mask_onethird_high_1
- push 0x80808080
- push 0x8080800d
- push 0x0a070401
- push 0x80808080 ;shufb_mask_onethird_low_2
- push 0x800e0b08
- push 0x05028080
- push 0x80808080
- push 0x80808080 ;shufb_mask_onethird_high_2
- push 0x800f0c09
- push 0x06030080
- push 0x80808080
- push 0x0d0a0704 ;shufb_mask_onethird_low_3
- push 0x01808080
- push 0x80808080
- push 0x80808080
- push 0x0e0b0805 ;shufb_mask_onethird_high_3
- push 0x02808080
- push 0x80808080
- push 0x80808080
- movdqa xmm5, [esp+80]
- movdqa xmm6, [esp+64]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_1]
- movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
movdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+48]
- movdqa xmm6, [esp+32]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_2]
- movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
@@ -1751,13 +1700,8 @@
movdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+16]
- movdqa xmm6, [esp]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_3]
- movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
@@ -1768,25 +1712,15 @@
;2nd line
movdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+80]
- movdqa xmm6, [esp+64]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_1]
- movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
movdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+48]
- movdqa xmm6, [esp+32]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_2]
- movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
@@ -1795,15 +1729,8 @@
movdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+16]
- movdqa xmm6, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm5, [shufb_mask_onethird_low_3]
- movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
@@ -1832,7 +1759,7 @@
lea r0, [r0+r1]
lea r0, [r0+r6] ;current dst lien + 1 line
- dec r5
+ dec i_height
jg near .yloops_onethird_sse3
movdqa [r0], xmm7 ;restore the tailer for the unasigned size
@@ -1841,6 +1768,7 @@
pop r12
%endif
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -1847,6 +1775,7 @@
pop r6
%endif
ret
+%undef i_height
;***********************************************************************
; void DyadicBilinearOneThirdDownsampler_sse4( unsigned char* pDst, const int iDstStride,
@@ -1866,6 +1795,12 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+ %define i_height dword arg6
+%else
+ %define i_height r5
+%endif
+ INIT_X86_32_PIC_NOPRESERVE r5
%ifndef X86_32
push r12
@@ -1873,7 +1808,7 @@
%endif
mov r6, r1 ;Save the tailer for the unasigned size
- imul r6, r5
+ imul r6, i_height
add r6, r0
movdqa xmm7, [r6]
@@ -1906,52 +1841,15 @@
;1st line
movntdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
movdqa xmm1, xmm0
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080 ;shufb_mask_onethird_low_1
- push 0x80808080
- push 0x80800f0c
- push 0x09060300
- push 0x80808080 ;shufb_mask_onethird_high_1
- push 0x80808080
- push 0x8080800d
- push 0x0a070401
- push 0x80808080 ;shufb_mask_onethird_low_2
- push 0x800e0b08
- push 0x05028080
- push 0x80808080
- push 0x80808080 ;shufb_mask_onethird_high_2
- push 0x800f0c09
- push 0x06030080
- push 0x80808080
- push 0x0d0a0704 ;shufb_mask_onethird_low_3
- push 0x01808080
- push 0x80808080
- push 0x80808080
- push 0x0e0b0805 ;shufb_mask_onethird_high_3
- push 0x02808080
- push 0x80808080
- push 0x80808080
- movdqa xmm5, [esp+80]
- movdqa xmm6, [esp+64]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_1]
- movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
movntdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+48]
- movdqa xmm6, [esp+32]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_2]
- movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
@@ -1960,13 +1858,8 @@
movntdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+16]
- movdqa xmm6, [esp]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_3]
- movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
@@ -1977,25 +1870,15 @@
;2nd line
movntdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
movdqa xmm3, xmm2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+80]
- movdqa xmm6, [esp+64]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_1]
- movdqa xmm6, [shufb_mask_onethird_high_1]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_1)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_1)]
pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
movntdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+48]
- movdqa xmm6, [esp+32]
-%else
- movdqa xmm5, [shufb_mask_onethird_low_2]
- movdqa xmm6, [shufb_mask_onethird_high_2]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_2)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_2)]
pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
@@ -2004,15 +1887,8 @@
movntdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
movdqa xmm4, xmm1
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp+16]
- movdqa xmm6, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm5, [shufb_mask_onethird_low_3]
- movdqa xmm6, [shufb_mask_onethird_high_3]
-%endif
+ movdqa xmm5, [pic(shufb_mask_onethird_low_3)]
+ movdqa xmm6, [pic(shufb_mask_onethird_high_3)]
pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
@@ -2041,7 +1917,7 @@
lea r0, [r0+r1]
lea r0, [r0+r6] ;current dst lien + 1 line
- dec r5
+ dec i_height
jg near .yloops_onethird_sse4
movdqa [r0], xmm7 ;restore the tailer for the unasigned size
@@ -2050,6 +1926,7 @@
pop r12
%endif
+ DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
@@ -2056,6 +1933,7 @@
pop r6
%endif
ret
+%undef i_height
;***********************************************************************
; void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride,
@@ -2256,20 +2134,10 @@
add r6, r0
movq xmm7, [r6]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080
- push 0x0d090501
- push 0x80808080
- push 0x0c080400
- movdqa xmm6, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm6, [shufb_mask_quarter]
-%endif
+ INIT_X86_32_PIC_NOPRESERVE r4
+ movdqa xmm6, [pic(shufb_mask_quarter)]
+ DEINIT_X86_32_PIC
+
.yloops_quarter_sse3:
;mov eax, [esp+40] ; iSrcWidth
;sar eax, $02 ; iSrcWidth >> 2
@@ -2378,20 +2246,9 @@
add r6, r0
movq xmm7, [r6]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080
- push 0x0d090501
- push 0x80808080
- push 0x0c080400
- movdqa xmm6, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm6, [shufb_mask_quarter] ;mask
-%endif
+ INIT_X86_32_PIC_NOPRESERVE r4
+ movdqa xmm6, [pic(shufb_mask_quarter)] ;mask
+ DEINIT_X86_32_PIC
.yloops_quarter_sse4:
%ifdef X86_32
@@ -2534,20 +2391,7 @@
%macro SSSE3_BilinearFastDownsample4xOrLess_8px 0
movdqa xmm_tmp0, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x08080808
- push 0x08080808
- push 0x00000000
- push 0x00000000
- pshufb xmm_tmp0, [esp]
- mov esp, r0
- pop r0
-%else
- pshufb xmm_tmp0, [shufb_0000000088888888]
-%endif
+ pshufb xmm_tmp0, xmm_shufb_0000000088888888
psubb xmm_xpos_int, xmm_tmp0
SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
mov r_tmp0, i_xpos
@@ -2555,24 +2399,7 @@
lddqu xmm_tmp3, [p_src_row0 + r_tmp0]
lddqu xmm_tmp4, [p_src_row1 + r_tmp0]
movdqa xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- punpcklbw xmm_tmp2, [esp]
- mov esp, r5
- pop r5
-%else
- punpcklbw xmm_tmp2, [db80h_256]
-%endif
+ punpcklbw xmm_tmp2, xmm_db80h
pshufb xmm_tmp3, xmm_tmp2
pshufb xmm_tmp4, xmm_tmp2
SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2585,24 +2412,7 @@
lddqu xmm_tmp3, [p_src_row0 + r_tmp0]
lddqu xmm_tmp4, [p_src_row1 + r_tmp0]
movdqa xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- punpckhbw xmm_tmp2, [esp]
- mov esp, r5
- pop r5
-%else
- punpckhbw xmm_tmp2, [db80h_256]
-%endif
+ punpckhbw xmm_tmp2, xmm_db80h
pshufb xmm_tmp3, xmm_tmp2
pshufb xmm_tmp4, xmm_tmp2
SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2741,43 +2551,13 @@
%macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
movdqa xmm_tmp0, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x08080808
- push 0x08080808
- push 0x00000000
- push 0x00000000
- pshufb xmm_tmp0, [esp]
- mov esp, r0
- pop r0
-%else
- pshufb xmm_tmp0, [shufb_0000000088888888]
-%endif
+ pshufb xmm_tmp0, xmm_shufb_0000000088888888
psubb xmm_xpos_int, xmm_tmp0
SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
mov r_tmp0, i_xpos
shr r_tmp0, 16
movdqa xmm_tmp3, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- punpcklbw xmm_tmp3, [esp]
- mov esp, r5
- pop r5
-%else
- punpcklbw xmm_tmp3, [db80h_256]
-%endif
+ punpcklbw xmm_tmp3, xmm_db80h
lddqu xmm_tmp4, [p_src_row0 + r_tmp0]
lddqu xmm_tmp2, [p_src_row1 + r_tmp0]
lea r_tmp0, [i_xpos + 4 * i_scalex]
@@ -2789,24 +2569,7 @@
pmaddwd xmm_tmp2, xmm_tmp0
SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
movdqa xmm_tmp2, xmm_xpos_int
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- punpckhbw xmm_tmp2, [esp]
- mov esp, r5
- pop r5
-%else
- punpckhbw xmm_tmp2, [db80h_256]
-%endif
+ punpckhbw xmm_tmp2, xmm_db80h
lddqu xmm_tmp4, [p_src_row0 + r_tmp0]
lddqu xmm_tmp3, [p_src_row1 + r_tmp0]
pshufb xmm_tmp4, xmm_tmp2
@@ -2987,7 +2750,11 @@
movd xmm0, arg8
movd xmm1, esp
and esp, -16
+%ifdef X86_32_PICASM
+ sub esp, 8 * 4 + 9 * 16
+%else
sub esp, 8 * 4 + 7 * 16
+%endif
movd [esp], xmm1
%define p_dst r0
%define i_dst_stride_less_width [esp + 1 * 4]
@@ -3021,6 +2788,22 @@
%define xmm_0 [esp + 8 * 4 + 4 * 16]
%define xmm_xpos_int_begin [esp + 8 * 4 + 5 * 16]
%define xmm_xpos_frac_begin [esp + 8 * 4 + 6 * 16]
+%ifdef X86_32_PICASM
+ %define xmm_db80h [esp + 8 * 4 + 7 * 16]
+ %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 16]
+ pxor xmm_tmp4, xmm_tmp4
+ pcmpeqb xmm_tmp5, xmm_tmp5
+ psubb xmm_tmp4, xmm_tmp5
+ movdqa xmm_tmp3, xmm_tmp4
+ psllw xmm_tmp3, 3
+ pslldq xmm_tmp3, 8
+ movdqa xmm_shufb_0000000088888888, xmm_tmp3
+ psllw xmm_tmp4, 7
+ movdqa xmm_db80h, xmm_tmp4
+%else
+ %define xmm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+%endif
mov i_dst_stride_less_width, r1
mov i_dst_width, r2
mov i_dst_height, r3
@@ -3067,6 +2850,8 @@
%define xmm_tmp5 xmm6
%define xmm_xpos_int_begin xmm14
%define xmm_xpos_frac_begin xmm15
+ %define xmm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
pxor xmm_0, xmm_0
%endif
@@ -3230,6 +3015,8 @@
%undef xmm_xfrac0_begin
%undef xmm_xfrac1_begin
%undef xmm_xfrac_inc
+%undef xmm_db80h
+%undef xmm_shufb_0000000088888888
;**************************************************************************************************************
;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
@@ -3265,7 +3052,11 @@
movd xmm0, arg8
movd xmm1, esp
and esp, -16
+%ifdef X86_32_PICASM
+ sub esp, 8 * 4 + 10 * 16
+%else
sub esp, 8 * 4 + 8 * 16
+%endif
movd [esp], xmm1
%define p_dst r0
%define i_dst_stride_less_width [esp + 1 * 4]
@@ -3300,6 +3091,22 @@
%define xmm_7fff [esp + 8 * 4 + 5 * 16]
%define xmm_xpos_int_begin [esp + 8 * 4 + 6 * 16]
%define xmm_xpos_frac_begin [esp + 8 * 4 + 7 * 16]
+%ifdef X86_32_PICASM
+ %define xmm_db80h [esp + 8 * 4 + 8 * 16]
+ %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 16]
+ pxor xmm_tmp4, xmm_tmp4
+ pcmpeqb xmm_tmp5, xmm_tmp5
+ psubb xmm_tmp4, xmm_tmp5
+ movdqa xmm_tmp3, xmm_tmp4
+ psllw xmm_tmp3, 3
+ pslldq xmm_tmp3, 8
+ movdqa xmm_shufb_0000000088888888, xmm_tmp3
+ psllw xmm_tmp4, 7
+ movdqa xmm_db80h, xmm_tmp4
+%else
+ %define xmm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+%endif
mov i_dst_stride_less_width, r1
mov i_dst_width, r2
mov i_dst_height, r3
@@ -3350,6 +3157,8 @@
%define xmm_7fff xmm13
%define xmm_xpos_int_begin xmm14
%define xmm_xpos_frac_begin xmm15
+ %define xmm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
pxor xmm_0, xmm_0
pcmpeqw xmm_7fff, xmm_7fff
psrlw xmm_7fff, 1
@@ -3517,6 +3326,8 @@
%undef xmm_xfrac0_begin
%undef xmm_xfrac1_begin
%undef xmm_xfrac_inc
+%undef xmm_db80h
+%undef xmm_shufb_0000000088888888
%ifdef HAVE_AVX2
; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
@@ -3585,20 +3396,7 @@
%endmacro
%macro AVX2_BilinearFastDownsample4xOrLess_16px 0
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x08080808
- push 0x08080808
- push 0x00000000
- push 0x00000000
- vbroadcasti128 ymm_tmp0, [esp]
- mov esp, r0
- pop r0
-%else
- vbroadcasti128 ymm_tmp0, [shufb_0000000088888888]
-%endif
+ vbroadcasti128 ymm_tmp0, xmm_shufb_0000000088888888
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
@@ -3642,20 +3440,7 @@
%endmacro
%macro AVX2_BilinearFastDownsample8xOrLess_16px 0
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xffffffe0
- push 0x0c0c0c0c
- push 0x08080808
- push 0x04040404
- push 0x00000000
- vbroadcasti128 ymm_tmp0, [esp]
- mov esp, r0
- pop r0
-%else
- vbroadcasti128 ymm_tmp0, [shufb_000044448888CCCC]
-%endif
+ vbroadcasti128 ymm_tmp0, xmm_shufb_000044448888CCCC
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
mov r_tmp0, i_xpos
@@ -3894,20 +3679,7 @@
%endmacro
%macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x08080808 ;shufb_0000000088888888
- push 0x08080808
- push 0x00000000
- push 0x00000000
- vbroadcasti128 ymm_tmp0, [esp]
- mov esp, r5
- pop r5
-%else
- vbroadcasti128 ymm_tmp0, [shufb_0000000088888888]
-%endif
+ vbroadcasti128 ymm_tmp0, xmm_shufb_0000000088888888
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
@@ -3922,24 +3694,7 @@
lea r_tmp0, [i_xpos + 2 * i_scalex2]
lea i_xpos, [r_tmp0 + 4 * i_scalex2]
shr r_tmp0, 16
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- vpunpcklbw ymm_tmp3, ymm_xpos_int, [esp]
- mov esp, r5
- pop r5
-%else
- vpunpcklbw ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+ vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_db80h
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3
vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp0
@@ -3952,24 +3707,7 @@
shr r_tmp0, 16
vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- vpunpckhbw ymm_tmp3, ymm_xpos_int, [esp]
- mov esp, r5
- pop r5
-%else
- vpunpckhbw ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+ vpunpckhbw ymm_tmp3, ymm_xpos_int, ymm_db80h
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3
vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1
@@ -3985,20 +3723,7 @@
%endmacro
%macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x0c0c0c0c ;shufb_000044448888cccc
- push 0x08080808
- push 0x04040404
- push 0x00000000
- vbroadcasti128 ymm_tmp0, [esp]
- mov esp, r5
- pop r5
-%else
- vbroadcasti128 ymm_tmp0, [shufb_000044448888CCCC]
-%endif
+ vbroadcasti128 ymm_tmp0, xmm_shufb_000044448888CCCC
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
mov r_tmp0, i_xpos
@@ -4019,24 +3744,7 @@
shr r_tmp0, 16
vinserti128 ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
vinserti128 ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xffffffe0
- push 0x80808080 ;db80h_256
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- vpunpcklbw ymm_tmp3, ymm_xpos_int, [esp]
- mov esp, r5
- pop r5
-%else
- vpunpcklbw ymm_tmp3, ymm_xpos_int, [db80h_256]
-%endif
+ vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_db80h
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
vpshufb ymm_tmp5, ymm_tmp5, ymm_tmp3
vpshufb ymm_tmp0, ymm_tmp0, ymm_tmp3
@@ -4313,7 +4021,11 @@
vmovd xmm0, arg8
vmovd xmm1, esp
and esp, -32
+%ifdef X86_32_PICASM
+ sub esp, 8 * 4 + 9 * 32
+%else
sub esp, 8 * 4 + 8 * 32
+%endif
vmovd [esp], xmm1
%define p_dst r0
%define i_dst_stride_less_width [esp + 1 * 4]
@@ -4354,6 +4066,22 @@
%define ymm_ffff [esp + 8 * 4 + 5 * 32]
%define ymm_xpos_int_begin [esp + 8 * 4 + 6 * 32]
%define ymm_xpos_frac_begin [esp + 8 * 4 + 7 * 32]
+%ifdef X86_32_PICASM
+ %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 32]
+ %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 8 * 32 + 16]
+ vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4
+ vpcmpeqb ymm_tmp5, ymm_tmp5, ymm_tmp5
+ vpsubb ymm_tmp4, ymm_tmp4, ymm_tmp5
+ vpsllw ymm_tmp3, ymm_tmp4, 3
+ vpslldq ymm_tmp3, ymm_tmp3, 8
+ vmovdqa xmm_shufb_0000000088888888, xmm_tmp3
+ vpsllq ymm_tmp5, ymm_tmp4, 34
+ vpaddb ymm_tmp5, ymm_tmp5, ymm_tmp3
+ vmovdqa xmm_shufb_000044448888CCCC, xmm_tmp5
+%else
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+ %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
+%endif
mov i_dst_stride_less_width, r1
mov i_dst_width, r2
mov i_dst_height, r3
@@ -4409,6 +4137,8 @@
%define ymm_ffff ymm13
%define ymm_xpos_int_begin ymm14
%define ymm_xpos_frac_begin ymm15
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+ %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
vpxor ymm_0, ymm_0, ymm_0
vpcmpeqw ymm_ffff, ymm_ffff, ymm_ffff
%endif
@@ -4597,6 +4327,8 @@
%undef ymm_xfrac0_begin
%undef ymm_xfrac1_begin
%undef ymm_xfrac_inc
+%undef xmm_shufb_0000000088888888
+%undef xmm_shufb_000044448888CCCC
;**************************************************************************************************************
;void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
@@ -4632,7 +4364,11 @@
vmovd xmm0, arg8
vmovd xmm1, esp
and esp, -32
+%ifdef X86_32_PICASM
+ sub esp, 8 * 4 + 10 * 32
+%else
sub esp, 8 * 4 + 8 * 32
+%endif
vmovd [esp], xmm1
%define p_dst r0
%define i_dst_stride_less_width [esp + 1 * 4]
@@ -4673,6 +4409,26 @@
%define ymm_7fff [esp + 8 * 4 + 5 * 32]
%define ymm_xpos_int_begin [esp + 8 * 4 + 6 * 32]
%define ymm_xpos_frac_begin [esp + 8 * 4 + 7 * 32]
+%ifdef X86_32_PICASM
+ %define ymm_db80h [esp + 8 * 4 + 8 * 32]
+ %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 32]
+ %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 9 * 32 + 16]
+ vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4
+ vpcmpeqb ymm_tmp5, ymm_tmp5, ymm_tmp5
+ vpsubb ymm_tmp4, ymm_tmp4, ymm_tmp5
+ vpsllw ymm_tmp3, ymm_tmp4, 3
+ vpslldq ymm_tmp3, ymm_tmp3, 8
+ vmovdqa xmm_shufb_0000000088888888, xmm_tmp3
+ vpsllq ymm_tmp5, ymm_tmp4, 34
+ vpaddb ymm_tmp5, ymm_tmp5, ymm_tmp3
+ vmovdqa xmm_shufb_000044448888CCCC, xmm_tmp5
+ vpsllw ymm_tmp4, ymm_tmp4, 7
+ vmovdqa ymm_db80h, ymm_tmp4
+%else
+ %define ymm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+ %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
+%endif
mov i_dst_stride_less_width, r1
mov i_dst_width, r2
mov i_dst_height, r3
@@ -4729,6 +4485,9 @@
%define ymm_7fff ymm13
%define ymm_xpos_int_begin ymm14
%define ymm_xpos_frac_begin ymm15
+ %define ymm_db80h [db80h_256]
+ %define xmm_shufb_0000000088888888 [shufb_0000000088888888]
+ %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC]
vpxor ymm_0, ymm_0, ymm_0
vpcmpeqw ymm_7fff, ymm_7fff, ymm_7fff
vpsrlw ymm_7fff, ymm_7fff, 1
@@ -4920,5 +4679,8 @@
%undef ymm_xfrac0_begin
%undef ymm_xfrac1_begin
%undef ymm_xfrac_inc
-%endif
+%undef ymm_db80h
+%undef xmm_shufb_0000000088888888
+%undef xmm_shufb_000044448888CCCC
+%endif
--- a/test/encoder/EncUT_Cavlc.cpp
+++ b/test/encoder/EncUT_Cavlc.cpp
@@ -77,18 +77,14 @@
}
#ifdef X86_32_ASM
-#ifndef X86_32_PICASM
TEST (CavlcTest, CavlcParamCal_sse2) {
TestCavlcParamCal (CavlcParamCal_sse2);
}
#endif
-#endif
#ifdef X86_ASM
-#ifndef X86_32_PICASM
TEST (CavlcTest, CavlcParamCal_sse42) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
TestCavlcParamCal (CavlcParamCal_sse42);
}
-#endif
#endif
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -222,7 +222,6 @@
}
#endif //HAVE_AVX2
-#ifndef X86_32_PICASM
TEST (EncodeMbAuxTest, WelsCalculateSingleCtr4x4_sse2) {
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (int16_t, iDctC, 16);
@@ -236,7 +235,6 @@
FREE_MEMORY (iDctC);
FREE_MEMORY (iDctS);
}
-#endif //#ifndef X86_32_PICASM
#endif
void copy (uint8_t* pDst, int32_t iDStride, uint8_t* pSrc, int32_t iSStride, int32_t iWidth, int32_t iHeight) {
@@ -304,11 +302,9 @@
TestGetNoneZeroCount (WelsGetNoneZeroCount_c);
}
#ifdef X86_ASM
-#ifndef X86_32_PICASM
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2);
}
-#endif
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);