ref: c82c19022b5a0bed5845a4e6dcb03a36991224b9
parent: f36959bf4b897b66e54bf5e236618aaff517ec67
author: Sindre Aamås <[email protected]>
date: Tue Mar 7 09:19:18 EST 2017
[Decoder/x86] Simplify intra_pred X86_32_PICASM handling Utilize program counter-relative offsets to simplify X86_32_PICASM code. In order for this to work with nasm, data constants are placed in the text segment.
--- a/codec/decoder/core/x86/intra_pred.asm
+++ b/codec/decoder/core/x86/intra_pred.asm
@@ -49,7 +49,11 @@
; Local Data (Read Only)
;*******************************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
SECTION .rodata align=16
+%endif
align 16
sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
@@ -132,20 +136,7 @@
%macro COPY_16_TIMES 2
movdqa %2, [%1-16]
psrldq %2, 15
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x01010101 ;mmx_01bytes
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq %2, [esp]
- mov esp, r5
- pop r5
-%else
- pmuludq %2, [mmx_01bytes]
-%endif
+ pmuludq %2, [pic(mmx_01bytes)]
pshufd %2, %2, 0
%endmacro
@@ -152,20 +143,7 @@
%macro COPY_16_TIMESS 3
movdqa %2, [%1+%3-16]
psrldq %2, 15
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x01010101 ;mmx_01bytes
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq %2, [esp]
- mov esp, r5
- pop r5
-%else
- pmuludq %2, [mmx_01bytes]
-%endif
+ pmuludq %2, [pic(mmx_01bytes)]
pshufd %2, %2, 0
%endmacro
@@ -203,52 +181,26 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
-%ifdef X86_32_PICASM
- push r3
- mov r3, esp
- and esp, 0xfffffff0
- push 0x01010101 ;mmx_01bytes
- push 0x01010101
- push 0x01010101
- push 0x01010101
-%endif
movzx r2, byte [r0-1]
movd xmm0, r2d
-%ifdef X86_32_PICASM
- pmuludq xmm0, [esp]
-%else
- pmuludq xmm0, [mmx_01bytes]
-%endif
+ pmuludq xmm0, [pic(mmx_01bytes)]
movzx r2, byte [r0+r1-1]
movd xmm1, r2d
-%ifdef X86_32_PICASM
- pmuludq xmm1, [esp]
-%else
- pmuludq xmm1, [mmx_01bytes]
-%endif
+ pmuludq xmm1, [pic(mmx_01bytes)]
lea r0, [r0+r1]
movzx r2, byte [r0+r1-1]
movd xmm2, r2d
-%ifdef X86_32_PICASM
- pmuludq xmm2, [esp]
-%else
- pmuludq xmm2, [mmx_01bytes]
-%endif
+ pmuludq xmm2, [pic(mmx_01bytes)]
movzx r2, byte [r0+2*r1-1]
movd xmm3, r2d
-%ifdef X86_32_PICASM
- pmuludq xmm3, [esp]
- mov esp, r3
- pop r3
-%else
- pmuludq xmm3, [mmx_01bytes]
-%endif
+ pmuludq xmm3, [pic(mmx_01bytes)]
sub r0, r1
movd [r0], xmm0
@@ -257,6 +209,7 @@
movd [r0], xmm2
movd [r0+r1], xmm3
+ DEINIT_X86_32_PIC
ret
;*******************************************************************************
@@ -266,6 +219,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -276,37 +230,11 @@
;for H
pxor xmm7, xmm7
movq xmm0, [r0]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00010002
- push 0x00030004
- push 0x00050006
- push 0x00070008
- movdqa xmm5, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm5, [sse2_plane_dec]
-%endif
+ movdqa xmm5, [pic(sse2_plane_dec)]
punpcklbw xmm0, xmm7
pmullw xmm0, xmm5
movq xmm1, [r0 + 9]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00080007 ;sse2_plane_inc
- push 0x00060005
- push 0x00040003
- push 0x00020001
- movdqa xmm6, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm6, [sse2_plane_inc]
-%endif
+ movdqa xmm6, [pic(sse2_plane_inc)]
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
psubw xmm1, xmm0
@@ -361,19 +289,7 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r2, r2
-%ifdef X86_32_PICASM
- mov r2, esp
- and esp, 0xfffffff0
- push 0x0000ffff ;sse2_plane_inc_minus
- push 0xfffefffd
- push 0xfffcfffb
- push 0xfffafff9
- movdqa xmm5, [esp]
- mov esp, r2
- xor r2, r2
-%else
- movdqa xmm5, [sse2_plane_inc_minus]
-%endif
+ movdqa xmm5, [pic(sse2_plane_inc_minus)]
get_i16x16_luma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -393,6 +309,7 @@
jnz get_i16x16_luma_pred_plane_sse2_1
POP_XMM
+ DEINIT_X86_32_PIC
pop r4
pop r3
ret
@@ -414,6 +331,7 @@
WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
%assign push_num 0
+ INIT_X86_32_PIC_NOPRESERVE r2
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -430,6 +348,7 @@
SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ DEINIT_X86_32_PIC
ret
;*******************************************************************************
@@ -477,6 +396,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_2_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -486,30 +406,11 @@
pxor mm7, mm7
movq mm0, [r0]
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x00010002 ;sse2_plane_dec_c
- push 0x00030004
- push 0x00040003 ;sse2_plane_inc_c
- push 0x00020001
- push 0x00040003 ;
- push 0x00020001
- push 0x0000ffff
- push 0xfffefffd
- movq mm5, [esp+24]
-%else
- movq mm5, [sse2_plane_dec_c]
-%endif
+ movq mm5, [pic(sse2_plane_dec_c)]
punpcklbw mm0, mm7
pmullw mm0, mm5
movq mm1, [r0 + 5]
-%ifdef X86_32_PICASM
- movq mm6, [esp+16]
-%else
- movq mm6, [sse2_plane_inc_c]
-%endif
+ movq mm6, [pic(sse2_plane_inc_c)]
punpcklbw mm1, mm7
pmullw mm1, mm6
psubw mm1, mm0
@@ -561,13 +462,7 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r2, r2
-%ifdef X86_32_PICASM
- movdqa xmm5, [esp]
- mov esp, r5
- pop r5
-%else
- movdqa xmm5, [sse2_plane_mul_b_c]
-%endif
+ movdqa xmm5, [pic(sse2_plane_mul_b_c)]
get_i_chroma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -583,6 +478,7 @@
jnz get_i_chroma_pred_plane_sse2_1
POP_XMM
+ DEINIT_X86_32_PIC
pop r4
pop r3
WELSEMMS
@@ -602,6 +498,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -629,20 +526,7 @@
movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
pxor mm1,mm4 ;find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pand mm1,[esp] ;set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm1,[mmx_01bytes] ;set the odd bit
-%endif
+ pand mm1,[pic(mmx_01bytes)] ;set the odd bit
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
@@ -655,6 +539,7 @@
movd [r0+r1],mm2
psrlq mm2,8
movd [r0],mm2
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -667,20 +552,7 @@
movq %1, [%3-8]
psrlq %1, 38h
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw %1, [mmx_01bytes]
-%endif
+ pmullw %1, [pic(mmx_01bytes)]
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -689,20 +561,7 @@
movq %1, [%3+r1-8]
psrlq %1, 38h
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw %1, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw %1, [mmx_01bytes]
-%endif
+ pmullw %1, [pic(mmx_01bytes)]
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -709,6 +568,7 @@
WELS_EXTERN WelsDecoderIChromaPredH_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -716,20 +576,7 @@
movq mm0, [r2-8]
psrlq mm0, 38h
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmullw mm0, [esp]
- mov esp, r0
- pop r0
-%else
- pmullw mm0, [mmx_01bytes]
-%endif
+ pmullw mm0, [pic(mmx_01bytes)]
pshufw mm0, mm0, 0
movq [r0], mm0
@@ -753,6 +600,7 @@
lea r0, [r0+2*r1]
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -816,6 +664,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -841,18 +690,7 @@
pavgb mm1, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm4, [esp]
- mov esp, r0
- pop r0
-%else
- pand mm4, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm4, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm1, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
@@ -876,6 +714,7 @@
movd [r0+2*r1], mm3
psrlq mm3, 10h
movd [r0+r1], mm3
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -909,6 +748,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -937,18 +777,7 @@
pavgb mm2, mm0
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm5, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm5, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm5, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm5 ; decrease 1 from odd bytes
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
@@ -970,6 +799,7 @@
movd [r0+r1], mm1
psrlq mm1, 10h
movd [r0+2*r1], mm1
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1005,6 +835,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -1030,18 +861,7 @@
pavgb mm2, mm0
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm3, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm3, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm3, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm3 ; decrease 1 from odd bytes
movq mm3, mm0
@@ -1071,6 +891,7 @@
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
lea r0, [r0+2*r1]
movd [r0+r1], mm5
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1102,6 +923,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -1121,18 +943,7 @@
movq mm3, mm1
pavgb mm1, mm2
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm3, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm3, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm3, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm1, mm3 ; decrease 1 from odd bytes
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
@@ -1146,6 +957,7 @@
psrlq mm0, 8h
lea r0, [r0+2*r1]
movd [r0+r1], mm0
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1181,6 +993,7 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r2, r0
@@ -1199,18 +1012,7 @@
movq mm4, mm2
pavgb mm2, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pand mm4, [esp] ; set the odd bit
- mov esp, r0
- pop r0
-%else
- pand mm4, [mmx_01bytes] ; set the odd bit
-%endif
+ pand mm4, [pic(mmx_01bytes)] ; set the odd bit
psubusb mm2, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
@@ -1223,6 +1025,7 @@
psrlq mm2, 8h
lea r0, [r0+2*r1]
movd [r0+r1], mm2
+ DEINIT_X86_32_PIC
WELSEMMS
ret
@@ -1234,6 +1037,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r4, r0
@@ -1275,18 +1079,7 @@
movq mm1, mm2
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x00000000
- push 0x00000002
- movq mm4, [esp]
- mov esp, r0
- pop r0
-%else
- movq mm4, [mmx_0x02]
-%endif
+ movq mm4, [pic(mmx_0x02)]
paddq mm0, mm4
psrlq mm0, 0x02
@@ -1302,30 +1095,13 @@
paddq mm1, mm4
psrlq mm1, 0x03
-%ifdef X86_32_PICASM
- push r5
- mov r5, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- pmuludq mm0, [esp]
- pmuludq mm3, [esp]
-%else
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
-%endif
+ pmuludq mm0, [pic(mmx_01bytes)]
+ pmuludq mm3, [pic(mmx_01bytes)]
psllq mm0, 0x20
pxor mm0, mm3 ; mm0 = m_up
-%ifdef X86_32_PICASM
- pmuludq mm2, [esp]
- pmuludq mm1, [esp]
- mov esp, r5
- pop r5
-%else
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
-%endif
+ pmuludq mm2, [pic(mmx_01bytes)]
+ pmuludq mm1, [pic(mmx_01bytes)]
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
@@ -1342,6 +1118,7 @@
lea r4, [r4+2*r1]
movq [r4+r1], mm1
+ DEINIT_X86_32_PIC
pop r4
pop r3
WELSEMMS
@@ -1357,6 +1134,7 @@
push r3
push r4
%assign push_num 2
+ INIT_X86_32_PIC r5
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
mov r4, r0
@@ -1385,20 +1163,7 @@
movd xmm1, r2d
paddw xmm0, xmm1
psrld xmm0, 0x05
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x01010101
- push 0x01010101
- push 0x01010101
- push 0x01010101
- pmuludq xmm0, [esp]
- mov esp, r0
- pop r0
-%else
- pmuludq xmm0, [mmx_01bytes]
-%endif
+ pmuludq xmm0, [pic(mmx_01bytes)]
pshufd xmm0, xmm0, 0
movdqa [r4], xmm0
@@ -1432,6 +1197,7 @@
movdqa [r4+r1], xmm0
+ DEINIT_X86_32_PIC
pop r4
pop r3
@@ -1518,24 +1284,12 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
lea r2, [2*r1+r1] ; 3*kiStride
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080
- push 0x80808080
- push 0x80808080
- push 0x80808080
- movdqa xmm0, [esp]
- mov esp, r0
- pop r0
-%else
- movdqa xmm0, [sse2_dc_0x80]
-%endif
+ movdqa xmm0, [pic(sse2_dc_0x80)]
movdqa xmm1, xmm0
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
@@ -1557,6 +1311,7 @@
movdqa [r0+2*r1], xmm0
movdqa [r0+r2], xmm1
+ DEINIT_X86_32_PIC
ret
;*******************************************************************************
@@ -1680,21 +1435,11 @@
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
%assign push_num 0
+ INIT_X86_32_PIC r3
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
lea r2, [2*r1+r1]
-%ifdef X86_32_PICASM
- push r0
- mov r0, esp
- and esp, 0xfffffff0
- push 0x80808080
- push 0x80808080
- movq mm0, [esp]
- mov esp, r0
- pop r0
-%else
- movq mm0, [sse2_dc_0x80]
-%endif
+ movq mm0, [pic(sse2_dc_0x80)]
movq mm1, mm0
movq [r0], mm0
movq [r0+r1], mm1
@@ -1705,6 +1450,7 @@
movq [r0+r1], mm1
movq [r0+2*r1], mm0
movq [r0+r2], mm1
+ DEINIT_X86_32_PIC
emms
ret