shithub: openh264

Download patch

ref: 659ff14af50bc184bac74dd058010143cf9e5734
parent: c82c19022b5a0bed5845a4e6dcb03a36991224b9
author: Sindre Aamås <[email protected]>
date: Tue Mar 7 09:21:17 EST 2017

[Encoder/x86] Simplify intra_pred X86_32_PICASM handling

Utilize program counter-relative offsets to simplify X86_32_PICASM
code.

In order for this to work with nasm, data constants are placed in
the text segment.

--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -45,7 +45,11 @@
 ; Local Data (Read Only)
 ;***********************************************************************
 
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 align 16
 sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
@@ -144,20 +148,7 @@
 %macro COPY_16_TIMES 2
     movdqa      %2, [%1-16]
     psrldq      %2, 15
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     %2, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmuludq     %2, [mmx_01bytes]
-%endif
+    pmuludq     %2, [pic(mmx_01bytes)]
     pshufd      %2, %2, 0
 %endmacro
 
@@ -164,20 +155,7 @@
 %macro COPY_16_TIMESS 3
     movdqa      %2, [%1+%3-16]
     psrldq      %2, 15
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     %2, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmuludq     %2, [mmx_01bytes]
-%endif
+    pmuludq     %2, [pic(mmx_01bytes)]
     pshufd      %2, %2, 0
 %endmacro
 
@@ -215,30 +193,16 @@
 WELS_EXTERN WelsI4x4LumaPredH_sse2
     push r3
     %assign push_num 1
+    INIT_X86_32_PIC r4
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     movzx       r3, byte [r1-1]
     movd        xmm0,   r3d
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     xmm0,   [esp]
-%else
-    pmuludq     xmm0,   [mmx_01bytes]
-%endif
+    pmuludq     xmm0,   [pic(mmx_01bytes)]
 
     movzx       r3, byte [r1+r2-1]
     movd        xmm1,   r3d
-%ifdef X86_32_PICASM
-    pmuludq     xmm1,   [esp]
-%else
-    pmuludq     xmm1,   [mmx_01bytes]
-%endif
+    pmuludq     xmm1,   [pic(mmx_01bytes)]
 
     unpcklps    xmm0,   xmm1
 
@@ -245,26 +209,17 @@
     lea         r1, [r1+r2*2]
     movzx       r3, byte [r1-1]
     movd        xmm2,   r3d
-%ifdef X86_32_PICASM
-    pmuludq     xmm2,   [esp]
-%else
-    pmuludq     xmm2,   [mmx_01bytes]
-%endif
+    pmuludq     xmm2,   [pic(mmx_01bytes)]
 
     movzx       r3, byte [r1+r2-1]
     movd        xmm3,   r3d
-%ifdef X86_32_PICASM
-    pmuludq     xmm3,   [esp]
-    mov         esp,    r0
-    pop         r0
-%else
-    pmuludq     xmm3,   [mmx_01bytes]
-%endif
+    pmuludq     xmm3,   [pic(mmx_01bytes)]
 
     unpcklps    xmm2,   xmm3
     unpcklpd    xmm0,   xmm2
 
     movdqa      [r0],   xmm0
+    DEINIT_X86_32_PIC
     pop r3
     ret
 
@@ -275,6 +230,7 @@
     push r3
     push r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_3_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r2, r2d
@@ -284,34 +240,11 @@
     ;for H
     pxor    xmm7,   xmm7
     movq    xmm0,   [r1]
-%ifdef X86_32_PICASM
-    push    r5
-    mov     r5, esp
-    and     esp, 0xfffffff0
-    push    0x00010002    ;sse2_plane_dec
-    push    0x00030004
-    push    0x00050006
-    push    0x00070008
-    push    0x00080007    ;sse_plane_inc
-    push    0x00060005
-    push    0x00040003
-    push    0x00020001
-    push    0x0000ffff    ;sse_plane_inc_minus
-    push    0xfffefffd
-    push    0xfffcfffb
-    push    0xfffafff9
-    movdqa  xmm5,   [esp+32]
-%else
-    movdqa  xmm5,   [sse2_plane_dec]
-%endif
+    movdqa  xmm5,   [pic(sse2_plane_dec)]
     punpcklbw xmm0, xmm7
     pmullw  xmm0,   xmm5
     movq    xmm1,   [r1 + 9]
-%ifdef X86_32_PICASM
-    movdqa  xmm6,   [esp+16]
-%else
-    movdqa  xmm6,   [sse2_plane_inc]
-%endif
+    movdqa  xmm6,   [pic(sse2_plane_inc)]
     punpcklbw xmm1, xmm7
     pmullw  xmm1,   xmm6
     psubw   xmm1,   xmm0
@@ -357,13 +290,7 @@
     SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
     xor     r3, r3
-%ifdef X86_32_PICASM
-    movdqa  xmm5,   [esp]
-    mov     esp,    r5
-    pop     r5
-%else
-    movdqa  xmm5,   [sse2_plane_inc_minus]
-%endif
+    movdqa  xmm5,   [pic(sse2_plane_inc_minus)]
 
 get_i16x16_luma_pred_plane_sse2_1:
     movdqa  xmm2,   xmm1
@@ -382,6 +309,7 @@
     cmp     r3, 16
     jnz get_i16x16_luma_pred_plane_sse2_1
     POP_XMM
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
     ret
@@ -393,6 +321,7 @@
     push r3
     push r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_3_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r2, r2d
@@ -401,30 +330,11 @@
 
     pxor    mm7,    mm7
     movq    mm0,    [r1]
-%ifdef X86_32_PICASM
-    push    r5
-    mov     r5, esp
-    and     esp, 0xfffffff0
-    push    0x00010002    ;sse2_plane_dec_c
-    push    0x00030004
-    push    0x00040003    ;sse2_plane_inc_c
-    push    0x00020001
-    push    0x00040003    ;sse2_plane_mul_b_c
-    push    0x00020001
-    push    0x0000ffff
-    push    0xfffefffd
-    movq    mm5,    [esp+24]
-%else
-    movq    mm5,    [sse2_plane_dec_c]
-%endif
+    movq    mm5,    [pic(sse2_plane_dec_c)]
     punpcklbw mm0,  mm7
     pmullw  mm0,    mm5
     movq    mm1,    [r1 + 5]
-%ifdef X86_32_PICASM
-    movq    mm6,    [esp+16]
-%else
-    movq    mm6,    [sse2_plane_inc_c]
-%endif
+    movq    mm6,    [pic(sse2_plane_inc_c)]
     punpcklbw mm1,  mm7
     pmullw  mm1,    mm6
     psubw   mm1,    mm0
@@ -474,13 +384,7 @@
     SSE2_Copy8Times xmm0, r3d   ; xmm0 = s,s,s,s,s,s,s,s
 
     xor     r3, r3
-%ifdef X86_32_PICASM
-    movdqa  xmm5,   [esp]
-    mov     esp,    r5
-    pop     r5
-%else
-    movdqa  xmm5,   [sse2_plane_mul_b_c]
-%endif
+    movdqa  xmm5,   [pic(sse2_plane_mul_b_c)]
 
 get_i_chroma_pred_plane_sse2_1:
     movdqa  xmm2,   xmm1
@@ -495,6 +399,7 @@
     cmp     r3, 8
     jnz get_i_chroma_pred_plane_sse2_1
     POP_XMM
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
     WELSEMMS
@@ -514,6 +419,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDR_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     movq        mm1,[r1+r2-8]       ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
@@ -539,18 +445,7 @@
     movq        mm4,mm3             ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
     pavgb       mm3,mm1             ;mm3=([11]+[21]+1)/2
     pxor        mm1,mm4             ;find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm1,[esp]   ;set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm1,[mmx_01bytes]   ;set the odd bit
-%endif
+    pand        mm1,[pic(mmx_01bytes)]   ;set the odd bit
     psubusb     mm3,mm1             ;decrease 1 from odd bytes
     pavgb       mm2,mm3             ;mm2=(([11]+[21]+1)/2+1+[16])/2
 
@@ -561,6 +456,7 @@
     movd        [r0+4],mm2
     psrlq       mm2,8
     movd        [r0],mm2
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -619,20 +515,7 @@
     psrlq       %1,     38h
 
     ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmullw      %1,     [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmullw      %1,     [mmx_01bytes]
-%endif
+    pmullw      %1,     [pic(mmx_01bytes)]
     pshufw      %1,     %1, 0
     movq        [%4],   %1
 %endmacro
@@ -642,20 +525,7 @@
     psrlq       %1,     38h
 
     ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmullw      %1,     [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmullw      %1,     [mmx_01bytes]
-%endif
+    pmullw      %1,     [pic(mmx_01bytes)]
     pshufw      %1,     %1, 0
     movq        [%4],   %1
 %endmacro
@@ -662,6 +532,7 @@
 
 WELS_EXTERN WelsIChromaPredH_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     movq        mm0,    [r1-8]
@@ -668,20 +539,7 @@
     psrlq       mm0,    38h
 
     ;pmuludq        mm0,    [mmx_01bytes]       ;extend to 4 bytes
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmullw      mm0,        [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmullw      mm0,        [mmx_01bytes]
-%endif
+    pmullw      mm0,        [pic(mmx_01bytes)]
     pshufw      mm0,    mm0,    0
     movq        [r0],   mm0
 
@@ -701,6 +559,7 @@
     MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
 
     MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+56
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -767,6 +626,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -791,18 +651,7 @@
     pavgb       mm1, mm0
 
     pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm4, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm4, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm4, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm1, mm4                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
@@ -824,6 +673,7 @@
     movd        [r0+8], mm3
     psrlq       mm3, 10h
     movd        [r0+4], mm3
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -855,6 +705,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     movd        mm0, [r1-4]            ; mm0[3] = l0
@@ -881,18 +732,7 @@
     pavgb       mm2, mm0
 
     pxor        mm5, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm5, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm5, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm5, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm2, mm5                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
@@ -912,6 +752,7 @@
     movd        [r0+4], mm1
     psrlq       mm1, 10h
     movd        [r0+8], mm1
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -947,6 +788,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -971,18 +813,7 @@
     pavgb       mm2, mm0
 
     pxor        mm3, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm3, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm3, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm3, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm2, mm3                ; decrease 1 from odd bytes
 
     movq        mm3, mm0
@@ -1011,6 +842,7 @@
     psllq       mm2, 8h
     pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
     movd        [r0+12], mm5
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -1042,6 +874,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -1060,18 +893,7 @@
     movq        mm3, mm1
     pavgb       mm1, mm2
     pxor        mm3, mm2                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm3, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm3, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm3, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm1, mm3                ; decrease 1 from odd bytes
 
     pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
@@ -1084,6 +906,7 @@
     movd        [r0+8], mm0
     psrlq       mm0, 8h
     movd        [r0+12], mm0
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -1119,6 +942,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -1135,18 +959,7 @@
     movq        mm4, mm2
     pavgb       mm2, mm0
     pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    pand        mm4, [esp]      ; set the odd bit
-    mov         esp, r0
-    pop         r0
-%else
-    pand        mm4, [mmx_01bytes]      ; set the odd bit
-%endif
+    pand        mm4, [pic(mmx_01bytes)] ; set the odd bit
     psubusb     mm2, mm4                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
@@ -1158,6 +971,7 @@
     movd        [r0+4], mm2
     psrlq       mm2, 8h
     movd        [r0+12], mm2
+    DEINIT_X86_32_PIC
     WELSEMMS
     ret
 
@@ -1169,6 +983,7 @@
     push r3
     push r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -1208,18 +1023,7 @@
     movq        mm1, mm2
     paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
 
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x00000000
-    push        0x00000002
-    movq        mm4, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    movq        mm4, [mmx_0x02]
-%endif
+    movq        mm4, [pic(mmx_0x02)]
 
     paddq       mm0, mm4
     psrlq       mm0, 0x02
@@ -1235,32 +1039,13 @@
     paddq       mm1, mm4
     psrlq       mm1, 0x03
 
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     mm0, [esp]
-    pmuludq     mm3, [esp]
-%else
-    pmuludq     mm0, [mmx_01bytes]
-    pmuludq     mm3, [mmx_01bytes]
-%endif
+    pmuludq     mm0, [pic(mmx_01bytes)]
+    pmuludq     mm3, [pic(mmx_01bytes)]
     psllq       mm0, 0x20
     pxor        mm0, mm3                 ; mm0 = m_up
 
-%ifdef X86_32_PICASM
-    pmuludq     mm2, [esp]
-    pmuludq     mm1, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmuludq     mm2, [mmx_01bytes]
-    pmuludq     mm1, [mmx_01bytes]
-%endif
+    pmuludq     mm2, [pic(mmx_01bytes)]
+    pmuludq     mm1, [pic(mmx_01bytes)]
     psllq       mm1, 0x20
     pxor        mm1, mm2                 ; mm2 = m_down
 
@@ -1274,6 +1059,7 @@
     movq        [r0+0x30], mm1
     movq        [r0+0x38], mm1
 
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
     WELSEMMS
@@ -1289,6 +1075,7 @@
     push r3
     push r4
     %assign push_num 2
+    INIT_X86_32_PIC r5
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
     sub         r1, r2
@@ -1316,20 +1103,7 @@
     movd        xmm1, r3d
     paddw       xmm0, xmm1
     psrld       xmm0, 0x05
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    push        0x01010101
-    pmuludq     xmm0, [esp]
-    mov         esp, r0
-    pop         r0
-%else
-    pmuludq     xmm0, [mmx_01bytes]
-%endif
+    pmuludq     xmm0, [pic(mmx_01bytes)]
     pshufd      xmm0, xmm0, 0
 
     movdqa      [r0], xmm0
@@ -1349,6 +1123,7 @@
     movdqa      [r0+0xe0], xmm0
     movdqa      [r0+0xf0], xmm0
 
+    DEINIT_X86_32_PIC
     pop r4
     pop r3
     ret