shithub: openh264

Download patch

ref: a009153741ef3b6a789a546c33a814c002d75fb5
parent: 9909c306f1172661c2b09e4bee2428fd953e868a
author: Sindre Aamås <[email protected]>
date: Thu Feb 25 11:00:26 EST 2016

[Common/x86] DeblockChromaEq4H_ssse3 optimizations

Use packed 8-bit operations rather than unpack to 16-bit.

~5.80x speedup on Haswell (x86-64).
~1.69x speedup on Haswell (x86 32-bit).

--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -509,7 +509,33 @@
     SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9
 %endmacro
 
+; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 xmmclobber=%7,%8,%9
+%macro SSSE3_DeblockChromaEq4 9
+    movdqa   %7, %3
+    SSE2_AbsDiffUB %7, %2, %8         ; |p0 - q0|
+    SSE2_CmpgeUB %7, %5               ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
+    movdqa   %8, %4
+    SSE2_AbsDiffUB %8, %3, %5         ; |q1 - q0|
+    movdqa   %9, %1
+    SSE2_AbsDiffUB %9, %2, %5         ; |p1 - p0|
+    pmaxub   %8, %9                   ; max(|q1 - q0|, |p1 - p0|)
+    pxor     %9, %9
+    movd     %5, %6
+    pshufb   %5, %9                   ; iBeta
+    SSE2_CmpgeUB %8, %5               ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
+    por      %7, %8                   ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
+    WELS_DB1 %5
+    movdqa   %8, %2
+    SSE2_AvgbFloor1 %8, %4, %5, %9    ; (p0 + q1) >> 1
+    pavgb    %8, %1                   ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
+    movdqa   %9, %7
+    SSE2_Blend %2, %8, %7             ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
+    SSE2_AvgbFloor1 %1, %3, %5, %7    ; (q0 + p1) >> 1
+    pavgb    %1, %4                   ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
+    SSE2_Blend %3, %1, %9             ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
+%endmacro
 
+
 ;******************************************************************************
 ; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
@@ -572,42 +598,15 @@
     movhps   xmm0, [r1 + 0 * r2]              ; q0 cr
     movq     xmm2, [r0 + 1 * r3]              ; p0 cb
     movhps   xmm2, [r1 + 1 * r3]              ; p0 cr
-
-    movdqa   xmm4, xmm0
-    SSE2_AbsDiffUB xmm4, xmm2, xmm5           ; |p0 - q0|
-    SSE2_CmpgeUB xmm4, xmm7                   ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
-
     movq     xmm1, [r0 + 1 * r2]              ; q1 cb
     movhps   xmm1, [r1 + 1 * r2]              ; q1 cr
     movq     xmm3, [r0 + 2 * r3]              ; p1 cb
     movhps   xmm3, [r1 + 2 * r3]              ; p1 cr
 
-    movdqa   xmm5, xmm1
-    SSE2_AbsDiffUB xmm5, xmm0, xmm7           ; |q1 - q0|
-    movdqa   xmm6, xmm3
-    SSE2_AbsDiffUB xmm6, xmm2, xmm7           ; |p1 - p0|
-    pmaxub   xmm5, xmm6                       ; max(|q1 - q0|, |p1 - p0|)
+    SSSE3_DeblockChromaEq4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, xmm4, xmm5, xmm6
 
-    pxor     xmm6, xmm6
-    movd     xmm7, arg5d
-    pshufb   xmm7, xmm6                       ; iBeta
-
-    SSE2_CmpgeUB xmm5, xmm7                   ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
-    por      xmm4, xmm5                       ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
-
-    WELS_DB1 xmm7
-    movdqa   xmm5, xmm2
-    SSE2_AvgbFloor1 xmm2, xmm1, xmm7, xmm6    ; (p0 + q1) >> 1
-    pavgb    xmm2, xmm3                       ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
-    movdqa   xmm6, xmm4
-    SSE2_Blend xmm5, xmm2, xmm4               ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
-
-    SSE2_AvgbFloor1 xmm3, xmm0, xmm7, xmm4    ; (q0 + p1) >> 1
-    pavgb    xmm3, xmm1                       ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
-    SSE2_Blend xmm0, xmm3, xmm6               ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
-
-    movlps   [r0 + 1 * r3], xmm5              ; store p0 cb
-    movhps   [r1 + 1 * r3], xmm5              ; store p0 cr
+    movlps   [r0 + 1 * r3], xmm2              ; store p0 cb
+    movhps   [r1 + 1 * r3], xmm2              ; store p0 cr
     movlps   [r0 + 0 * r2], xmm0              ; store q0 cb
     movhps   [r1 + 0 * r2], xmm0              ; store q0 cr
 
@@ -640,550 +639,6 @@
     ret
 
 
-%ifdef  WIN64
-
-
-WELS_EXTERN DeblockChromaEq4H_ssse3
-    mov         rax,rsp
-    mov         [rax+20h],rbx
-    push        rdi
-    PUSH_XMM 16
-    sub         rsp,140h
-    mov         rdi,rdx
-    lea         eax,[r8*4]
-    movsxd      r10,eax
-    mov         eax,[rcx-2]
-    mov         [rsp+10h],eax
-    lea         rbx,[r10+rdx-2]
-    lea         r11,[r10+rcx-2]
-    movdqa      xmm5,[rsp+10h]
-    movsxd      r10,r8d
-    mov         eax,[r10+rcx-2]
-    lea         rdx,[r10+r10*2]
-    mov         [rsp+20h],eax
-    mov         eax,[rcx+r10*2-2]
-    mov         [rsp+30h],eax
-    mov         eax,[rdx+rcx-2]
-    movdqa      xmm2,[rsp+20h]
-    mov         [rsp+40h],eax
-    mov         eax, [rdi-2]
-    movdqa      xmm4,[rsp+30h]
-    mov         [rsp+50h],eax
-    mov         eax,[r10+rdi-2]
-    movdqa      xmm3,[rsp+40h]
-    mov         [rsp+60h],eax
-    mov         eax,[rdi+r10*2-2]
-    punpckldq   xmm5,[rsp+50h]
-    mov         [rsp+70h],eax
-    mov         eax, [rdx+rdi-2]
-    punpckldq   xmm2, [rsp+60h]
-    mov          [rsp+80h],eax
-    mov         eax,[r11]
-    punpckldq   xmm4, [rsp+70h]
-    mov         [rsp+50h],eax
-    mov         eax,[rbx]
-    punpckldq   xmm3,[rsp+80h]
-    mov         [rsp+60h],eax
-    mov         eax,[r10+r11]
-    movdqa      xmm0, [rsp+50h]
-    punpckldq   xmm0, [rsp+60h]
-    punpcklqdq  xmm5,xmm0
-    movdqa      [rsp+50h],xmm0
-    mov         [rsp+50h],eax
-    mov         eax,[r10+rbx]
-    movdqa      xmm0,[rsp+50h]
-    movdqa      xmm1,xmm5
-    mov         [rsp+60h],eax
-    mov         eax,[r11+r10*2]
-    punpckldq   xmm0, [rsp+60h]
-    punpcklqdq  xmm2,xmm0
-    punpcklbw   xmm1,xmm2
-    punpckhbw   xmm5,xmm2
-    movdqa      [rsp+50h],xmm0
-    mov         [rsp+50h],eax
-    mov         eax,[rbx+r10*2]
-    movdqa      xmm0,[rsp+50h]
-    mov         [rsp+60h],eax
-    mov         eax, [rdx+r11]
-    movdqa      xmm15,xmm1
-    punpckldq   xmm0,[rsp+60h]
-    punpcklqdq  xmm4,xmm0
-    movdqa      [rsp+50h],xmm0
-    mov         [rsp+50h],eax
-    mov         eax, [rdx+rbx]
-    movdqa      xmm0,[rsp+50h]
-    mov         [rsp+60h],eax
-    punpckldq   xmm0, [rsp+60h]
-    punpcklqdq  xmm3,xmm0
-    movdqa      xmm0,xmm4
-    punpcklbw   xmm0,xmm3
-    punpckhbw   xmm4,xmm3
-    punpcklwd   xmm15,xmm0
-    punpckhwd   xmm1,xmm0
-    movdqa      xmm0,xmm5
-    movdqa      xmm12,xmm15
-    punpcklwd   xmm0,xmm4
-    punpckhwd   xmm5,xmm4
-    punpckldq   xmm12,xmm0
-    punpckhdq   xmm15,xmm0
-    movdqa      xmm0,xmm1
-    movdqa      xmm11,xmm12
-    punpckldq   xmm0,xmm5
-    punpckhdq   xmm1,xmm5
-    punpcklqdq  xmm11,xmm0
-    punpckhqdq  xmm12,xmm0
-    movsx       eax,r9w
-    movdqa      xmm14,xmm15
-    punpcklqdq  xmm14,xmm1
-    punpckhqdq  xmm15,xmm1
-    pxor        xmm1,xmm1
-    movd        xmm0,eax
-    movdqa      xmm4,xmm12
-    movdqa      xmm8,xmm11
-    movsx       eax,word [rsp+170h + 160] ; iBeta
-    punpcklwd   xmm0,xmm0
-    punpcklbw   xmm4,xmm1
-    punpckhbw   xmm12,xmm1
-    movdqa      xmm9,xmm14
-    movdqa      xmm7,xmm15
-    movdqa      xmm10,xmm15
-    pshufd      xmm13,xmm0,0
-    punpcklbw   xmm9,xmm1
-    punpckhbw   xmm14,xmm1
-    movdqa      xmm6,xmm13
-    movd        xmm0,eax
-    movdqa      [rsp],xmm11
-    mov         eax,2
-    cwde
-    punpckhbw   xmm11,xmm1
-    punpckhbw   xmm10,xmm1
-    punpcklbw   xmm7,xmm1
-    punpcklwd   xmm0,xmm0
-    punpcklbw   xmm8,xmm1
-    pshufd      xmm3,xmm0,0
-    movdqa      xmm1,xmm8
-    movdqa      xmm0,xmm4
-    psubw       xmm0,xmm9
-    psubw       xmm1,xmm4
-    movdqa      xmm2,xmm3
-    pabsw       xmm0,xmm0
-    pcmpgtw     xmm6,xmm0
-    pabsw       xmm0,xmm1
-    movdqa      xmm1,xmm3
-    pcmpgtw     xmm2,xmm0
-    pand        xmm6,xmm2
-    movdqa      xmm0,xmm7
-    movdqa      xmm2,xmm3
-    psubw       xmm0,xmm9
-    pabsw       xmm0,xmm0
-    pcmpgtw     xmm1,xmm0
-    pand        xmm6,xmm1
-    movdqa      xmm0,xmm12
-    movdqa      xmm1,xmm11
-    psubw       xmm0,xmm14
-    psubw       xmm1,xmm12
-    movdqa      xmm5,xmm6
-    pabsw       xmm0,xmm0
-    pcmpgtw     xmm13,xmm0
-    pabsw       xmm0,xmm1
-    movdqa      xmm1,xmm8
-    pcmpgtw     xmm2,xmm0
-    paddw       xmm1,xmm8
-    movdqa      xmm0,xmm10
-    pand        xmm13,xmm2
-    psubw       xmm0,xmm14
-    paddw       xmm1,xmm4
-    movdqa      xmm2,xmm11
-    pabsw       xmm0,xmm0
-    paddw       xmm2,xmm11
-    paddw       xmm1,xmm7
-    pcmpgtw     xmm3,xmm0
-    paddw       xmm2,xmm12
-    movd        xmm0,eax
-    pand        xmm13,xmm3
-    paddw       xmm2,xmm10
-    punpcklwd   xmm0,xmm0
-    pshufd      xmm3,xmm0,0
-    movdqa      xmm0,xmm6
-    paddw       xmm1,xmm3
-    pandn       xmm0,xmm4
-    paddw       xmm2,xmm3
-    psraw       xmm1,2
-    pand        xmm5,xmm1
-    por         xmm5,xmm0
-    paddw       xmm7,xmm7
-    paddw       xmm10,xmm10
-    psraw       xmm2,2
-    movdqa      xmm1,xmm13
-    movdqa      xmm0,xmm13
-    pandn       xmm0,xmm12
-    pand        xmm1,xmm2
-    paddw       xmm7,xmm9
-    por         xmm1,xmm0
-    paddw       xmm10,xmm14
-    paddw       xmm7,xmm8
-    movdqa      xmm0,xmm13
-    packuswb    xmm5,xmm1
-    paddw       xmm7,xmm3
-    paddw       xmm10,xmm11
-    movdqa      xmm1,xmm6
-    paddw       xmm10,xmm3
-    pandn       xmm6,xmm9
-    psraw       xmm7,2
-    pand        xmm1,xmm7
-    psraw       xmm10,2
-    pandn       xmm13,xmm14
-    pand        xmm0,xmm10
-    por         xmm1,xmm6
-    movdqa      xmm6,[rsp]
-    movdqa      xmm4,xmm6
-    por         xmm0,xmm13
-    punpcklbw   xmm4,xmm5
-    punpckhbw   xmm6,xmm5
-    movdqa      xmm3,xmm4
-    packuswb    xmm1,xmm0
-    movdqa      xmm0,xmm1
-    punpckhbw   xmm1,xmm15
-    punpcklbw   xmm0,xmm15
-    punpcklwd   xmm3,xmm0
-    punpckhwd   xmm4,xmm0
-    movdqa      xmm0,xmm6
-    movdqa      xmm2,xmm3
-    punpcklwd   xmm0,xmm1
-    punpckhwd   xmm6,xmm1
-    movdqa      xmm1,xmm4
-    punpckldq   xmm2,xmm0
-    punpckhdq   xmm3,xmm0
-    punpckldq   xmm1,xmm6
-    movdqa      xmm0,xmm2
-    punpcklqdq  xmm0,xmm1
-    punpckhdq   xmm4,xmm6
-    punpckhqdq  xmm2,xmm1
-    movdqa      [rsp+10h],xmm0
-    movdqa      [rsp+60h],xmm2
-    movdqa      xmm0,xmm3
-    mov         eax,[rsp+10h]
-    mov         [rcx-2],eax
-    mov         eax,[rsp+60h]
-    punpcklqdq  xmm0,xmm4
-    punpckhqdq  xmm3,xmm4
-    mov         [r10+rcx-2],eax
-    movdqa      [rsp+20h],xmm0
-    mov         eax, [rsp+20h]
-    movdqa      [rsp+70h],xmm3
-    mov         [rcx+r10*2-2],eax
-    mov         eax,[rsp+70h]
-    mov         [rdx+rcx-2],eax
-    mov         eax,[rsp+18h]
-    mov         [r11],eax
-    mov         eax,[rsp+68h]
-    mov         [r10+r11],eax
-    mov         eax,[rsp+28h]
-    mov         [r11+r10*2],eax
-    mov         eax,[rsp+78h]
-    mov         [rdx+r11],eax
-    mov         eax,[rsp+14h]
-    mov         [rdi-2],eax
-    mov         eax,[rsp+64h]
-    mov         [r10+rdi-2],eax
-    mov         eax,[rsp+24h]
-    mov         [rdi+r10*2-2],eax
-    mov         eax, [rsp+74h]
-    mov         [rdx+rdi-2],eax
-    mov         eax, [rsp+1Ch]
-    mov         [rbx],eax
-    mov         eax, [rsp+6Ch]
-    mov         [r10+rbx],eax
-    mov         eax,[rsp+2Ch]
-    mov         [rbx+r10*2],eax
-    mov         eax,[rsp+7Ch]
-    mov         [rdx+rbx],eax
-    lea         rsp,[rsp+140h]
-    POP_XMM
-    mov         rbx, [rsp+28h]
-    pop         rdi
-    ret
-
-
-
-%elifdef  UNIX64
-
-
-WELS_EXTERN DeblockChromaEq4H_ssse3
-    mov         rax,rsp
-    push        rbx
-    push        rbp
-    push        r12
-
-    mov         rbp,   r8
-    mov         r8,    rdx
-    mov         r9,    rcx
-    mov         rcx,   rdi
-    mov         rdx,   rsi
-    mov         rdi,   rdx
-
-    sub         rsp,140h
-    lea         eax,[r8*4]
-    movsxd      r10,eax
-    mov         eax,[rcx-2]
-    mov         [rsp+10h],eax
-    lea         rbx,[r10+rdx-2]
-    lea         r11,[r10+rcx-2]
-
-    movdqa      xmm5,[rsp+10h]
-    movsxd      r10,r8d
-    mov         eax,[r10+rcx-2]
-    lea         rdx,[r10+r10*2]
-    mov         [rsp+20h],eax
-    mov         eax,[rcx+r10*2-2]
-    mov         [rsp+30h],eax
-    mov         eax,[rdx+rcx-2]
-    movdqa      xmm2,[rsp+20h]
-    mov         [rsp+40h],eax
-    mov         eax, [rdi-2]
-    movdqa      xmm4,[rsp+30h]
-    mov         [rsp+50h],eax
-    mov         eax,[r10+rdi-2]
-    movdqa      xmm3,[rsp+40h]
-    mov         [rsp+60h],eax
-    mov         eax,[rdi+r10*2-2]
-    punpckldq   xmm5,[rsp+50h]
-    mov         [rsp+70h],eax
-    mov         eax, [rdx+rdi-2]
-    punpckldq   xmm2, [rsp+60h]
-    mov          [rsp+80h],eax
-    mov         eax,[r11]
-    punpckldq   xmm4, [rsp+70h]
-    mov         [rsp+50h],eax
-    mov         eax,[rbx]
-    punpckldq   xmm3,[rsp+80h]
-    mov         [rsp+60h],eax
-    mov         eax,[r10+r11]
-    movdqa      xmm0, [rsp+50h]
-    punpckldq   xmm0, [rsp+60h]
-    punpcklqdq  xmm5,xmm0
-    movdqa      [rsp+50h],xmm0
-    mov         [rsp+50h],eax
-    mov         eax,[r10+rbx]
-    movdqa      xmm0,[rsp+50h]
-    movdqa      xmm1,xmm5
-    mov         [rsp+60h],eax
-    mov         eax,[r11+r10*2]
-    punpckldq   xmm0, [rsp+60h]
-    punpcklqdq  xmm2,xmm0
-    punpcklbw   xmm1,xmm2
-    punpckhbw   xmm5,xmm2
-    movdqa      [rsp+50h],xmm0
-    mov         [rsp+50h],eax
-    mov         eax,[rbx+r10*2]
-    movdqa      xmm0,[rsp+50h]
-    mov         [rsp+60h],eax
-    mov         eax, [rdx+r11]
-    movdqa      xmm15,xmm1
-    punpckldq   xmm0,[rsp+60h]
-    punpcklqdq  xmm4,xmm0
-    movdqa      [rsp+50h],xmm0
-    mov         [rsp+50h],eax
-    mov         eax, [rdx+rbx]
-    movdqa      xmm0,[rsp+50h]
-    mov         [rsp+60h],eax
-    punpckldq   xmm0, [rsp+60h]
-    punpcklqdq  xmm3,xmm0
-    movdqa      xmm0,xmm4
-    punpcklbw   xmm0,xmm3
-    punpckhbw   xmm4,xmm3
-    punpcklwd   xmm15,xmm0
-    punpckhwd   xmm1,xmm0
-    movdqa      xmm0,xmm5
-    movdqa      xmm12,xmm15
-    punpcklwd   xmm0,xmm4
-    punpckhwd   xmm5,xmm4
-    punpckldq   xmm12,xmm0
-    punpckhdq   xmm15,xmm0
-    movdqa      xmm0,xmm1
-    movdqa      xmm11,xmm12
-    punpckldq   xmm0,xmm5
-    punpckhdq   xmm1,xmm5
-    punpcklqdq  xmm11,xmm0
-    punpckhqdq  xmm12,xmm0
-    movsx       eax,r9w
-    movdqa      xmm14,xmm15
-    punpcklqdq  xmm14,xmm1
-    punpckhqdq  xmm15,xmm1
-    pxor        xmm1,xmm1
-    movd        xmm0,eax
-    movdqa      xmm4,xmm12
-    movdqa      xmm8,xmm11
-    mov         eax, ebp ; iBeta
-    punpcklwd   xmm0,xmm0
-    punpcklbw   xmm4,xmm1
-    punpckhbw   xmm12,xmm1
-    movdqa      xmm9,xmm14
-    movdqa      xmm7,xmm15
-    movdqa      xmm10,xmm15
-    pshufd      xmm13,xmm0,0
-    punpcklbw   xmm9,xmm1
-    punpckhbw   xmm14,xmm1
-    movdqa      xmm6,xmm13
-    movd        xmm0,eax
-    movdqa      [rsp],xmm11
-    mov         eax,2
-    cwde
-    punpckhbw   xmm11,xmm1
-    punpckhbw   xmm10,xmm1
-    punpcklbw   xmm7,xmm1
-    punpcklwd   xmm0,xmm0
-    punpcklbw   xmm8,xmm1
-    pshufd      xmm3,xmm0,0
-    movdqa      xmm1,xmm8
-    movdqa      xmm0,xmm4
-    psubw       xmm0,xmm9
-    psubw       xmm1,xmm4
-    movdqa      xmm2,xmm3
-    pabsw       xmm0,xmm0
-    pcmpgtw     xmm6,xmm0
-    pabsw       xmm0,xmm1
-    movdqa      xmm1,xmm3
-    pcmpgtw     xmm2,xmm0
-    pand        xmm6,xmm2
-    movdqa      xmm0,xmm7
-    movdqa      xmm2,xmm3
-    psubw       xmm0,xmm9
-    pabsw       xmm0,xmm0
-    pcmpgtw     xmm1,xmm0
-    pand        xmm6,xmm1
-    movdqa      xmm0,xmm12
-    movdqa      xmm1,xmm11
-    psubw       xmm0,xmm14
-    psubw       xmm1,xmm12
-    movdqa      xmm5,xmm6
-    pabsw       xmm0,xmm0
-    pcmpgtw     xmm13,xmm0
-    pabsw       xmm0,xmm1
-    movdqa      xmm1,xmm8
-    pcmpgtw     xmm2,xmm0
-    paddw       xmm1,xmm8
-    movdqa      xmm0,xmm10
-    pand        xmm13,xmm2
-    psubw       xmm0,xmm14
-    paddw       xmm1,xmm4
-    movdqa      xmm2,xmm11
-    pabsw       xmm0,xmm0
-    paddw       xmm2,xmm11
-    paddw       xmm1,xmm7
-    pcmpgtw     xmm3,xmm0
-    paddw       xmm2,xmm12
-    movd        xmm0,eax
-    pand        xmm13,xmm3
-    paddw       xmm2,xmm10
-    punpcklwd   xmm0,xmm0
-    pshufd      xmm3,xmm0,0
-    movdqa      xmm0,xmm6
-    paddw       xmm1,xmm3
-    pandn       xmm0,xmm4
-    paddw       xmm2,xmm3
-    psraw       xmm1,2
-    pand        xmm5,xmm1
-    por         xmm5,xmm0
-    paddw       xmm7,xmm7
-    paddw       xmm10,xmm10
-    psraw       xmm2,2
-    movdqa      xmm1,xmm13
-    movdqa      xmm0,xmm13
-    pandn       xmm0,xmm12
-    pand        xmm1,xmm2
-    paddw       xmm7,xmm9
-    por         xmm1,xmm0
-    paddw       xmm10,xmm14
-    paddw       xmm7,xmm8
-    movdqa      xmm0,xmm13
-    packuswb    xmm5,xmm1
-    paddw       xmm7,xmm3
-    paddw       xmm10,xmm11
-    movdqa      xmm1,xmm6
-    paddw       xmm10,xmm3
-    pandn       xmm6,xmm9
-    psraw       xmm7,2
-    pand        xmm1,xmm7
-    psraw       xmm10,2
-    pandn       xmm13,xmm14
-    pand        xmm0,xmm10
-    por         xmm1,xmm6
-    movdqa      xmm6,[rsp]
-    movdqa      xmm4,xmm6
-    por         xmm0,xmm13
-    punpcklbw   xmm4,xmm5
-    punpckhbw   xmm6,xmm5
-    movdqa      xmm3,xmm4
-    packuswb    xmm1,xmm0
-    movdqa      xmm0,xmm1
-    punpckhbw   xmm1,xmm15
-    punpcklbw   xmm0,xmm15
-    punpcklwd   xmm3,xmm0
-    punpckhwd   xmm4,xmm0
-    movdqa      xmm0,xmm6
-    movdqa      xmm2,xmm3
-    punpcklwd   xmm0,xmm1
-    punpckhwd   xmm6,xmm1
-    movdqa      xmm1,xmm4
-    punpckldq   xmm2,xmm0
-    punpckhdq   xmm3,xmm0
-    punpckldq   xmm1,xmm6
-    movdqa      xmm0,xmm2
-    punpcklqdq  xmm0,xmm1
-    punpckhdq   xmm4,xmm6
-    punpckhqdq  xmm2,xmm1
-    movdqa      [rsp+10h],xmm0
-    movdqa      [rsp+60h],xmm2
-    movdqa      xmm0,xmm3
-    mov         eax,[rsp+10h]
-    mov         [rcx-2],eax
-    mov         eax,[rsp+60h]
-    punpcklqdq  xmm0,xmm4
-    punpckhqdq  xmm3,xmm4
-    mov         [r10+rcx-2],eax
-    movdqa      [rsp+20h],xmm0
-    mov         eax, [rsp+20h]
-    movdqa      [rsp+70h],xmm3
-    mov         [rcx+r10*2-2],eax
-    mov         eax,[rsp+70h]
-    mov         [rdx+rcx-2],eax
-    mov         eax,[rsp+18h]
-    mov         [r11],eax
-    mov         eax,[rsp+68h]
-    mov         [r10+r11],eax
-    mov         eax,[rsp+28h]
-    mov         [r11+r10*2],eax
-    mov         eax,[rsp+78h]
-    mov         [rdx+r11],eax
-    mov         eax,[rsp+14h]
-    mov         [rdi-2],eax
-    mov         eax,[rsp+64h]
-    mov         [r10+rdi-2],eax
-    mov         eax,[rsp+24h]
-    mov         [rdi+r10*2-2],eax
-    mov         eax, [rsp+74h]
-    mov         [rdx+rdi-2],eax
-    mov         eax, [rsp+1Ch]
-    mov         [rbx],eax
-    mov         eax, [rsp+6Ch]
-    mov         [r10+rbx],eax
-    mov         eax,[rsp+2Ch]
-    mov         [rbx+r10*2],eax
-    mov         eax,[rsp+7Ch]
-    mov         [rdx+rbx],eax
-    lea         r11,[rsp+140h]
-    mov         rbx, [r11+28h]
-    mov         rsp,r11
-    pop         r12
-    pop         rbp
-    pop         rbx
-    ret
-
-
-
-%elifdef  X86_32
-
 ;***************************************************************************
 ;  void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;          int32_t iAlpha, int32_t iBeta)
@@ -1190,284 +645,30 @@
 ;***************************************************************************
 
 WELS_EXTERN DeblockChromaEq4H_ssse3
-    push        ebp
-    mov         ebp,esp
-    and         esp,0FFFFFFF0h
-    sub         esp,0C8h
-    mov         ecx,dword [ebp+8]
-    mov         edx,dword [ebp+0Ch]
-    mov         eax,dword [ebp+10h]
-    sub         ecx,2
-    sub         edx,2
-    push        esi
-    lea         esi,[eax+eax*2]
-    mov         dword [esp+18h],ecx
-    mov         dword [esp+4],edx
-    lea         ecx,[ecx+eax*4]
-    lea         edx,[edx+eax*4]
-    lea         eax,[esp+7Ch]
-    push        edi
-    mov         dword [esp+14h],esi
-    mov         dword [esp+18h],ecx
-    mov         dword [esp+0Ch],edx
-    mov         dword [esp+10h],eax
-    mov         esi,dword [esp+1Ch]
-    mov         ecx,dword [ebp+10h]
-    mov         edx,dword [esp+14h]
-    movd        xmm0,dword [esi]
-    movd        xmm1,dword [esi+ecx]
-    movd        xmm2,dword [esi+ecx*2]
-    movd        xmm3,dword [esi+edx]
-    mov         esi,dword  [esp+8]
-    movd        xmm4,dword [esi]
-    movd        xmm5,dword [esi+ecx]
-    movd        xmm6,dword [esi+ecx*2]
-    movd        xmm7,dword [esi+edx]
-    punpckldq   xmm0,xmm4
-    punpckldq   xmm1,xmm5
-    punpckldq   xmm2,xmm6
-    punpckldq   xmm3,xmm7
-    mov         esi,dword [esp+18h]
-    mov         edi,dword [esp+0Ch]
-    movd        xmm4,dword [esi]
-    movd        xmm5,dword [edi]
-    punpckldq   xmm4,xmm5
-    punpcklqdq  xmm0,xmm4
-    movd        xmm4,dword [esi+ecx]
-    movd        xmm5,dword [edi+ecx]
-    punpckldq   xmm4,xmm5
-    punpcklqdq  xmm1,xmm4
-    movd        xmm4,dword [esi+ecx*2]
-    movd        xmm5,dword [edi+ecx*2]
-    punpckldq   xmm4,xmm5
-    punpcklqdq  xmm2,xmm4
-    movd        xmm4,dword [esi+edx]
-    movd        xmm5,dword [edi+edx]
-    punpckldq   xmm4,xmm5
-    punpcklqdq  xmm3,xmm4
-    movdqa      xmm6,xmm0
-    punpcklbw   xmm0,xmm1
-    punpckhbw   xmm6,xmm1
-    movdqa      xmm7,xmm2
-    punpcklbw   xmm2,xmm3
-    punpckhbw   xmm7,xmm3
-    movdqa      xmm4,xmm0
-    movdqa      xmm5,xmm6
-    punpcklwd   xmm0,xmm2
-    punpckhwd   xmm4,xmm2
-    punpcklwd   xmm6,xmm7
-    punpckhwd   xmm5,xmm7
-    movdqa      xmm1,xmm0
-    movdqa      xmm2,xmm4
-    punpckldq   xmm0,xmm6
-    punpckhdq   xmm1,xmm6
-    punpckldq   xmm4,xmm5
-    punpckhdq   xmm2,xmm5
-    movdqa      xmm5,xmm0
-    movdqa      xmm6,xmm1
-    punpcklqdq  xmm0,xmm4
-    punpckhqdq  xmm5,xmm4
-    punpcklqdq  xmm1,xmm2
-    punpckhqdq  xmm6,xmm2
-    mov         edi,dword [esp+10h]
-    movdqa      [edi],xmm0
-    movdqa      [edi+10h],xmm5
-    movdqa      [edi+20h],xmm1
-    movdqa      [edi+30h],xmm6
-    movsx       ecx,word [ebp+14h]
-    movsx       edx,word [ebp+18h]
-    movdqa      xmm6,[esp+80h]
-    movdqa      xmm4,[esp+90h]
-    movdqa      xmm5,[esp+0A0h]
-    movdqa      xmm7,[esp+0B0h]
-    pxor        xmm0,xmm0
-    movd        xmm1,ecx
-    movdqa      xmm2,xmm1
-    punpcklwd   xmm2,xmm1
-    pshufd      xmm1,xmm2,0
-    movd        xmm2,edx
-    movdqa      xmm3,xmm2
-    punpcklwd   xmm3,xmm2
-    pshufd      xmm2,xmm3,0
-    movdqa      xmm3,xmm6
-    punpckhbw   xmm6,xmm0
-    movdqa      [esp+60h],xmm6
-    movdqa      xmm6,[esp+90h]
-    punpckhbw   xmm6,xmm0
-    movdqa      [esp+30h],xmm6
-    movdqa      xmm6,[esp+0A0h]
-    punpckhbw   xmm6,xmm0
-    movdqa      [esp+40h],xmm6
-    movdqa      xmm6,[esp+0B0h]
-    punpckhbw   xmm6,xmm0
-    movdqa      [esp+70h],xmm6
-    punpcklbw   xmm7,xmm0
-    punpcklbw   xmm4,xmm0
-    punpcklbw   xmm5,xmm0
-    punpcklbw   xmm3,xmm0
-    movdqa      [esp+50h],xmm7
-    movdqa      xmm6,xmm4
-    psubw       xmm6,xmm5
-    pabsw       xmm6,xmm6
-    movdqa      xmm0,xmm1
-    pcmpgtw     xmm0,xmm6
-    movdqa      xmm6,xmm3
-    psubw       xmm6,xmm4
-    pabsw       xmm6,xmm6
-    movdqa      xmm7,xmm2
-    pcmpgtw     xmm7,xmm6
-    movdqa      xmm6,[esp+50h]
-    psubw       xmm6,xmm5
-    pabsw       xmm6,xmm6
-    pand        xmm0,xmm7
-    movdqa      xmm7,xmm2
-    pcmpgtw     xmm7,xmm6
-    movdqa      xmm6,[esp+30h]
-    psubw       xmm6,[esp+40h]
-    pabsw       xmm6,xmm6
-    pcmpgtw     xmm1,xmm6
-    movdqa      xmm6,[esp+60h]
-    psubw       xmm6,[esp+30h]
-    pabsw       xmm6,xmm6
-    pand        xmm0,xmm7
-    movdqa      xmm7,xmm2
-    pcmpgtw     xmm7,xmm6
-    movdqa      xmm6,[esp+70h]
-    psubw       xmm6,[esp+40h]
-    pabsw       xmm6,xmm6
-    pand        xmm1,xmm7
-    pcmpgtw     xmm2,xmm6
-    pand        xmm1,xmm2
-    mov         eax,2
-    movsx       ecx,ax
-    movd        xmm2,ecx
-    movdqa      xmm6,xmm2
-    punpcklwd   xmm6,xmm2
-    pshufd      xmm2,xmm6,0
-    movdqa      [esp+20h],xmm2
-    movdqa      xmm2,xmm3
-    paddw       xmm2,xmm3
-    paddw       xmm2,xmm4
-    paddw       xmm2,[esp+50h]
-    paddw       xmm2,[esp+20h]
-    psraw       xmm2,2
-    movdqa      xmm6,xmm0
-    pand        xmm6,xmm2
-    movdqa      xmm2,xmm0
-    pandn       xmm2,xmm4
-    por         xmm6,xmm2
-    movdqa      xmm2,[esp+60h]
-    movdqa      xmm7,xmm2
-    paddw       xmm7,xmm2
-    paddw       xmm7,[esp+30h]
-    paddw       xmm7,[esp+70h]
-    paddw       xmm7,[esp+20h]
-    movdqa      xmm4,xmm1
-    movdqa      xmm2,xmm1
-    pandn       xmm2,[esp+30h]
-    psraw       xmm7,2
-    pand        xmm4,xmm7
-    por         xmm4,xmm2
-    movdqa      xmm2,[esp+50h]
-    packuswb    xmm6,xmm4
-    movdqa      [esp+90h],xmm6
-    movdqa      xmm6,xmm2
-    paddw       xmm6,xmm2
-    movdqa      xmm2,[esp+20h]
-    paddw       xmm6,xmm5
-    paddw       xmm6,xmm3
-    movdqa      xmm4,xmm0
-    pandn       xmm0,xmm5
-    paddw       xmm6,xmm2
-    psraw       xmm6,2
-    pand        xmm4,xmm6
-    por         xmm4,xmm0
-    movdqa      xmm0,[esp+70h]
-    movdqa      xmm5,xmm0
-    paddw       xmm5,xmm0
-    movdqa      xmm0,[esp+40h]
-    paddw       xmm5,xmm0
-    paddw       xmm5,[esp+60h]
-    movdqa      xmm3,xmm1
-    paddw       xmm5,xmm2
-    psraw       xmm5,2
-    pand        xmm3,xmm5
-    pandn       xmm1,xmm0
-    por         xmm3,xmm1
-    packuswb    xmm4,xmm3
-    movdqa      [esp+0A0h],xmm4
-    mov         esi,dword [esp+10h]
-    movdqa      xmm0,[esi]
-    movdqa      xmm1,[esi+10h]
-    movdqa      xmm2,[esi+20h]
-    movdqa      xmm3,[esi+30h]
-    movdqa      xmm6,xmm0
-    punpcklbw   xmm0,xmm1
-    punpckhbw   xmm6,xmm1
-    movdqa      xmm7,xmm2
-    punpcklbw   xmm2,xmm3
-    punpckhbw   xmm7,xmm3
-    movdqa      xmm4,xmm0
-    movdqa      xmm5,xmm6
-    punpcklwd   xmm0,xmm2
-    punpckhwd   xmm4,xmm2
-    punpcklwd   xmm6,xmm7
-    punpckhwd   xmm5,xmm7
-    movdqa      xmm1,xmm0
-    movdqa      xmm2,xmm4
-    punpckldq   xmm0,xmm6
-    punpckhdq   xmm1,xmm6
-    punpckldq   xmm4,xmm5
-    punpckhdq   xmm2,xmm5
-    movdqa      xmm5,xmm0
-    movdqa      xmm6,xmm1
-    punpcklqdq  xmm0,xmm4
-    punpckhqdq  xmm5,xmm4
-    punpcklqdq  xmm1,xmm2
-    punpckhqdq  xmm6,xmm2
-    mov         esi,dword [esp+1Ch]
-    mov         ecx,dword [ebp+10h]
-    mov         edx,dword [esp+14h]
-    mov         edi,dword [esp+8]
-    movd        dword [esi],xmm0
-    movd        dword [esi+ecx],xmm5
-    movd        dword [esi+ecx*2],xmm1
-    movd        dword [esi+edx],xmm6
-    psrldq      xmm0,4
-    psrldq      xmm5,4
-    psrldq      xmm1,4
-    psrldq      xmm6,4
-    mov         esi,dword [esp+18h]
-    movd        dword [edi],xmm0
-    movd        dword [edi+ecx],xmm5
-    movd        dword [edi+ecx*2],xmm1
-    movd        dword [edi+edx],xmm6
-    psrldq      xmm0,4
-    psrldq      xmm5,4
-    psrldq      xmm1,4
-    psrldq      xmm6,4
-    movd        dword [esi],xmm0
-    movd        dword [esi+ecx],xmm5
-    movd        dword [esi+ecx*2],xmm1
-    movd        dword [esi+edx],xmm6
-    psrldq      xmm0,4
-    psrldq      xmm5,4
-    psrldq      xmm1,4
-    psrldq      xmm6,4
-    mov         edi,dword [esp+0Ch]
-    movd        dword [edi],xmm0
-    movd        dword [edi+ecx],xmm5
-    movd        dword [edi+ecx*2],xmm1
-    movd        dword [edi+edx],xmm6
-    pop         edi
-    pop         esi
-    mov         esp,ebp
-    pop         ebp
-    ret
+    %assign push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r2, r2d
+    movd       xmm7, arg4d
+    pxor       xmm0, xmm0
+    pshufb     xmm7, xmm0                       ; iAlpha
+    lea        r3, [3 * r2 - 1]                 ; 3 * iStride - 1
 
-
+    SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
+    SSSE3_DeblockChromaEq4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, xmm2, xmm3, xmm6
+%ifdef X86_32
+    push r4
+    push r5
+    SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
+    pop r5
+    pop r4
+%else
+    SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
 %endif
 
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 
 ;********************************************************************************