shithub: openh264

Download patch

ref: 9909c306f1172661c2b09e4bee2428fd953e868a
parent: 040974f7355a2829ada1fe7451bead94eaa6aec1
author: Sindre Aamås <[email protected]>
date: Thu Feb 25 10:57:20 EST 2016

[Common/x86] DeblockChromaLt4H_ssse3 optimizations

Use packed 8-bit operations rather than unpack to 16-bit.

~5.72x speedup on Haswell (x86-64).
~1.85x speedup on Haswell (x86 32-bit).

--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -113,6 +113,7 @@
 %define r1w  dx
 %define r2w  r8w
 %define r3w  r9w
+%define r4w  ax
 %define r6w  r11w
 
 %define r0b  cl
@@ -182,6 +183,7 @@
 %define r1w  si
 %define r2w  dx
 %define r3w  cx
+%define r4w  r8w
 %define r6w  r10w
 
 %define r0b  dil
@@ -249,6 +251,7 @@
 %define r1w cx
 %define r2w dx
 %define r3w bx
+%define r4w si
 %define r6w bp
 
 %define r0b al
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -130,7 +130,45 @@
     por      %1, %3
 %endmacro
 
+; Compute
+; p0 = clip(p0 + clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
+; q0 = clip(q0 - clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
+; 16-wide parallel in packed byte representation in xmm registers.
+;
+; p1=%1 p0=%2 q0=%3 q1=%4 iTc=%5 FFh=%6 xmmclobber=%7,%8
+%macro SSE2_DeblockP0Q0_Lt4 8
+    ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
+    ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
+    ; Bias so that unsigned saturation can be used.
+    ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
+    ; q0 - p0 is split into a non-negative and non-positive part. The latter is
+    ; subtracted from the biased value.
+    movdqa     %7, %2
+    psubusb    %7, %3  ; clip(p0 - q0, 0, 255)
+    ; ((p1 - q1) >> 2) + 0xc0
+    pxor       %4, %6  ; q1 ^ 0xff aka -q1 - 1 & 0xff
+    pavgb      %1, %4  ; (((p1 - q1 + 0x100) >> 1)
+    pavgb      %1, %6  ;  + 0x100) >> 1
+    psubusb    %1, %7  ; -= clip(p0 - q0, 0, 255) saturate.
+    movdqa     %8, %3
+    psubusb    %8, %2  ; (clip(q0 - p0, 0, 255)
+    pavgb      %8, %1  ;  + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1
 
+    ; Unbias and split into a non-negative and a non-positive part.
+    ; Clip each part to iTc via minub.
+    ; Add/subtract each part to/from p0/q0 and clip.
+    movdqa     %6, [WELS_DB96_16]
+    psubusb    %6, %8
+    psubusb    %8, [WELS_DB96_16]
+    pminub     %6, %5
+    pminub     %8, %5
+    psubusb    %2, %6
+    paddusb    %2, %8  ; p0
+    paddusb    %3, %6
+    psubusb    %3, %8  ; q0
+%endmacro
+
+
 ;*******************************************************************************
 ;    void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
 ;                                 int32_t iBeta, int8_t * pTC)
@@ -208,38 +246,11 @@
     SSE2_ClipUB xmm1, xmm6, xmm5, xmm7  ; clip q1.
     MOVDQ    [r0 + 1 * r2], xmm1        ; store q1.
 
-    ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
-    ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
-    ; Bias so that unsigned saturation can be used.
-    ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
-    ; q0 - p0 is split into a non-negative and non-positive part. The latter is
-    ; subtracted from the biased value.
-    MOVDQ    xmm1, [r3 + 0 * r1] ; p0
-    MOVDQ    xmm0, [r0 + 0 * r2] ; q0
-    movdqa   xmm7, xmm1
-    psubusb  xmm7, xmm0  ; clip(p0 - q0, 0, 255)
-    ; ((p1 - q1) >> 2) + 0xc0
-    pxor     xmm6, xmm2  ; q1 ^ 0xff aka -q1 - 1 & 0xff
-    pavgb    xmm4, xmm6  ; (((p1 - q1 + 0x100) >> 1)
-    pavgb    xmm4, xmm2  ;  + 0x100) >> 1
-    psubusb  xmm4, xmm7  ; -= clip(p0 - q0, 0, 255) saturate.
-    psubusb  xmm0, xmm1  ; (clip(q0 - p0, 0, 255)
-    pavgb    xmm0, xmm4  ;  + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1
-
-    ; Unbias and split into a non-negative and a non-positive part.
-    ; Clip each part to iTc via minub.
-    ; Add/subtract each part to/from p0/q0 and clip.
-    movdqa   xmm6, [WELS_DB96_16]
-    psubusb  xmm6, xmm0
-    psubusb  xmm0, [WELS_DB96_16]
-    pminub   xmm6, xmm3
-    pminub   xmm0, xmm3
-    psubusb  xmm1, xmm6
-    paddusb  xmm1, xmm0
-    paddusb  xmm6, [r0 + 0 * r2]
-    psubusb  xmm6, xmm0
+    MOVDQ    xmm1, [r3 + 0 * r1]  ; p0
+    MOVDQ    xmm0, [r0 + 0 * r2]  ; q0
+    SSE2_DeblockP0Q0_Lt4 xmm4, xmm1, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7
     MOVDQ    [r3 + 0 * r1], xmm1  ; store p0.
-    MOVDQ    [r0 + 0 * r2], xmm6  ; store q0.
+    MOVDQ    [r0 + 0 * r2], xmm0  ; store q0.
 
     POP_XMM
     LOAD_5_PARA_POP
@@ -375,6 +386,130 @@
     ret
 
 
+; [out:p1,p0,q0,q1]=%1,%2,%3,%4 pPixCb=%5 pPixCr=%6 iStride=%7 3*iStride-1=%8 xmmclobber=%9,%10,%11
+%macro SSE2_LoadCbCr_4x16H 11
+    movd       %1,  [%5 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cb line 0
+    movd       %2,  [%5 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cb line 2
+    punpcklbw  %1,  %2                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 0,2
+    movd       %2,  [%5 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cb line 4
+    movd       %9,  [%5 + 2 * %8]      ; [p1,p0,q0,q1] cb line 6
+    punpcklbw  %2,  %9                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 4,6
+    punpcklwd  %1,  %2                 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 0,2,4,6
+    movd       %2,  [%6 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cr line 0
+    movd       %9,  [%6 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cr line 2
+    punpcklbw  %2,  %9                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 0,2
+    movd       %9,  [%6 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cr line 4
+    movd       %10, [%6 + 2 * %8]      ; [p1,p0,q0,q1] cr line 6
+    punpcklbw  %9,  %10                ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 4,6
+    punpcklwd  %2,  %9                 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 0,2,4,6
+    add        %5,  %7                 ; pPixCb += iStride
+    add        %6,  %7                 ; pPixCr += iStride
+    movd       %9,  [%5 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cb line 1
+    movd       %10, [%5 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cb line 3
+    punpcklbw  %9,  %10                ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 1,3
+    movd       %10, [%5 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cb line 5
+    movd       %3,  [%5 + 2 * %8]      ; [p1,p0,q0,q1] cb line 7
+    punpcklbw  %10, %3                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 5,7
+    punpcklwd  %9,  %10                ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 1,3,5,7
+    movd       %10, [%6 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cr line 1
+    movd       %3,  [%6 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cr line 3
+    punpcklbw  %10, %3                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 1,3
+    movd       %3,  [%6 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cr line 5
+    movd       %4,  [%6 + 2 * %8]      ; [p1,p0,q0,q1] cr line 7
+    punpcklbw  %3,  %4                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 5,7
+    punpcklwd  %10, %3                 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 1,3,5,7
+    movdqa     %3,  %1
+    punpckldq  %1,  %2                 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6
+    punpckhdq  %3,  %2                 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6
+    movdqa     %11, %9
+    punpckldq  %9,  %10                ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 1,3,5,7
+    punpckhdq  %11, %10                ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 1,3,5,7
+    movdqa     %2,  %1
+    punpcklqdq %1,  %9                 ; [p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1] cb/cr line 0,2,4,6,1,3,5,7
+    punpckhqdq %2,  %9                 ; [p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6,1,3,5,7
+    movdqa     %4,  %3
+    punpcklqdq %3,  %11                ; [q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0] cb/cr line 0,2,4,6,1,3,5,7
+    punpckhqdq %4,  %11                ; [q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6,1,3,5,7
+%endmacro
+
+; pPixCb+iStride=%1 pPixCr+iStride=%2 iStride=%3 3*iStride-1=%4 p0=%5 q0=%6 rclobber=%7 dwclobber={%8,%9} xmmclobber=%10
+%macro SSE2_StoreCbCr_4x16H 10
+    movdqa     %10, %5
+    punpcklbw  %10, %6                 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6
+    punpckhbw  %5, %6                  ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7
+    mov        %7, r7                  ; preserve stack pointer
+    and        r7, -16                 ; align stack pointer
+    sub        r7, 32                  ; allocate stack space
+    movdqa     [r7     ], %10          ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6 on the stack
+    movdqa     [r7 + 16], %5           ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7 on the stack
+    mov        %8, [r7 + 16]           ; [p0,q0,p0,q0] cb line 1,3
+    mov        [%1 + 0 * %3 - 1], %9   ; store [p0,q0] cb line 1
+    shr        %8, 16                  ; [p0,q0] cb line 3
+    mov        [%1 + 2 * %3 - 1], %9   ; store [p0,q0] cb line 3
+    mov        %8, [r7 + 20]           ; [p0,q0,p0,q0] cb line 5,7
+    mov        [%1 + 4 * %3 - 1], %9   ; store [p0,q0] cb line 5
+    shr        %8, 16                  ; [p0,q0] cb line 7
+    mov        [%1 + 2 * %4 + 1], %9   ; store [p0,q0] cb line 7
+    mov        %8, [r7 + 24]           ; [p0,q0,p0,q0] cr line 1,3
+    mov        [%2 + 0 * %3 - 1], %9   ; store [p0,q0] cr line 1
+    shr        %8, 16                  ; [p0,q0] cr line 3
+    mov        [%2 + 2 * %3 - 1], %9   ; store [p0,q0] cr line 3
+    mov        %8, [r7 + 28]           ; [p0,q0,p0,q0] cr line 5,7
+    mov        [%2 + 4 * %3 - 1], %9   ; store [p0,q0] cr line 5
+    shr        %8, 16                  ; [p0,q0] cr line 7
+    mov        [%2 + 2 * %4 + 1], %9   ; store [p0,q0] cr line 7
+    sub        %1, %3                  ; pPixCb -= iStride
+    sub        %2, %3                  ; pPixCr -= iStride
+    mov        %8, [r7     ]           ; [p0,q0,p0,q0] cb line 0,2
+    mov        [%1 + 0 * %3 - 1], %9   ; store [p0,q0] cb line 0
+    shr        %8, 16                  ; [p0,q0] cb line 2
+    mov        [%1 + 2 * %3 - 1], %9   ; store [p0,q0] cb line 2
+    mov        %8, [r7 +  4]           ; [p0,q0,p0,q0] cb line 4,6
+    mov        [%1 + 4 * %3 - 1], %9   ; store [p0,q0] cb line 4
+    shr        %8, 16                  ; [p0,q0] cb line 6
+    mov        [%1 + 2 * %4 + 1], %9   ; store [p0,q0] cb line 6
+    mov        %8, [r7 +  8]           ; [p0,q0,p0,q0] cr line 0,2
+    mov        [%2 + 0 * %3 - 1], %9   ; store [p0,q0] cr line 0
+    shr        %8, 16                  ; [p0,q0] cr line 2
+    mov        [%2 + 2 * %3 - 1], %9   ; store [p0,q0] cr line 2
+    mov        %8, [r7 + 12]           ; [p0,q0,p0,q0] cr line 4,6
+    mov        [%2 + 4 * %3 - 1], %9   ; store [p0,q0] cr line 4
+    shr        %8, 16                  ; [p0,q0] cr line 6
+    mov        [%2 + 2 * %4 + 1], %9   ; store [p0,q0] cr line 6
+    mov        r7, %7                  ; restore stack pointer
+%endmacro
+
+; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 pTC=%7 xmmclobber=%8,%9,%10 interleaveTC=%11
+%macro SSSE3_DeblockChromaLt4 11
+    movdqa     %8, %3
+    SSE2_AbsDiffUB %8, %2, %9           ; |p0 - q0|
+    SSE2_CmpgeUB %8, %5                 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
+    movdqa     %9, %4
+    SSE2_AbsDiffUB %9, %3, %5           ; |q1 - q0|
+    movdqa     %10, %1
+    SSE2_AbsDiffUB %10, %2, %5          ; |p1 - p0|
+    pmaxub     %9, %10                  ; max(|q1 - q0|, |p1 - p0|)
+    pxor       %10, %10
+    movd       %5, %6
+    pshufb     %5, %10                  ; iBeta
+    SSE2_CmpgeUB %9, %5                 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
+    por        %8, %9                   ; | !bDeltaP0Q0
+    movd       %5, [%7]
+%if %11
+    punpckldq  %5, %5
+    punpcklbw  %5, %5                   ; iTc
+%else
+    pshufd     %5, %5, 0                ; iTc
+%endif
+    pcmpeqw    %10, %10                 ; FFh
+    movdqa     %9, %5
+    pcmpgtb    %9, %10                  ; iTc > -1 ? FFh : 00h
+    pandn      %8, %5                   ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
+    pand       %8, %9                   ; &= (iTc > -1 ? FFh : 00h)
+    SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9
+%endmacro
+
+
 ;******************************************************************************
 ; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
@@ -395,73 +530,18 @@
     movhps   xmm0, [r1 + 0 * r2]              ; q0 cr
     movq     xmm2, [r0 + 1 * r3]              ; p0 cb
     movhps   xmm2, [r1 + 1 * r3]              ; p0 cr
-
-    movdqa   xmm4, xmm0
-    SSE2_AbsDiffUB xmm4, xmm2, xmm5           ; |p0 - q0|
-    SSE2_CmpgeUB xmm4, xmm7                   ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
-
     movq     xmm1, [r0 + 1 * r2]              ; q1 cb
     movhps   xmm1, [r1 + 1 * r2]              ; q1 cr
     movq     xmm3, [r0 + 2 * r3]              ; p1 cb
     movhps   xmm3, [r1 + 2 * r3]              ; p1 cr
 
-    movdqa   xmm5, xmm1
-    SSE2_AbsDiffUB xmm5, xmm0, xmm7           ; |q1 - q0|
-    movdqa   xmm6, xmm3
-    SSE2_AbsDiffUB xmm6, xmm2, xmm7           ; |p1 - p0|
-    pmaxub   xmm5, xmm6                       ; max(|q1 - q0|, |p1 - p0|)
-
-    pxor     xmm6, xmm6
-    movd     xmm7, arg5d
-    pshufb   xmm7, xmm6                       ; iBeta
-
-    SSE2_CmpgeUB xmm5, xmm7                   ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
-    por      xmm4, xmm5                       ; | !bDeltaP0Q0
-
 %ifidni arg6, r5
-    movd     xmm7, [arg6]
+    SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, arg6, xmm4, xmm5, xmm6, 1
 %else
     mov      r2, arg6
-    movd     xmm7, [r2]
+    SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, r2,   xmm4, xmm5, xmm6, 1
 %endif
-    punpckldq xmm7, xmm7
-    punpcklbw xmm7, xmm7                      ; iTc
-    pcmpeqw  xmm6, xmm6                       ; FFh
-    movdqa   xmm5, xmm7
-    pcmpgtb  xmm5, xmm6                       ; iTc > -1 ? FFh : FFh
-    pandn    xmm4, xmm7                       ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
-    pand     xmm4, xmm5                       ; &= (iTc > -1 ? FFh : 00h)
 
-    ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
-    ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
-    ; Bias so that unsigned saturation can be used.
-    ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
-    ; q0 - p0 is split into a non-negative and non-positive part. The latter is
-    ; subtracted from the biased value.
-    movdqa   xmm7, xmm2
-    psubusb  xmm7, xmm0  ; clip(p0 - q0, 0, 255)
-    ; ((p1 - q1) >> 2) + 0xc0
-    pxor     xmm1, xmm6  ; q1 ^ 0xff aka -q1 - 1 & 0xff
-    pavgb    xmm3, xmm1  ; (((p1 - q1 + 0x100) >> 1)
-    pavgb    xmm3, xmm6  ;  + 0x100) >> 1
-    psubusb  xmm3, xmm7  ; -= clip(p0 - q0, 0, 255) saturate.
-    movdqa   xmm5, xmm0
-    psubusb  xmm5, xmm2  ; (clip(q0 - p0, 0, 255)
-    pavgb    xmm5, xmm3  ;  + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1
-
-    ; Unbias and split into a non-negative and a non-positive part.
-    ; Clip each part to iTc via minub.
-    ; Add/subtract each part to/from p0/q0 and clip.
-    movdqa   xmm6, [WELS_DB96_16]
-    psubusb  xmm6, xmm5
-    psubusb  xmm5, [WELS_DB96_16]
-    pminub   xmm6, xmm4
-    pminub   xmm5, xmm4
-    psubusb  xmm2, xmm6
-    paddusb  xmm2, xmm5
-    paddusb  xmm0, xmm6
-    psubusb  xmm0, xmm5
-
     movlps   [r0 + 1 * r3], xmm2              ; store p0 cb
     movhps   [r1 + 1 * r3], xmm2              ; store p0 cr
     movlps   [r0         ], xmm0              ; store q0 cb
@@ -536,6 +616,30 @@
     ret
 
 
+;*******************************************************************************
+;    void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4H_ssse3
+    %assign push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r2, r2d
+    movd       xmm7, arg4d
+    pxor       xmm0, xmm0
+    pshufb     xmm7, xmm0                       ; iAlpha
+    lea        r3, [3 * r2 - 1]                 ; 3 * iStride - 1
+
+    SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
+    SSSE3_DeblockChromaLt4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, r5, xmm2, xmm3, xmm6, 0
+    SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
+
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+
+
 %ifdef  WIN64
 
 
@@ -802,289 +906,6 @@
 
 
 
-WELS_EXTERN DeblockChromaLt4H_ssse3
-    mov         rax,rsp
-    push        rbx
-    push        rbp
-    push        rsi
-    push        rdi
-    push        r12
-    PUSH_XMM 16
-    sub         rsp,170h
-
-    movsxd      rsi,r8d
-    lea         eax,[r8*4]
-    mov         r11d,r9d
-    movsxd      r10,eax
-    mov         eax, [rcx-2]
-    mov         r12,rdx
-    mov         [rsp+40h],eax
-    mov         eax, [rsi+rcx-2]
-    lea         rbx,[r10+rcx-2]
-    movdqa      xmm5,[rsp+40h]
-    mov         [rsp+50h],eax
-    mov         eax, [rcx+rsi*2-2]
-    lea         rbp,[r10+rdx-2]
-    movdqa      xmm2, [rsp+50h]
-    mov         [rsp+60h],eax
-    lea         r10,[rsi+rsi*2]
-    mov         rdi,rcx
-    mov         eax,[r10+rcx-2]
-    movdqa      xmm4,[rsp+60h]
-    mov         [rsp+70h],eax
-    mov         eax,[rdx-2]
-    mov         [rsp+80h],eax
-    mov         eax, [rsi+rdx-2]
-    movdqa      xmm3,[rsp+70h]
-    mov         [rsp+90h],eax
-    mov         eax,[rdx+rsi*2-2]
-    punpckldq   xmm5,[rsp+80h]
-    mov         [rsp+0A0h],eax
-    mov         eax, [r10+rdx-2]
-    punpckldq   xmm2,[rsp+90h]
-    mov         [rsp+0B0h],eax
-    mov         eax, [rbx]
-    punpckldq   xmm4,[rsp+0A0h]
-    mov         [rsp+80h],eax
-    mov         eax,[rbp]
-    punpckldq   xmm3,[rsp+0B0h]
-    mov         [rsp+90h],eax
-    mov         eax,[rsi+rbx]
-    movdqa      xmm0,[rsp+80h]
-    punpckldq   xmm0,[rsp+90h]
-    punpcklqdq  xmm5,xmm0
-    movdqa      [rsp+80h],xmm0
-    mov         [rsp+80h],eax
-    mov         eax,[rsi+rbp]
-    movdqa      xmm0,[rsp+80h]
-    movdqa      xmm1,xmm5
-    mov         [rsp+90h],eax
-    mov         eax,[rbx+rsi*2]
-    punpckldq   xmm0,[rsp+90h]
-    punpcklqdq  xmm2,xmm0
-    punpcklbw   xmm1,xmm2
-    punpckhbw   xmm5,xmm2
-    movdqa      [rsp+80h],xmm0
-    mov         [rsp+80h],eax
-    mov         eax,[rbp+rsi*2]
-    movdqa      xmm0, [rsp+80h]
-    mov         [rsp+90h],eax
-    mov         eax,[r10+rbx]
-    movdqa      xmm7,xmm1
-    punpckldq   xmm0,[rsp+90h]
-    punpcklqdq  xmm4,xmm0
-    movdqa      [rsp+80h],xmm0
-    mov         [rsp+80h],eax
-    mov         eax, [r10+rbp]
-    movdqa      xmm0,[rsp+80h]
-    mov         [rsp+90h],eax
-    punpckldq   xmm0,[rsp+90h]
-    punpcklqdq  xmm3,xmm0
-    movdqa      xmm0,xmm4
-    punpcklbw   xmm0,xmm3
-    punpckhbw   xmm4,xmm3
-    punpcklwd   xmm7,xmm0
-    punpckhwd   xmm1,xmm0
-    movdqa      xmm0,xmm5
-    movdqa      xmm6,xmm7
-    punpcklwd   xmm0,xmm4
-    punpckhwd   xmm5,xmm4
-    punpckldq   xmm6,xmm0
-    punpckhdq   xmm7,xmm0
-    movdqa      xmm0,xmm1
-    punpckldq   xmm0,xmm5
-    mov         rax, [rsp+1C8h+160]    ; pTC
-    punpckhdq   xmm1,xmm5
-    movdqa      xmm9,xmm6
-    punpckhqdq  xmm6,xmm0
-    punpcklqdq  xmm9,xmm0
-    movdqa      xmm2,xmm7
-    movdqa      xmm13,xmm6
-    movdqa      xmm4,xmm9
-    movdqa      [rsp+10h],xmm9
-    punpcklqdq  xmm2,xmm1
-    punpckhqdq  xmm7,xmm1
-    pxor        xmm1,xmm1
-    movsx       ecx,byte [rax+3]
-    movsx       edx,byte [rax+2]
-    movsx       r8d,byte [rax+1]
-    movsx       r9d,byte [rax]
-    movdqa      xmm10,xmm1
-    movdqa      xmm15,xmm2
-    punpckhbw   xmm2,xmm1
-    punpckhbw   xmm6,xmm1
-    punpcklbw   xmm4,xmm1
-    movsx       eax,r11w
-    mov         word [rsp+0Eh],cx
-    mov         word [rsp+0Ch],cx
-    movdqa      xmm3,xmm7
-    movdqa      xmm8,xmm7
-    movdqa      [rsp+20h],xmm7
-    punpcklbw   xmm15,xmm1
-    punpcklbw   xmm13,xmm1
-    punpcklbw   xmm3,xmm1
-    mov         word [rsp+0Ah],dx
-    mov         word [rsp+8],dx
-    mov         word [rsp+6],r8w
-    movd        xmm0,eax
-    movdqa      [rsp+30h],xmm6
-    punpckhbw   xmm9,xmm1
-    punpckhbw   xmm8,xmm1
-    punpcklwd   xmm0,xmm0
-    movsx       eax,word [rsp+1C0h+160]   ; iBeta
-    mov         word [rsp+4],r8w
-    mov         word [rsp+2],r9w
-    pshufd      xmm12,xmm0,0
-    mov         word [rsp],r9w
-    movd        xmm0,eax
-    mov         eax,4
-    cwde
-    movdqa      xmm14, [rsp]
-    movdqa      [rsp],xmm2
-    movdqa      xmm2,xmm12
-    punpcklwd   xmm0,xmm0
-    pshufd      xmm11,xmm0,0
-    psubw       xmm10,xmm14
-    movd        xmm0,eax
-    movdqa      xmm7,xmm14
-    movdqa      xmm6,xmm14
-    pcmpgtw     xmm7,xmm1
-    punpcklwd   xmm0,xmm0
-    pshufd      xmm5,xmm0,0
-    movdqa      xmm0,xmm4
-    movdqa      xmm1,xmm15
-    psubw       xmm4,xmm13
-    psubw       xmm0,xmm3
-    psubw       xmm1,xmm13
-    psubw       xmm3,xmm15
-    psllw       xmm1,2
-    paddw       xmm1,xmm0
-    paddw       xmm1,xmm5
-    movdqa      xmm0,xmm10
-    psraw       xmm1,3
-    pmaxsw      xmm0,xmm1
-    pminsw      xmm6,xmm0
-    movdqa      xmm1,xmm11
-    movdqa      xmm0,xmm13
-    psubw       xmm0,xmm15
-    pabsw       xmm0,xmm0
-    pcmpgtw     xmm2,xmm0
-    pabsw       xmm0,xmm4
-    pcmpgtw     xmm1,xmm0
-    pabsw       xmm0,xmm3
-    pand        xmm2,xmm1
-    movdqa      xmm1,xmm11
-    movdqa      xmm3,[rsp+30h]
-    pcmpgtw     xmm1,xmm0
-    movdqa      xmm0,xmm9
-    pand        xmm2,xmm1
-    psubw       xmm0,xmm8
-    psubw       xmm9,xmm3
-    pand        xmm2,xmm7
-    pand        xmm6,xmm2
-    psubw       xmm15,xmm6
-    paddw       xmm13,xmm6
-    movdqa      xmm2,[rsp]
-    movdqa      xmm1,xmm2
-    psubw       xmm1,xmm3
-    psubw       xmm8,xmm2
-    psllw       xmm1,2
-    paddw       xmm1,xmm0
-    paddw       xmm1,xmm5
-    movdqa      xmm0,xmm3
-    movdqa      xmm5,[rsp+10h]
-    psubw       xmm0,xmm2
-    psraw       xmm1,3
-    movdqa      xmm4,xmm5
-    pabsw       xmm0,xmm0
-    pmaxsw      xmm10,xmm1
-    movdqa      xmm1,xmm11
-    pcmpgtw     xmm12,xmm0
-    pabsw       xmm0,xmm9
-    pminsw      xmm14,xmm10
-    pcmpgtw     xmm1,xmm0
-    pabsw       xmm0,xmm8
-    pcmpgtw     xmm11,xmm0
-    pand        xmm12,xmm1
-    movdqa      xmm1,[rsp+20h]
-    pand        xmm12,xmm11
-    pand        xmm12,xmm7
-    pand        xmm14,xmm12
-    paddw       xmm3,xmm14
-    psubw       xmm2,xmm14
-    packuswb    xmm13,xmm3
-    packuswb    xmm15,xmm2
-    punpcklbw   xmm4,xmm13
-    punpckhbw   xmm5,xmm13
-    movdqa      xmm0,xmm15
-    punpcklbw   xmm0,xmm1
-    punpckhbw   xmm15,xmm1
-    movdqa      xmm3,xmm4
-    punpcklwd   xmm3,xmm0
-    punpckhwd   xmm4,xmm0
-    movdqa      xmm0,xmm5
-    movdqa      xmm2,xmm3
-    movdqa      xmm1,xmm4
-    punpcklwd   xmm0,xmm15
-    punpckhwd   xmm5,xmm15
-    punpckldq   xmm2,xmm0
-    punpckhdq   xmm3,xmm0
-    punpckldq   xmm1,xmm5
-    movdqa      xmm0,xmm2
-    punpcklqdq  xmm0,xmm1
-    punpckhdq   xmm4,xmm5
-    punpckhqdq  xmm2,xmm1
-    movdqa      [rsp+40h],xmm0
-    movdqa      xmm0,xmm3
-    movdqa      [rsp+90h],xmm2
-    mov         eax,[rsp+40h]
-    mov         [rdi-2],eax
-    mov         eax, [rsp+90h]
-    punpcklqdq  xmm0,xmm4
-    punpckhqdq  xmm3,xmm4
-    mov         [rsi+rdi-2],eax
-    movdqa      [rsp+50h],xmm0
-    mov         eax,[rsp+50h]
-    movdqa      [rsp+0A0h],xmm3
-    mov         [rdi+rsi*2-2],eax
-    mov         eax,[rsp+0A0h]
-    mov         [r10+rdi-2],eax
-    mov         eax,[rsp+48h]
-    mov         [rbx],eax
-    mov         eax,[rsp+98h]
-    mov         [rsi+rbx],eax
-    mov         eax,[rsp+58h]
-    mov         [rbx+rsi*2],eax
-    mov         eax, [rsp+0A8h]
-    mov         [r10+rbx],eax
-    mov         eax, [rsp+44h]
-    mov         [r12-2],eax
-    mov         eax,[rsp+94h]
-    mov         [rsi+r12-2],eax
-    mov         eax,[rsp+54h]
-    mov         [r12+rsi*2-2],eax
-    mov         eax, [rsp+0A4h]
-    mov         [r10+r12-2],eax
-    mov         eax,[rsp+4Ch]
-    mov         [rbp],eax
-    mov         eax,[rsp+9Ch]
-    mov         [rsi+rbp],eax
-    mov         eax, [rsp+5Ch]
-    mov         [rbp+rsi*2],eax
-    mov         eax,[rsp+0ACh]
-    mov         [r10+rbp],eax
-    lea         r11,[rsp+170h]
-    mov         rsp,r11
-    POP_XMM
-    pop         r12
-    pop         rdi
-    pop         rsi
-    pop         rbp
-    pop         rbx
-    ret
-
-
-
 %elifdef  UNIX64
 
 
@@ -1360,294 +1181,7 @@
     ret
 
 
-WELS_EXTERN DeblockChromaLt4H_ssse3
-    mov         rax,rsp
-    push        rbx
-    push        rbp
-    push        r12
-    push        r13
-    push        r14
-    sub         rsp,170h
 
-    mov         r13,   r8
-    mov         r14,   r9
-    mov         r8,    rdx
-    mov         r9,    rcx
-    mov         rdx,   rdi
-    mov         rcx,   rsi
-
-    movsxd      rsi,r8d
-    lea         eax,[r8*4]
-    mov         r11d,r9d
-    movsxd      r10,eax
-    mov         eax, [rcx-2]
-    mov         r12,rdx
-    mov         [rsp+40h],eax
-    mov         eax, [rsi+rcx-2]
-    lea         rbx,[r10+rcx-2]
-    movdqa      xmm5,[rsp+40h]
-    mov         [rsp+50h],eax
-    mov         eax, [rcx+rsi*2-2]
-    lea         rbp,[r10+rdx-2]
-    movdqa      xmm2, [rsp+50h]
-    mov         [rsp+60h],eax
-    lea         r10,[rsi+rsi*2]
-    mov         rdi,rcx
-    mov         eax,[r10+rcx-2]
-    movdqa      xmm4,[rsp+60h]
-    mov         [rsp+70h],eax
-    mov         eax,[rdx-2]
-    mov         [rsp+80h],eax
-    mov         eax, [rsi+rdx-2]
-    movdqa      xmm3,[rsp+70h]
-    mov         [rsp+90h],eax
-    mov         eax,[rdx+rsi*2-2]
-    punpckldq   xmm5,[rsp+80h]
-    mov         [rsp+0A0h],eax
-    mov         eax, [r10+rdx-2]
-    punpckldq   xmm2,[rsp+90h]
-    mov         [rsp+0B0h],eax
-    mov         eax, [rbx]
-    punpckldq   xmm4,[rsp+0A0h]
-    mov         [rsp+80h],eax
-    mov         eax,[rbp]
-    punpckldq   xmm3,[rsp+0B0h]
-    mov         [rsp+90h],eax
-    mov         eax,[rsi+rbx]
-    movdqa      xmm0,[rsp+80h]
-    punpckldq   xmm0,[rsp+90h]
-    punpcklqdq  xmm5,xmm0
-    movdqa      [rsp+80h],xmm0
-    mov         [rsp+80h],eax
-    mov         eax,[rsi+rbp]
-    movdqa      xmm0,[rsp+80h]
-    movdqa      xmm1,xmm5
-    mov         [rsp+90h],eax
-    mov         eax,[rbx+rsi*2]
-    punpckldq   xmm0,[rsp+90h]
-    punpcklqdq  xmm2,xmm0
-    punpcklbw   xmm1,xmm2
-    punpckhbw   xmm5,xmm2
-    movdqa      [rsp+80h],xmm0
-    mov         [rsp+80h],eax
-    mov         eax,[rbp+rsi*2]
-    movdqa      xmm0, [rsp+80h]
-    mov         [rsp+90h],eax
-    mov         eax,[r10+rbx]
-    movdqa      xmm7,xmm1
-    punpckldq   xmm0,[rsp+90h]
-    punpcklqdq  xmm4,xmm0
-    movdqa      [rsp+80h],xmm0
-    mov         [rsp+80h],eax
-    mov         eax, [r10+rbp]
-    movdqa      xmm0,[rsp+80h]
-    mov         [rsp+90h],eax
-    punpckldq   xmm0,[rsp+90h]
-    punpcklqdq  xmm3,xmm0
-    movdqa      xmm0,xmm4
-    punpcklbw   xmm0,xmm3
-    punpckhbw   xmm4,xmm3
-    punpcklwd   xmm7,xmm0
-    punpckhwd   xmm1,xmm0
-    movdqa      xmm0,xmm5
-    movdqa      xmm6,xmm7
-    punpcklwd   xmm0,xmm4
-    punpckhwd   xmm5,xmm4
-    punpckldq   xmm6,xmm0
-    punpckhdq   xmm7,xmm0
-    movdqa      xmm0,xmm1
-    punpckldq   xmm0,xmm5
-    mov         rax, r14    ; pTC
-    punpckhdq   xmm1,xmm5
-    movdqa      xmm9,xmm6
-    punpckhqdq  xmm6,xmm0
-    punpcklqdq  xmm9,xmm0
-    movdqa      xmm2,xmm7
-    movdqa      xmm13,xmm6
-    movdqa      xmm4,xmm9
-    movdqa      [rsp+10h],xmm9
-    punpcklqdq  xmm2,xmm1
-    punpckhqdq  xmm7,xmm1
-    pxor        xmm1,xmm1
-    movsx       ecx,byte [rax+3]
-    movsx       edx,byte [rax+2]
-    movsx       r8d,byte [rax+1]
-    movsx       r9d,byte [rax]
-    movdqa      xmm10,xmm1
-    movdqa      xmm15,xmm2
-    punpckhbw   xmm2,xmm1
-    punpckhbw   xmm6,xmm1
-    punpcklbw   xmm4,xmm1
-    movsx       eax,r11w
-    mov         word [rsp+0Eh],cx
-    mov         word [rsp+0Ch],cx
-    movdqa      xmm3,xmm7
-    movdqa      xmm8,xmm7
-    movdqa      [rsp+20h],xmm7
-    punpcklbw   xmm15,xmm1
-    punpcklbw   xmm13,xmm1
-    punpcklbw   xmm3,xmm1
-    mov         word [rsp+0Ah],dx
-    mov         word [rsp+8],dx
-    mov         word [rsp+6],r8w
-    movd        xmm0,eax
-    movdqa      [rsp+30h],xmm6
-    punpckhbw   xmm9,xmm1
-    punpckhbw   xmm8,xmm1
-    punpcklwd   xmm0,xmm0
-    mov         eax, r13d   ; iBeta
-    mov         word [rsp+4],r8w
-    mov         word [rsp+2],r9w
-    pshufd      xmm12,xmm0,0
-    mov         word [rsp],r9w
-    movd        xmm0,eax
-    mov         eax,4
-    cwde
-    movdqa      xmm14, [rsp]
-    movdqa      [rsp],xmm2
-    movdqa      xmm2,xmm12
-    punpcklwd   xmm0,xmm0
-    pshufd      xmm11,xmm0,0
-    psubw       xmm10,xmm14
-    movd        xmm0,eax
-    movdqa      xmm7,xmm14
-    movdqa      xmm6,xmm14
-    pcmpgtw     xmm7,xmm1
-    punpcklwd   xmm0,xmm0
-    pshufd      xmm5,xmm0,0
-    movdqa      xmm0,xmm4
-    movdqa      xmm1,xmm15
-    psubw       xmm4,xmm13
-    psubw       xmm0,xmm3
-    psubw       xmm1,xmm13
-    psubw       xmm3,xmm15
-    psllw       xmm1,2
-    paddw       xmm1,xmm0
-    paddw       xmm1,xmm5
-    movdqa      xmm0,xmm10
-    psraw       xmm1,3
-    pmaxsw      xmm0,xmm1
-    pminsw      xmm6,xmm0
-    movdqa      xmm1,xmm11
-    movdqa      xmm0,xmm13
-    psubw       xmm0,xmm15
-    pabsw       xmm0,xmm0
-    pcmpgtw     xmm2,xmm0
-    pabsw       xmm0,xmm4
-    pcmpgtw     xmm1,xmm0
-    pabsw       xmm0,xmm3
-    pand        xmm2,xmm1
-    movdqa      xmm1,xmm11
-    movdqa      xmm3,[rsp+30h]
-    pcmpgtw     xmm1,xmm0
-    movdqa      xmm0,xmm9
-    pand        xmm2,xmm1
-    psubw       xmm0,xmm8
-    psubw       xmm9,xmm3
-    pand        xmm2,xmm7
-    pand        xmm6,xmm2
-    psubw       xmm15,xmm6
-    paddw       xmm13,xmm6
-    movdqa      xmm2,[rsp]
-    movdqa      xmm1,xmm2
-    psubw       xmm1,xmm3
-    psubw       xmm8,xmm2
-    psllw       xmm1,2
-    paddw       xmm1,xmm0
-    paddw       xmm1,xmm5
-    movdqa      xmm0,xmm3
-    movdqa      xmm5,[rsp+10h]
-    psubw       xmm0,xmm2
-    psraw       xmm1,3
-    movdqa      xmm4,xmm5
-    pabsw       xmm0,xmm0
-    pmaxsw      xmm10,xmm1
-    movdqa      xmm1,xmm11
-    pcmpgtw     xmm12,xmm0
-    pabsw       xmm0,xmm9
-    pminsw      xmm14,xmm10
-    pcmpgtw     xmm1,xmm0
-    pabsw       xmm0,xmm8
-    pcmpgtw     xmm11,xmm0
-    pand        xmm12,xmm1
-    movdqa      xmm1,[rsp+20h]
-    pand        xmm12,xmm11
-    pand        xmm12,xmm7
-    pand        xmm14,xmm12
-    paddw       xmm3,xmm14
-    psubw       xmm2,xmm14
-    packuswb    xmm13,xmm3
-    packuswb    xmm15,xmm2
-    punpcklbw   xmm4,xmm13
-    punpckhbw   xmm5,xmm13
-    movdqa      xmm0,xmm15
-    punpcklbw   xmm0,xmm1
-    punpckhbw   xmm15,xmm1
-    movdqa      xmm3,xmm4
-    punpcklwd   xmm3,xmm0
-    punpckhwd   xmm4,xmm0
-    movdqa      xmm0,xmm5
-    movdqa      xmm2,xmm3
-    movdqa      xmm1,xmm4
-    punpcklwd   xmm0,xmm15
-    punpckhwd   xmm5,xmm15
-    punpckldq   xmm2,xmm0
-    punpckhdq   xmm3,xmm0
-    punpckldq   xmm1,xmm5
-    movdqa      xmm0,xmm2
-    punpcklqdq  xmm0,xmm1
-    punpckhdq   xmm4,xmm5
-    punpckhqdq  xmm2,xmm1
-    movdqa      [rsp+40h],xmm0
-    movdqa      xmm0,xmm3
-    movdqa      [rsp+90h],xmm2
-    mov         eax,[rsp+40h]
-    mov         [rdi-2],eax
-    mov         eax, [rsp+90h]
-    punpcklqdq  xmm0,xmm4
-    punpckhqdq  xmm3,xmm4
-    mov         [rsi+rdi-2],eax
-    movdqa      [rsp+50h],xmm0
-    mov         eax,[rsp+50h]
-    movdqa      [rsp+0A0h],xmm3
-    mov         [rdi+rsi*2-2],eax
-    mov         eax,[rsp+0A0h]
-    mov         [r10+rdi-2],eax
-    mov         eax,[rsp+48h]
-    mov         [rbx],eax
-    mov         eax,[rsp+98h]
-    mov         [rsi+rbx],eax
-    mov         eax,[rsp+58h]
-    mov         [rbx+rsi*2],eax
-    mov         eax, [rsp+0A8h]
-    mov         [r10+rbx],eax
-    mov         eax, [rsp+44h]
-    mov         [r12-2],eax
-    mov         eax,[rsp+94h]
-    mov         [rsi+r12-2],eax
-    mov         eax,[rsp+54h]
-    mov         [r12+rsi*2-2],eax
-    mov         eax, [rsp+0A4h]
-    mov         [r10+r12-2],eax
-    mov         eax,[rsp+4Ch]
-    mov         [rbp],eax
-    mov         eax,[rsp+9Ch]
-    mov         [rsi+rbp],eax
-    mov         eax, [rsp+5Ch]
-    mov         [rbp+rsi*2],eax
-    mov         eax,[rsp+0ACh]
-    mov         [r10+rbp],eax
-    lea         r11,[rsp+170h]
-    mov         rsp,r11
-    pop         r14
-    pop         r13
-    pop         r12
-    pop         rbp
-    pop         rbx
-    ret
-
-
-
 %elifdef  X86_32
 
 ;***************************************************************************
@@ -1921,315 +1455,6 @@
     psrldq      xmm1,4
     psrldq      xmm6,4
     mov         edi,dword [esp+0Ch]
-    movd        dword [edi],xmm0
-    movd        dword [edi+ecx],xmm5
-    movd        dword [edi+ecx*2],xmm1
-    movd        dword [edi+edx],xmm6
-    pop         edi
-    pop         esi
-    mov         esp,ebp
-    pop         ebp
-    ret
-
-;*******************************************************************************
-;    void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4H_ssse3
-    push        ebp
-    mov         ebp,esp
-    and         esp,0FFFFFFF0h
-    sub         esp,108h
-    mov         ecx,dword [ebp+8]
-    mov         edx,dword [ebp+0Ch]
-    mov         eax,dword [ebp+10h]
-    sub         ecx,2
-    sub         edx,2
-    push        esi
-    lea         esi,[eax+eax*2]
-    mov         dword [esp+10h],ecx
-    mov         dword [esp+4],edx
-    lea         ecx,[ecx+eax*4]
-    lea         edx,[edx+eax*4]
-    lea         eax,[esp+6Ch]
-    push        edi
-    mov         dword [esp+0Ch],esi
-    mov         dword [esp+18h],ecx
-    mov         dword [esp+10h],edx
-    mov         dword [esp+1Ch],eax
-    mov         esi,dword [esp+14h]
-    mov         ecx,dword [ebp+10h]
-    mov         edx,dword [esp+0Ch]
-    movd        xmm0,dword [esi]
-    movd        xmm1,dword [esi+ecx]
-    movd        xmm2,dword [esi+ecx*2]
-    movd        xmm3,dword [esi+edx]
-    mov         esi,dword [esp+8]
-    movd        xmm4,dword [esi]
-    movd        xmm5,dword [esi+ecx]
-    movd        xmm6,dword [esi+ecx*2]
-    movd        xmm7,dword [esi+edx]
-    punpckldq   xmm0,xmm4
-    punpckldq   xmm1,xmm5
-    punpckldq   xmm2,xmm6
-    punpckldq   xmm3,xmm7
-    mov         esi,dword [esp+18h]
-    mov         edi,dword [esp+10h]
-    movd        xmm4,dword [esi]
-    movd        xmm5,dword [edi]
-    punpckldq   xmm4,xmm5
-    punpcklqdq  xmm0,xmm4
-    movd        xmm4,dword [esi+ecx]
-    movd        xmm5,dword [edi+ecx]
-    punpckldq   xmm4,xmm5
-    punpcklqdq  xmm1,xmm4
-    movd        xmm4,dword [esi+ecx*2]
-    movd        xmm5,dword [edi+ecx*2]
-    punpckldq   xmm4,xmm5
-    punpcklqdq  xmm2,xmm4
-    movd        xmm4,dword [esi+edx]
-    movd        xmm5,dword [edi+edx]
-    punpckldq   xmm4,xmm5
-    punpcklqdq  xmm3,xmm4
-    movdqa      xmm6,xmm0
-    punpcklbw   xmm0,xmm1
-    punpckhbw   xmm6,xmm1
-    movdqa      xmm7,xmm2
-    punpcklbw   xmm2,xmm3
-    punpckhbw   xmm7,xmm3
-    movdqa      xmm4,xmm0
-    movdqa      xmm5,xmm6
-    punpcklwd   xmm0,xmm2
-    punpckhwd   xmm4,xmm2
-    punpcklwd   xmm6,xmm7
-    punpckhwd   xmm5,xmm7
-    movdqa      xmm1,xmm0
-    movdqa      xmm2,xmm4
-    punpckldq   xmm0,xmm6
-    punpckhdq   xmm1,xmm6
-    punpckldq   xmm4,xmm5
-    punpckhdq   xmm2,xmm5
-    movdqa      xmm5,xmm0
-    movdqa      xmm6,xmm1
-    punpcklqdq  xmm0,xmm4
-    punpckhqdq  xmm5,xmm4
-    punpcklqdq  xmm1,xmm2
-    punpckhqdq  xmm6,xmm2
-    mov         edi,dword [esp+1Ch]
-    movdqa      [edi],xmm0
-    movdqa      [edi+10h],xmm5
-    movdqa      [edi+20h],xmm1
-    movdqa      [edi+30h],xmm6
-    mov         eax,dword [ebp+1Ch]
-    movsx       cx,byte [eax+3]
-    movsx       dx,byte [eax+2]
-    movsx       si,byte [eax+1]
-    movsx       ax,byte [eax]
-    movzx       edi,cx
-    movzx       ecx,cx
-    movd        xmm2,ecx
-    movzx       ecx,dx
-    movzx       edx,dx
-    movd        xmm3,ecx
-    movd        xmm4,edx
-    movzx       ecx,si
-    movzx       edx,si
-    movd        xmm5,ecx
-    pxor        xmm0,xmm0
-    movd        xmm6,edx
-    movzx       ecx,ax
-    movdqa      [esp+60h],xmm0
-    movzx       edx,ax
-    movsx       eax,word [ebp+14h]
-    punpcklwd   xmm6,xmm2
-    movd        xmm1,edi
-    movd        xmm7,ecx
-    movsx       ecx,word [ebp+18h]
-    movd        xmm0,edx
-    punpcklwd   xmm7,xmm3
-    punpcklwd   xmm5,xmm1
-    movdqa      xmm1,[esp+60h]
-    punpcklwd   xmm7,xmm5
-    movdqa      xmm5,[esp+0A0h]
-    punpcklwd   xmm0,xmm4
-    punpcklwd   xmm0,xmm6
-    movdqa      xmm6, [esp+70h]
-    punpcklwd   xmm0,xmm7
-    movdqa      xmm7,[esp+80h]
-    movdqa      xmm2,xmm1
-    psubw       xmm2,xmm0
-    movdqa      [esp+0D0h],xmm2
-    movd        xmm2,eax
-    movdqa      xmm3,xmm2
-    punpcklwd   xmm3,xmm2
-    pshufd      xmm4,xmm3,0
-    movd        xmm2,ecx
-    movdqa      xmm3,xmm2
-    punpcklwd   xmm3,xmm2
-    pshufd      xmm2,xmm3,0
-    movdqa      xmm3, [esp+90h]
-    movdqa      [esp+50h],xmm2
-    movdqa      xmm2,xmm6
-    punpcklbw   xmm2,xmm1
-    punpckhbw   xmm6,xmm1
-    movdqa      [esp+40h],xmm2
-    movdqa      [esp+0B0h],xmm6
-    movdqa      xmm6,[esp+90h]
-    movdqa      xmm2,xmm7
-    punpckhbw   xmm7,xmm1
-    punpckhbw   xmm6,xmm1
-    punpcklbw   xmm2,xmm1
-    punpcklbw   xmm3,xmm1
-    punpcklbw   xmm5,xmm1
-    movdqa      [esp+0F0h],xmm7
-    movdqa      [esp+0C0h],xmm6
-    movdqa      xmm6, [esp+0A0h]
-    punpckhbw   xmm6,xmm1
-    movdqa      [esp+0E0h],xmm6
-    mov         edx,4
-    movsx       eax,dx
-    movd        xmm6,eax
-    movdqa      xmm7,xmm6
-    punpcklwd   xmm7,xmm6
-    pshufd      xmm6,xmm7,0
-    movdqa      [esp+30h],xmm6
-    movdqa      xmm7, [esp+40h]
-    psubw       xmm7,xmm5
-    movdqa      xmm6,xmm0
-    pcmpgtw     xmm6,xmm1
-    movdqa      [esp+60h],xmm6
-    movdqa      xmm1, [esp+0D0h]
-    movdqa      xmm6,xmm3
-    psubw       xmm6,xmm2
-    psllw       xmm6,2
-    paddw       xmm6,xmm7
-    paddw       xmm6,[esp+30h]
-    psraw       xmm6,3
-    pmaxsw      xmm1,xmm6
-    movdqa      xmm7,[esp+50h]
-    movdqa      [esp+20h],xmm0
-    movdqa      xmm6, [esp+20h]
-    pminsw      xmm6,xmm1
-    movdqa      [esp+20h],xmm6
-    movdqa      xmm6,xmm4
-    movdqa      xmm1,xmm2
-    psubw       xmm1,xmm3
-    pabsw       xmm1,xmm1
-    pcmpgtw     xmm6,xmm1
-    movdqa      xmm1, [esp+40h]
-    psubw       xmm1,xmm2
-    pabsw       xmm1,xmm1
-    pcmpgtw     xmm7,xmm1
-    movdqa      xmm1, [esp+50h]
-    pand        xmm6,xmm7
-    movdqa      xmm7, [esp+50h]
-    psubw       xmm5,xmm3
-    pabsw       xmm5,xmm5
-    pcmpgtw     xmm1,xmm5
-    movdqa      xmm5, [esp+0B0h]
-    psubw       xmm5,[esp+0E0h]
-    pand        xmm6,xmm1
-    pand        xmm6, [esp+60h]
-    movdqa      xmm1, [esp+20h]
-    pand        xmm1,xmm6
-    movdqa      xmm6, [esp+0C0h]
-    movdqa      [esp+40h],xmm1
-    movdqa      xmm1, [esp+0F0h]
-    psubw       xmm6,xmm1
-    psllw       xmm6,2
-    paddw       xmm6,xmm5
-    paddw       xmm6, [esp+30h]
-    movdqa      xmm5, [esp+0D0h]
-    psraw       xmm6,3
-    pmaxsw      xmm5,xmm6
-    pminsw      xmm0,xmm5
-    movdqa      xmm5,[esp+0C0h]
-    movdqa      xmm6,xmm1
-    psubw       xmm6,xmm5
-    pabsw       xmm6,xmm6
-    pcmpgtw     xmm4,xmm6
-    movdqa      xmm6,[esp+0B0h]
-    psubw       xmm6,xmm1
-    pabsw       xmm6,xmm6
-    pcmpgtw     xmm7,xmm6
-    movdqa      xmm6, [esp+0E0h]
-    pand        xmm4,xmm7
-    movdqa      xmm7, [esp+50h]
-    psubw       xmm6,xmm5
-    pabsw       xmm6,xmm6
-    pcmpgtw     xmm7,xmm6
-    pand        xmm4,xmm7
-    pand        xmm4,[esp+60h]
-    pand        xmm0,xmm4
-    movdqa      xmm4, [esp+40h]
-    paddw       xmm2,xmm4
-    paddw       xmm1,xmm0
-    psubw       xmm3,xmm4
-    psubw       xmm5,xmm0
-    packuswb    xmm2,xmm1
-    packuswb    xmm3,xmm5
-    movdqa      [esp+80h],xmm2
-    movdqa      [esp+90h],xmm3
-    mov         esi,dword [esp+1Ch]
-    movdqa      xmm0, [esi]
-    movdqa      xmm1, [esi+10h]
-    movdqa      xmm2, [esi+20h]
-    movdqa      xmm3, [esi+30h]
-    movdqa      xmm6,xmm0
-    punpcklbw   xmm0,xmm1
-    punpckhbw   xmm6,xmm1
-    movdqa      xmm7,xmm2
-    punpcklbw   xmm2,xmm3
-    punpckhbw   xmm7,xmm3
-    movdqa      xmm4,xmm0
-    movdqa      xmm5,xmm6
-    punpcklwd   xmm0,xmm2
-    punpckhwd   xmm4,xmm2
-    punpcklwd   xmm6,xmm7
-    punpckhwd   xmm5,xmm7
-    movdqa      xmm1,xmm0
-    movdqa      xmm2,xmm4
-    punpckldq   xmm0,xmm6
-    punpckhdq   xmm1,xmm6
-    punpckldq   xmm4,xmm5
-    punpckhdq   xmm2,xmm5
-    movdqa      xmm5,xmm0
-    movdqa      xmm6,xmm1
-    punpcklqdq  xmm0,xmm4
-    punpckhqdq  xmm5,xmm4
-    punpcklqdq  xmm1,xmm2
-    punpckhqdq  xmm6,xmm2
-    mov         esi,dword [esp+14h]
-    mov         ecx,dword [ebp+10h]
-    mov         edx,dword [esp+0Ch]
-    mov         edi,dword [esp+8]
-    movd        dword [esi],xmm0
-    movd        dword [esi+ecx],xmm5
-    movd        dword [esi+ecx*2],xmm1
-    movd        dword [esi+edx],xmm6
-    psrldq      xmm0,4
-    psrldq      xmm5,4
-    psrldq      xmm1,4
-    psrldq      xmm6,4
-    mov         esi,dword [esp+18h]
-    movd        dword [edi],xmm0
-    movd        dword [edi+ecx],xmm5
-    movd        dword [edi+ecx*2],xmm1
-    movd        dword [edi+edx],xmm6
-    psrldq      xmm0,4
-    psrldq      xmm5,4
-    psrldq      xmm1,4
-    psrldq      xmm6,4
-    movd        dword [esi],xmm0
-    movd        dword [esi+ecx],xmm5
-    movd        dword [esi+ecx*2],xmm1
-    movd        dword [esi+edx],xmm6
-    psrldq      xmm0,4
-    psrldq      xmm5,4
-    psrldq      xmm1,4
-    psrldq      xmm6,4
-    mov         edi,dword [esp+10h]
     movd        dword [edi],xmm0
     movd        dword [edi+ecx],xmm5
     movd        dword [edi+ecx*2],xmm1