ref: a009153741ef3b6a789a546c33a814c002d75fb5
parent: 9909c306f1172661c2b09e4bee2428fd953e868a
author: Sindre Aamås <[email protected]>
date: Thu Feb 25 11:00:26 EST 2016
[Common/x86] DeblockChromaEq4H_ssse3 optimizations Use packed 8-bit operations rather than unpack to 16-bit. ~5.80x speedup on Haswell (x86-64). ~1.69x speedup on Haswell (x86 32-bit).
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -509,7 +509,33 @@
SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9
%endmacro
+; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 xmmclobber=%7,%8,%9
+%macro SSSE3_DeblockChromaEq4 9
+ movdqa %7, %3
+ SSE2_AbsDiffUB %7, %2, %8 ; |p0 - q0|
+ SSE2_CmpgeUB %7, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
+ movdqa %8, %4
+ SSE2_AbsDiffUB %8, %3, %5 ; |q1 - q0|
+ movdqa %9, %1
+ SSE2_AbsDiffUB %9, %2, %5 ; |p1 - p0|
+ pmaxub %8, %9 ; max(|q1 - q0|, |p1 - p0|)
+ pxor %9, %9
+ movd %5, %6
+ pshufb %5, %9 ; iBeta
+ SSE2_CmpgeUB %8, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
+ por %7, %8 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
+ WELS_DB1 %5
+ movdqa %8, %2
+ SSE2_AvgbFloor1 %8, %4, %5, %9 ; (p0 + q1) >> 1
+ pavgb %8, %1 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
+ movdqa %9, %7
+ SSE2_Blend %2, %8, %7 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
+ SSE2_AvgbFloor1 %1, %3, %5, %7 ; (q0 + p1) >> 1
+ pavgb %1, %4 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
+ SSE2_Blend %3, %1, %9 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
+%endmacro
+
;******************************************************************************
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
@@ -572,42 +598,15 @@
movhps xmm0, [r1 + 0 * r2] ; q0 cr
movq xmm2, [r0 + 1 * r3] ; p0 cb
movhps xmm2, [r1 + 1 * r3] ; p0 cr
-
- movdqa xmm4, xmm0
- SSE2_AbsDiffUB xmm4, xmm2, xmm5 ; |p0 - q0|
- SSE2_CmpgeUB xmm4, xmm7 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
-
movq xmm1, [r0 + 1 * r2] ; q1 cb
movhps xmm1, [r1 + 1 * r2] ; q1 cr
movq xmm3, [r0 + 2 * r3] ; p1 cb
movhps xmm3, [r1 + 2 * r3] ; p1 cr
- movdqa xmm5, xmm1
- SSE2_AbsDiffUB xmm5, xmm0, xmm7 ; |q1 - q0|
- movdqa xmm6, xmm3
- SSE2_AbsDiffUB xmm6, xmm2, xmm7 ; |p1 - p0|
- pmaxub xmm5, xmm6 ; max(|q1 - q0|, |p1 - p0|)
+ SSSE3_DeblockChromaEq4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, xmm4, xmm5, xmm6
- pxor xmm6, xmm6
- movd xmm7, arg5d
- pshufb xmm7, xmm6 ; iBeta
-
- SSE2_CmpgeUB xmm5, xmm7 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
- por xmm4, xmm5 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
-
- WELS_DB1 xmm7
- movdqa xmm5, xmm2
- SSE2_AvgbFloor1 xmm2, xmm1, xmm7, xmm6 ; (p0 + q1) >> 1
- pavgb xmm2, xmm3 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
- movdqa xmm6, xmm4
- SSE2_Blend xmm5, xmm2, xmm4 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
-
- SSE2_AvgbFloor1 xmm3, xmm0, xmm7, xmm4 ; (q0 + p1) >> 1
- pavgb xmm3, xmm1 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
- SSE2_Blend xmm0, xmm3, xmm6 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
-
- movlps [r0 + 1 * r3], xmm5 ; store p0 cb
- movhps [r1 + 1 * r3], xmm5 ; store p0 cr
+ movlps [r0 + 1 * r3], xmm2 ; store p0 cb
+ movhps [r1 + 1 * r3], xmm2 ; store p0 cr
movlps [r0 + 0 * r2], xmm0 ; store q0 cb
movhps [r1 + 0 * r2], xmm0 ; store q0 cr
@@ -640,550 +639,6 @@
ret
-%ifdef WIN64
-
-
-WELS_EXTERN DeblockChromaEq4H_ssse3
- mov rax,rsp
- mov [rax+20h],rbx
- push rdi
- PUSH_XMM 16
- sub rsp,140h
- mov rdi,rdx
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
- mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- movsx eax,word [rsp+170h + 160] ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea rsp,[rsp+140h]
- POP_XMM
- mov rbx, [rsp+28h]
- pop rdi
- ret
-
-
-
-%elifdef UNIX64
-
-
-WELS_EXTERN DeblockChromaEq4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push r12
-
- mov rbp, r8
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
- mov rdi, rdx
-
- sub rsp,140h
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
-
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
- mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- mov eax, ebp ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea r11,[rsp+140h]
- mov rbx, [r11+28h]
- mov rsp,r11
- pop r12
- pop rbp
- pop rbx
- ret
-
-
-
-%elifdef X86_32
-
;***************************************************************************
; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta)
@@ -1190,284 +645,30 @@
;***************************************************************************
WELS_EXTERN DeblockChromaEq4H_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ movd xmm7, arg4d
+ pxor xmm0, xmm0
+ pshufb xmm7, xmm0 ; iAlpha
+ lea r3, [3 * r2 - 1] ; 3 * iStride - 1
-
+ SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
+ SSSE3_DeblockChromaEq4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, xmm2, xmm3, xmm6
+%ifdef X86_32
+ push r4
+ push r5
+ SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
+ pop r5
+ pop r4
+%else
+ SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
%endif
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;********************************************************************************