ref: 8f9a5469beb962c22b6d8bbe78f01ec79fb33a55
parent: dcf08c6d413b0c9b56fef0b8602049c6a58b184b
author: Martin Storsjö <[email protected]>
date: Fri Dec 13 04:40:57 EST 2013
Convert source files to unix newlines Most files were converted in ff6b669176, but some (non C++ source files) were left with windows newlines.
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -1,129 +1,129 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* ?Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* ?Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* dct.asm
-;*
-;* Abstract
-;* WelsDctFourT4_sse2
-;*
-;* History
-;* 8/4/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-%macro MMX_SumSubDiv2 3
- movq %3, %2
- psraw %3, $1
- paddw %3, %1
- psraw %1, $1
- psubw %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
- movq %3, %2
- psubw %2, %1
- paddw %1, %3
-%endmacro
-
-%macro MMX_IDCT 6
- MMX_SumSub %4, %5, %6
- MMX_SumSubDiv2 %3, %2, %1
- MMX_SumSub %1, %4, %6
- MMX_SumSub %3, %5, %6
-%endmacro
-
-
-%macro MMX_StoreDiff4P 5
- movd %2, %5
- punpcklbw %2, %4
- paddw %1, %3
- psraw %1, $6
- paddsw %1, %2
- packuswb %1, %2
- movd %5, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN IdctResAddPred_mmx
-
-ALIGN 16
-;*******************************************************************************
-; void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
-;*******************************************************************************
-
-IdctResAddPred_mmx:
-
-%define pushsize 0
-%define pPred esp+pushsize+4
-%define kiStride esp+pushsize+8
-%define pRs esp+pushsize+12
-
- mov eax, [pRs ]
- mov edx, [pPred ]
- mov ecx, [kiStride]
- movq mm0, [eax+ 0]
- movq mm1, [eax+ 8]
- movq mm2, [eax+16]
- movq mm3, [eax+24]
-
- MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
- MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
- MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
- MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
-
- WELS_Zero mm7
- WELS_DW32 mm6
-
- MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx]
- MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx]
- lea edx, [edx+2*ecx]
- MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx]
- MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx]
-
-%undef pushsize
-%undef pPred
-%undef kiStride
-%undef pRs
- emms
- ret
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* ?Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* ?Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* dct.asm
+;*
+;* Abstract
+;* WelsDctFourT4_sse2
+;*
+;* History
+;* 8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+%macro MMX_SumSubDiv2 3
+ movq %3, %2
+ psraw %3, $1
+ paddw %3, %1
+ psraw %1, $1
+ psubw %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+ movq %3, %2
+ psubw %2, %1
+ paddw %1, %3
+%endmacro
+
+%macro MMX_IDCT 6
+ MMX_SumSub %4, %5, %6
+ MMX_SumSubDiv2 %3, %2, %1
+ MMX_SumSub %1, %4, %6
+ MMX_SumSub %3, %5, %6
+%endmacro
+
+
+%macro MMX_StoreDiff4P 5
+ movd %2, %5
+ punpcklbw %2, %4
+ paddw %1, %3
+ psraw %1, $6
+ paddsw %1, %2
+ packuswb %1, %2
+ movd %5, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN IdctResAddPred_mmx
+
+ALIGN 16
+;*******************************************************************************
+; void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
+;*******************************************************************************
+
+IdctResAddPred_mmx:
+
+%define pushsize 0
+%define pPred esp+pushsize+4
+%define kiStride esp+pushsize+8
+%define pRs esp+pushsize+12
+
+ mov eax, [pRs ]
+ mov edx, [pPred ]
+ mov ecx, [kiStride]
+ movq mm0, [eax+ 0]
+ movq mm1, [eax+ 8]
+ movq mm2, [eax+16]
+ movq mm3, [eax+24]
+
+ MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
+ MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
+ MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
+ MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
+
+ WELS_Zero mm7
+ WELS_DW32 mm6
+
+ MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx]
+ MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx]
+ lea edx, [edx+2*ecx]
+ MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx]
+ MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx]
+
+%undef pushsize
+%undef pPred
+%undef kiStride
+%undef pRs
+ emms
+ ret
--- a/codec/decoder/core/asm/deblock.asm
+++ b/codec/decoder/core/asm/deblock.asm
@@ -1,2113 +1,2113 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* deblock.asm
-;*
-;* Abstract
-;* edge loop
-;*
-;* History
-;* 08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN DeblockChromaEq4V_sse2
-
-ALIGN 16
-DeblockChromaEq4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
- mov edx,[ebp+10h] ; iStride
- mov eax,[ebp+8] ; pPixCb
- mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0E4h
- push ebx
- push esi
- mov esi, [ebp+1Ch] ; pTC
- movsx ebx, byte [esi+2]
- push edi
- movsx di,byte [esi+3]
- mov word [esp+0Ch],bx
- movsx bx,byte [esi+1]
- movsx esi,byte [esi]
- mov word [esp+0Eh],si
- movzx esi,di
- movd xmm1,esi
- movzx esi,di
- movd xmm2,esi
- mov si,word [esp+0Ch]
- mov edx, [ebp + 10h]
- mov eax, [ebp + 08h]
- movzx edi,si
- movzx esi,si
- mov ecx, [ebp + 0Ch]
- movd xmm4,esi
- movzx esi,bx
- movd xmm5,esi
- movd xmm3,edi
- movzx esi,bx
- movd xmm6,esi
- mov si,word [esp+0Eh]
- movzx edi,si
- movzx esi,si
- punpcklwd xmm6,xmm2
- pxor xmm0,xmm0
- movdqa [esp+40h],xmm0
- movd xmm7,edi
- movd xmm0,esi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+40h]
- punpcklwd xmm0,xmm4
- movq xmm4,[edx+ecx]
- punpcklwd xmm7,xmm3
- movq xmm3,[eax]
- punpcklwd xmm0,xmm6
- movq xmm6,[edi]
- punpcklwd xmm7,xmm5
- punpcklwd xmm0,xmm7
- mov edi,ecx
- sub edi,esi
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+60h],xmm2
- movq xmm2, [edi]
- punpcklqdq xmm6,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm7,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm2,[edi]
- punpcklqdq xmm7,xmm2
- movq xmm2,[ecx]
- punpcklqdq xmm3,xmm2
- movq xmm2,[edx+eax]
- movsx edx,word [ebp + 14h]
- punpcklqdq xmm2,xmm4
- movdqa [esp+0E0h],xmm2
- movd xmm2,edx
- movsx edx,word [ebp + 18h]
- movdqa xmm4,xmm2
- punpcklwd xmm4,xmm2
- movd xmm2,edx
- movdqa xmm5,xmm2
- punpcklwd xmm5,xmm2
- pshufd xmm2,xmm5,0
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- movdqa [esp+0D0h],xmm3
- pshufd xmm4,xmm4,0
- movdqa [esp+30h],xmm2
- punpckhbw xmm6,xmm1
- movdqa [esp+80h],xmm6
- movdqa xmm6,[esp+0D0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+70h],xmm6
- movdqa xmm6, [esp+0E0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+90h],xmm6
- movdqa xmm5, [esp+0E0h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0A0h],xmm7
- punpcklbw xmm3,xmm1
- mov edx,4
- punpcklbw xmm2,xmm1
- movsx edx,dx
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,[esp+30h]
- movdqa [esp+20h],xmm6
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa xmm1,[esp+60h]
- movdqa [esp+40h],xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6, [esp+20h]
- movdqa xmm7, [esp+50h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa [esp+10h],xmm0
- movdqa xmm6, [esp+10h]
- pminsw xmm6,xmm1
- movdqa [esp+10h],xmm6
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm6,xmm4
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+30h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1,[esp+50h]
- pand xmm6,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5,[esp+80h]
- psubw xmm5,[esp+90h]
- pand xmm6,xmm1
- pand xmm6,[esp+40h]
- movdqa xmm1,[esp+10h]
- pand xmm1,xmm6
- movdqa xmm6,[esp+70h]
- movdqa [esp+30h],xmm1
- movdqa xmm1,[esp+0A0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6,[esp+20h]
- movdqa xmm5,[esp+60h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+70h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+80h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+90h]
- pand xmm4,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+40h]
- pand xmm0,xmm4
- movdqa xmm4,[esp+30h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- packuswb xmm2,xmm1
- movq [esi],xmm2
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm3,xmm5
- movq [eax],xmm3
- psrldq xmm2,8
- movq [edi],xmm2
- pop edi
- pop esi
- psrldq xmm3,8
- movq [ecx],xmm3
- pop ebx
- mov esp,ebp
- pop ebp
- ret
-
-;***************************************************************************
-; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN DeblockChromaEq4H_sse2
-
-ALIGN 16
-
-DeblockChromaEq4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;*******************************************************************************
-; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4H_sse2
-
-ALIGN 16
-
-DeblockChromaLt4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-
-
-;*******************************************************************************
-; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-
-
-WELS_EXTERN DeblockLumaLt4V_sse2
-
-ALIGN 16
-
-DeblockLumaLt4V_sse2:
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 420 ; 000001a4H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
-
- pxor xmm0, xmm0
- push ebx
- mov edx, dword [ebp+24]
- movdqa [esp+424-384], xmm0
- push esi
-
- lea esi, [ecx+ecx*2]
- push edi
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
-
- lea esi, [ecx+ecx]
- movdqa [esp+432-208], xmm0
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
- movdqa [esp+448-208], xmm0
-
- mov ebx, eax
- sub ebx, ecx
- movdqa xmm0, [ebx]
- movdqa [esp+464-208], xmm0
-
- movdqa xmm0, [eax]
-
- add ecx, eax
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [ecx]
- mov dword [esp+432-404], ecx
-
- movsx ecx, word [ebp+16]
- movdqa [esp+496-208], xmm0
- movdqa xmm0, [esi+eax]
-
- movsx si, byte [edx]
- movdqa [esp+512-208], xmm0
- movd xmm0, ecx
- movsx ecx, word [ebp+20]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- pshufd xmm0, xmm1, 0
- movdqa [esp+432-112], xmm0
- movd xmm0, ecx
- movsx cx, byte [edx+1]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- mov dword [esp+432-408], ebx
- movzx ebx, cx
- pshufd xmm0, xmm1, 0
- movd xmm1, ebx
- movzx ebx, cx
- movd xmm2, ebx
- movzx ebx, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, si
- movd xmm5, ecx
- movzx ecx, si
- movd xmm6, ecx
- movzx ecx, si
- movd xmm7, ecx
- movzx ecx, si
- movdqa [esp+432-336], xmm0
- movd xmm0, ecx
-
- movsx cx, byte [edx+3]
- movsx dx, byte [edx+2]
- movd xmm3, ebx
- punpcklwd xmm0, xmm4
- movzx esi, cx
- punpcklwd xmm6, xmm2
- punpcklwd xmm5, xmm1
- punpcklwd xmm0, xmm6
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- punpcklwd xmm0, xmm7
- movdqa [esp+432-400], xmm0
- movd xmm0, esi
- movzx esi, cx
- movd xmm2, esi
- movzx esi, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, dx
- movd xmm3, esi
- movd xmm5, ecx
- punpcklwd xmm5, xmm0
-
- movdqa xmm0, [esp+432-384]
- movzx ecx, dx
- movd xmm6, ecx
- movzx ecx, dx
- movzx edx, dx
- punpcklwd xmm6, xmm2
- movd xmm7, ecx
- movd xmm1, edx
-
- movdqa xmm2, [esp+448-208]
- punpcklbw xmm2, xmm0
-
- mov ecx, 4
- movsx edx, cx
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- movdqa xmm5, [esp+496-208]
- movdqa xmm3, [esp+464-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-240], xmm5
- movdqa xmm5, [esp+512-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-352], xmm5
- punpcklwd xmm1, xmm4
- movdqa xmm4, [esp+432-208]
- punpcklwd xmm1, xmm6
- movdqa xmm6, [esp+480-208]
- punpcklwd xmm1, xmm7
- punpcklbw xmm6, xmm0
- punpcklbw xmm3, xmm0
- punpcklbw xmm4, xmm0
- movdqa xmm7, xmm3
- psubw xmm7, xmm4
- pabsw xmm7, xmm7
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-336]
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-352]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
- movdqa xmm5, xmm3
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
- movdqa xmm5, [esp+432-400]
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, xmm3
- movdqa [esp+432-32], xmm6
- psubw xmm6, [esp+432-240]
- movdqa xmm7, xmm5
- movdqa [esp+432-384], xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
-
- pand xmm5, xmm7
- movdqa xmm6, xmm3
- psubw xmm6, xmm2
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-400]
- pand xmm5, xmm7
- movdqa xmm7, xmm6
- pcmpeqw xmm6, xmm0
- pcmpgtw xmm7, xmm0
- por xmm7, xmm6
- pand xmm5, xmm7
- movdqa [esp+432-320], xmm5
- movd xmm5, edx
- movdqa xmm6, xmm5
- punpcklwd xmm6, xmm5
- pshufd xmm5, xmm6, 0
- movdqa [esp+432-336], xmm5
- movdqa xmm5, [esp+432-224]
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm0
- psubw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- psllw xmm5, 2
- movdqa xmm7, xmm2
- psubw xmm7, [esp+432-240]
- paddw xmm7, xmm5
- paddw xmm7, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- psraw xmm7, 3
- pmaxsw xmm6, xmm7
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- movdqa xmm6, [esp+432-400]
- movdqa [esp+432-64], xmm5
- movdqa [esp+432-384], xmm6
- movdqa xmm5, xmm0
- psubw xmm5, xmm6
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm2
- paddw xmm7, xmm2
- psubw xmm5, xmm7
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-288]
- movdqa xmm6, [esp+432-240]
- movdqa [esp+432-96], xmm5
- movdqa xmm5, [esp+432-352]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm6
- paddw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
- psubw xmm5, xmm7
-
- movdqa xmm7, [esp+496-208]
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-400]
- pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-256]
- movdqa xmm6, [esp+448-208]
- punpckhbw xmm7, xmm0
- movdqa [esp+432-352], xmm7
-
- movdqa xmm7, [esp+512-208]
- punpckhbw xmm6, xmm0
- movdqa [esp+432-48], xmm5
- movdqa xmm5, [esp+432-208]
- movdqa [esp+432-368], xmm6
- movdqa xmm6, [esp+464-208]
- punpckhbw xmm7, xmm0
- punpckhbw xmm5, xmm0
- movdqa [esp+432-384], xmm7
- punpckhbw xmm6, xmm0
- movdqa [esp+432-400], xmm6
-
- movdqa xmm7, [esp+432-400]
- movdqa xmm6, [esp+480-208]
- psubw xmm7, xmm5
- movdqa [esp+432-16], xmm5
- pabsw xmm7, xmm7
- punpckhbw xmm6, xmm0
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
-
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-384]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
-
- movdqa xmm5, [esp+432-400]
- movdqa [esp+432-80], xmm6
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
-
- movdqa xmm5, xmm1
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, [esp+432-400]
- psubw xmm6, [esp+432-352]
- movdqa [esp+432-272], xmm5
- movdqa xmm7, xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- movdqa xmm7, xmm4
- pabsw xmm6, xmm6
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
-
- pand xmm5, xmm7
- movdqa xmm7, [esp+432-400]
- psubw xmm7, xmm6
- psubw xmm6, [esp+432-352]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
- pand xmm5, xmm4
-
- paddw xmm2, [esp+432-96]
- movdqa xmm4, xmm1
- pcmpgtw xmm4, xmm0
- movdqa xmm7, xmm1
- pcmpeqw xmm7, xmm0
- por xmm4, xmm7
- pand xmm5, xmm4
- movdqa xmm4, [esp+432-224]
- movdqa [esp+432-320], xmm5
- movdqa xmm5, [esp+432-272]
- movdqa xmm7, xmm0
- psubw xmm7, xmm4
- psubw xmm0, xmm1
- psllw xmm5, 2
- paddw xmm6, xmm5
- paddw xmm6, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- movdqa [esp+432-336], xmm0
- psraw xmm6, 3
- pmaxsw xmm7, xmm6
- pminsw xmm4, xmm7
- pand xmm4, [esp+432-320]
- movdqa xmm6, xmm0
- movdqa xmm0, [esp+432-16]
- paddw xmm0, [esp+432-304]
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-368]
- paddw xmm4, xmm4
- psubw xmm0, xmm4
-
- movdqa xmm4, [esp+432-64]
- psraw xmm0, 1
- pmaxsw xmm6, xmm0
- movdqa xmm0, [esp+432-400]
- movdqa xmm7, xmm1
- pminsw xmm7, xmm6
- movdqa xmm6, [esp+432-320]
- pand xmm7, xmm6
- pand xmm7, [esp+432-288]
- paddw xmm5, xmm7
- packuswb xmm2, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm0, xmm5
- paddw xmm3, xmm4
- packuswb xmm3, xmm0
-
- movdqa xmm0, [esp+432-32]
- psubw xmm0, xmm4
- movdqa xmm4, [esp+432-80]
- psubw xmm4, xmm5
-
- movdqa xmm5, [esp+432-240]
- paddw xmm5, [esp+432-48]
- packuswb xmm0, xmm4
- movdqa xmm4, [esp+432-384]
- paddw xmm4, [esp+432-304]
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [esp+432-352]
- movdqa xmm7, xmm0
- paddw xmm0, xmm0
-
- mov ecx, dword [esp+432-408]
-
- mov edx, dword [esp+432-404]
- psubw xmm4, xmm0
- movdqa xmm0, [esp+432-336]
- movdqa [edi], xmm2
- psraw xmm4, 1
- pmaxsw xmm0, xmm4
- pminsw xmm1, xmm0
- movdqa xmm0, [esp+480-208]
-
- pop edi
- pand xmm1, xmm6
- pand xmm1, [esp+428-256]
- movdqa [ecx], xmm3
- paddw xmm7, xmm1
- pop esi
- packuswb xmm5, xmm7
- movdqa [eax], xmm0
- movdqa [edx], xmm5
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;*******************************************************************************
-; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN DeblockLumaEq4V_sse2
-
-ALIGN 16
-
-DeblockLumaEq4V_sse2:
-
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 628 ; 00000274H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
- push ebx
- push esi
-
- lea edx, [ecx*4]
- pxor xmm0, xmm0
- movdqa xmm2, xmm0
-
- movdqa xmm0, [ecx+eax]
- mov esi, eax
- sub esi, edx
- movdqa xmm3, [esi]
- movdqa xmm5, [eax]
- push edi
- lea edi, [ecx+ecx]
- lea ebx, [ecx+ecx*2]
- mov dword [esp+640-600], edi
- mov esi, eax
- sub esi, edi
- movdqa xmm1, [esi]
- movdqa [esp+720-272], xmm0
- mov edi, eax
- sub edi, ecx
- movdqa xmm4, [edi]
- add ecx, eax
- mov dword [esp+640-596], ecx
-
- mov ecx, dword [esp+640-600]
- movdqa xmm0, [ecx+eax]
- movdqa [esp+736-272], xmm0
-
- movdqa xmm0, [eax+ebx]
- mov edx, eax
- sub edx, ebx
-
- movsx ebx, word [ebp+16]
- movdqa xmm6, [edx]
- add ecx, eax
- movdqa [esp+752-272], xmm0
- movd xmm0, ebx
-
- movsx ebx, word [ebp+20]
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
- movdqa [esp+640-320], xmm0
- movd xmm0, ebx
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
-
- movdqa xmm7, [esp+736-272]
- punpcklbw xmm7, xmm2
- movdqa [esp+640-416], xmm7
- movdqa [esp+640-512], xmm0
- movdqa xmm0, xmm1
- movdqa [esp+672-272], xmm1
- movdqa xmm1, xmm4
- movdqa [esp+704-272], xmm5
- punpcklbw xmm5, xmm2
- punpcklbw xmm1, xmm2
-
- movdqa xmm7, xmm5
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- punpcklbw xmm0, xmm2
- movdqa [esp+688-272], xmm4
- movdqa xmm4, [esp+720-272]
- movdqa [esp+640-480], xmm0
-
- movdqa xmm7, xmm1
- psubw xmm7, xmm0
-
- movdqa xmm0, [esp+640-512]
- pabsw xmm7, xmm7
- punpcklbw xmm4, xmm2
- pcmpgtw xmm0, xmm7
- movdqa [esp+640-384], xmm4
- movdqa xmm7, xmm5
- psubw xmm7, xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+656-272], xmm6
- punpcklbw xmm6, xmm2
- pabsw xmm7, xmm7
- movdqa [esp+640-48], xmm2
- movdqa [esp+640-368], xmm6
- movdqa [esp+640-144], xmm1
- movdqa [esp+640-400], xmm5
- pcmpgtw xmm4, xmm7
- pand xmm0, xmm4
- movdqa xmm4, [esp+640-320]
- pcmpgtw xmm4, [esp+640-560]
- pand xmm0, xmm4
-
- mov ebx, 2
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, [esp+640-320]
- psraw xmm4, 2
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm7
- movdqa [esp+640-576], xmm4
- pcmpgtw xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-512]
- movdqa [esp+640-624], xmm7
- movdqa xmm7, xmm1
- psubw xmm7, xmm6
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-544], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa xmm7, xmm5
- psubw xmm7, [esp+640-416]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-544]
- pandn xmm4, xmm6
- movdqa [esp+640-16], xmm4
- mov ebx, 4
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm2
- psllw xmm4, 1
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, [esp+640-480]
-
- movdqa xmm6, [esp+640-560]
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm1
- movdqa [esp+640-592], xmm7
- paddw xmm4, xmm5
- paddw xmm4, xmm7
- movdqa xmm7, [esp+640-416]
- pandn xmm6, xmm7
- movdqa [esp+640-80], xmm6
- movdqa xmm6, [esp+752-272]
- punpcklbw xmm6, xmm2
- psllw xmm6, 1
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-384]
-
- movdqa xmm7, [esp+640-480]
- paddw xmm6, xmm5
- paddw xmm6, xmm1
- paddw xmm6, [esp+640-592]
- psraw xmm6, 3
- pand xmm6, [esp+640-560]
- movdqa [esp+640-112], xmm6
- movdqa xmm6, [esp+640-544]
- pandn xmm6, xmm7
- movdqa [esp+640-336], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-528], xmm6
- movdqa xmm6, [esp+640-368]
- paddw xmm6, xmm7
- movdqa xmm7, xmm1
- psraw xmm4, 3
- pand xmm4, [esp+640-544]
- paddw xmm7, xmm5
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
-
- paddw xmm5, xmm1
- psraw xmm6, 2
- pand xmm7, xmm6
-
- movdqa xmm6, [esp+640-384]
- movdqa [esp+640-64], xmm7
- movdqa xmm7, [esp+640-560]
- pandn xmm7, xmm6
- movdqa [esp+640-304], xmm7
- movdqa xmm7, [esp+640-560]
- movdqa [esp+640-528], xmm7
- movdqa xmm7, [esp+640-416]
- paddw xmm7, xmm6
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pand xmm5, xmm7
- movdqa [esp+640-32], xmm5
-
- movdqa xmm5, [esp+640-544]
- movdqa [esp+640-528], xmm5
- movdqa xmm5, [esp+640-480]
- movdqa xmm7, xmm5
- paddw xmm7, xmm5
- movdqa xmm5, xmm1
- paddw xmm5, xmm6
- paddw xmm6, [esp+640-592]
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pandn xmm5, xmm7
- movdqa xmm7, [esp+640-480]
- paddw xmm7, xmm1
- paddw xmm7, [esp+640-400]
- movdqa xmm1, [esp+640-544]
- movdqa [esp+640-352], xmm5
- movdqa xmm5, [esp+640-368]
- psllw xmm7, 1
- paddw xmm7, xmm6
- paddw xmm5, xmm7
-
- movdqa xmm7, [esp+640-400]
- psraw xmm5, 3
- pand xmm1, xmm5
- movdqa xmm5, [esp+640-480]
- movdqa [esp+640-96], xmm1
- movdqa xmm1, [esp+640-560]
- movdqa [esp+640-528], xmm1
- movdqa xmm1, [esp+640-384]
- movdqa xmm6, xmm1
- paddw xmm6, xmm1
- paddw xmm1, [esp+640-400]
- paddw xmm1, [esp+640-144]
- paddw xmm7, xmm5
- paddw xmm5, [esp+640-592]
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
- psraw xmm6, 2
- psllw xmm1, 1
- paddw xmm1, xmm5
-
- movdqa xmm5, [esp+656-272]
- pandn xmm7, xmm6
- movdqa xmm6, [esp+640-416]
- paddw xmm6, xmm1
- movdqa xmm1, [esp+640-560]
- psraw xmm6, 3
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+704-272]
- movdqa [esp+640-128], xmm1
- movdqa xmm1, [esp+672-272]
- punpckhbw xmm1, xmm2
- movdqa [esp+640-448], xmm1
- movdqa xmm1, [esp+688-272]
- punpckhbw xmm1, xmm2
- punpckhbw xmm6, xmm2
- movdqa [esp+640-288], xmm7
- punpckhbw xmm5, xmm2
- movdqa [esp+640-496], xmm1
- movdqa [esp+640-432], xmm6
-
- movdqa xmm7, [esp+720-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-464], xmm7
-
- movdqa xmm7, [esp+736-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-528], xmm7
-
- movdqa xmm7, xmm6
-
- psubw xmm6, [esp+640-464]
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- por xmm4, [esp+640-16]
- pabsw xmm6, xmm6
- movdqa xmm7, xmm1
- psubw xmm7, [esp+640-448]
-
- movdqa xmm1, [esp+640-512]
- pabsw xmm7, xmm7
- pcmpgtw xmm1, xmm7
- movdqa xmm7, [esp+640-512]
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+640-320]
- pand xmm1, xmm7
- movdqa xmm7, [esp+640-560]
- pcmpgtw xmm6, xmm7
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+640-576]
- pcmpgtw xmm6, xmm7
-
- movdqa xmm7, [esp+640-496]
- punpckhbw xmm3, xmm2
- movdqa [esp+640-560], xmm6
- movdqa xmm6, [esp+640-512]
- psubw xmm7, xmm5
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
-
- pand xmm6, [esp+640-560]
- movdqa xmm7, [esp+640-432]
- psubw xmm7, [esp+640-528]
-
- psllw xmm3, 1
- movdqa [esp+640-544], xmm6
- movdqa xmm6, [esp+640-512]
-
- movdqa xmm2, [esp+640-544]
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, [esp+640-448]
- paddw xmm3, [esp+640-496]
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa [esp+640-560], xmm6
-
- movdqa xmm6, xmm0
- pand xmm6, xmm4
- movdqa xmm4, xmm0
- pandn xmm4, [esp+640-368]
- por xmm6, xmm4
- movdqa xmm4, [esp+640-432]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-592]
- psraw xmm3, 3
- pand xmm3, xmm2
- pandn xmm2, xmm5
- por xmm3, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm3
- movdqa xmm3, [esp+640-64]
- por xmm3, [esp+640-336]
- movdqa xmm2, xmm1
- pandn xmm2, xmm5
- por xmm7, xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-480]
- por xmm2, xmm3
- packuswb xmm6, xmm7
- movdqa [esp+640-336], xmm2
- movdqa [esp+656-272], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa xmm2, xmm5
- paddw xmm2, [esp+640-448]
- movdqa xmm3, xmm1
- movdqa xmm7, [esp+640-496]
- paddw xmm7, xmm4
- paddw xmm2, xmm7
- paddw xmm2, [esp+640-624]
- movdqa xmm7, [esp+640-544]
- psraw xmm2, 2
- pand xmm6, xmm2
- movdqa xmm2, [esp+640-448]
- pandn xmm7, xmm2
- por xmm6, xmm7
- pand xmm3, xmm6
- movdqa xmm6, xmm1
- pandn xmm6, xmm2
- paddw xmm2, [esp+640-496]
- paddw xmm2, xmm4
- por xmm3, xmm6
- movdqa xmm6, [esp+640-336]
- packuswb xmm6, xmm3
- psllw xmm2, 1
- movdqa [esp+672-272], xmm6
- movdqa xmm6, [esp+640-96]
- por xmm6, [esp+640-352]
-
- movdqa xmm3, xmm0
- pand xmm3, xmm6
- movdqa xmm6, xmm0
- pandn xmm6, [esp+640-144]
- por xmm3, xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-352], xmm3
- movdqa xmm3, [esp+640-464]
- paddw xmm3, [esp+640-592]
- paddw xmm2, xmm3
- movdqa xmm3, [esp+640-448]
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-496]
- psraw xmm5, 3
- pand xmm6, xmm5
- movdqa xmm5, [esp+640-464]
- paddw xmm2, xmm5
- paddw xmm5, [esp+640-432]
- movdqa xmm4, xmm3
- paddw xmm4, xmm3
- paddw xmm4, xmm2
- paddw xmm4, [esp+640-624]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, [esp+640-592]
- psraw xmm4, 2
- pandn xmm2, xmm4
- por xmm6, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-496]
- movdqa xmm2, xmm1
- pandn xmm2, xmm6
- por xmm7, xmm2
- movdqa xmm2, [esp+640-352]
- packuswb xmm2, xmm7
- movdqa [esp+688-272], xmm2
- movdqa xmm2, [esp+640-128]
- por xmm2, [esp+640-288]
-
- movdqa xmm4, xmm0
- pand xmm4, xmm2
- paddw xmm5, xmm6
- movdqa xmm2, xmm0
- pandn xmm2, [esp+640-400]
- por xmm4, xmm2
- movdqa xmm2, [esp+640-528]
- psllw xmm5, 1
- paddw xmm5, xmm3
- movdqa xmm3, [esp+640-560]
- paddw xmm2, xmm5
- psraw xmm2, 3
- movdqa [esp+640-288], xmm4
- movdqa xmm4, [esp+640-560]
- pand xmm4, xmm2
- movdqa xmm2, [esp+640-464]
- movdqa xmm5, xmm2
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-432]
- paddw xmm2, [esp+640-448]
- movdqa xmm7, xmm1
- paddw xmm5, xmm2
- paddw xmm5, [esp+640-624]
- movdqa xmm6, [esp+640-560]
- psraw xmm5, 2
- pandn xmm3, xmm5
- por xmm4, xmm3
- movdqa xmm3, [esp+640-32]
- por xmm3, [esp+640-304]
- pand xmm7, xmm4
- movdqa xmm4, [esp+640-432]
- movdqa xmm5, [esp+640-464]
- movdqa xmm2, xmm1
- pandn xmm2, xmm4
- paddw xmm4, [esp+640-496]
- por xmm7, xmm2
- movdqa xmm2, [esp+640-288]
- packuswb xmm2, xmm7
- movdqa [esp+704-272], xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-384]
- por xmm2, xmm3
- movdqa [esp+640-304], xmm2
- movdqa xmm2, [esp+640-528]
- movdqa xmm3, xmm2
- paddw xmm3, [esp+640-464]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-624]
- psraw xmm3, 2
- pand xmm6, xmm3
- movdqa xmm3, [esp+640-560]
- movdqa xmm4, xmm3
- pandn xmm4, xmm5
- por xmm6, xmm4
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-304]
- movdqa xmm4, xmm1
- pandn xmm4, xmm5
- por xmm7, xmm4
-
- movdqa xmm4, xmm0
- pandn xmm0, [esp+640-416]
- packuswb xmm6, xmm7
- movdqa xmm7, [esp+640-112]
- por xmm7, [esp+640-80]
- pand xmm4, xmm7
- por xmm4, xmm0
- movdqa xmm0, [esp+752-272]
- punpckhbw xmm0, [esp+640-48]
- psllw xmm0, 1
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm5
- paddw xmm0, [esp+640-432]
- paddw xmm0, [esp+640-496]
- paddw xmm0, [esp+640-592]
- psraw xmm0, 3
- pand xmm0, xmm3
- movdqa xmm7, xmm1
- pandn xmm3, xmm2
- por xmm0, xmm3
- pand xmm7, xmm0
-
- movdqa xmm0, [esp+656-272]
- movdqa [edx], xmm0
-
- movdqa xmm0, [esp+672-272]
-
- mov edx, dword [esp+640-596]
- movdqa [esi], xmm0
- movdqa xmm0, [esp+688-272]
- movdqa [edi], xmm0
- movdqa xmm0, [esp+704-272]
-
- pop edi
- pandn xmm1, xmm2
- movdqa [eax], xmm0
- por xmm7, xmm1
- pop esi
- packuswb xmm4, xmm7
- movdqa [edx], xmm6
- movdqa [ecx], xmm4
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;********************************************************************************
-;
-; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-;
-;********************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeH2V_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeH2V_sse2:
- push ebp
- push ebx
- mov ebp, esp
- and esp,0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 0Ch]
- mov ecx, [ebp + 10h]
- lea edx, [eax + ecx * 8]
- lea ebx, [ecx*3]
-
- movq xmm0, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm0, xmm7
- movq xmm1, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm1, xmm7
- movq xmm2, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm2, xmm7
- movq xmm3, [eax + ebx]
- movq xmm7, [edx + ebx]
- punpcklqdq xmm3, xmm7
-
- lea eax, [eax + ecx * 4]
- lea edx, [edx + ecx * 4]
- movq xmm4, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm4, xmm7
- movq xmm5, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm5, xmm7
- movq xmm6, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm6, xmm7
-
- movdqa [esp], xmm0
- movq xmm7, [eax + ebx]
- movq xmm0, [edx + ebx]
- punpcklqdq xmm7, xmm0
- movdqa xmm0, [esp]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- mov eax, [ebp + 14h]
- movdqa [eax], xmm4
- movdqa [eax + 10h], xmm2
- movdqa [eax + 20h], xmm3
- movdqa [eax + 30h], xmm7
- movdqa [eax + 40h], xmm5
- movdqa [eax + 50h], xmm1
- movdqa [eax + 60h], xmm6
- movdqa [eax + 70h], xmm0
-
- mov esp, ebp
- pop ebx
- pop ebp
- ret
-
-
-
-;*******************************************************************************************
-;
-; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeV2H_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeV2H_sse2:
- push ebp
- mov ebp, esp
-
- and esp, 0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 10h]
- mov ecx, [ebp + 0Ch]
- mov edx, [ebp + 08h]
-
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 10h]
- movdqa xmm2, [eax + 20h]
- movdqa xmm3, [eax + 30h]
- movdqa xmm4, [eax + 40h]
- movdqa xmm5, [eax + 50h]
- movdqa xmm6, [eax + 60h]
- movdqa xmm7, [eax + 70h]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- lea eax, [ecx * 3]
-
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
- psrldq xmm4, 8
- psrldq xmm2, 8
- psrldq xmm3, 8
- psrldq xmm7, 8
- psrldq xmm5, 8
- psrldq xmm1, 8
- psrldq xmm6, 8
- psrldq xmm0, 8
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
-
- mov esp, ebp
- pop ebp
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* deblock.asm
+;*
+;* Abstract
+;* edge loop
+;*
+;* History
+;* 08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN DeblockChromaEq4V_sse2
+
+ALIGN 16
+DeblockChromaEq4V_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,68h
+ mov edx,[ebp+10h] ; iStride
+ mov eax,[ebp+8] ; pPixCb
+ mov ecx,[ebp+0Ch] ; pPixCr
+ movq xmm4,[ecx]
+ movq xmm5,[edx+ecx]
+ push esi
+ push edi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ movq xmm1,[edi]
+ mov edi,ecx
+ sub edi,esi
+ movq xmm2,[edi]
+ punpcklqdq xmm1,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm2,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm3,[edi]
+ punpcklqdq xmm2,xmm3
+ movq xmm3,[eax]
+ punpcklqdq xmm3,xmm4
+ movq xmm4,[edx+eax]
+ mov edx, [ebp + 14h]
+ punpcklqdq xmm4,xmm5
+ movd xmm5,edx
+ mov edx, [ebp + 18h]
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm5
+ punpcklwd xmm6,xmm5
+ pshufd xmm5,xmm6,0
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,xmm1
+ punpckhbw xmm1,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+40h],xmm1
+ movdqa [esp+60h],xmm7
+ movdqa xmm7,xmm2
+ punpcklbw xmm7,xmm0
+ movdqa [esp+10h],xmm7
+ movdqa xmm7,xmm3
+ punpcklbw xmm7,xmm0
+ punpckhbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm7,xmm4
+ punpckhbw xmm4,xmm0
+ punpckhbw xmm2,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+30h],xmm3
+ movdqa xmm3,[esp+10h]
+ movdqa xmm1,xmm3
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa [esp+20h],xmm4
+ movdqa xmm0,xmm5
+ pcmpgtw xmm0,xmm1
+ movdqa xmm1,[esp+60h]
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ pand xmm0,xmm4
+ movdqa xmm1,xmm7
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,xmm2
+ psubw xmm1,[esp+30h]
+ pabsw xmm1,xmm1
+ pcmpgtw xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ pand xmm0,xmm4
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,[esp+20h]
+ psubw xmm1,[esp+30h]
+ pand xmm5,xmm4
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ pand xmm5,xmm6
+ mov edx,2
+ movsx edx,dx
+ movd xmm1,edx
+ movdqa xmm4,xmm1
+ punpcklwd xmm4,xmm1
+ pshufd xmm1,xmm4,0
+ movdqa xmm4,[esp+60h]
+ movdqa xmm6,xmm4
+ paddw xmm6,xmm4
+ paddw xmm6,xmm3
+ paddw xmm6,xmm7
+ movdqa [esp+10h],xmm1
+ paddw xmm6,[esp+10h]
+ psraw xmm6,2
+ movdqa xmm4,xmm0
+ pandn xmm4,xmm3
+ movdqa xmm3,[esp+40h]
+ movdqa xmm1,xmm0
+ pand xmm1,xmm6
+ por xmm1,xmm4
+ movdqa xmm6,xmm3
+ paddw xmm6,xmm3
+ movdqa xmm3,[esp+10h]
+ paddw xmm6,xmm2
+ paddw xmm6,[esp+20h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm4,xmm5
+ pand xmm4,xmm6
+ movdqa xmm6,xmm5
+ pandn xmm6,xmm2
+ por xmm4,xmm6
+ packuswb xmm1,xmm4
+ movdqa xmm4,[esp+50h]
+ movdqa xmm6,xmm7
+ paddw xmm6,xmm7
+ paddw xmm6,xmm4
+ paddw xmm6,[esp+60h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm2,xmm0
+ pand xmm2,xmm6
+ pandn xmm0,xmm4
+ por xmm2,xmm0
+ movdqa xmm0,[esp+20h]
+ movdqa xmm6,xmm0
+ paddw xmm6,xmm0
+ movdqa xmm0,[esp+30h]
+ paddw xmm6,xmm0
+ paddw xmm6,[esp+40h]
+ movdqa xmm4,xmm5
+ paddw xmm6,xmm3
+ movq [esi],xmm1
+ psraw xmm6,2
+ pand xmm4,xmm6
+ pandn xmm5,xmm0
+ por xmm4,xmm5
+ packuswb xmm2,xmm4
+ movq [eax],xmm2
+ psrldq xmm1,8
+ movq [edi],xmm1
+ pop edi
+ psrldq xmm2,8
+ movq [ecx],xmm2
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0E4h
+ push ebx
+ push esi
+ mov esi, [ebp+1Ch] ; pTC
+ movsx ebx, byte [esi+2]
+ push edi
+ movsx di,byte [esi+3]
+ mov word [esp+0Ch],bx
+ movsx bx,byte [esi+1]
+ movsx esi,byte [esi]
+ mov word [esp+0Eh],si
+ movzx esi,di
+ movd xmm1,esi
+ movzx esi,di
+ movd xmm2,esi
+ mov si,word [esp+0Ch]
+ mov edx, [ebp + 10h]
+ mov eax, [ebp + 08h]
+ movzx edi,si
+ movzx esi,si
+ mov ecx, [ebp + 0Ch]
+ movd xmm4,esi
+ movzx esi,bx
+ movd xmm5,esi
+ movd xmm3,edi
+ movzx esi,bx
+ movd xmm6,esi
+ mov si,word [esp+0Eh]
+ movzx edi,si
+ movzx esi,si
+ punpcklwd xmm6,xmm2
+ pxor xmm0,xmm0
+ movdqa [esp+40h],xmm0
+ movd xmm7,edi
+ movd xmm0,esi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ punpcklwd xmm0,xmm4
+ movq xmm4,[edx+ecx]
+ punpcklwd xmm7,xmm3
+ movq xmm3,[eax]
+ punpcklwd xmm0,xmm6
+ movq xmm6,[edi]
+ punpcklwd xmm7,xmm5
+ punpcklwd xmm0,xmm7
+ mov edi,ecx
+ sub edi,esi
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+60h],xmm2
+ movq xmm2, [edi]
+ punpcklqdq xmm6,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm7,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm2,[edi]
+ punpcklqdq xmm7,xmm2
+ movq xmm2,[ecx]
+ punpcklqdq xmm3,xmm2
+ movq xmm2,[edx+eax]
+ movsx edx,word [ebp + 14h]
+ punpcklqdq xmm2,xmm4
+ movdqa [esp+0E0h],xmm2
+ movd xmm2,edx
+ movsx edx,word [ebp + 18h]
+ movdqa xmm4,xmm2
+ punpcklwd xmm4,xmm2
+ movd xmm2,edx
+ movdqa xmm5,xmm2
+ punpcklwd xmm5,xmm2
+ pshufd xmm2,xmm5,0
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ movdqa [esp+0D0h],xmm3
+ pshufd xmm4,xmm4,0
+ movdqa [esp+30h],xmm2
+ punpckhbw xmm6,xmm1
+ movdqa [esp+80h],xmm6
+ movdqa xmm6,[esp+0D0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+70h],xmm6
+ movdqa xmm6, [esp+0E0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+90h],xmm6
+ movdqa xmm5, [esp+0E0h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0A0h],xmm7
+ punpcklbw xmm3,xmm1
+ mov edx,4
+ punpcklbw xmm2,xmm1
+ movsx edx,dx
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,[esp+30h]
+ movdqa [esp+20h],xmm6
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1,[esp+60h]
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6, [esp+20h]
+ movdqa xmm7, [esp+50h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa [esp+10h],xmm0
+ movdqa xmm6, [esp+10h]
+ pminsw xmm6,xmm1
+ movdqa [esp+10h],xmm6
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm6,xmm4
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+30h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1,[esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5,[esp+80h]
+ psubw xmm5,[esp+90h]
+ pand xmm6,xmm1
+ pand xmm6,[esp+40h]
+ movdqa xmm1,[esp+10h]
+ pand xmm1,xmm6
+ movdqa xmm6,[esp+70h]
+ movdqa [esp+30h],xmm1
+ movdqa xmm1,[esp+0A0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6,[esp+20h]
+ movdqa xmm5,[esp+60h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+70h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+80h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+90h]
+ pand xmm4,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+40h]
+ pand xmm0,xmm4
+ movdqa xmm4,[esp+30h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ packuswb xmm2,xmm1
+ movq [esi],xmm2
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm3,xmm5
+ movq [eax],xmm3
+ psrldq xmm2,8
+ movq [edi],xmm2
+ pop edi
+ pop esi
+ psrldq xmm3,8
+ movq [ecx],xmm3
+ pop ebx
+ mov esp,ebp
+ pop ebp
+ ret
+
+;***************************************************************************
+; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN DeblockChromaEq4H_sse2
+
+ALIGN 16
+
+DeblockChromaEq4H_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0C8h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+18h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+7Ch]
+ push edi
+ mov dword [esp+14h],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+0Ch],edx
+ mov dword [esp+10h],eax
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+0Ch]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+10h]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ movsx ecx,word [ebp+14h]
+ movsx edx,word [ebp+18h]
+ movdqa xmm6,[esp+80h]
+ movdqa xmm4,[esp+90h]
+ movdqa xmm5,[esp+0A0h]
+ movdqa xmm7,[esp+0B0h]
+ pxor xmm0,xmm0
+ movd xmm1,ecx
+ movdqa xmm2,xmm1
+ punpcklwd xmm2,xmm1
+ pshufd xmm1,xmm2,0
+ movd xmm2,edx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3,xmm6
+ punpckhbw xmm6,xmm0
+ movdqa [esp+60h],xmm6
+ movdqa xmm6,[esp+90h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+30h],xmm6
+ movdqa xmm6,[esp+0A0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,[esp+0B0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+70h],xmm6
+ punpcklbw xmm7,xmm0
+ punpcklbw xmm4,xmm0
+ punpcklbw xmm5,xmm0
+ punpcklbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm6,xmm4
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ movdqa xmm0,xmm1
+ pcmpgtw xmm0,xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm4
+ pabsw xmm6,xmm6
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+30h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pcmpgtw xmm1,xmm6
+ movdqa xmm6,[esp+60h]
+ psubw xmm6,[esp+30h]
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+70h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pand xmm1,xmm7
+ pcmpgtw xmm2,xmm6
+ pand xmm1,xmm2
+ mov eax,2
+ movsx ecx,ax
+ movd xmm2,ecx
+ movdqa xmm6,xmm2
+ punpcklwd xmm6,xmm2
+ pshufd xmm2,xmm6,0
+ movdqa [esp+20h],xmm2
+ movdqa xmm2,xmm3
+ paddw xmm2,xmm3
+ paddw xmm2,xmm4
+ paddw xmm2,[esp+50h]
+ paddw xmm2,[esp+20h]
+ psraw xmm2,2
+ movdqa xmm6,xmm0
+ pand xmm6,xmm2
+ movdqa xmm2,xmm0
+ pandn xmm2,xmm4
+ por xmm6,xmm2
+ movdqa xmm2,[esp+60h]
+ movdqa xmm7,xmm2
+ paddw xmm7,xmm2
+ paddw xmm7,[esp+30h]
+ paddw xmm7,[esp+70h]
+ paddw xmm7,[esp+20h]
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ pandn xmm2,[esp+30h]
+ psraw xmm7,2
+ pand xmm4,xmm7
+ por xmm4,xmm2
+ movdqa xmm2,[esp+50h]
+ packuswb xmm6,xmm4
+ movdqa [esp+90h],xmm6
+ movdqa xmm6,xmm2
+ paddw xmm6,xmm2
+ movdqa xmm2,[esp+20h]
+ paddw xmm6,xmm5
+ paddw xmm6,xmm3
+ movdqa xmm4,xmm0
+ pandn xmm0,xmm5
+ paddw xmm6,xmm2
+ psraw xmm6,2
+ pand xmm4,xmm6
+ por xmm4,xmm0
+ movdqa xmm0,[esp+70h]
+ movdqa xmm5,xmm0
+ paddw xmm5,xmm0
+ movdqa xmm0,[esp+40h]
+ paddw xmm5,xmm0
+ paddw xmm5,[esp+60h]
+ movdqa xmm3,xmm1
+ paddw xmm5,xmm2
+ psraw xmm5,2
+ pand xmm3,xmm5
+ pandn xmm1,xmm0
+ por xmm3,xmm1
+ packuswb xmm4,xmm3
+ movdqa [esp+0A0h],xmm4
+ mov esi,dword [esp+10h]
+ movdqa xmm0,[esi]
+ movdqa xmm1,[esi+10h]
+ movdqa xmm2,[esi+20h]
+ movdqa xmm3,[esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+0Ch]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;*******************************************************************************
+; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4H_sse2
+
+ALIGN 16
+
+DeblockChromaLt4H_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,108h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+10h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+6Ch]
+ push edi
+ mov dword [esp+0Ch],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+10h],edx
+ mov dword [esp+1Ch],eax
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+10h]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+1Ch]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ mov eax,dword [ebp+1Ch]
+ movsx cx,byte [eax+3]
+ movsx dx,byte [eax+2]
+ movsx si,byte [eax+1]
+ movsx ax,byte [eax]
+ movzx edi,cx
+ movzx ecx,cx
+ movd xmm2,ecx
+ movzx ecx,dx
+ movzx edx,dx
+ movd xmm3,ecx
+ movd xmm4,edx
+ movzx ecx,si
+ movzx edx,si
+ movd xmm5,ecx
+ pxor xmm0,xmm0
+ movd xmm6,edx
+ movzx ecx,ax
+ movdqa [esp+60h],xmm0
+ movzx edx,ax
+ movsx eax,word [ebp+14h]
+ punpcklwd xmm6,xmm2
+ movd xmm1,edi
+ movd xmm7,ecx
+ movsx ecx,word [ebp+18h]
+ movd xmm0,edx
+ punpcklwd xmm7,xmm3
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+60h]
+ punpcklwd xmm7,xmm5
+ movdqa xmm5,[esp+0A0h]
+ punpcklwd xmm0,xmm4
+ punpcklwd xmm0,xmm6
+ movdqa xmm6, [esp+70h]
+ punpcklwd xmm0,xmm7
+ movdqa xmm7,[esp+80h]
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+0D0h],xmm2
+ movd xmm2,eax
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm4,xmm3,0
+ movd xmm2,ecx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3, [esp+90h]
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa [esp+40h],xmm2
+ movdqa [esp+0B0h],xmm6
+ movdqa xmm6,[esp+90h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm2,xmm1
+ punpcklbw xmm3,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0F0h],xmm7
+ movdqa [esp+0C0h],xmm6
+ movdqa xmm6, [esp+0A0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+0E0h],xmm6
+ mov edx,4
+ movsx eax,dx
+ movd xmm6,eax
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa [esp+30h],xmm6
+ movdqa xmm7, [esp+40h]
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa [esp+60h],xmm6
+ movdqa xmm1, [esp+0D0h]
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6,[esp+30h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa xmm7,[esp+50h]
+ movdqa [esp+20h],xmm0
+ movdqa xmm6, [esp+20h]
+ pminsw xmm6,xmm1
+ movdqa [esp+20h],xmm6
+ movdqa xmm6,xmm4
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+40h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1, [esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5, [esp+0B0h]
+ psubw xmm5,[esp+0E0h]
+ pand xmm6,xmm1
+ pand xmm6, [esp+60h]
+ movdqa xmm1, [esp+20h]
+ pand xmm1,xmm6
+ movdqa xmm6, [esp+0C0h]
+ movdqa [esp+40h],xmm1
+ movdqa xmm1, [esp+0F0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6, [esp+30h]
+ movdqa xmm5, [esp+0D0h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+0C0h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+0B0h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6, [esp+0E0h]
+ pand xmm4,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+60h]
+ pand xmm0,xmm4
+ movdqa xmm4, [esp+40h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm2,xmm1
+ packuswb xmm3,xmm5
+ movdqa [esp+80h],xmm2
+ movdqa [esp+90h],xmm3
+ mov esi,dword [esp+1Ch]
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi+10h]
+ movdqa xmm2, [esi+20h]
+ movdqa xmm3, [esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+10h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+
+
+;*******************************************************************************
+; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN DeblockLumaLt4V_sse2
+
+ALIGN 16
+
+DeblockLumaLt4V_sse2:
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 420 ; 000001a4H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+
+ pxor xmm0, xmm0
+ push ebx
+ mov edx, dword [ebp+24]
+ movdqa [esp+424-384], xmm0
+ push esi
+
+ lea esi, [ecx+ecx*2]
+ push edi
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+
+ lea esi, [ecx+ecx]
+ movdqa [esp+432-208], xmm0
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+ movdqa [esp+448-208], xmm0
+
+ mov ebx, eax
+ sub ebx, ecx
+ movdqa xmm0, [ebx]
+ movdqa [esp+464-208], xmm0
+
+ movdqa xmm0, [eax]
+
+ add ecx, eax
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [ecx]
+ mov dword [esp+432-404], ecx
+
+ movsx ecx, word [ebp+16]
+ movdqa [esp+496-208], xmm0
+ movdqa xmm0, [esi+eax]
+
+ movsx si, byte [edx]
+ movdqa [esp+512-208], xmm0
+ movd xmm0, ecx
+ movsx ecx, word [ebp+20]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ pshufd xmm0, xmm1, 0
+ movdqa [esp+432-112], xmm0
+ movd xmm0, ecx
+ movsx cx, byte [edx+1]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ mov dword [esp+432-408], ebx
+ movzx ebx, cx
+ pshufd xmm0, xmm1, 0
+ movd xmm1, ebx
+ movzx ebx, cx
+ movd xmm2, ebx
+ movzx ebx, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, si
+ movd xmm5, ecx
+ movzx ecx, si
+ movd xmm6, ecx
+ movzx ecx, si
+ movd xmm7, ecx
+ movzx ecx, si
+ movdqa [esp+432-336], xmm0
+ movd xmm0, ecx
+
+ movsx cx, byte [edx+3]
+ movsx dx, byte [edx+2]
+ movd xmm3, ebx
+ punpcklwd xmm0, xmm4
+ movzx esi, cx
+ punpcklwd xmm6, xmm2
+ punpcklwd xmm5, xmm1
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ punpcklwd xmm0, xmm7
+ movdqa [esp+432-400], xmm0
+ movd xmm0, esi
+ movzx esi, cx
+ movd xmm2, esi
+ movzx esi, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, dx
+ movd xmm3, esi
+ movd xmm5, ecx
+ punpcklwd xmm5, xmm0
+
+ movdqa xmm0, [esp+432-384]
+ movzx ecx, dx
+ movd xmm6, ecx
+ movzx ecx, dx
+ movzx edx, dx
+ punpcklwd xmm6, xmm2
+ movd xmm7, ecx
+ movd xmm1, edx
+
+ movdqa xmm2, [esp+448-208]
+ punpcklbw xmm2, xmm0
+
+ mov ecx, 4
+ movsx edx, cx
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ movdqa xmm5, [esp+496-208]
+ movdqa xmm3, [esp+464-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-240], xmm5
+ movdqa xmm5, [esp+512-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-352], xmm5
+ punpcklwd xmm1, xmm4
+ movdqa xmm4, [esp+432-208]
+ punpcklwd xmm1, xmm6
+ movdqa xmm6, [esp+480-208]
+ punpcklwd xmm1, xmm7
+ punpcklbw xmm6, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ movdqa xmm7, xmm3
+ psubw xmm7, xmm4
+ pabsw xmm7, xmm7
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-336]
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-352]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+ movdqa xmm5, xmm3
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+ movdqa xmm5, [esp+432-400]
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, xmm3
+ movdqa [esp+432-32], xmm6
+ psubw xmm6, [esp+432-240]
+ movdqa xmm7, xmm5
+ movdqa [esp+432-384], xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+
+ pand xmm5, xmm7
+ movdqa xmm6, xmm3
+ psubw xmm6, xmm2
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-400]
+ pand xmm5, xmm7
+ movdqa xmm7, xmm6
+ pcmpeqw xmm6, xmm0
+ pcmpgtw xmm7, xmm0
+ por xmm7, xmm6
+ pand xmm5, xmm7
+ movdqa [esp+432-320], xmm5
+ movd xmm5, edx
+ movdqa xmm6, xmm5
+ punpcklwd xmm6, xmm5
+ pshufd xmm5, xmm6, 0
+ movdqa [esp+432-336], xmm5
+ movdqa xmm5, [esp+432-224]
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm0
+ psubw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ psllw xmm5, 2
+ movdqa xmm7, xmm2
+ psubw xmm7, [esp+432-240]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ psraw xmm7, 3
+ pmaxsw xmm6, xmm7
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ movdqa xmm6, [esp+432-400]
+ movdqa [esp+432-64], xmm5
+ movdqa [esp+432-384], xmm6
+ movdqa xmm5, xmm0
+ psubw xmm5, xmm6
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm2
+ psubw xmm5, xmm7
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-288]
+ movdqa xmm6, [esp+432-240]
+ movdqa [esp+432-96], xmm5
+ movdqa xmm5, [esp+432-352]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm6
+ paddw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+ psubw xmm5, xmm7
+
+ movdqa xmm7, [esp+496-208]
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-400]
+ pminsw xmm5, xmm6
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-256]
+ movdqa xmm6, [esp+448-208]
+ punpckhbw xmm7, xmm0
+ movdqa [esp+432-352], xmm7
+
+ movdqa xmm7, [esp+512-208]
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-48], xmm5
+ movdqa xmm5, [esp+432-208]
+ movdqa [esp+432-368], xmm6
+ movdqa xmm6, [esp+464-208]
+ punpckhbw xmm7, xmm0
+ punpckhbw xmm5, xmm0
+ movdqa [esp+432-384], xmm7
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-400], xmm6
+
+ movdqa xmm7, [esp+432-400]
+ movdqa xmm6, [esp+480-208]
+ psubw xmm7, xmm5
+ movdqa [esp+432-16], xmm5
+ pabsw xmm7, xmm7
+ punpckhbw xmm6, xmm0
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-384]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+
+ movdqa xmm5, [esp+432-400]
+ movdqa [esp+432-80], xmm6
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+
+ movdqa xmm5, xmm1
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, [esp+432-400]
+ psubw xmm6, [esp+432-352]
+ movdqa [esp+432-272], xmm5
+ movdqa xmm7, xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ movdqa xmm7, xmm4
+ pabsw xmm6, xmm6
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+
+ pand xmm5, xmm7
+ movdqa xmm7, [esp+432-400]
+ psubw xmm7, xmm6
+ psubw xmm6, [esp+432-352]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+ pand xmm5, xmm4
+
+ paddw xmm2, [esp+432-96]
+ movdqa xmm4, xmm1
+ pcmpgtw xmm4, xmm0
+ movdqa xmm7, xmm1
+ pcmpeqw xmm7, xmm0
+ por xmm4, xmm7
+ pand xmm5, xmm4
+ movdqa xmm4, [esp+432-224]
+ movdqa [esp+432-320], xmm5
+ movdqa xmm5, [esp+432-272]
+ movdqa xmm7, xmm0
+ psubw xmm7, xmm4
+ psubw xmm0, xmm1
+ psllw xmm5, 2
+ paddw xmm6, xmm5
+ paddw xmm6, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ movdqa [esp+432-336], xmm0
+ psraw xmm6, 3
+ pmaxsw xmm7, xmm6
+ pminsw xmm4, xmm7
+ pand xmm4, [esp+432-320]
+ movdqa xmm6, xmm0
+ movdqa xmm0, [esp+432-16]
+ paddw xmm0, [esp+432-304]
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-368]
+ paddw xmm4, xmm4
+ psubw xmm0, xmm4
+
+ movdqa xmm4, [esp+432-64]
+ psraw xmm0, 1
+ pmaxsw xmm6, xmm0
+ movdqa xmm0, [esp+432-400]
+ movdqa xmm7, xmm1
+ pminsw xmm7, xmm6
+ movdqa xmm6, [esp+432-320]
+ pand xmm7, xmm6
+ pand xmm7, [esp+432-288]
+ paddw xmm5, xmm7
+ packuswb xmm2, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm0, xmm5
+ paddw xmm3, xmm4
+ packuswb xmm3, xmm0
+
+ movdqa xmm0, [esp+432-32]
+ psubw xmm0, xmm4
+ movdqa xmm4, [esp+432-80]
+ psubw xmm4, xmm5
+
+ movdqa xmm5, [esp+432-240]
+ paddw xmm5, [esp+432-48]
+ packuswb xmm0, xmm4
+ movdqa xmm4, [esp+432-384]
+ paddw xmm4, [esp+432-304]
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [esp+432-352]
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm0
+
+ mov ecx, dword [esp+432-408]
+
+ mov edx, dword [esp+432-404]
+ psubw xmm4, xmm0
+ movdqa xmm0, [esp+432-336]
+ movdqa [edi], xmm2
+ psraw xmm4, 1
+ pmaxsw xmm0, xmm4
+ pminsw xmm1, xmm0
+ movdqa xmm0, [esp+480-208]
+
+ pop edi
+ pand xmm1, xmm6
+ pand xmm1, [esp+428-256]
+ movdqa [ecx], xmm3
+ paddw xmm7, xmm1
+ pop esi
+ packuswb xmm5, xmm7
+ movdqa [eax], xmm0
+ movdqa [edx], xmm5
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+
+;*******************************************************************************
+; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN DeblockLumaEq4V_sse2
+
+ALIGN 16
+
+DeblockLumaEq4V_sse2:
+
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 628 ; 00000274H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+ push ebx
+ push esi
+
+ lea edx, [ecx*4]
+ pxor xmm0, xmm0
+ movdqa xmm2, xmm0
+
+ movdqa xmm0, [ecx+eax]
+ mov esi, eax
+ sub esi, edx
+ movdqa xmm3, [esi]
+ movdqa xmm5, [eax]
+ push edi
+ lea edi, [ecx+ecx]
+ lea ebx, [ecx+ecx*2]
+ mov dword [esp+640-600], edi
+ mov esi, eax
+ sub esi, edi
+ movdqa xmm1, [esi]
+ movdqa [esp+720-272], xmm0
+ mov edi, eax
+ sub edi, ecx
+ movdqa xmm4, [edi]
+ add ecx, eax
+ mov dword [esp+640-596], ecx
+
+ mov ecx, dword [esp+640-600]
+ movdqa xmm0, [ecx+eax]
+ movdqa [esp+736-272], xmm0
+
+ movdqa xmm0, [eax+ebx]
+ mov edx, eax
+ sub edx, ebx
+
+ movsx ebx, word [ebp+16]
+ movdqa xmm6, [edx]
+ add ecx, eax
+ movdqa [esp+752-272], xmm0
+ movd xmm0, ebx
+
+ movsx ebx, word [ebp+20]
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+ movdqa [esp+640-320], xmm0
+ movd xmm0, ebx
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+
+ movdqa xmm7, [esp+736-272]
+ punpcklbw xmm7, xmm2
+ movdqa [esp+640-416], xmm7
+ movdqa [esp+640-512], xmm0
+ movdqa xmm0, xmm1
+ movdqa [esp+672-272], xmm1
+ movdqa xmm1, xmm4
+ movdqa [esp+704-272], xmm5
+ punpcklbw xmm5, xmm2
+ punpcklbw xmm1, xmm2
+
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ punpcklbw xmm0, xmm2
+ movdqa [esp+688-272], xmm4
+ movdqa xmm4, [esp+720-272]
+ movdqa [esp+640-480], xmm0
+
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm0
+
+ movdqa xmm0, [esp+640-512]
+ pabsw xmm7, xmm7
+ punpcklbw xmm4, xmm2
+ pcmpgtw xmm0, xmm7
+ movdqa [esp+640-384], xmm4
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+656-272], xmm6
+ punpcklbw xmm6, xmm2
+ pabsw xmm7, xmm7
+ movdqa [esp+640-48], xmm2
+ movdqa [esp+640-368], xmm6
+ movdqa [esp+640-144], xmm1
+ movdqa [esp+640-400], xmm5
+ pcmpgtw xmm4, xmm7
+ pand xmm0, xmm4
+ movdqa xmm4, [esp+640-320]
+ pcmpgtw xmm4, [esp+640-560]
+ pand xmm0, xmm4
+
+ mov ebx, 2
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, [esp+640-320]
+ psraw xmm4, 2
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm7
+ movdqa [esp+640-576], xmm4
+ pcmpgtw xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+640-624], xmm7
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm6
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-544], xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa xmm7, xmm5
+ psubw xmm7, [esp+640-416]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-544]
+ pandn xmm4, xmm6
+ movdqa [esp+640-16], xmm4
+ mov ebx, 4
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, xmm3
+ punpcklbw xmm4, xmm2
+ psllw xmm4, 1
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, [esp+640-480]
+
+ movdqa xmm6, [esp+640-560]
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm1
+ movdqa [esp+640-592], xmm7
+ paddw xmm4, xmm5
+ paddw xmm4, xmm7
+ movdqa xmm7, [esp+640-416]
+ pandn xmm6, xmm7
+ movdqa [esp+640-80], xmm6
+ movdqa xmm6, [esp+752-272]
+ punpcklbw xmm6, xmm2
+ psllw xmm6, 1
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-384]
+
+ movdqa xmm7, [esp+640-480]
+ paddw xmm6, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, [esp+640-592]
+ psraw xmm6, 3
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-112], xmm6
+ movdqa xmm6, [esp+640-544]
+ pandn xmm6, xmm7
+ movdqa [esp+640-336], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-528], xmm6
+ movdqa xmm6, [esp+640-368]
+ paddw xmm6, xmm7
+ movdqa xmm7, xmm1
+ psraw xmm4, 3
+ pand xmm4, [esp+640-544]
+ paddw xmm7, xmm5
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+
+ paddw xmm5, xmm1
+ psraw xmm6, 2
+ pand xmm7, xmm6
+
+ movdqa xmm6, [esp+640-384]
+ movdqa [esp+640-64], xmm7
+ movdqa xmm7, [esp+640-560]
+ pandn xmm7, xmm6
+ movdqa [esp+640-304], xmm7
+ movdqa xmm7, [esp+640-560]
+ movdqa [esp+640-528], xmm7
+ movdqa xmm7, [esp+640-416]
+ paddw xmm7, xmm6
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pand xmm5, xmm7
+ movdqa [esp+640-32], xmm5
+
+ movdqa xmm5, [esp+640-544]
+ movdqa [esp+640-528], xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa xmm7, xmm5
+ paddw xmm7, xmm5
+ movdqa xmm5, xmm1
+ paddw xmm5, xmm6
+ paddw xmm6, [esp+640-592]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pandn xmm5, xmm7
+ movdqa xmm7, [esp+640-480]
+ paddw xmm7, xmm1
+ paddw xmm7, [esp+640-400]
+ movdqa xmm1, [esp+640-544]
+ movdqa [esp+640-352], xmm5
+ movdqa xmm5, [esp+640-368]
+ psllw xmm7, 1
+ paddw xmm7, xmm6
+ paddw xmm5, xmm7
+
+ movdqa xmm7, [esp+640-400]
+ psraw xmm5, 3
+ pand xmm1, xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa [esp+640-96], xmm1
+ movdqa xmm1, [esp+640-560]
+ movdqa [esp+640-528], xmm1
+ movdqa xmm1, [esp+640-384]
+ movdqa xmm6, xmm1
+ paddw xmm6, xmm1
+ paddw xmm1, [esp+640-400]
+ paddw xmm1, [esp+640-144]
+ paddw xmm7, xmm5
+ paddw xmm5, [esp+640-592]
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+ psraw xmm6, 2
+ psllw xmm1, 1
+ paddw xmm1, xmm5
+
+ movdqa xmm5, [esp+656-272]
+ pandn xmm7, xmm6
+ movdqa xmm6, [esp+640-416]
+ paddw xmm6, xmm1
+ movdqa xmm1, [esp+640-560]
+ psraw xmm6, 3
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+704-272]
+ movdqa [esp+640-128], xmm1
+ movdqa xmm1, [esp+672-272]
+ punpckhbw xmm1, xmm2
+ movdqa [esp+640-448], xmm1
+ movdqa xmm1, [esp+688-272]
+ punpckhbw xmm1, xmm2
+ punpckhbw xmm6, xmm2
+ movdqa [esp+640-288], xmm7
+ punpckhbw xmm5, xmm2
+ movdqa [esp+640-496], xmm1
+ movdqa [esp+640-432], xmm6
+
+ movdqa xmm7, [esp+720-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-464], xmm7
+
+ movdqa xmm7, [esp+736-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-528], xmm7
+
+ movdqa xmm7, xmm6
+
+ psubw xmm6, [esp+640-464]
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ por xmm4, [esp+640-16]
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm1
+ psubw xmm7, [esp+640-448]
+
+ movdqa xmm1, [esp+640-512]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm1, xmm7
+ movdqa xmm7, [esp+640-512]
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+640-320]
+ pand xmm1, xmm7
+ movdqa xmm7, [esp+640-560]
+ pcmpgtw xmm6, xmm7
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+640-576]
+ pcmpgtw xmm6, xmm7
+
+ movdqa xmm7, [esp+640-496]
+ punpckhbw xmm3, xmm2
+ movdqa [esp+640-560], xmm6
+ movdqa xmm6, [esp+640-512]
+ psubw xmm7, xmm5
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+
+ pand xmm6, [esp+640-560]
+ movdqa xmm7, [esp+640-432]
+ psubw xmm7, [esp+640-528]
+
+ psllw xmm3, 1
+ movdqa [esp+640-544], xmm6
+ movdqa xmm6, [esp+640-512]
+
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, [esp+640-448]
+ paddw xmm3, [esp+640-496]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-560], xmm6
+
+ movdqa xmm6, xmm0
+ pand xmm6, xmm4
+ movdqa xmm4, xmm0
+ pandn xmm4, [esp+640-368]
+ por xmm6, xmm4
+ movdqa xmm4, [esp+640-432]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-592]
+ psraw xmm3, 3
+ pand xmm3, xmm2
+ pandn xmm2, xmm5
+ por xmm3, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm3
+ movdqa xmm3, [esp+640-64]
+ por xmm3, [esp+640-336]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm5
+ por xmm7, xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-480]
+ por xmm2, xmm3
+ packuswb xmm6, xmm7
+ movdqa [esp+640-336], xmm2
+ movdqa [esp+656-272], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa xmm2, xmm5
+ paddw xmm2, [esp+640-448]
+ movdqa xmm3, xmm1
+ movdqa xmm7, [esp+640-496]
+ paddw xmm7, xmm4
+ paddw xmm2, xmm7
+ paddw xmm2, [esp+640-624]
+ movdqa xmm7, [esp+640-544]
+ psraw xmm2, 2
+ pand xmm6, xmm2
+ movdqa xmm2, [esp+640-448]
+ pandn xmm7, xmm2
+ por xmm6, xmm7
+ pand xmm3, xmm6
+ movdqa xmm6, xmm1
+ pandn xmm6, xmm2
+ paddw xmm2, [esp+640-496]
+ paddw xmm2, xmm4
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-336]
+ packuswb xmm6, xmm3
+ psllw xmm2, 1
+ movdqa [esp+672-272], xmm6
+ movdqa xmm6, [esp+640-96]
+ por xmm6, [esp+640-352]
+
+ movdqa xmm3, xmm0
+ pand xmm3, xmm6
+ movdqa xmm6, xmm0
+ pandn xmm6, [esp+640-144]
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-352], xmm3
+ movdqa xmm3, [esp+640-464]
+ paddw xmm3, [esp+640-592]
+ paddw xmm2, xmm3
+ movdqa xmm3, [esp+640-448]
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-496]
+ psraw xmm5, 3
+ pand xmm6, xmm5
+ movdqa xmm5, [esp+640-464]
+ paddw xmm2, xmm5
+ paddw xmm5, [esp+640-432]
+ movdqa xmm4, xmm3
+ paddw xmm4, xmm3
+ paddw xmm4, xmm2
+ paddw xmm4, [esp+640-624]
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, [esp+640-592]
+ psraw xmm4, 2
+ pandn xmm2, xmm4
+ por xmm6, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-496]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm6
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-352]
+ packuswb xmm2, xmm7
+ movdqa [esp+688-272], xmm2
+ movdqa xmm2, [esp+640-128]
+ por xmm2, [esp+640-288]
+
+ movdqa xmm4, xmm0
+ pand xmm4, xmm2
+ paddw xmm5, xmm6
+ movdqa xmm2, xmm0
+ pandn xmm2, [esp+640-400]
+ por xmm4, xmm2
+ movdqa xmm2, [esp+640-528]
+ psllw xmm5, 1
+ paddw xmm5, xmm3
+ movdqa xmm3, [esp+640-560]
+ paddw xmm2, xmm5
+ psraw xmm2, 3
+ movdqa [esp+640-288], xmm4
+ movdqa xmm4, [esp+640-560]
+ pand xmm4, xmm2
+ movdqa xmm2, [esp+640-464]
+ movdqa xmm5, xmm2
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-432]
+ paddw xmm2, [esp+640-448]
+ movdqa xmm7, xmm1
+ paddw xmm5, xmm2
+ paddw xmm5, [esp+640-624]
+ movdqa xmm6, [esp+640-560]
+ psraw xmm5, 2
+ pandn xmm3, xmm5
+ por xmm4, xmm3
+ movdqa xmm3, [esp+640-32]
+ por xmm3, [esp+640-304]
+ pand xmm7, xmm4
+ movdqa xmm4, [esp+640-432]
+ movdqa xmm5, [esp+640-464]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm4
+ paddw xmm4, [esp+640-496]
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-288]
+ packuswb xmm2, xmm7
+ movdqa [esp+704-272], xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-384]
+ por xmm2, xmm3
+ movdqa [esp+640-304], xmm2
+ movdqa xmm2, [esp+640-528]
+ movdqa xmm3, xmm2
+ paddw xmm3, [esp+640-464]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-624]
+ psraw xmm3, 2
+ pand xmm6, xmm3
+ movdqa xmm3, [esp+640-560]
+ movdqa xmm4, xmm3
+ pandn xmm4, xmm5
+ por xmm6, xmm4
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-304]
+ movdqa xmm4, xmm1
+ pandn xmm4, xmm5
+ por xmm7, xmm4
+
+ movdqa xmm4, xmm0
+ pandn xmm0, [esp+640-416]
+ packuswb xmm6, xmm7
+ movdqa xmm7, [esp+640-112]
+ por xmm7, [esp+640-80]
+ pand xmm4, xmm7
+ por xmm4, xmm0
+ movdqa xmm0, [esp+752-272]
+ punpckhbw xmm0, [esp+640-48]
+ psllw xmm0, 1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm5
+ paddw xmm0, [esp+640-432]
+ paddw xmm0, [esp+640-496]
+ paddw xmm0, [esp+640-592]
+ psraw xmm0, 3
+ pand xmm0, xmm3
+ movdqa xmm7, xmm1
+ pandn xmm3, xmm2
+ por xmm0, xmm3
+ pand xmm7, xmm0
+
+ movdqa xmm0, [esp+656-272]
+ movdqa [edx], xmm0
+
+ movdqa xmm0, [esp+672-272]
+
+ mov edx, dword [esp+640-596]
+ movdqa [esi], xmm0
+ movdqa xmm0, [esp+688-272]
+ movdqa [edi], xmm0
+ movdqa xmm0, [esp+704-272]
+
+ pop edi
+ pandn xmm1, xmm2
+ movdqa [eax], xmm0
+ por xmm7, xmm1
+ pop esi
+ packuswb xmm4, xmm7
+ movdqa [edx], xmm6
+ movdqa [ecx], xmm4
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+
+;********************************************************************************
+;
+; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeH2V_sse2
+
+ALIGN 16
+
+DeblockLumaTransposeH2V_sse2:
+ push ebp
+ push ebx
+ mov ebp, esp
+ and esp,0FFFFFFF0h
+ sub esp, 10h
+
+ mov eax, [ebp + 0Ch]
+ mov ecx, [ebp + 10h]
+ lea edx, [eax + ecx * 8]
+ lea ebx, [ecx*3]
+
+ movq xmm0, [eax]
+ movq xmm7, [edx]
+ punpcklqdq xmm0, xmm7
+ movq xmm1, [eax + ecx]
+ movq xmm7, [edx + ecx]
+ punpcklqdq xmm1, xmm7
+ movq xmm2, [eax + ecx*2]
+ movq xmm7, [edx + ecx*2]
+ punpcklqdq xmm2, xmm7
+ movq xmm3, [eax + ebx]
+ movq xmm7, [edx + ebx]
+ punpcklqdq xmm3, xmm7
+
+ lea eax, [eax + ecx * 4]
+ lea edx, [edx + ecx * 4]
+ movq xmm4, [eax]
+ movq xmm7, [edx]
+ punpcklqdq xmm4, xmm7
+ movq xmm5, [eax + ecx]
+ movq xmm7, [edx + ecx]
+ punpcklqdq xmm5, xmm7
+ movq xmm6, [eax + ecx*2]
+ movq xmm7, [edx + ecx*2]
+ punpcklqdq xmm6, xmm7
+
+ movdqa [esp], xmm0
+ movq xmm7, [eax + ebx]
+ movq xmm0, [edx + ebx]
+ punpcklqdq xmm7, xmm0
+ movdqa xmm0, [esp]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ mov eax, [ebp + 14h]
+ movdqa [eax], xmm4
+ movdqa [eax + 10h], xmm2
+ movdqa [eax + 20h], xmm3
+ movdqa [eax + 30h], xmm7
+ movdqa [eax + 40h], xmm5
+ movdqa [eax + 50h], xmm1
+ movdqa [eax + 60h], xmm6
+ movdqa [eax + 70h], xmm0
+
+ mov esp, ebp
+ pop ebx
+ pop ebp
+ ret
+
+
+
+;*******************************************************************************************
+;
+; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeV2H_sse2
+
+ALIGN 16
+
+DeblockLumaTransposeV2H_sse2:
+ push ebp
+ mov ebp, esp
+
+ and esp, 0FFFFFFF0h
+ sub esp, 10h
+
+ mov eax, [ebp + 10h]
+ mov ecx, [ebp + 0Ch]
+ mov edx, [ebp + 08h]
+
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 10h]
+ movdqa xmm2, [eax + 20h]
+ movdqa xmm3, [eax + 30h]
+ movdqa xmm4, [eax + 40h]
+ movdqa xmm5, [eax + 50h]
+ movdqa xmm6, [eax + 60h]
+ movdqa xmm7, [eax + 70h]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ lea eax, [ecx * 3]
+
+ movq [edx], xmm4
+ movq [edx + ecx], xmm2
+ movq [edx + ecx*2], xmm3
+ movq [edx + eax], xmm7
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm5
+ movq [edx + ecx], xmm1
+ movq [edx + ecx*2], xmm6
+ movq [edx + eax], xmm0
+
+ psrldq xmm4, 8
+ psrldq xmm2, 8
+ psrldq xmm3, 8
+ psrldq xmm7, 8
+ psrldq xmm5, 8
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+ psrldq xmm0, 8
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm4
+ movq [edx + ecx], xmm2
+ movq [edx + ecx*2], xmm3
+ movq [edx + eax], xmm7
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm5
+ movq [edx + ecx], xmm1
+ movq [edx + ecx*2], xmm6
+ movq [edx + eax], xmm0
+
+
+ mov esp, ebp
+ pop ebp
ret
\ No newline at end of file
--- a/codec/decoder/core/asm/mc_chroma.asm
+++ b/codec/decoder/core/asm/mc_chroma.asm
@@ -1,317 +1,317 @@
-;*!
-;* \copy
-;* Copyright (c) 2004-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_chroma.asm
-;*
-;* Abstract
-;* mmx motion compensation for chroma
-;*
-;* History
-;* 10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
- dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
- dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd mm3, [eax]
- WELS_Zero mm7
- punpcklbw mm3, mm3
- movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
-
- movq mm5, mm3
- punpcklbw mm3, mm7
- punpckhbw mm5, mm7
-
- movq mm6, mm4
- punpcklbw mm4, mm7
- punpckhbw mm6, mm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movd mm0, [esi]
- movd mm1, [esi+1]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
-.xloop:
-
- pmullw mm0, mm3
- pmullw mm1, mm5
- paddw mm0, mm1
-
- movd mm1, [ebx]
- punpcklbw mm1, mm7
- movq mm2, mm1
- pmullw mm1, mm4
- paddw mm0, mm1
-
- movd mm1, [ebx+1]
- punpcklbw mm1, mm7
- movq mm7, mm1
- pmullw mm1,mm6
- paddw mm0, mm1
- movq mm1,mm7
-
- paddw mm0, [h264_d0x20_mmx]
- psrlw mm0, 6
-
- WELS_Zero mm7
- packuswb mm0, mm7
- movd [edi], mm0
-
- movq mm0, mm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
- WELSEMMS
- pop ebx
- pop edi
- pop esi
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd xmm3, [eax]
- WELS_Zero xmm7
- punpcklbw xmm3, xmm3
- punpcklwd xmm3, xmm3
-
- movdqa xmm4, xmm3
- punpckldq xmm3, xmm3
- punpckhdq xmm4, xmm4
- movdqa xmm5, xmm3
- movdqa xmm6, xmm4
-
- punpcklbw xmm3, xmm7
- punpckhbw xmm5, xmm7
- punpcklbw xmm4, xmm7
- punpckhbw xmm6, xmm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movq xmm0, [esi]
- movq xmm1, [esi+1]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
-.xloop:
-
- pmullw xmm0, xmm3
- pmullw xmm1, xmm5
- paddw xmm0, xmm1
-
- movq xmm1, [ebx]
- punpcklbw xmm1, xmm7
- movdqa xmm2, xmm1
- pmullw xmm1, xmm4
- paddw xmm0, xmm1
-
- movq xmm1, [ebx+1]
- punpcklbw xmm1, xmm7
- movdqa xmm7, xmm1
- pmullw xmm1, xmm6
- paddw xmm0, xmm1
- movdqa xmm1,xmm7
-
- paddw xmm0, [h264_d0x20_sse2]
- psrlw xmm0, 6
-
- WELS_Zero xmm7
- packuswb xmm0, xmm7
- movq [edi], xmm0
-
- movdqa xmm0, xmm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
-
- pop ebx
- pop edi
- pop esi
- ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
- push ebx
- push esi
- push edi
-
- mov eax, [esp + 12 + 20]
-
- pxor xmm7, xmm7
- movd xmm5, [eax]
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- movdqa xmm6, xmm5
- punpcklqdq xmm5, xmm5
- punpckhqdq xmm6, xmm6
-
- mov eax, [esp + 12 + 4]
- mov edx, [esp + 12 + 8]
- mov esi, [esp + 12 + 12]
- mov edi, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- sub esi, edi
- sub esi, edi
- movdqa xmm7, [h264_d0x20_sse2]
-
- movdqu xmm0, [eax]
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- punpcklbw xmm0, xmm1
-
-.hloop_chroma:
- lea esi, [esi+2*edi]
-
- movdqu xmm2, [eax+edx]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm4, xmm2
-
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm0, xmm2
- paddw xmm0, xmm7
- psrlw xmm0, 6
- packuswb xmm0, xmm0
- movq [esi],xmm0
-
- lea eax, [eax+2*edx]
- movdqu xmm2, [eax]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm0, xmm2
-
- pmaddubsw xmm4, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm4, xmm2
- paddw xmm4, xmm7
- psrlw xmm4, 6
- packuswb xmm4, xmm4
- movq [esi+edi],xmm4
-
- sub ecx, 2
- jnz .hloop_chroma
- pop edi
- pop esi
- pop ebx
-
- ret
-
-
+;*!
+;* \copy
+;* Copyright (c) 2004-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mc_chroma.asm
+;*
+;* Abstract
+;* mmx motion compensation for chroma
+;*
+;* History
+;* 10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+ dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+ dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+ push esi
+ push edi
+ push ebx
+
+ mov eax, [esp +12 + 20]
+ movd mm3, [eax]
+ WELS_Zero mm7
+ punpcklbw mm3, mm3
+ movq mm4, mm3
+ punpcklwd mm3, mm3
+ punpckhwd mm4, mm4
+
+ movq mm5, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm5, mm7
+
+ movq mm6, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm6, mm7
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ lea ebx, [esi + eax]
+ movd mm0, [esi]
+ movd mm1, [esi+1]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+.xloop:
+
+ pmullw mm0, mm3
+ pmullw mm1, mm5
+ paddw mm0, mm1
+
+ movd mm1, [ebx]
+ punpcklbw mm1, mm7
+ movq mm2, mm1
+ pmullw mm1, mm4
+ paddw mm0, mm1
+
+ movd mm1, [ebx+1]
+ punpcklbw mm1, mm7
+ movq mm7, mm1
+ pmullw mm1,mm6
+ paddw mm0, mm1
+ movq mm1,mm7
+
+ paddw mm0, [h264_d0x20_mmx]
+ psrlw mm0, 6
+
+ WELS_Zero mm7
+ packuswb mm0, mm7
+ movd [edi], mm0
+
+ movq mm0, mm2
+
+ lea edi, [edi +edx ]
+ lea ebx, [ebx + eax]
+
+ dec ecx
+ jnz near .xloop
+ WELSEMMS
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+ push esi
+ push edi
+ push ebx
+
+ mov eax, [esp +12 + 20]
+ movd xmm3, [eax]
+ WELS_Zero xmm7
+ punpcklbw xmm3, xmm3
+ punpcklwd xmm3, xmm3
+
+ movdqa xmm4, xmm3
+ punpckldq xmm3, xmm3
+ punpckhdq xmm4, xmm4
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm4
+
+ punpcklbw xmm3, xmm7
+ punpckhbw xmm5, xmm7
+ punpcklbw xmm4, xmm7
+ punpckhbw xmm6, xmm7
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ lea ebx, [esi + eax]
+ movq xmm0, [esi]
+ movq xmm1, [esi+1]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+.xloop:
+
+ pmullw xmm0, xmm3
+ pmullw xmm1, xmm5
+ paddw xmm0, xmm1
+
+ movq xmm1, [ebx]
+ punpcklbw xmm1, xmm7
+ movdqa xmm2, xmm1
+ pmullw xmm1, xmm4
+ paddw xmm0, xmm1
+
+ movq xmm1, [ebx+1]
+ punpcklbw xmm1, xmm7
+ movdqa xmm7, xmm1
+ pmullw xmm1, xmm6
+ paddw xmm0, xmm1
+ movdqa xmm1,xmm7
+
+ paddw xmm0, [h264_d0x20_sse2]
+ psrlw xmm0, 6
+
+ WELS_Zero xmm7
+ packuswb xmm0, xmm7
+ movq [edi], xmm0
+
+ movdqa xmm0, xmm2
+
+ lea edi, [edi +edx ]
+ lea ebx, [ebx + eax]
+
+ dec ecx
+ jnz near .xloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+ push ebx
+ push esi
+ push edi
+
+ mov eax, [esp + 12 + 20]
+
+ pxor xmm7, xmm7
+ movd xmm5, [eax]
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ movdqa xmm6, xmm5
+ punpcklqdq xmm5, xmm5
+ punpckhqdq xmm6, xmm6
+
+ mov eax, [esp + 12 + 4]
+ mov edx, [esp + 12 + 8]
+ mov esi, [esp + 12 + 12]
+ mov edi, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ sub esi, edi
+ sub esi, edi
+ movdqa xmm7, [h264_d0x20_sse2]
+
+ movdqu xmm0, [eax]
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+ punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+ lea esi, [esi+2*edi]
+
+ movdqu xmm2, [eax+edx]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm4, xmm2
+
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm0, xmm2
+ paddw xmm0, xmm7
+ psrlw xmm0, 6
+ packuswb xmm0, xmm0
+ movq [esi],xmm0
+
+ lea eax, [eax+2*edx]
+ movdqu xmm2, [eax]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm0, xmm2
+
+ pmaddubsw xmm4, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm4, xmm2
+ paddw xmm4, xmm7
+ psrlw xmm4, 6
+ packuswb xmm4, xmm4
+ movq [esi+edi],xmm4
+
+ sub ecx, 2
+ jnz .hloop_chroma
+ pop edi
+ pop esi
+ pop ebx
+
+ ret
+
+
--- a/codec/encoder/core/asm/deblock.asm
+++ b/codec/encoder/core/asm/deblock.asm
@@ -1,2113 +1,2113 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* deblock.asm
-;*
-;* Abstract
-;* edge loop
-;*
-;* History
-;* 08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN DeblockChromaEq4V_sse2
-
-ALIGN 16
-DeblockChromaEq4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
- mov edx,[ebp+10h] ; iStride
- mov eax,[ebp+8] ; pPixCb
- mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0E4h
- push ebx
- push esi
- mov esi, [ebp+1Ch] ; pTC
- movsx ebx, byte [esi+2]
- push edi
- movsx di,byte [esi+3]
- mov word [esp+0Ch],bx
- movsx bx,byte [esi+1]
- movsx esi,byte [esi]
- mov word [esp+0Eh],si
- movzx esi,di
- movd xmm1,esi
- movzx esi,di
- movd xmm2,esi
- mov si,word [esp+0Ch]
- mov edx, [ebp + 10h]
- mov eax, [ebp + 08h]
- movzx edi,si
- movzx esi,si
- mov ecx, [ebp + 0Ch]
- movd xmm4,esi
- movzx esi,bx
- movd xmm5,esi
- movd xmm3,edi
- movzx esi,bx
- movd xmm6,esi
- mov si,word [esp+0Eh]
- movzx edi,si
- movzx esi,si
- punpcklwd xmm6,xmm2
- pxor xmm0,xmm0
- movdqa [esp+40h],xmm0
- movd xmm7,edi
- movd xmm0,esi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+40h]
- punpcklwd xmm0,xmm4
- movq xmm4,[edx+ecx]
- punpcklwd xmm7,xmm3
- movq xmm3,[eax]
- punpcklwd xmm0,xmm6
- movq xmm6,[edi]
- punpcklwd xmm7,xmm5
- punpcklwd xmm0,xmm7
- mov edi,ecx
- sub edi,esi
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+60h],xmm2
- movq xmm2, [edi]
- punpcklqdq xmm6,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm7,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm2,[edi]
- punpcklqdq xmm7,xmm2
- movq xmm2,[ecx]
- punpcklqdq xmm3,xmm2
- movq xmm2,[edx+eax]
- movsx edx,word [ebp + 14h]
- punpcklqdq xmm2,xmm4
- movdqa [esp+0E0h],xmm2
- movd xmm2,edx
- movsx edx,word [ebp + 18h]
- movdqa xmm4,xmm2
- punpcklwd xmm4,xmm2
- movd xmm2,edx
- movdqa xmm5,xmm2
- punpcklwd xmm5,xmm2
- pshufd xmm2,xmm5,0
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- movdqa [esp+0D0h],xmm3
- pshufd xmm4,xmm4,0
- movdqa [esp+30h],xmm2
- punpckhbw xmm6,xmm1
- movdqa [esp+80h],xmm6
- movdqa xmm6,[esp+0D0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+70h],xmm6
- movdqa xmm6, [esp+0E0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+90h],xmm6
- movdqa xmm5, [esp+0E0h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0A0h],xmm7
- punpcklbw xmm3,xmm1
- mov edx,4
- punpcklbw xmm2,xmm1
- movsx edx,dx
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,[esp+30h]
- movdqa [esp+20h],xmm6
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa xmm1,[esp+60h]
- movdqa [esp+40h],xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6, [esp+20h]
- movdqa xmm7, [esp+50h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa [esp+10h],xmm0
- movdqa xmm6, [esp+10h]
- pminsw xmm6,xmm1
- movdqa [esp+10h],xmm6
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm6,xmm4
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+30h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1,[esp+50h]
- pand xmm6,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5,[esp+80h]
- psubw xmm5,[esp+90h]
- pand xmm6,xmm1
- pand xmm6,[esp+40h]
- movdqa xmm1,[esp+10h]
- pand xmm1,xmm6
- movdqa xmm6,[esp+70h]
- movdqa [esp+30h],xmm1
- movdqa xmm1,[esp+0A0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6,[esp+20h]
- movdqa xmm5,[esp+60h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+70h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+80h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+90h]
- pand xmm4,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+40h]
- pand xmm0,xmm4
- movdqa xmm4,[esp+30h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- packuswb xmm2,xmm1
- movq [esi],xmm2
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm3,xmm5
- movq [eax],xmm3
- psrldq xmm2,8
- movq [edi],xmm2
- pop edi
- pop esi
- psrldq xmm3,8
- movq [ecx],xmm3
- pop ebx
- mov esp,ebp
- pop ebp
- ret
-
-;***************************************************************************
-; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN DeblockChromaEq4H_sse2
-
-ALIGN 16
-
-DeblockChromaEq4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;*******************************************************************************
-; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4H_sse2
-
-ALIGN 16
-
-DeblockChromaLt4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-
-
-;*******************************************************************************
-; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-
-
-WELS_EXTERN DeblockLumaLt4V_sse2
-
-ALIGN 16
-
-DeblockLumaLt4V_sse2:
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 420 ; 000001a4H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
-
- pxor xmm0, xmm0
- push ebx
- mov edx, dword [ebp+24]
- movdqa [esp+424-384], xmm0
- push esi
-
- lea esi, [ecx+ecx*2]
- push edi
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
-
- lea esi, [ecx+ecx]
- movdqa [esp+432-208], xmm0
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
- movdqa [esp+448-208], xmm0
-
- mov ebx, eax
- sub ebx, ecx
- movdqa xmm0, [ebx]
- movdqa [esp+464-208], xmm0
-
- movdqa xmm0, [eax]
-
- add ecx, eax
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [ecx]
- mov dword [esp+432-404], ecx
-
- movsx ecx, word [ebp+16]
- movdqa [esp+496-208], xmm0
- movdqa xmm0, [esi+eax]
-
- movsx si, byte [edx]
- movdqa [esp+512-208], xmm0
- movd xmm0, ecx
- movsx ecx, word [ebp+20]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- pshufd xmm0, xmm1, 0
- movdqa [esp+432-112], xmm0
- movd xmm0, ecx
- movsx cx, byte [edx+1]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- mov dword [esp+432-408], ebx
- movzx ebx, cx
- pshufd xmm0, xmm1, 0
- movd xmm1, ebx
- movzx ebx, cx
- movd xmm2, ebx
- movzx ebx, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, si
- movd xmm5, ecx
- movzx ecx, si
- movd xmm6, ecx
- movzx ecx, si
- movd xmm7, ecx
- movzx ecx, si
- movdqa [esp+432-336], xmm0
- movd xmm0, ecx
-
- movsx cx, byte [edx+3]
- movsx dx, byte [edx+2]
- movd xmm3, ebx
- punpcklwd xmm0, xmm4
- movzx esi, cx
- punpcklwd xmm6, xmm2
- punpcklwd xmm5, xmm1
- punpcklwd xmm0, xmm6
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- punpcklwd xmm0, xmm7
- movdqa [esp+432-400], xmm0
- movd xmm0, esi
- movzx esi, cx
- movd xmm2, esi
- movzx esi, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, dx
- movd xmm3, esi
- movd xmm5, ecx
- punpcklwd xmm5, xmm0
-
- movdqa xmm0, [esp+432-384]
- movzx ecx, dx
- movd xmm6, ecx
- movzx ecx, dx
- movzx edx, dx
- punpcklwd xmm6, xmm2
- movd xmm7, ecx
- movd xmm1, edx
-
- movdqa xmm2, [esp+448-208]
- punpcklbw xmm2, xmm0
-
- mov ecx, 4
- movsx edx, cx
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- movdqa xmm5, [esp+496-208]
- movdqa xmm3, [esp+464-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-240], xmm5
- movdqa xmm5, [esp+512-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-352], xmm5
- punpcklwd xmm1, xmm4
- movdqa xmm4, [esp+432-208]
- punpcklwd xmm1, xmm6
- movdqa xmm6, [esp+480-208]
- punpcklwd xmm1, xmm7
- punpcklbw xmm6, xmm0
- punpcklbw xmm3, xmm0
- punpcklbw xmm4, xmm0
- movdqa xmm7, xmm3
- psubw xmm7, xmm4
- pabsw xmm7, xmm7
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-336]
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-352]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
- movdqa xmm5, xmm3
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
- movdqa xmm5, [esp+432-400]
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, xmm3
- movdqa [esp+432-32], xmm6
- psubw xmm6, [esp+432-240]
- movdqa xmm7, xmm5
- movdqa [esp+432-384], xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
-
- pand xmm5, xmm7
- movdqa xmm6, xmm3
- psubw xmm6, xmm2
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-400]
- pand xmm5, xmm7
- movdqa xmm7, xmm6
- pcmpeqw xmm6, xmm0
- pcmpgtw xmm7, xmm0
- por xmm7, xmm6
- pand xmm5, xmm7
- movdqa [esp+432-320], xmm5
- movd xmm5, edx
- movdqa xmm6, xmm5
- punpcklwd xmm6, xmm5
- pshufd xmm5, xmm6, 0
- movdqa [esp+432-336], xmm5
- movdqa xmm5, [esp+432-224]
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm0
- psubw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- psllw xmm5, 2
- movdqa xmm7, xmm2
- psubw xmm7, [esp+432-240]
- paddw xmm7, xmm5
- paddw xmm7, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- psraw xmm7, 3
- pmaxsw xmm6, xmm7
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- movdqa xmm6, [esp+432-400]
- movdqa [esp+432-64], xmm5
- movdqa [esp+432-384], xmm6
- movdqa xmm5, xmm0
- psubw xmm5, xmm6
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm2
- paddw xmm7, xmm2
- psubw xmm5, xmm7
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-288]
- movdqa xmm6, [esp+432-240]
- movdqa [esp+432-96], xmm5
- movdqa xmm5, [esp+432-352]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm6
- paddw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
- psubw xmm5, xmm7
-
- movdqa xmm7, [esp+496-208]
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-400]
- pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-256]
- movdqa xmm6, [esp+448-208]
- punpckhbw xmm7, xmm0
- movdqa [esp+432-352], xmm7
-
- movdqa xmm7, [esp+512-208]
- punpckhbw xmm6, xmm0
- movdqa [esp+432-48], xmm5
- movdqa xmm5, [esp+432-208]
- movdqa [esp+432-368], xmm6
- movdqa xmm6, [esp+464-208]
- punpckhbw xmm7, xmm0
- punpckhbw xmm5, xmm0
- movdqa [esp+432-384], xmm7
- punpckhbw xmm6, xmm0
- movdqa [esp+432-400], xmm6
-
- movdqa xmm7, [esp+432-400]
- movdqa xmm6, [esp+480-208]
- psubw xmm7, xmm5
- movdqa [esp+432-16], xmm5
- pabsw xmm7, xmm7
- punpckhbw xmm6, xmm0
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
-
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-384]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
-
- movdqa xmm5, [esp+432-400]
- movdqa [esp+432-80], xmm6
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
-
- movdqa xmm5, xmm1
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, [esp+432-400]
- psubw xmm6, [esp+432-352]
- movdqa [esp+432-272], xmm5
- movdqa xmm7, xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- movdqa xmm7, xmm4
- pabsw xmm6, xmm6
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
-
- pand xmm5, xmm7
- movdqa xmm7, [esp+432-400]
- psubw xmm7, xmm6
- psubw xmm6, [esp+432-352]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
- pand xmm5, xmm4
-
- paddw xmm2, [esp+432-96]
- movdqa xmm4, xmm1
- pcmpgtw xmm4, xmm0
- movdqa xmm7, xmm1
- pcmpeqw xmm7, xmm0
- por xmm4, xmm7
- pand xmm5, xmm4
- movdqa xmm4, [esp+432-224]
- movdqa [esp+432-320], xmm5
- movdqa xmm5, [esp+432-272]
- movdqa xmm7, xmm0
- psubw xmm7, xmm4
- psubw xmm0, xmm1
- psllw xmm5, 2
- paddw xmm6, xmm5
- paddw xmm6, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- movdqa [esp+432-336], xmm0
- psraw xmm6, 3
- pmaxsw xmm7, xmm6
- pminsw xmm4, xmm7
- pand xmm4, [esp+432-320]
- movdqa xmm6, xmm0
- movdqa xmm0, [esp+432-16]
- paddw xmm0, [esp+432-304]
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-368]
- paddw xmm4, xmm4
- psubw xmm0, xmm4
-
- movdqa xmm4, [esp+432-64]
- psraw xmm0, 1
- pmaxsw xmm6, xmm0
- movdqa xmm0, [esp+432-400]
- movdqa xmm7, xmm1
- pminsw xmm7, xmm6
- movdqa xmm6, [esp+432-320]
- pand xmm7, xmm6
- pand xmm7, [esp+432-288]
- paddw xmm5, xmm7
- packuswb xmm2, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm0, xmm5
- paddw xmm3, xmm4
- packuswb xmm3, xmm0
-
- movdqa xmm0, [esp+432-32]
- psubw xmm0, xmm4
- movdqa xmm4, [esp+432-80]
- psubw xmm4, xmm5
-
- movdqa xmm5, [esp+432-240]
- paddw xmm5, [esp+432-48]
- packuswb xmm0, xmm4
- movdqa xmm4, [esp+432-384]
- paddw xmm4, [esp+432-304]
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [esp+432-352]
- movdqa xmm7, xmm0
- paddw xmm0, xmm0
-
- mov ecx, dword [esp+432-408]
-
- mov edx, dword [esp+432-404]
- psubw xmm4, xmm0
- movdqa xmm0, [esp+432-336]
- movdqa [edi], xmm2
- psraw xmm4, 1
- pmaxsw xmm0, xmm4
- pminsw xmm1, xmm0
- movdqa xmm0, [esp+480-208]
-
- pop edi
- pand xmm1, xmm6
- pand xmm1, [esp+428-256]
- movdqa [ecx], xmm3
- paddw xmm7, xmm1
- pop esi
- packuswb xmm5, xmm7
- movdqa [eax], xmm0
- movdqa [edx], xmm5
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;*******************************************************************************
-; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN DeblockLumaEq4V_sse2
-
-ALIGN 16
-
-DeblockLumaEq4V_sse2:
-
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 628 ; 00000274H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
- push ebx
- push esi
-
- lea edx, [ecx*4]
- pxor xmm0, xmm0
- movdqa xmm2, xmm0
-
- movdqa xmm0, [ecx+eax]
- mov esi, eax
- sub esi, edx
- movdqa xmm3, [esi]
- movdqa xmm5, [eax]
- push edi
- lea edi, [ecx+ecx]
- lea ebx, [ecx+ecx*2]
- mov dword [esp+640-600], edi
- mov esi, eax
- sub esi, edi
- movdqa xmm1, [esi]
- movdqa [esp+720-272], xmm0
- mov edi, eax
- sub edi, ecx
- movdqa xmm4, [edi]
- add ecx, eax
- mov dword [esp+640-596], ecx
-
- mov ecx, dword [esp+640-600]
- movdqa xmm0, [ecx+eax]
- movdqa [esp+736-272], xmm0
-
- movdqa xmm0, [eax+ebx]
- mov edx, eax
- sub edx, ebx
-
- movsx ebx, word [ebp+16]
- movdqa xmm6, [edx]
- add ecx, eax
- movdqa [esp+752-272], xmm0
- movd xmm0, ebx
-
- movsx ebx, word [ebp+20]
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
- movdqa [esp+640-320], xmm0
- movd xmm0, ebx
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
-
- movdqa xmm7, [esp+736-272]
- punpcklbw xmm7, xmm2
- movdqa [esp+640-416], xmm7
- movdqa [esp+640-512], xmm0
- movdqa xmm0, xmm1
- movdqa [esp+672-272], xmm1
- movdqa xmm1, xmm4
- movdqa [esp+704-272], xmm5
- punpcklbw xmm5, xmm2
- punpcklbw xmm1, xmm2
-
- movdqa xmm7, xmm5
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- punpcklbw xmm0, xmm2
- movdqa [esp+688-272], xmm4
- movdqa xmm4, [esp+720-272]
- movdqa [esp+640-480], xmm0
-
- movdqa xmm7, xmm1
- psubw xmm7, xmm0
-
- movdqa xmm0, [esp+640-512]
- pabsw xmm7, xmm7
- punpcklbw xmm4, xmm2
- pcmpgtw xmm0, xmm7
- movdqa [esp+640-384], xmm4
- movdqa xmm7, xmm5
- psubw xmm7, xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+656-272], xmm6
- punpcklbw xmm6, xmm2
- pabsw xmm7, xmm7
- movdqa [esp+640-48], xmm2
- movdqa [esp+640-368], xmm6
- movdqa [esp+640-144], xmm1
- movdqa [esp+640-400], xmm5
- pcmpgtw xmm4, xmm7
- pand xmm0, xmm4
- movdqa xmm4, [esp+640-320]
- pcmpgtw xmm4, [esp+640-560]
- pand xmm0, xmm4
-
- mov ebx, 2
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, [esp+640-320]
- psraw xmm4, 2
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm7
- movdqa [esp+640-576], xmm4
- pcmpgtw xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-512]
- movdqa [esp+640-624], xmm7
- movdqa xmm7, xmm1
- psubw xmm7, xmm6
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-544], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa xmm7, xmm5
- psubw xmm7, [esp+640-416]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-544]
- pandn xmm4, xmm6
- movdqa [esp+640-16], xmm4
- mov ebx, 4
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm2
- psllw xmm4, 1
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, [esp+640-480]
-
- movdqa xmm6, [esp+640-560]
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm1
- movdqa [esp+640-592], xmm7
- paddw xmm4, xmm5
- paddw xmm4, xmm7
- movdqa xmm7, [esp+640-416]
- pandn xmm6, xmm7
- movdqa [esp+640-80], xmm6
- movdqa xmm6, [esp+752-272]
- punpcklbw xmm6, xmm2
- psllw xmm6, 1
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-384]
-
- movdqa xmm7, [esp+640-480]
- paddw xmm6, xmm5
- paddw xmm6, xmm1
- paddw xmm6, [esp+640-592]
- psraw xmm6, 3
- pand xmm6, [esp+640-560]
- movdqa [esp+640-112], xmm6
- movdqa xmm6, [esp+640-544]
- pandn xmm6, xmm7
- movdqa [esp+640-336], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-528], xmm6
- movdqa xmm6, [esp+640-368]
- paddw xmm6, xmm7
- movdqa xmm7, xmm1
- psraw xmm4, 3
- pand xmm4, [esp+640-544]
- paddw xmm7, xmm5
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
-
- paddw xmm5, xmm1
- psraw xmm6, 2
- pand xmm7, xmm6
-
- movdqa xmm6, [esp+640-384]
- movdqa [esp+640-64], xmm7
- movdqa xmm7, [esp+640-560]
- pandn xmm7, xmm6
- movdqa [esp+640-304], xmm7
- movdqa xmm7, [esp+640-560]
- movdqa [esp+640-528], xmm7
- movdqa xmm7, [esp+640-416]
- paddw xmm7, xmm6
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pand xmm5, xmm7
- movdqa [esp+640-32], xmm5
-
- movdqa xmm5, [esp+640-544]
- movdqa [esp+640-528], xmm5
- movdqa xmm5, [esp+640-480]
- movdqa xmm7, xmm5
- paddw xmm7, xmm5
- movdqa xmm5, xmm1
- paddw xmm5, xmm6
- paddw xmm6, [esp+640-592]
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pandn xmm5, xmm7
- movdqa xmm7, [esp+640-480]
- paddw xmm7, xmm1
- paddw xmm7, [esp+640-400]
- movdqa xmm1, [esp+640-544]
- movdqa [esp+640-352], xmm5
- movdqa xmm5, [esp+640-368]
- psllw xmm7, 1
- paddw xmm7, xmm6
- paddw xmm5, xmm7
-
- movdqa xmm7, [esp+640-400]
- psraw xmm5, 3
- pand xmm1, xmm5
- movdqa xmm5, [esp+640-480]
- movdqa [esp+640-96], xmm1
- movdqa xmm1, [esp+640-560]
- movdqa [esp+640-528], xmm1
- movdqa xmm1, [esp+640-384]
- movdqa xmm6, xmm1
- paddw xmm6, xmm1
- paddw xmm1, [esp+640-400]
- paddw xmm1, [esp+640-144]
- paddw xmm7, xmm5
- paddw xmm5, [esp+640-592]
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
- psraw xmm6, 2
- psllw xmm1, 1
- paddw xmm1, xmm5
-
- movdqa xmm5, [esp+656-272]
- pandn xmm7, xmm6
- movdqa xmm6, [esp+640-416]
- paddw xmm6, xmm1
- movdqa xmm1, [esp+640-560]
- psraw xmm6, 3
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+704-272]
- movdqa [esp+640-128], xmm1
- movdqa xmm1, [esp+672-272]
- punpckhbw xmm1, xmm2
- movdqa [esp+640-448], xmm1
- movdqa xmm1, [esp+688-272]
- punpckhbw xmm1, xmm2
- punpckhbw xmm6, xmm2
- movdqa [esp+640-288], xmm7
- punpckhbw xmm5, xmm2
- movdqa [esp+640-496], xmm1
- movdqa [esp+640-432], xmm6
-
- movdqa xmm7, [esp+720-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-464], xmm7
-
- movdqa xmm7, [esp+736-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-528], xmm7
-
- movdqa xmm7, xmm6
-
- psubw xmm6, [esp+640-464]
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- por xmm4, [esp+640-16]
- pabsw xmm6, xmm6
- movdqa xmm7, xmm1
- psubw xmm7, [esp+640-448]
-
- movdqa xmm1, [esp+640-512]
- pabsw xmm7, xmm7
- pcmpgtw xmm1, xmm7
- movdqa xmm7, [esp+640-512]
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+640-320]
- pand xmm1, xmm7
- movdqa xmm7, [esp+640-560]
- pcmpgtw xmm6, xmm7
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+640-576]
- pcmpgtw xmm6, xmm7
-
- movdqa xmm7, [esp+640-496]
- punpckhbw xmm3, xmm2
- movdqa [esp+640-560], xmm6
- movdqa xmm6, [esp+640-512]
- psubw xmm7, xmm5
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
-
- pand xmm6, [esp+640-560]
- movdqa xmm7, [esp+640-432]
- psubw xmm7, [esp+640-528]
-
- psllw xmm3, 1
- movdqa [esp+640-544], xmm6
- movdqa xmm6, [esp+640-512]
-
- movdqa xmm2, [esp+640-544]
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, [esp+640-448]
- paddw xmm3, [esp+640-496]
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa [esp+640-560], xmm6
-
- movdqa xmm6, xmm0
- pand xmm6, xmm4
- movdqa xmm4, xmm0
- pandn xmm4, [esp+640-368]
- por xmm6, xmm4
- movdqa xmm4, [esp+640-432]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-592]
- psraw xmm3, 3
- pand xmm3, xmm2
- pandn xmm2, xmm5
- por xmm3, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm3
- movdqa xmm3, [esp+640-64]
- por xmm3, [esp+640-336]
- movdqa xmm2, xmm1
- pandn xmm2, xmm5
- por xmm7, xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-480]
- por xmm2, xmm3
- packuswb xmm6, xmm7
- movdqa [esp+640-336], xmm2
- movdqa [esp+656-272], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa xmm2, xmm5
- paddw xmm2, [esp+640-448]
- movdqa xmm3, xmm1
- movdqa xmm7, [esp+640-496]
- paddw xmm7, xmm4
- paddw xmm2, xmm7
- paddw xmm2, [esp+640-624]
- movdqa xmm7, [esp+640-544]
- psraw xmm2, 2
- pand xmm6, xmm2
- movdqa xmm2, [esp+640-448]
- pandn xmm7, xmm2
- por xmm6, xmm7
- pand xmm3, xmm6
- movdqa xmm6, xmm1
- pandn xmm6, xmm2
- paddw xmm2, [esp+640-496]
- paddw xmm2, xmm4
- por xmm3, xmm6
- movdqa xmm6, [esp+640-336]
- packuswb xmm6, xmm3
- psllw xmm2, 1
- movdqa [esp+672-272], xmm6
- movdqa xmm6, [esp+640-96]
- por xmm6, [esp+640-352]
-
- movdqa xmm3, xmm0
- pand xmm3, xmm6
- movdqa xmm6, xmm0
- pandn xmm6, [esp+640-144]
- por xmm3, xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-352], xmm3
- movdqa xmm3, [esp+640-464]
- paddw xmm3, [esp+640-592]
- paddw xmm2, xmm3
- movdqa xmm3, [esp+640-448]
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-496]
- psraw xmm5, 3
- pand xmm6, xmm5
- movdqa xmm5, [esp+640-464]
- paddw xmm2, xmm5
- paddw xmm5, [esp+640-432]
- movdqa xmm4, xmm3
- paddw xmm4, xmm3
- paddw xmm4, xmm2
- paddw xmm4, [esp+640-624]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, [esp+640-592]
- psraw xmm4, 2
- pandn xmm2, xmm4
- por xmm6, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-496]
- movdqa xmm2, xmm1
- pandn xmm2, xmm6
- por xmm7, xmm2
- movdqa xmm2, [esp+640-352]
- packuswb xmm2, xmm7
- movdqa [esp+688-272], xmm2
- movdqa xmm2, [esp+640-128]
- por xmm2, [esp+640-288]
-
- movdqa xmm4, xmm0
- pand xmm4, xmm2
- paddw xmm5, xmm6
- movdqa xmm2, xmm0
- pandn xmm2, [esp+640-400]
- por xmm4, xmm2
- movdqa xmm2, [esp+640-528]
- psllw xmm5, 1
- paddw xmm5, xmm3
- movdqa xmm3, [esp+640-560]
- paddw xmm2, xmm5
- psraw xmm2, 3
- movdqa [esp+640-288], xmm4
- movdqa xmm4, [esp+640-560]
- pand xmm4, xmm2
- movdqa xmm2, [esp+640-464]
- movdqa xmm5, xmm2
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-432]
- paddw xmm2, [esp+640-448]
- movdqa xmm7, xmm1
- paddw xmm5, xmm2
- paddw xmm5, [esp+640-624]
- movdqa xmm6, [esp+640-560]
- psraw xmm5, 2
- pandn xmm3, xmm5
- por xmm4, xmm3
- movdqa xmm3, [esp+640-32]
- por xmm3, [esp+640-304]
- pand xmm7, xmm4
- movdqa xmm4, [esp+640-432]
- movdqa xmm5, [esp+640-464]
- movdqa xmm2, xmm1
- pandn xmm2, xmm4
- paddw xmm4, [esp+640-496]
- por xmm7, xmm2
- movdqa xmm2, [esp+640-288]
- packuswb xmm2, xmm7
- movdqa [esp+704-272], xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-384]
- por xmm2, xmm3
- movdqa [esp+640-304], xmm2
- movdqa xmm2, [esp+640-528]
- movdqa xmm3, xmm2
- paddw xmm3, [esp+640-464]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-624]
- psraw xmm3, 2
- pand xmm6, xmm3
- movdqa xmm3, [esp+640-560]
- movdqa xmm4, xmm3
- pandn xmm4, xmm5
- por xmm6, xmm4
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-304]
- movdqa xmm4, xmm1
- pandn xmm4, xmm5
- por xmm7, xmm4
-
- movdqa xmm4, xmm0
- pandn xmm0, [esp+640-416]
- packuswb xmm6, xmm7
- movdqa xmm7, [esp+640-112]
- por xmm7, [esp+640-80]
- pand xmm4, xmm7
- por xmm4, xmm0
- movdqa xmm0, [esp+752-272]
- punpckhbw xmm0, [esp+640-48]
- psllw xmm0, 1
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm5
- paddw xmm0, [esp+640-432]
- paddw xmm0, [esp+640-496]
- paddw xmm0, [esp+640-592]
- psraw xmm0, 3
- pand xmm0, xmm3
- movdqa xmm7, xmm1
- pandn xmm3, xmm2
- por xmm0, xmm3
- pand xmm7, xmm0
-
- movdqa xmm0, [esp+656-272]
- movdqa [edx], xmm0
-
- movdqa xmm0, [esp+672-272]
-
- mov edx, dword [esp+640-596]
- movdqa [esi], xmm0
- movdqa xmm0, [esp+688-272]
- movdqa [edi], xmm0
- movdqa xmm0, [esp+704-272]
-
- pop edi
- pandn xmm1, xmm2
- movdqa [eax], xmm0
- por xmm7, xmm1
- pop esi
- packuswb xmm4, xmm7
- movdqa [edx], xmm6
- movdqa [ecx], xmm4
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;********************************************************************************
-;
-; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-;
-;********************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeH2V_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeH2V_sse2:
- push ebp
- push ebx
- mov ebp, esp
- and esp,0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 0Ch]
- mov ecx, [ebp + 10h]
- lea edx, [eax + ecx * 8]
- lea ebx, [ecx*3]
-
- movq xmm0, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm0, xmm7
- movq xmm1, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm1, xmm7
- movq xmm2, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm2, xmm7
- movq xmm3, [eax + ebx]
- movq xmm7, [edx + ebx]
- punpcklqdq xmm3, xmm7
-
- lea eax, [eax + ecx * 4]
- lea edx, [edx + ecx * 4]
- movq xmm4, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm4, xmm7
- movq xmm5, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm5, xmm7
- movq xmm6, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm6, xmm7
-
- movdqa [esp], xmm0
- movq xmm7, [eax + ebx]
- movq xmm0, [edx + ebx]
- punpcklqdq xmm7, xmm0
- movdqa xmm0, [esp]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- mov eax, [ebp + 14h]
- movdqa [eax], xmm4
- movdqa [eax + 10h], xmm2
- movdqa [eax + 20h], xmm3
- movdqa [eax + 30h], xmm7
- movdqa [eax + 40h], xmm5
- movdqa [eax + 50h], xmm1
- movdqa [eax + 60h], xmm6
- movdqa [eax + 70h], xmm0
-
- mov esp, ebp
- pop ebx
- pop ebp
- ret
-
-
-
-;*******************************************************************************************
-;
-; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeV2H_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeV2H_sse2:
- push ebp
- mov ebp, esp
-
- and esp, 0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 10h]
- mov ecx, [ebp + 0Ch]
- mov edx, [ebp + 08h]
-
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 10h]
- movdqa xmm2, [eax + 20h]
- movdqa xmm3, [eax + 30h]
- movdqa xmm4, [eax + 40h]
- movdqa xmm5, [eax + 50h]
- movdqa xmm6, [eax + 60h]
- movdqa xmm7, [eax + 70h]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- lea eax, [ecx * 3]
-
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
- psrldq xmm4, 8
- psrldq xmm2, 8
- psrldq xmm3, 8
- psrldq xmm7, 8
- psrldq xmm5, 8
- psrldq xmm1, 8
- psrldq xmm6, 8
- psrldq xmm0, 8
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
-
- mov esp, ebp
- pop ebp
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* deblock.asm
+;*
+;* Abstract
+;* edge loop
+;*
+;* History
+;* 08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN DeblockChromaEq4V_sse2
+
+ALIGN 16
+DeblockChromaEq4V_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,68h
+ mov edx,[ebp+10h] ; iStride
+ mov eax,[ebp+8] ; pPixCb
+ mov ecx,[ebp+0Ch] ; pPixCr
+ movq xmm4,[ecx]
+ movq xmm5,[edx+ecx]
+ push esi
+ push edi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ movq xmm1,[edi]
+ mov edi,ecx
+ sub edi,esi
+ movq xmm2,[edi]
+ punpcklqdq xmm1,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm2,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm3,[edi]
+ punpcklqdq xmm2,xmm3
+ movq xmm3,[eax]
+ punpcklqdq xmm3,xmm4
+ movq xmm4,[edx+eax]
+ mov edx, [ebp + 14h]
+ punpcklqdq xmm4,xmm5
+ movd xmm5,edx
+ mov edx, [ebp + 18h]
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm5
+ punpcklwd xmm6,xmm5
+ pshufd xmm5,xmm6,0
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,xmm1
+ punpckhbw xmm1,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+40h],xmm1
+ movdqa [esp+60h],xmm7
+ movdqa xmm7,xmm2
+ punpcklbw xmm7,xmm0
+ movdqa [esp+10h],xmm7
+ movdqa xmm7,xmm3
+ punpcklbw xmm7,xmm0
+ punpckhbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm7,xmm4
+ punpckhbw xmm4,xmm0
+ punpckhbw xmm2,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+30h],xmm3
+ movdqa xmm3,[esp+10h]
+ movdqa xmm1,xmm3
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa [esp+20h],xmm4
+ movdqa xmm0,xmm5
+ pcmpgtw xmm0,xmm1
+ movdqa xmm1,[esp+60h]
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ pand xmm0,xmm4
+ movdqa xmm1,xmm7
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,xmm2
+ psubw xmm1,[esp+30h]
+ pabsw xmm1,xmm1
+ pcmpgtw xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ pand xmm0,xmm4
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,[esp+20h]
+ psubw xmm1,[esp+30h]
+ pand xmm5,xmm4
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ pand xmm5,xmm6
+ mov edx,2
+ movsx edx,dx
+ movd xmm1,edx
+ movdqa xmm4,xmm1
+ punpcklwd xmm4,xmm1
+ pshufd xmm1,xmm4,0
+ movdqa xmm4,[esp+60h]
+ movdqa xmm6,xmm4
+ paddw xmm6,xmm4
+ paddw xmm6,xmm3
+ paddw xmm6,xmm7
+ movdqa [esp+10h],xmm1
+ paddw xmm6,[esp+10h]
+ psraw xmm6,2
+ movdqa xmm4,xmm0
+ pandn xmm4,xmm3
+ movdqa xmm3,[esp+40h]
+ movdqa xmm1,xmm0
+ pand xmm1,xmm6
+ por xmm1,xmm4
+ movdqa xmm6,xmm3
+ paddw xmm6,xmm3
+ movdqa xmm3,[esp+10h]
+ paddw xmm6,xmm2
+ paddw xmm6,[esp+20h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm4,xmm5
+ pand xmm4,xmm6
+ movdqa xmm6,xmm5
+ pandn xmm6,xmm2
+ por xmm4,xmm6
+ packuswb xmm1,xmm4
+ movdqa xmm4,[esp+50h]
+ movdqa xmm6,xmm7
+ paddw xmm6,xmm7
+ paddw xmm6,xmm4
+ paddw xmm6,[esp+60h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm2,xmm0
+ pand xmm2,xmm6
+ pandn xmm0,xmm4
+ por xmm2,xmm0
+ movdqa xmm0,[esp+20h]
+ movdqa xmm6,xmm0
+ paddw xmm6,xmm0
+ movdqa xmm0,[esp+30h]
+ paddw xmm6,xmm0
+ paddw xmm6,[esp+40h]
+ movdqa xmm4,xmm5
+ paddw xmm6,xmm3
+ movq [esi],xmm1
+ psraw xmm6,2
+ pand xmm4,xmm6
+ pandn xmm5,xmm0
+ por xmm4,xmm5
+ packuswb xmm2,xmm4
+ movq [eax],xmm2
+ psrldq xmm1,8
+ movq [edi],xmm1
+ pop edi
+ psrldq xmm2,8
+ movq [ecx],xmm2
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0E4h
+ push ebx
+ push esi
+ mov esi, [ebp+1Ch] ; pTC
+ movsx ebx, byte [esi+2]
+ push edi
+ movsx di,byte [esi+3]
+ mov word [esp+0Ch],bx
+ movsx bx,byte [esi+1]
+ movsx esi,byte [esi]
+ mov word [esp+0Eh],si
+ movzx esi,di
+ movd xmm1,esi
+ movzx esi,di
+ movd xmm2,esi
+ mov si,word [esp+0Ch]
+ mov edx, [ebp + 10h]
+ mov eax, [ebp + 08h]
+ movzx edi,si
+ movzx esi,si
+ mov ecx, [ebp + 0Ch]
+ movd xmm4,esi
+ movzx esi,bx
+ movd xmm5,esi
+ movd xmm3,edi
+ movzx esi,bx
+ movd xmm6,esi
+ mov si,word [esp+0Eh]
+ movzx edi,si
+ movzx esi,si
+ punpcklwd xmm6,xmm2
+ pxor xmm0,xmm0
+ movdqa [esp+40h],xmm0
+ movd xmm7,edi
+ movd xmm0,esi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ punpcklwd xmm0,xmm4
+ movq xmm4,[edx+ecx]
+ punpcklwd xmm7,xmm3
+ movq xmm3,[eax]
+ punpcklwd xmm0,xmm6
+ movq xmm6,[edi]
+ punpcklwd xmm7,xmm5
+ punpcklwd xmm0,xmm7
+ mov edi,ecx
+ sub edi,esi
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+60h],xmm2
+ movq xmm2, [edi]
+ punpcklqdq xmm6,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm7,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm2,[edi]
+ punpcklqdq xmm7,xmm2
+ movq xmm2,[ecx]
+ punpcklqdq xmm3,xmm2
+ movq xmm2,[edx+eax]
+ movsx edx,word [ebp + 14h]
+ punpcklqdq xmm2,xmm4
+ movdqa [esp+0E0h],xmm2
+ movd xmm2,edx
+ movsx edx,word [ebp + 18h]
+ movdqa xmm4,xmm2
+ punpcklwd xmm4,xmm2
+ movd xmm2,edx
+ movdqa xmm5,xmm2
+ punpcklwd xmm5,xmm2
+ pshufd xmm2,xmm5,0
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ movdqa [esp+0D0h],xmm3
+ pshufd xmm4,xmm4,0
+ movdqa [esp+30h],xmm2
+ punpckhbw xmm6,xmm1
+ movdqa [esp+80h],xmm6
+ movdqa xmm6,[esp+0D0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+70h],xmm6
+ movdqa xmm6, [esp+0E0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+90h],xmm6
+ movdqa xmm5, [esp+0E0h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0A0h],xmm7
+ punpcklbw xmm3,xmm1
+ mov edx,4
+ punpcklbw xmm2,xmm1
+ movsx edx,dx
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,[esp+30h]
+ movdqa [esp+20h],xmm6
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1,[esp+60h]
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6, [esp+20h]
+ movdqa xmm7, [esp+50h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa [esp+10h],xmm0
+ movdqa xmm6, [esp+10h]
+ pminsw xmm6,xmm1
+ movdqa [esp+10h],xmm6
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm6,xmm4
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+30h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1,[esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5,[esp+80h]
+ psubw xmm5,[esp+90h]
+ pand xmm6,xmm1
+ pand xmm6,[esp+40h]
+ movdqa xmm1,[esp+10h]
+ pand xmm1,xmm6
+ movdqa xmm6,[esp+70h]
+ movdqa [esp+30h],xmm1
+ movdqa xmm1,[esp+0A0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6,[esp+20h]
+ movdqa xmm5,[esp+60h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+70h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+80h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+90h]
+ pand xmm4,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+40h]
+ pand xmm0,xmm4
+ movdqa xmm4,[esp+30h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ packuswb xmm2,xmm1
+ movq [esi],xmm2
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm3,xmm5
+ movq [eax],xmm3
+ psrldq xmm2,8
+ movq [edi],xmm2
+ pop edi
+ pop esi
+ psrldq xmm3,8
+ movq [ecx],xmm3
+ pop ebx
+ mov esp,ebp
+ pop ebp
+ ret
+
+;***************************************************************************
+; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN DeblockChromaEq4H_sse2
+
+ALIGN 16
+
+DeblockChromaEq4H_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0C8h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+18h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+7Ch]
+ push edi
+ mov dword [esp+14h],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+0Ch],edx
+ mov dword [esp+10h],eax
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+0Ch]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+10h]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ movsx ecx,word [ebp+14h]
+ movsx edx,word [ebp+18h]
+ movdqa xmm6,[esp+80h]
+ movdqa xmm4,[esp+90h]
+ movdqa xmm5,[esp+0A0h]
+ movdqa xmm7,[esp+0B0h]
+ pxor xmm0,xmm0
+ movd xmm1,ecx
+ movdqa xmm2,xmm1
+ punpcklwd xmm2,xmm1
+ pshufd xmm1,xmm2,0
+ movd xmm2,edx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3,xmm6
+ punpckhbw xmm6,xmm0
+ movdqa [esp+60h],xmm6
+ movdqa xmm6,[esp+90h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+30h],xmm6
+ movdqa xmm6,[esp+0A0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,[esp+0B0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+70h],xmm6
+ punpcklbw xmm7,xmm0
+ punpcklbw xmm4,xmm0
+ punpcklbw xmm5,xmm0
+ punpcklbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm6,xmm4
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ movdqa xmm0,xmm1
+ pcmpgtw xmm0,xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm4
+ pabsw xmm6,xmm6
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+30h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pcmpgtw xmm1,xmm6
+ movdqa xmm6,[esp+60h]
+ psubw xmm6,[esp+30h]
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+70h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pand xmm1,xmm7
+ pcmpgtw xmm2,xmm6
+ pand xmm1,xmm2
+ mov eax,2
+ movsx ecx,ax
+ movd xmm2,ecx
+ movdqa xmm6,xmm2
+ punpcklwd xmm6,xmm2
+ pshufd xmm2,xmm6,0
+ movdqa [esp+20h],xmm2
+ movdqa xmm2,xmm3
+ paddw xmm2,xmm3
+ paddw xmm2,xmm4
+ paddw xmm2,[esp+50h]
+ paddw xmm2,[esp+20h]
+ psraw xmm2,2
+ movdqa xmm6,xmm0
+ pand xmm6,xmm2
+ movdqa xmm2,xmm0
+ pandn xmm2,xmm4
+ por xmm6,xmm2
+ movdqa xmm2,[esp+60h]
+ movdqa xmm7,xmm2
+ paddw xmm7,xmm2
+ paddw xmm7,[esp+30h]
+ paddw xmm7,[esp+70h]
+ paddw xmm7,[esp+20h]
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ pandn xmm2,[esp+30h]
+ psraw xmm7,2
+ pand xmm4,xmm7
+ por xmm4,xmm2
+ movdqa xmm2,[esp+50h]
+ packuswb xmm6,xmm4
+ movdqa [esp+90h],xmm6
+ movdqa xmm6,xmm2
+ paddw xmm6,xmm2
+ movdqa xmm2,[esp+20h]
+ paddw xmm6,xmm5
+ paddw xmm6,xmm3
+ movdqa xmm4,xmm0
+ pandn xmm0,xmm5
+ paddw xmm6,xmm2
+ psraw xmm6,2
+ pand xmm4,xmm6
+ por xmm4,xmm0
+ movdqa xmm0,[esp+70h]
+ movdqa xmm5,xmm0
+ paddw xmm5,xmm0
+ movdqa xmm0,[esp+40h]
+ paddw xmm5,xmm0
+ paddw xmm5,[esp+60h]
+ movdqa xmm3,xmm1
+ paddw xmm5,xmm2
+ psraw xmm5,2
+ pand xmm3,xmm5
+ pandn xmm1,xmm0
+ por xmm3,xmm1
+ packuswb xmm4,xmm3
+ movdqa [esp+0A0h],xmm4
+ mov esi,dword [esp+10h]
+ movdqa xmm0,[esi]
+ movdqa xmm1,[esi+10h]
+ movdqa xmm2,[esi+20h]
+ movdqa xmm3,[esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+0Ch]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;*******************************************************************************
+; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4H_sse2
+
+ALIGN 16
+
+DeblockChromaLt4H_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,108h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+10h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+6Ch]
+ push edi
+ mov dword [esp+0Ch],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+10h],edx
+ mov dword [esp+1Ch],eax
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+10h]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+1Ch]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ mov eax,dword [ebp+1Ch]
+ movsx cx,byte [eax+3]
+ movsx dx,byte [eax+2]
+ movsx si,byte [eax+1]
+ movsx ax,byte [eax]
+ movzx edi,cx
+ movzx ecx,cx
+ movd xmm2,ecx
+ movzx ecx,dx
+ movzx edx,dx
+ movd xmm3,ecx
+ movd xmm4,edx
+ movzx ecx,si
+ movzx edx,si
+ movd xmm5,ecx
+ pxor xmm0,xmm0
+ movd xmm6,edx
+ movzx ecx,ax
+ movdqa [esp+60h],xmm0
+ movzx edx,ax
+ movsx eax,word [ebp+14h]
+ punpcklwd xmm6,xmm2
+ movd xmm1,edi
+ movd xmm7,ecx
+ movsx ecx,word [ebp+18h]
+ movd xmm0,edx
+ punpcklwd xmm7,xmm3
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+60h]
+ punpcklwd xmm7,xmm5
+ movdqa xmm5,[esp+0A0h]
+ punpcklwd xmm0,xmm4
+ punpcklwd xmm0,xmm6
+ movdqa xmm6, [esp+70h]
+ punpcklwd xmm0,xmm7
+ movdqa xmm7,[esp+80h]
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+0D0h],xmm2
+ movd xmm2,eax
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm4,xmm3,0
+ movd xmm2,ecx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3, [esp+90h]
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa [esp+40h],xmm2
+ movdqa [esp+0B0h],xmm6
+ movdqa xmm6,[esp+90h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm2,xmm1
+ punpcklbw xmm3,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0F0h],xmm7
+ movdqa [esp+0C0h],xmm6
+ movdqa xmm6, [esp+0A0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+0E0h],xmm6
+ mov edx,4
+ movsx eax,dx
+ movd xmm6,eax
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa [esp+30h],xmm6
+ movdqa xmm7, [esp+40h]
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa [esp+60h],xmm6
+ movdqa xmm1, [esp+0D0h]
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6,[esp+30h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa xmm7,[esp+50h]
+ movdqa [esp+20h],xmm0
+ movdqa xmm6, [esp+20h]
+ pminsw xmm6,xmm1
+ movdqa [esp+20h],xmm6
+ movdqa xmm6,xmm4
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+40h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1, [esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5, [esp+0B0h]
+ psubw xmm5,[esp+0E0h]
+ pand xmm6,xmm1
+ pand xmm6, [esp+60h]
+ movdqa xmm1, [esp+20h]
+ pand xmm1,xmm6
+ movdqa xmm6, [esp+0C0h]
+ movdqa [esp+40h],xmm1
+ movdqa xmm1, [esp+0F0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6, [esp+30h]
+ movdqa xmm5, [esp+0D0h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+0C0h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+0B0h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6, [esp+0E0h]
+ pand xmm4,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+60h]
+ pand xmm0,xmm4
+ movdqa xmm4, [esp+40h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm2,xmm1
+ packuswb xmm3,xmm5
+ movdqa [esp+80h],xmm2
+ movdqa [esp+90h],xmm3
+ mov esi,dword [esp+1Ch]
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi+10h]
+ movdqa xmm2, [esi+20h]
+ movdqa xmm3, [esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+10h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+
+
+;*******************************************************************************
+; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN DeblockLumaLt4V_sse2
+
+ALIGN 16
+
+DeblockLumaLt4V_sse2:
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 420 ; 000001a4H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+
+ pxor xmm0, xmm0
+ push ebx
+ mov edx, dword [ebp+24]
+ movdqa [esp+424-384], xmm0
+ push esi
+
+ lea esi, [ecx+ecx*2]
+ push edi
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+
+ lea esi, [ecx+ecx]
+ movdqa [esp+432-208], xmm0
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+ movdqa [esp+448-208], xmm0
+
+ mov ebx, eax
+ sub ebx, ecx
+ movdqa xmm0, [ebx]
+ movdqa [esp+464-208], xmm0
+
+ movdqa xmm0, [eax]
+
+ add ecx, eax
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [ecx]
+ mov dword [esp+432-404], ecx
+
+ movsx ecx, word [ebp+16]
+ movdqa [esp+496-208], xmm0
+ movdqa xmm0, [esi+eax]
+
+ movsx si, byte [edx]
+ movdqa [esp+512-208], xmm0
+ movd xmm0, ecx
+ movsx ecx, word [ebp+20]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ pshufd xmm0, xmm1, 0
+ movdqa [esp+432-112], xmm0
+ movd xmm0, ecx
+ movsx cx, byte [edx+1]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ mov dword [esp+432-408], ebx
+ movzx ebx, cx
+ pshufd xmm0, xmm1, 0
+ movd xmm1, ebx
+ movzx ebx, cx
+ movd xmm2, ebx
+ movzx ebx, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, si
+ movd xmm5, ecx
+ movzx ecx, si
+ movd xmm6, ecx
+ movzx ecx, si
+ movd xmm7, ecx
+ movzx ecx, si
+ movdqa [esp+432-336], xmm0
+ movd xmm0, ecx
+
+ movsx cx, byte [edx+3]
+ movsx dx, byte [edx+2]
+ movd xmm3, ebx
+ punpcklwd xmm0, xmm4
+ movzx esi, cx
+ punpcklwd xmm6, xmm2
+ punpcklwd xmm5, xmm1
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ punpcklwd xmm0, xmm7
+ movdqa [esp+432-400], xmm0
+ movd xmm0, esi
+ movzx esi, cx
+ movd xmm2, esi
+ movzx esi, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, dx
+ movd xmm3, esi
+ movd xmm5, ecx
+ punpcklwd xmm5, xmm0
+
+ movdqa xmm0, [esp+432-384]
+ movzx ecx, dx
+ movd xmm6, ecx
+ movzx ecx, dx
+ movzx edx, dx
+ punpcklwd xmm6, xmm2
+ movd xmm7, ecx
+ movd xmm1, edx
+
+ movdqa xmm2, [esp+448-208]
+ punpcklbw xmm2, xmm0
+
+ mov ecx, 4
+ movsx edx, cx
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ movdqa xmm5, [esp+496-208]
+ movdqa xmm3, [esp+464-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-240], xmm5
+ movdqa xmm5, [esp+512-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-352], xmm5
+ punpcklwd xmm1, xmm4
+ movdqa xmm4, [esp+432-208]
+ punpcklwd xmm1, xmm6
+ movdqa xmm6, [esp+480-208]
+ punpcklwd xmm1, xmm7
+ punpcklbw xmm6, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ movdqa xmm7, xmm3
+ psubw xmm7, xmm4
+ pabsw xmm7, xmm7
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-336]
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-352]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+ movdqa xmm5, xmm3
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+ movdqa xmm5, [esp+432-400]
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, xmm3
+ movdqa [esp+432-32], xmm6
+ psubw xmm6, [esp+432-240]
+ movdqa xmm7, xmm5
+ movdqa [esp+432-384], xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+
+ pand xmm5, xmm7
+ movdqa xmm6, xmm3
+ psubw xmm6, xmm2
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-400]
+ pand xmm5, xmm7
+ movdqa xmm7, xmm6
+ pcmpeqw xmm6, xmm0
+ pcmpgtw xmm7, xmm0
+ por xmm7, xmm6
+ pand xmm5, xmm7
+ movdqa [esp+432-320], xmm5
+ movd xmm5, edx
+ movdqa xmm6, xmm5
+ punpcklwd xmm6, xmm5
+ pshufd xmm5, xmm6, 0
+ movdqa [esp+432-336], xmm5
+ movdqa xmm5, [esp+432-224]
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm0
+ psubw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ psllw xmm5, 2
+ movdqa xmm7, xmm2
+ psubw xmm7, [esp+432-240]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ psraw xmm7, 3
+ pmaxsw xmm6, xmm7
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ movdqa xmm6, [esp+432-400]
+ movdqa [esp+432-64], xmm5
+ movdqa [esp+432-384], xmm6
+ movdqa xmm5, xmm0
+ psubw xmm5, xmm6
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm2
+ psubw xmm5, xmm7
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-288]
+ movdqa xmm6, [esp+432-240]
+ movdqa [esp+432-96], xmm5
+ movdqa xmm5, [esp+432-352]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm6
+ paddw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+ psubw xmm5, xmm7
+
+ movdqa xmm7, [esp+496-208]
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-400]
+ pminsw xmm5, xmm6
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-256]
+ movdqa xmm6, [esp+448-208]
+ punpckhbw xmm7, xmm0
+ movdqa [esp+432-352], xmm7
+
+ movdqa xmm7, [esp+512-208]
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-48], xmm5
+ movdqa xmm5, [esp+432-208]
+ movdqa [esp+432-368], xmm6
+ movdqa xmm6, [esp+464-208]
+ punpckhbw xmm7, xmm0
+ punpckhbw xmm5, xmm0
+ movdqa [esp+432-384], xmm7
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-400], xmm6
+
+ movdqa xmm7, [esp+432-400]
+ movdqa xmm6, [esp+480-208]
+ psubw xmm7, xmm5
+ movdqa [esp+432-16], xmm5
+ pabsw xmm7, xmm7
+ punpckhbw xmm6, xmm0
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-384]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+
+ movdqa xmm5, [esp+432-400]
+ movdqa [esp+432-80], xmm6
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+
+ movdqa xmm5, xmm1
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, [esp+432-400]
+ psubw xmm6, [esp+432-352]
+ movdqa [esp+432-272], xmm5
+ movdqa xmm7, xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ movdqa xmm7, xmm4
+ pabsw xmm6, xmm6
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+
+ pand xmm5, xmm7
+ movdqa xmm7, [esp+432-400]
+ psubw xmm7, xmm6
+ psubw xmm6, [esp+432-352]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+ pand xmm5, xmm4
+
+ paddw xmm2, [esp+432-96]
+ movdqa xmm4, xmm1
+ pcmpgtw xmm4, xmm0
+ movdqa xmm7, xmm1
+ pcmpeqw xmm7, xmm0
+ por xmm4, xmm7
+ pand xmm5, xmm4
+ movdqa xmm4, [esp+432-224]
+ movdqa [esp+432-320], xmm5
+ movdqa xmm5, [esp+432-272]
+ movdqa xmm7, xmm0
+ psubw xmm7, xmm4
+ psubw xmm0, xmm1
+ psllw xmm5, 2
+ paddw xmm6, xmm5
+ paddw xmm6, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ movdqa [esp+432-336], xmm0
+ psraw xmm6, 3
+ pmaxsw xmm7, xmm6
+ pminsw xmm4, xmm7
+ pand xmm4, [esp+432-320]
+ movdqa xmm6, xmm0
+ movdqa xmm0, [esp+432-16]
+ paddw xmm0, [esp+432-304]
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-368]
+ paddw xmm4, xmm4
+ psubw xmm0, xmm4
+
+ movdqa xmm4, [esp+432-64]
+ psraw xmm0, 1
+ pmaxsw xmm6, xmm0
+ movdqa xmm0, [esp+432-400]
+ movdqa xmm7, xmm1
+ pminsw xmm7, xmm6
+ movdqa xmm6, [esp+432-320]
+ pand xmm7, xmm6
+ pand xmm7, [esp+432-288]
+ paddw xmm5, xmm7
+ packuswb xmm2, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm0, xmm5
+ paddw xmm3, xmm4
+ packuswb xmm3, xmm0
+
+ movdqa xmm0, [esp+432-32]
+ psubw xmm0, xmm4
+ movdqa xmm4, [esp+432-80]
+ psubw xmm4, xmm5
+
+ movdqa xmm5, [esp+432-240]
+ paddw xmm5, [esp+432-48]
+ packuswb xmm0, xmm4
+ movdqa xmm4, [esp+432-384]
+ paddw xmm4, [esp+432-304]
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [esp+432-352]
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm0
+
+ mov ecx, dword [esp+432-408]
+
+ mov edx, dword [esp+432-404]
+ psubw xmm4, xmm0
+ movdqa xmm0, [esp+432-336]
+ movdqa [edi], xmm2
+ psraw xmm4, 1
+ pmaxsw xmm0, xmm4
+ pminsw xmm1, xmm0
+ movdqa xmm0, [esp+480-208]
+
+ pop edi
+ pand xmm1, xmm6
+ pand xmm1, [esp+428-256]
+ movdqa [ecx], xmm3
+ paddw xmm7, xmm1
+ pop esi
+ packuswb xmm5, xmm7
+ movdqa [eax], xmm0
+ movdqa [edx], xmm5
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+
+;*******************************************************************************
+; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN DeblockLumaEq4V_sse2
+
+ALIGN 16
+
+DeblockLumaEq4V_sse2:
+
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 628 ; 00000274H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+ push ebx
+ push esi
+
+ lea edx, [ecx*4]
+ pxor xmm0, xmm0
+ movdqa xmm2, xmm0
+
+ movdqa xmm0, [ecx+eax]
+ mov esi, eax
+ sub esi, edx
+ movdqa xmm3, [esi]
+ movdqa xmm5, [eax]
+ push edi
+ lea edi, [ecx+ecx]
+ lea ebx, [ecx+ecx*2]
+ mov dword [esp+640-600], edi
+ mov esi, eax
+ sub esi, edi
+ movdqa xmm1, [esi]
+ movdqa [esp+720-272], xmm0
+ mov edi, eax
+ sub edi, ecx
+ movdqa xmm4, [edi]
+ add ecx, eax
+ mov dword [esp+640-596], ecx
+
+ mov ecx, dword [esp+640-600]
+ movdqa xmm0, [ecx+eax]
+ movdqa [esp+736-272], xmm0
+
+ movdqa xmm0, [eax+ebx]
+ mov edx, eax
+ sub edx, ebx
+
+ movsx ebx, word [ebp+16]
+ movdqa xmm6, [edx]
+ add ecx, eax
+ movdqa [esp+752-272], xmm0
+ movd xmm0, ebx
+
+ movsx ebx, word [ebp+20]
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+ movdqa [esp+640-320], xmm0
+ movd xmm0, ebx
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+
+ movdqa xmm7, [esp+736-272]
+ punpcklbw xmm7, xmm2
+ movdqa [esp+640-416], xmm7
+ movdqa [esp+640-512], xmm0
+ movdqa xmm0, xmm1
+ movdqa [esp+672-272], xmm1
+ movdqa xmm1, xmm4
+ movdqa [esp+704-272], xmm5
+ punpcklbw xmm5, xmm2
+ punpcklbw xmm1, xmm2
+
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ punpcklbw xmm0, xmm2
+ movdqa [esp+688-272], xmm4
+ movdqa xmm4, [esp+720-272]
+ movdqa [esp+640-480], xmm0
+
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm0
+
+ movdqa xmm0, [esp+640-512]
+ pabsw xmm7, xmm7
+ punpcklbw xmm4, xmm2
+ pcmpgtw xmm0, xmm7
+ movdqa [esp+640-384], xmm4
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+656-272], xmm6
+ punpcklbw xmm6, xmm2
+ pabsw xmm7, xmm7
+ movdqa [esp+640-48], xmm2
+ movdqa [esp+640-368], xmm6
+ movdqa [esp+640-144], xmm1
+ movdqa [esp+640-400], xmm5
+ pcmpgtw xmm4, xmm7
+ pand xmm0, xmm4
+ movdqa xmm4, [esp+640-320]
+ pcmpgtw xmm4, [esp+640-560]
+ pand xmm0, xmm4
+
+ mov ebx, 2
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, [esp+640-320]
+ psraw xmm4, 2
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm7
+ movdqa [esp+640-576], xmm4
+ pcmpgtw xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+640-624], xmm7
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm6
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-544], xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa xmm7, xmm5
+ psubw xmm7, [esp+640-416]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-544]
+ pandn xmm4, xmm6
+ movdqa [esp+640-16], xmm4
+ mov ebx, 4
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, xmm3
+ punpcklbw xmm4, xmm2
+ psllw xmm4, 1
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, [esp+640-480]
+
+ movdqa xmm6, [esp+640-560]
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm1
+ movdqa [esp+640-592], xmm7
+ paddw xmm4, xmm5
+ paddw xmm4, xmm7
+ movdqa xmm7, [esp+640-416]
+ pandn xmm6, xmm7
+ movdqa [esp+640-80], xmm6
+ movdqa xmm6, [esp+752-272]
+ punpcklbw xmm6, xmm2
+ psllw xmm6, 1
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-384]
+
+ movdqa xmm7, [esp+640-480]
+ paddw xmm6, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, [esp+640-592]
+ psraw xmm6, 3
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-112], xmm6
+ movdqa xmm6, [esp+640-544]
+ pandn xmm6, xmm7
+ movdqa [esp+640-336], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-528], xmm6
+ movdqa xmm6, [esp+640-368]
+ paddw xmm6, xmm7
+ movdqa xmm7, xmm1
+ psraw xmm4, 3
+ pand xmm4, [esp+640-544]
+ paddw xmm7, xmm5
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+
+ paddw xmm5, xmm1
+ psraw xmm6, 2
+ pand xmm7, xmm6
+
+ movdqa xmm6, [esp+640-384]
+ movdqa [esp+640-64], xmm7
+ movdqa xmm7, [esp+640-560]
+ pandn xmm7, xmm6
+ movdqa [esp+640-304], xmm7
+ movdqa xmm7, [esp+640-560]
+ movdqa [esp+640-528], xmm7
+ movdqa xmm7, [esp+640-416]
+ paddw xmm7, xmm6
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pand xmm5, xmm7
+ movdqa [esp+640-32], xmm5
+
+ movdqa xmm5, [esp+640-544]
+ movdqa [esp+640-528], xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa xmm7, xmm5
+ paddw xmm7, xmm5
+ movdqa xmm5, xmm1
+ paddw xmm5, xmm6
+ paddw xmm6, [esp+640-592]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pandn xmm5, xmm7
+ movdqa xmm7, [esp+640-480]
+ paddw xmm7, xmm1
+ paddw xmm7, [esp+640-400]
+ movdqa xmm1, [esp+640-544]
+ movdqa [esp+640-352], xmm5
+ movdqa xmm5, [esp+640-368]
+ psllw xmm7, 1
+ paddw xmm7, xmm6
+ paddw xmm5, xmm7
+
+ movdqa xmm7, [esp+640-400]
+ psraw xmm5, 3
+ pand xmm1, xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa [esp+640-96], xmm1
+ movdqa xmm1, [esp+640-560]
+ movdqa [esp+640-528], xmm1
+ movdqa xmm1, [esp+640-384]
+ movdqa xmm6, xmm1
+ paddw xmm6, xmm1
+ paddw xmm1, [esp+640-400]
+ paddw xmm1, [esp+640-144]
+ paddw xmm7, xmm5
+ paddw xmm5, [esp+640-592]
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+ psraw xmm6, 2
+ psllw xmm1, 1
+ paddw xmm1, xmm5
+
+ movdqa xmm5, [esp+656-272]
+ pandn xmm7, xmm6
+ movdqa xmm6, [esp+640-416]
+ paddw xmm6, xmm1
+ movdqa xmm1, [esp+640-560]
+ psraw xmm6, 3
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+704-272]
+ movdqa [esp+640-128], xmm1
+ movdqa xmm1, [esp+672-272]
+ punpckhbw xmm1, xmm2
+ movdqa [esp+640-448], xmm1
+ movdqa xmm1, [esp+688-272]
+ punpckhbw xmm1, xmm2
+ punpckhbw xmm6, xmm2
+ movdqa [esp+640-288], xmm7
+ punpckhbw xmm5, xmm2
+ movdqa [esp+640-496], xmm1
+ movdqa [esp+640-432], xmm6
+
+ movdqa xmm7, [esp+720-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-464], xmm7
+
+ movdqa xmm7, [esp+736-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-528], xmm7
+
+ movdqa xmm7, xmm6
+
+ psubw xmm6, [esp+640-464]
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ por xmm4, [esp+640-16]
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm1
+ psubw xmm7, [esp+640-448]
+
+ movdqa xmm1, [esp+640-512]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm1, xmm7
+ movdqa xmm7, [esp+640-512]
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+640-320]
+ pand xmm1, xmm7
+ movdqa xmm7, [esp+640-560]
+ pcmpgtw xmm6, xmm7
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+640-576]
+ pcmpgtw xmm6, xmm7
+
+ movdqa xmm7, [esp+640-496]
+ punpckhbw xmm3, xmm2
+ movdqa [esp+640-560], xmm6
+ movdqa xmm6, [esp+640-512]
+ psubw xmm7, xmm5
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+
+ pand xmm6, [esp+640-560]
+ movdqa xmm7, [esp+640-432]
+ psubw xmm7, [esp+640-528]
+
+ psllw xmm3, 1
+ movdqa [esp+640-544], xmm6
+ movdqa xmm6, [esp+640-512]
+
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, [esp+640-448]
+ paddw xmm3, [esp+640-496]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-560], xmm6
+
+ movdqa xmm6, xmm0
+ pand xmm6, xmm4
+ movdqa xmm4, xmm0
+ pandn xmm4, [esp+640-368]
+ por xmm6, xmm4
+ movdqa xmm4, [esp+640-432]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-592]
+ psraw xmm3, 3
+ pand xmm3, xmm2
+ pandn xmm2, xmm5
+ por xmm3, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm3
+ movdqa xmm3, [esp+640-64]
+ por xmm3, [esp+640-336]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm5
+ por xmm7, xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-480]
+ por xmm2, xmm3
+ packuswb xmm6, xmm7
+ movdqa [esp+640-336], xmm2
+ movdqa [esp+656-272], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa xmm2, xmm5
+ paddw xmm2, [esp+640-448]
+ movdqa xmm3, xmm1
+ movdqa xmm7, [esp+640-496]
+ paddw xmm7, xmm4
+ paddw xmm2, xmm7
+ paddw xmm2, [esp+640-624]
+ movdqa xmm7, [esp+640-544]
+ psraw xmm2, 2
+ pand xmm6, xmm2
+ movdqa xmm2, [esp+640-448]
+ pandn xmm7, xmm2
+ por xmm6, xmm7
+ pand xmm3, xmm6
+ movdqa xmm6, xmm1
+ pandn xmm6, xmm2
+ paddw xmm2, [esp+640-496]
+ paddw xmm2, xmm4
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-336]
+ packuswb xmm6, xmm3
+ psllw xmm2, 1
+ movdqa [esp+672-272], xmm6
+ movdqa xmm6, [esp+640-96]
+ por xmm6, [esp+640-352]
+
+ movdqa xmm3, xmm0
+ pand xmm3, xmm6
+ movdqa xmm6, xmm0
+ pandn xmm6, [esp+640-144]
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-352], xmm3
+ movdqa xmm3, [esp+640-464]
+ paddw xmm3, [esp+640-592]
+ paddw xmm2, xmm3
+ movdqa xmm3, [esp+640-448]
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-496]
+ psraw xmm5, 3
+ pand xmm6, xmm5
+ movdqa xmm5, [esp+640-464]
+ paddw xmm2, xmm5
+ paddw xmm5, [esp+640-432]
+ movdqa xmm4, xmm3
+ paddw xmm4, xmm3
+ paddw xmm4, xmm2
+ paddw xmm4, [esp+640-624]
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, [esp+640-592]
+ psraw xmm4, 2
+ pandn xmm2, xmm4
+ por xmm6, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-496]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm6
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-352]
+ packuswb xmm2, xmm7
+ movdqa [esp+688-272], xmm2
+ movdqa xmm2, [esp+640-128]
+ por xmm2, [esp+640-288]
+
+ movdqa xmm4, xmm0
+ pand xmm4, xmm2
+ paddw xmm5, xmm6
+ movdqa xmm2, xmm0
+ pandn xmm2, [esp+640-400]
+ por xmm4, xmm2
+ movdqa xmm2, [esp+640-528]
+ psllw xmm5, 1
+ paddw xmm5, xmm3
+ movdqa xmm3, [esp+640-560]
+ paddw xmm2, xmm5
+ psraw xmm2, 3
+ movdqa [esp+640-288], xmm4
+ movdqa xmm4, [esp+640-560]
+ pand xmm4, xmm2
+ movdqa xmm2, [esp+640-464]
+ movdqa xmm5, xmm2
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-432]
+ paddw xmm2, [esp+640-448]
+ movdqa xmm7, xmm1
+ paddw xmm5, xmm2
+ paddw xmm5, [esp+640-624]
+ movdqa xmm6, [esp+640-560]
+ psraw xmm5, 2
+ pandn xmm3, xmm5
+ por xmm4, xmm3
+ movdqa xmm3, [esp+640-32]
+ por xmm3, [esp+640-304]
+ pand xmm7, xmm4
+ movdqa xmm4, [esp+640-432]
+ movdqa xmm5, [esp+640-464]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm4
+ paddw xmm4, [esp+640-496]
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-288]
+ packuswb xmm2, xmm7
+ movdqa [esp+704-272], xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-384]
+ por xmm2, xmm3
+ movdqa [esp+640-304], xmm2
+ movdqa xmm2, [esp+640-528]
+ movdqa xmm3, xmm2
+ paddw xmm3, [esp+640-464]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-624]
+ psraw xmm3, 2
+ pand xmm6, xmm3
+ movdqa xmm3, [esp+640-560]
+ movdqa xmm4, xmm3
+ pandn xmm4, xmm5
+ por xmm6, xmm4
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-304]
+ movdqa xmm4, xmm1
+ pandn xmm4, xmm5
+ por xmm7, xmm4
+
+ movdqa xmm4, xmm0
+ pandn xmm0, [esp+640-416]
+ packuswb xmm6, xmm7
+ movdqa xmm7, [esp+640-112]
+ por xmm7, [esp+640-80]
+ pand xmm4, xmm7
+ por xmm4, xmm0
+ movdqa xmm0, [esp+752-272]
+ punpckhbw xmm0, [esp+640-48]
+ psllw xmm0, 1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm5
+ paddw xmm0, [esp+640-432]
+ paddw xmm0, [esp+640-496]
+ paddw xmm0, [esp+640-592]
+ psraw xmm0, 3
+ pand xmm0, xmm3
+ movdqa xmm7, xmm1
+ pandn xmm3, xmm2
+ por xmm0, xmm3
+ pand xmm7, xmm0
+
+ movdqa xmm0, [esp+656-272]
+ movdqa [edx], xmm0
+
+ movdqa xmm0, [esp+672-272]
+
+ mov edx, dword [esp+640-596]
+ movdqa [esi], xmm0
+ movdqa xmm0, [esp+688-272]
+ movdqa [edi], xmm0
+ movdqa xmm0, [esp+704-272]
+
+ pop edi
+ pandn xmm1, xmm2
+ movdqa [eax], xmm0
+ por xmm7, xmm1
+ pop esi
+ packuswb xmm4, xmm7
+ movdqa [edx], xmm6
+ movdqa [ecx], xmm4
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+
+;********************************************************************************
+;
+; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeH2V_sse2
+
+ALIGN 16
+
+DeblockLumaTransposeH2V_sse2:
+ push ebp
+ push ebx
+ mov ebp, esp
+ and esp,0FFFFFFF0h
+ sub esp, 10h
+
+ mov eax, [ebp + 0Ch]
+ mov ecx, [ebp + 10h]
+ lea edx, [eax + ecx * 8]
+ lea ebx, [ecx*3]
+
+ movq xmm0, [eax]
+ movq xmm7, [edx]
+ punpcklqdq xmm0, xmm7
+ movq xmm1, [eax + ecx]
+ movq xmm7, [edx + ecx]
+ punpcklqdq xmm1, xmm7
+ movq xmm2, [eax + ecx*2]
+ movq xmm7, [edx + ecx*2]
+ punpcklqdq xmm2, xmm7
+ movq xmm3, [eax + ebx]
+ movq xmm7, [edx + ebx]
+ punpcklqdq xmm3, xmm7
+
+ lea eax, [eax + ecx * 4]
+ lea edx, [edx + ecx * 4]
+ movq xmm4, [eax]
+ movq xmm7, [edx]
+ punpcklqdq xmm4, xmm7
+ movq xmm5, [eax + ecx]
+ movq xmm7, [edx + ecx]
+ punpcklqdq xmm5, xmm7
+ movq xmm6, [eax + ecx*2]
+ movq xmm7, [edx + ecx*2]
+ punpcklqdq xmm6, xmm7
+
+ movdqa [esp], xmm0
+ movq xmm7, [eax + ebx]
+ movq xmm0, [edx + ebx]
+ punpcklqdq xmm7, xmm0
+ movdqa xmm0, [esp]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ mov eax, [ebp + 14h]
+ movdqa [eax], xmm4
+ movdqa [eax + 10h], xmm2
+ movdqa [eax + 20h], xmm3
+ movdqa [eax + 30h], xmm7
+ movdqa [eax + 40h], xmm5
+ movdqa [eax + 50h], xmm1
+ movdqa [eax + 60h], xmm6
+ movdqa [eax + 70h], xmm0
+
+ mov esp, ebp
+ pop ebx
+ pop ebp
+ ret
+
+
+
+;*******************************************************************************************
+;
+; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeV2H_sse2
+
+ALIGN 16
+
+DeblockLumaTransposeV2H_sse2:
+ push ebp
+ mov ebp, esp
+
+ and esp, 0FFFFFFF0h
+ sub esp, 10h
+
+ mov eax, [ebp + 10h]
+ mov ecx, [ebp + 0Ch]
+ mov edx, [ebp + 08h]
+
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 10h]
+ movdqa xmm2, [eax + 20h]
+ movdqa xmm3, [eax + 30h]
+ movdqa xmm4, [eax + 40h]
+ movdqa xmm5, [eax + 50h]
+ movdqa xmm6, [eax + 60h]
+ movdqa xmm7, [eax + 70h]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ lea eax, [ecx * 3]
+
+ movq [edx], xmm4
+ movq [edx + ecx], xmm2
+ movq [edx + ecx*2], xmm3
+ movq [edx + eax], xmm7
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm5
+ movq [edx + ecx], xmm1
+ movq [edx + ecx*2], xmm6
+ movq [edx + eax], xmm0
+
+ psrldq xmm4, 8
+ psrldq xmm2, 8
+ psrldq xmm3, 8
+ psrldq xmm7, 8
+ psrldq xmm5, 8
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+ psrldq xmm0, 8
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm4
+ movq [edx + ecx], xmm2
+ movq [edx + ecx*2], xmm3
+ movq [edx + eax], xmm7
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm5
+ movq [edx + ecx], xmm1
+ movq [edx + ecx*2], xmm6
+ movq [edx + eax], xmm0
+
+
+ mov esp, ebp
+ pop ebp
ret
\ No newline at end of file
--- a/codec/encoder/core/asm/mc_chroma.asm
+++ b/codec/encoder/core/asm/mc_chroma.asm
@@ -1,317 +1,317 @@
-;*!
-;* \copy
-;* Copyright (c) 2004-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_chroma.asm
-;*
-;* Abstract
-;* mmx motion compensation for chroma
-;*
-;* History
-;* 10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
- dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
- dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd mm3, [eax]
- WELS_Zero mm7
- punpcklbw mm3, mm3
- movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
-
- movq mm5, mm3
- punpcklbw mm3, mm7
- punpckhbw mm5, mm7
-
- movq mm6, mm4
- punpcklbw mm4, mm7
- punpckhbw mm6, mm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movd mm0, [esi]
- movd mm1, [esi+1]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
-.xloop:
-
- pmullw mm0, mm3
- pmullw mm1, mm5
- paddw mm0, mm1
-
- movd mm1, [ebx]
- punpcklbw mm1, mm7
- movq mm2, mm1
- pmullw mm1, mm4
- paddw mm0, mm1
-
- movd mm1, [ebx+1]
- punpcklbw mm1, mm7
- movq mm7, mm1
- pmullw mm1,mm6
- paddw mm0, mm1
- movq mm1,mm7
-
- paddw mm0, [h264_d0x20_mmx]
- psrlw mm0, 6
-
- WELS_Zero mm7
- packuswb mm0, mm7
- movd [edi], mm0
-
- movq mm0, mm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
- WELSEMMS
- pop ebx
- pop edi
- pop esi
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd xmm3, [eax]
- WELS_Zero xmm7
- punpcklbw xmm3, xmm3
- punpcklwd xmm3, xmm3
-
- movdqa xmm4, xmm3
- punpckldq xmm3, xmm3
- punpckhdq xmm4, xmm4
- movdqa xmm5, xmm3
- movdqa xmm6, xmm4
-
- punpcklbw xmm3, xmm7
- punpckhbw xmm5, xmm7
- punpcklbw xmm4, xmm7
- punpckhbw xmm6, xmm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movq xmm0, [esi]
- movq xmm1, [esi+1]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
-.xloop:
-
- pmullw xmm0, xmm3
- pmullw xmm1, xmm5
- paddw xmm0, xmm1
-
- movq xmm1, [ebx]
- punpcklbw xmm1, xmm7
- movdqa xmm2, xmm1
- pmullw xmm1, xmm4
- paddw xmm0, xmm1
-
- movq xmm1, [ebx+1]
- punpcklbw xmm1, xmm7
- movdqa xmm7, xmm1
- pmullw xmm1, xmm6
- paddw xmm0, xmm1
- movdqa xmm1,xmm7
-
- paddw xmm0, [h264_d0x20_sse2]
- psrlw xmm0, 6
-
- WELS_Zero xmm7
- packuswb xmm0, xmm7
- movq [edi], xmm0
-
- movdqa xmm0, xmm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
-
- pop ebx
- pop edi
- pop esi
- ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
- push ebx
- push esi
- push edi
-
- mov eax, [esp + 12 + 20]
-
- pxor xmm7, xmm7
- movd xmm5, [eax]
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- movdqa xmm6, xmm5
- punpcklqdq xmm5, xmm5
- punpckhqdq xmm6, xmm6
-
- mov eax, [esp + 12 + 4]
- mov edx, [esp + 12 + 8]
- mov esi, [esp + 12 + 12]
- mov edi, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- sub esi, edi
- sub esi, edi
- movdqa xmm7, [h264_d0x20_sse2]
-
- movdqu xmm0, [eax]
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- punpcklbw xmm0, xmm1
-
-.hloop_chroma:
- lea esi, [esi+2*edi]
-
- movdqu xmm2, [eax+edx]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm4, xmm2
-
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm0, xmm2
- paddw xmm0, xmm7
- psrlw xmm0, 6
- packuswb xmm0, xmm0
- movq [esi],xmm0
-
- lea eax, [eax+2*edx]
- movdqu xmm2, [eax]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm0, xmm2
-
- pmaddubsw xmm4, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm4, xmm2
- paddw xmm4, xmm7
- psrlw xmm4, 6
- packuswb xmm4, xmm4
- movq [esi+edi],xmm4
-
- sub ecx, 2
- jnz .hloop_chroma
- pop edi
- pop esi
- pop ebx
-
- ret
-
-
+;*!
+;* \copy
+;* Copyright (c) 2004-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mc_chroma.asm
+;*
+;* Abstract
+;* mmx motion compensation for chroma
+;*
+;* History
+;* 10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+ dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+ dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+ push esi
+ push edi
+ push ebx
+
+ mov eax, [esp +12 + 20]
+ movd mm3, [eax]
+ WELS_Zero mm7
+ punpcklbw mm3, mm3
+ movq mm4, mm3
+ punpcklwd mm3, mm3
+ punpckhwd mm4, mm4
+
+ movq mm5, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm5, mm7
+
+ movq mm6, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm6, mm7
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ lea ebx, [esi + eax]
+ movd mm0, [esi]
+ movd mm1, [esi+1]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+.xloop:
+
+ pmullw mm0, mm3
+ pmullw mm1, mm5
+ paddw mm0, mm1
+
+ movd mm1, [ebx]
+ punpcklbw mm1, mm7
+ movq mm2, mm1
+ pmullw mm1, mm4
+ paddw mm0, mm1
+
+ movd mm1, [ebx+1]
+ punpcklbw mm1, mm7
+ movq mm7, mm1
+ pmullw mm1,mm6
+ paddw mm0, mm1
+ movq mm1,mm7
+
+ paddw mm0, [h264_d0x20_mmx]
+ psrlw mm0, 6
+
+ WELS_Zero mm7
+ packuswb mm0, mm7
+ movd [edi], mm0
+
+ movq mm0, mm2
+
+ lea edi, [edi +edx ]
+ lea ebx, [ebx + eax]
+
+ dec ecx
+ jnz near .xloop
+ WELSEMMS
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+ push esi
+ push edi
+ push ebx
+
+ mov eax, [esp +12 + 20]
+ movd xmm3, [eax]
+ WELS_Zero xmm7
+ punpcklbw xmm3, xmm3
+ punpcklwd xmm3, xmm3
+
+ movdqa xmm4, xmm3
+ punpckldq xmm3, xmm3
+ punpckhdq xmm4, xmm4
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm4
+
+ punpcklbw xmm3, xmm7
+ punpckhbw xmm5, xmm7
+ punpcklbw xmm4, xmm7
+ punpckhbw xmm6, xmm7
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ lea ebx, [esi + eax]
+ movq xmm0, [esi]
+ movq xmm1, [esi+1]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+.xloop:
+
+ pmullw xmm0, xmm3
+ pmullw xmm1, xmm5
+ paddw xmm0, xmm1
+
+ movq xmm1, [ebx]
+ punpcklbw xmm1, xmm7
+ movdqa xmm2, xmm1
+ pmullw xmm1, xmm4
+ paddw xmm0, xmm1
+
+ movq xmm1, [ebx+1]
+ punpcklbw xmm1, xmm7
+ movdqa xmm7, xmm1
+ pmullw xmm1, xmm6
+ paddw xmm0, xmm1
+ movdqa xmm1,xmm7
+
+ paddw xmm0, [h264_d0x20_sse2]
+ psrlw xmm0, 6
+
+ WELS_Zero xmm7
+ packuswb xmm0, xmm7
+ movq [edi], xmm0
+
+ movdqa xmm0, xmm2
+
+ lea edi, [edi +edx ]
+ lea ebx, [ebx + eax]
+
+ dec ecx
+ jnz near .xloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+ push ebx
+ push esi
+ push edi
+
+ mov eax, [esp + 12 + 20]
+
+ pxor xmm7, xmm7
+ movd xmm5, [eax]
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ movdqa xmm6, xmm5
+ punpcklqdq xmm5, xmm5
+ punpckhqdq xmm6, xmm6
+
+ mov eax, [esp + 12 + 4]
+ mov edx, [esp + 12 + 8]
+ mov esi, [esp + 12 + 12]
+ mov edi, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ sub esi, edi
+ sub esi, edi
+ movdqa xmm7, [h264_d0x20_sse2]
+
+ movdqu xmm0, [eax]
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+ punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+ lea esi, [esi+2*edi]
+
+ movdqu xmm2, [eax+edx]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm4, xmm2
+
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm0, xmm2
+ paddw xmm0, xmm7
+ psrlw xmm0, 6
+ packuswb xmm0, xmm0
+ movq [esi],xmm0
+
+ lea eax, [eax+2*edx]
+ movdqu xmm2, [eax]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm0, xmm2
+
+ pmaddubsw xmm4, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm4, xmm2
+ paddw xmm4, xmm7
+ psrlw xmm4, 6
+ packuswb xmm4, xmm4
+ movq [esi+edi],xmm4
+
+ sub ecx, 2
+ jnz .hloop_chroma
+ pop edi
+ pop esi
+ pop ebx
+
+ ret
+
+
--- a/processing/build/linux/makefile
+++ b/processing/build/linux/makefile
@@ -1,94 +1,94 @@
-NASM = 1
-NAME = libwelsvp
-
-OUTDIR = ../../../bin/linux
-BINDIR = ../../bin
-OBJDIR = ../../obj
-SRCDIRS = ../../src/asm \
- ../../src/common \
- ../../src/adaptivequantization \
- ../../src/backgounddetection \
- ../../src/denoise \
- ../../src/downsample \
- ../../src/scenechangedetection \
- ../../src/vaacalc \
- ../../src/complexityanalysis
-SRCDIRS += ../../src/imagerotate
-
-
-TARGETLIB = $(BINDIR)/$(NAME).so
-
-CC = $(shell which gcc)
-AS = $(shell which nasm)
-GCC = gcc -m32
-
-CPPFLAGS = -Wall -g -O3
-ifeq ($(NASM), 1)
-CPPFLAGS += -DX86_ASM
-endif
-ASMFLAGS = -f elf -DNOPREFIX -I ../../src/asm/
-LDFLAGS = -lstdc++ -ldl
-
-SRCEXTS = .cpp
-ifeq ($(NASM), 1)
-SRCEXTS += .asm
-endif
-HDREXTS = .h
-SOURCES = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
-HEADERS = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
-SRC_CPP = $(filter %.cpp,$(SOURCES))
-SRC_ASM = $(filter %.asm,$(SOURCES))
-OBJS = $(addsuffix .o, $(basename $(SOURCES)))
-DEPS = $(OBJS:.o=.d)
-
-DEP_OPT = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
- echo "-MM -MP"; else echo "-M"; fi )
-DEPEND_cpp.d = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
-DEPEND_asm.d = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
-COMPILE.cpp = $(GCC) $(CPPFLAGS) -c
-COMPILE.asm = $(AS) $(ASMFLAGS)
-LINK = $(GCC) $(LDFLAGS)
-
-.PHONY: all objs tags ctags clean distclean
-
-.SUFFIXES:
-
-all: $(TARGETLIB)
-
-%.d:%.cpp
- @echo -n $(dir $<) > $@
- @$(DEPEND_cpp.d) $< >> $@
-
-%.d:%.asm
- @echo -n $(dir $<) > $@
- @$(DEPEND_asm.d) $< >> $@
-
-objs:$(OBJS)
-
-%.o:%.cpp
- $(COMPILE.cpp) $< -o $@
-
-%.o:%.asm
- $(COMPILE.asm) $< -o $@
-
-tags: $(HEADERS) $(SOURCES)
- etags $(HEADERS) $(SOURCES)
-
-ctags: $(HEADERS) $(SOURCES)
- ctags $(HEADERS) $(SOURCES)
-
-$(TARGETLIB):$(OBJS)
- @if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
- $(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
- @echo produce the lib to $(TARGETLIB).
- @if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
- @cp -f $(TARGETLIB) $(OUTDIR)
- @cp -f $(TARGETLIB) ../../../testbin
- @echo copy the lib to $(OUTDIR).
-
-clean:
- rm -f $(OBJS) $(TARGETLIB)
-
-distclean: clean
- rm -f $(DEPS) TAGS
-
+NASM = 1
+NAME = libwelsvp
+
+OUTDIR = ../../../bin/linux
+BINDIR = ../../bin
+OBJDIR = ../../obj
+SRCDIRS = ../../src/asm \
+ ../../src/common \
+ ../../src/adaptivequantization \
+ ../../src/backgounddetection \
+ ../../src/denoise \
+ ../../src/downsample \
+ ../../src/scenechangedetection \
+ ../../src/vaacalc \
+ ../../src/complexityanalysis
+SRCDIRS += ../../src/imagerotate
+
+
+TARGETLIB = $(BINDIR)/$(NAME).so
+
+CC = $(shell which gcc)
+AS = $(shell which nasm)
+GCC = gcc -m32
+
+CPPFLAGS = -Wall -g -O3
+ifeq ($(NASM), 1)
+CPPFLAGS += -DX86_ASM
+endif
+ASMFLAGS = -f elf -DNOPREFIX -I ../../src/asm/
+LDFLAGS = -lstdc++ -ldl
+
+SRCEXTS = .cpp
+ifeq ($(NASM), 1)
+SRCEXTS += .asm
+endif
+HDREXTS = .h
+SOURCES = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
+HEADERS = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
+SRC_CPP = $(filter %.cpp,$(SOURCES))
+SRC_ASM = $(filter %.asm,$(SOURCES))
+OBJS = $(addsuffix .o, $(basename $(SOURCES)))
+DEPS = $(OBJS:.o=.d)
+
+DEP_OPT = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
+ echo "-MM -MP"; else echo "-M"; fi )
+DEPEND_cpp.d = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
+DEPEND_asm.d = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
+COMPILE.cpp = $(GCC) $(CPPFLAGS) -c
+COMPILE.asm = $(AS) $(ASMFLAGS)
+LINK = $(GCC) $(LDFLAGS)
+
+.PHONY: all objs tags ctags clean distclean
+
+.SUFFIXES:
+
+all: $(TARGETLIB)
+
+%.d:%.cpp
+ @echo -n $(dir $<) > $@
+ @$(DEPEND_cpp.d) $< >> $@
+
+%.d:%.asm
+ @echo -n $(dir $<) > $@
+ @$(DEPEND_asm.d) $< >> $@
+
+objs:$(OBJS)
+
+%.o:%.cpp
+ $(COMPILE.cpp) $< -o $@
+
+%.o:%.asm
+ $(COMPILE.asm) $< -o $@
+
+tags: $(HEADERS) $(SOURCES)
+ etags $(HEADERS) $(SOURCES)
+
+ctags: $(HEADERS) $(SOURCES)
+ ctags $(HEADERS) $(SOURCES)
+
+$(TARGETLIB):$(OBJS)
+ @if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
+ $(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
+ @echo produce the lib to $(TARGETLIB).
+ @if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
+ @cp -f $(TARGETLIB) $(OUTDIR)
+ @cp -f $(TARGETLIB) ../../../testbin
+ @echo copy the lib to $(OUTDIR).
+
+clean:
+ rm -f $(OBJS) $(TARGETLIB)
+
+distclean: clean
+ rm -f $(DEPS) TAGS
+
--- a/processing/src/asm/denoisefilter.asm
+++ b/processing/src/asm/denoisefilter.asm
@@ -1,263 +1,263 @@
-;*!
-;* \copy
-;* Copyright (c) 2010-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* predenoise.asm
-;*
-;* Abstract
-;* denoise for SVC2.1
-;* History
-;* 4/13/2010 Created
-;* 7/30/2010 Modified
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-SECTION .rodata align=16
-
-sse2_32 times 8 dw 32
-sse2_20 times 8 dw 20
-
-
-BITS 32
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-%macro WEIGHT_LINE 9
- movq %2, %9
- punpcklbw %2, %7
- movdqa %8, %2
-
- movdqa %1, %6
- psubusb %1, %8
- psubusb %8, %6
- por %8, %1 ; ABS(curPixel - centerPixel);
-
- movdqa %1, %3
- psubusb %1, %8
-
- pmullw %1, %1
- psrlw %1, 5
- pmullw %2, %1
- paddusw %4, %1
- paddusw %5, %2
-%endmacro
-
-%macro WEIGHT_LINE1_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
-%endmacro
-
-%macro WEIGHT_LINE2_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
-%endmacro
-
-%macro WEIGHT_LINE3_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- pmullw %2, [sse2_20]
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-%endmacro
-
-ALIGN 16
-WELS_EXTERN BilateralLumaFilter8_sse2
-;***********************************************************************
-; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-; 1 2 3
-; 4 0 5
-; 6 7 8
-; 0: the center point
-%define pushsize 4
-%define pixel esp + pushsize + 4
-%define stride esp + pushsize + 8
-BilateralLumaFilter8_sse2:
- push ebx
-
- pxor xmm7, xmm7
- mov eax, [pixel]
- mov ebx, eax
- movq xmm6, [eax]
- punpcklbw xmm6, xmm7
- movdqa xmm3, [sse2_32]
- pxor xmm4, xmm4 ; nTotWeight
- pxor xmm5, xmm5 ; nSum
-
- dec eax
- mov ecx, [stride]
-
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 4
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 5
-
- sub eax, ecx
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 1
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 2
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 3
-
- lea eax, [eax + ecx * 2]
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 6
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 7
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 8
-
- pcmpeqw xmm0, xmm0
- psrlw xmm0, 15
- psllw xmm0, 8
- psubusw xmm0, xmm4
- pmullw xmm0, xmm6
- paddusw xmm5, xmm0
- psrlw xmm5, 8
- packuswb xmm5, xmm5
- movq [ebx], xmm5
-
- pop ebx
- ret
-
-WELS_EXTERN WaverageChromaFilter8_sse2
-;***********************************************************************
-; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;5x5 filter:
-;1 1 2 1 1
-;1 2 4 2 1
-;2 4 20 4 2
-;1 2 4 2 1
-;1 1 2 1 1
-
-ALIGN 16
-WaverageChromaFilter8_sse2:
- mov edx, [esp + 4] ; pixels
- mov ecx, [esp + 8] ; stride
-
- mov eax, ecx
- add eax, eax
- sub edx, eax ; pixels - 2 * stride
- sub edx, 2
-
- pxor xmm0, xmm0
- pxor xmm3, xmm3
-
- movdqu xmm1, [edx]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [edx + ecx]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
-
- add edx, eax
- movdqu xmm1, [edx]
- WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [edx + ecx]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [edx + ecx * 2]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
-
- psrlw xmm3, 6
- packuswb xmm3, xmm3
- movq [edx + 2], xmm3
-
+;*!
+;* \copy
+;* Copyright (c) 2010-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* predenoise.asm
+;*
+;* Abstract
+;* denoise for SVC2.1
+;* History
+;* 4/13/2010 Created
+;* 7/30/2010 Modified
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+SECTION .rodata align=16
+
+sse2_32 times 8 dw 32
+sse2_20 times 8 dw 20
+
+
+BITS 32
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+%macro WEIGHT_LINE 9
+ movq %2, %9
+ punpcklbw %2, %7
+ movdqa %8, %2
+
+ movdqa %1, %6
+ psubusb %1, %8
+ psubusb %8, %6
+ por %8, %1 ; ABS(curPixel - centerPixel);
+
+ movdqa %1, %3
+ psubusb %1, %8
+
+ pmullw %1, %1
+ psrlw %1, 5
+ pmullw %2, %1
+ paddusw %4, %1
+ paddusw %5, %2
+%endmacro
+
+%macro WEIGHT_LINE1_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
+%endmacro
+
+%macro WEIGHT_LINE2_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
+%endmacro
+
+%macro WEIGHT_LINE3_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ pmullw %2, [sse2_20]
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+%endmacro
+
+ALIGN 16
+WELS_EXTERN BilateralLumaFilter8_sse2
+;***********************************************************************
+; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+; 1 2 3
+; 4 0 5
+; 6 7 8
+; 0: the center point
+%define pushsize 4
+%define pixel esp + pushsize + 4
+%define stride esp + pushsize + 8
+BilateralLumaFilter8_sse2:
+ push ebx
+
+ pxor xmm7, xmm7
+ mov eax, [pixel]
+ mov ebx, eax
+ movq xmm6, [eax]
+ punpcklbw xmm6, xmm7
+ movdqa xmm3, [sse2_32]
+ pxor xmm4, xmm4 ; nTotWeight
+ pxor xmm5, xmm5 ; nSum
+
+ dec eax
+ mov ecx, [stride]
+
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 4
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 5
+
+ sub eax, ecx
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 1
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 2
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 3
+
+ lea eax, [eax + ecx * 2]
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 6
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 7
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 8
+
+ pcmpeqw xmm0, xmm0
+ psrlw xmm0, 15
+ psllw xmm0, 8
+ psubusw xmm0, xmm4
+ pmullw xmm0, xmm6
+ paddusw xmm5, xmm0
+ psrlw xmm5, 8
+ packuswb xmm5, xmm5
+ movq [ebx], xmm5
+
+ pop ebx
+ ret
+
+WELS_EXTERN WaverageChromaFilter8_sse2
+;***********************************************************************
+; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;5x5 filter:
+;1 1 2 1 1
+;1 2 4 2 1
+;2 4 20 4 2
+;1 2 4 2 1
+;1 1 2 1 1
+
+ALIGN 16
+WaverageChromaFilter8_sse2:
+ mov edx, [esp + 4] ; pixels
+ mov ecx, [esp + 8] ; stride
+
+ mov eax, ecx
+ add eax, eax
+ sub edx, eax ; pixels - 2 * stride
+ sub edx, 2
+
+ pxor xmm0, xmm0
+ pxor xmm3, xmm3
+
+ movdqu xmm1, [edx]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [edx + ecx]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+
+ add edx, eax
+ movdqu xmm1, [edx]
+ WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [edx + ecx]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [edx + ecx * 2]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+
+ psrlw xmm3, 6
+ packuswb xmm3, xmm3
+ movq [edx + 2], xmm3
+
ret
\ No newline at end of file
--- a/processing/src/asm/downsample_bilinear.asm
+++ b/processing/src/asm/downsample_bilinear.asm
@@ -1,1225 +1,1225 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* upsampling.asm
-;*
-;* Abstract
-;* SIMD for pixel domain down sampling
-;*
-;* History
-;* 10/22/2009 Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-
-;***********************************************************************
-; Some constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-shufb_mask_low:
- db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
-shufb_mask_high:
- db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
-
-
-ALIGN 16
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
-
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
-
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- ; 2nd part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm1, [esi+16] ; 1st pSrc line + 16
- movq mm2, [esi+24] ; 1st pSrc line + 24
- movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
- movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
-
- ; to handle mm1, mm2, mm3, mm4
- pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm5, mm6 ; d c D C b a B A
- pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm6, mm7 ; h g H G f e F E
- pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm7, mm1 ; l k L K j i J I
- pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
-
- pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm1, mm2 ; p o P O n m N M
- pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
-
- ; to handle mm5, mm6, mm7, mm1
- movq mm2, mm5
- punpckldq mm2, mm6 ; H G F E D C B A
- punpckhdq mm5, mm6 ; h g f e d c b a
-
- movq mm3, mm7
- punpckldq mm3, mm1 ; P O N M L K J I
- punpckhdq mm7, mm1 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
-
- movq [edi ], mm0
- movq [edi+8], mm2
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
-
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
-
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- movq [edi ], mm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx8_sse:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $2 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 8 bytes
-.xloops:
- ; 1st part horizonal loop: x8 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A
- ;2nd Line Src: mm1: h H g G f F e E
- ;=> target:
- ;: H G F E D C B A
- ;: h g f e d c b a
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+ecx] ; 2nd pSrc line
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm2, mm3 ; d c D C b a B A
- pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm4, mm5 ; h g H G f e F E
- pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- ; to handle mm2, mm4
- movq mm0, mm2 ;
- punpckldq mm0, mm4 ; H G F E D C B A
- punpckhdq mm2, mm4 ; h g f e d c b a
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
- pshufw mm1, mm0, 04eh ; 01001110 B
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- movd [edi], mm0
-
- ; next unit
- lea esi, [esi+8]
- lea edi, [edi+4]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-
-
-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_ssse3:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+16] ; 1st_src_line + 16
- movdqa xmm2, [esi+ecx] ; 2nd_src_line
- movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
-
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm4 high bits
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
-
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
-
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
-
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
-
- ; write pDst
- movdqa [edi], xmm0
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_ssse3:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+ecx] ; 2nd_src_line
-
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm2 high bits
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
-
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
-
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
-
- ; write pDst
- movq [edi], xmm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse4:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+16] ; 1st_src_line + 16
- movntdqa xmm2, [esi+ecx] ; 2nd_src_line
- movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
-
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
-
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
-
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
-
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
-
- ; write pDst
- movdqa [edi], xmm0
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse4:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+ecx] ; 2nd_src_line
-
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
-
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
-
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
-
- ; write pDst
- movq [edi], xmm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-
-
-
-
-WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-; unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearAccurateDownsampler_sse2:
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
-
- pxor xmm0, xmm0
- mov edx, 32767
- mov eax, [uiScaleX]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm1, eax ; uinc(uiScaleX mod 32767)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
- pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
-
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
- pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
-
- mov edx, 40003fffh
- movd xmm5, edx
- punpcklwd xmm5, xmm0 ; 16384 16383
- pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
-
-
-DOWNSAMPLE:
-
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
-
- pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
-
-HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
-
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
-
- movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
-
-WIDTH:
- mov eax, [xInverse]
- shr eax, 15
-
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- pxor xmm0, xmm0
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
- punpcklwd xmm1, xmm0 ; 000d000c000b000a
-
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- movdqa xmm0, xmm2
- pmuludq xmm2, xmm1
- psrlq xmm0, 32
- psrlq xmm1, 32
- pmuludq xmm0, xmm1
- paddq xmm2, xmm0
- pshufd xmm1, xmm2, 00001110b
- paddq xmm2, xmm1
- psrlq xmm2, 29
-
- movd eax, xmm2
- inc eax
- shr eax, 1
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- paddw xmm3, xmm7 ; inc u
- psllw xmm3, 1
- psrlw xmm3, 1
-
- loop WIDTH
-
-WIDTH_END:
- mov eax, [xInverse]
- shr eax, 15
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
-
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
-
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
-
- dec dword [tmpHeight]
- jg HEIGHT
-
-
-LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
-
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
-
-LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 15
-
- mov al, [esi+eax]
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- loop LAST_ROW_WIDTH
-
-LAST_ROW_END:
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
- ret
-
-
-
-
-WELS_EXTERN GeneralBilinearFastDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-; unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearFastDownsampler_sse2:
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
-
- pxor xmm0, xmm0
- mov edx, 65535
- mov eax, [uiScaleX]
- and eax, edx
- mov ebx, eax
- neg ebx
- and ebx, 65535
- movd xmm1, eax ; uinc(uiScaleX mod 65536)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 uinc 0 -uinc
- pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
-
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 vinc 0 -vinc
- pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
-
- mov edx, 80007fffh ; 32768 32767
- movd xmm5, edx
- pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
- mov ebx, 16384
-
-
-FAST_DOWNSAMPLE:
-
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
-
- pshuflw xmm4, xmm5, 01010000b
- psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
-
-FAST_HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
-
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
-
- movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
-
-FAST_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
-
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
-
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- pmaddwd xmm2, xmm1
- pshufd xmm1, xmm2, 00000001b
- paddd xmm2, xmm1
- movd xmm1, ebx
- paddd xmm2, xmm1
- psrld xmm2, 15
-
- packuswb xmm2, xmm0
- movd eax, xmm2
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- paddw xmm3, xmm7 ; inc u
-
- loop FAST_WIDTH
-
-FAST_WIDTH_END:
- mov eax, [xInverse]
- shr eax, 16
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
-
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
-
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
-
- dec dword [tmpHeight]
- jg FAST_HEIGHT
-
-
-FAST_LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
-
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
-
-FAST_LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
-
- mov al, [esi+eax]
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- loop FAST_LAST_ROW_WIDTH
-
-FAST_LAST_ROW_END:
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* upsampling.asm
+;*
+;* Abstract
+;* SIMD for pixel domain down sampling
+;*
+;* History
+;* 10/22/2009 Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+
+;***********************************************************************
+; Some constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+shufb_mask_low:
+ db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+shufb_mask_high:
+ db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+
+
+ALIGN 16
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
+
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ ; 2nd part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm1, [esi+16] ; 1st pSrc line + 16
+ movq mm2, [esi+24] ; 1st pSrc line + 24
+ movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
+ movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
+
+ ; to handle mm1, mm2, mm3, mm4
+ pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm5, mm6 ; d c D C b a B A
+ pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm6, mm7 ; h g H G f e F E
+ pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm7, mm1 ; l k L K j i J I
+ pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
+
+ pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm1, mm2 ; p o P O n m N M
+ pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
+
+ ; to handle mm5, mm6, mm7, mm1
+ movq mm2, mm5
+ punpckldq mm2, mm6 ; H G F E D C B A
+ punpckhdq mm5, mm6 ; h g f e d c b a
+
+ movq mm3, mm7
+ punpckldq mm3, mm1 ; P O N M L K J I
+ punpckhdq mm7, mm1 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+ movq [edi ], mm0
+ movq [edi+8], mm2
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
+
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ movq [edi ], mm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx8_sse:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $2 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 8 bytes
+.xloops:
+ ; 1st part horizonal loop: x8 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A
+ ;2nd Line Src: mm1: h H g G f F e E
+ ;=> target:
+ ;: H G F E D C B A
+ ;: h g f e d c b a
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+ecx] ; 2nd pSrc line
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm2, mm3 ; d c D C b a B A
+ pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm4, mm5 ; h g H G f e F E
+ pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ ; to handle mm2, mm4
+ movq mm0, mm2 ;
+ punpckldq mm0, mm4 ; H G F E D C B A
+ punpckhdq mm2, mm4 ; h g f e d c b a
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+ pshufw mm1, mm0, 04eh ; 01001110 B
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ movd [edi], mm0
+
+ ; next unit
+ lea esi, [esi+8]
+ lea edi, [edi+4]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+
+
+; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_ssse3:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm4 high bits
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
+
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
+
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
+
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
+
+ ; write pDst
+ movdqa [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_ssse3:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+ecx] ; 2nd_src_line
+
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm2 high bits
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
+
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
+
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
+
+ ; write pDst
+ movq [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse4:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movntdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
+
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
+
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
+
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
+
+ ; write pDst
+ movdqa [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse4:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+ecx] ; 2nd_src_line
+
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
+
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
+
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
+
+ ; write pDst
+ movq [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+
+
+
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearAccurateDownsampler_sse2:
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
+
+ pxor xmm0, xmm0
+ mov edx, 32767
+ mov eax, [uiScaleX]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm1, eax ; uinc(uiScaleX mod 32767)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
+ pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
+
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
+ pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
+
+ mov edx, 40003fffh
+ movd xmm5, edx
+ punpcklwd xmm5, xmm0 ; 16384 16383
+ pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
+
+
+DOWNSAMPLE:
+
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
+
+ pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
+
+HEIGHT:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
+
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
+
+ movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
+
+WIDTH:
+ mov eax, [xInverse]
+ shr eax, 15
+
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ pxor xmm0, xmm0
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
+ punpcklwd xmm1, xmm0 ; 000d000c000b000a
+
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ movdqa xmm0, xmm2
+ pmuludq xmm2, xmm1
+ psrlq xmm0, 32
+ psrlq xmm1, 32
+ pmuludq xmm0, xmm1
+ paddq xmm2, xmm0
+ pshufd xmm1, xmm2, 00001110b
+ paddq xmm2, xmm1
+ psrlq xmm2, 29
+
+ movd eax, xmm2
+ inc eax
+ shr eax, 1
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ paddw xmm3, xmm7 ; inc u
+ psllw xmm3, 1
+ psrlw xmm3, 1
+
+ loop WIDTH
+
+WIDTH_END:
+ mov eax, [xInverse]
+ shr eax, 15
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
+
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
+
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
+
+ dec dword [tmpHeight]
+ jg HEIGHT
+
+
+LAST_ROW:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+
+LAST_ROW_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 15
+
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ loop LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
+ ret
+
+
+
+
+WELS_EXTERN GeneralBilinearFastDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearFastDownsampler_sse2:
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
+
+ pxor xmm0, xmm0
+ mov edx, 65535
+ mov eax, [uiScaleX]
+ and eax, edx
+ mov ebx, eax
+ neg ebx
+ and ebx, 65535
+ movd xmm1, eax ; uinc(uiScaleX mod 65536)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 uinc 0 -uinc
+ pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
+
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 vinc 0 -vinc
+ pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
+
+ mov edx, 80007fffh ; 32768 32767
+ movd xmm5, edx
+ pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
+ mov ebx, 16384
+
+
+FAST_DOWNSAMPLE:
+
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
+
+ pshuflw xmm4, xmm5, 01010000b
+ psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
+
+FAST_HEIGHT:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
+
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
+
+ movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
+
+FAST_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 16
+
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
+
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ pmaddwd xmm2, xmm1
+ pshufd xmm1, xmm2, 00000001b
+ paddd xmm2, xmm1
+ movd xmm1, ebx
+ paddd xmm2, xmm1
+ psrld xmm2, 15
+
+ packuswb xmm2, xmm0
+ movd eax, xmm2
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ paddw xmm3, xmm7 ; inc u
+
+ loop FAST_WIDTH
+
+FAST_WIDTH_END:
+ mov eax, [xInverse]
+ shr eax, 16
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
+
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
+
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
+
+ dec dword [tmpHeight]
+ jg FAST_HEIGHT
+
+
+FAST_LAST_ROW:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+
+FAST_LAST_ROW_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 16
+
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ loop FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
ret
\ No newline at end of file
--- a/processing/src/asm/intra_pred.asm
+++ b/processing/src/asm/intra_pred.asm
@@ -1,145 +1,145 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* intra_pred.asm
-;*
-;* Abstract
-;* sse2 function for intra predict operations
-;*
-;* History
-;* 18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "../../src/asm/asm_inc.asm"
-
-BITS 32
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata align=16
-%endif
-
-
-align 16
-mmx_01bytes: times 16 db 1
-
-;***********************************************************************
-; macros
-;***********************************************************************
-%macro COPY_16_TIMES 2
- movdqa %2, [%1-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-%macro COPY_16_TIMESS 3
- movdqa %2, [%1+%3-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_TWO_LINE 1
- lea eax, [eax+ecx*2]
-
- COPY_16_TIMES eax, xmm0
- movdqa [edx+%1], xmm0
- COPY_16_TIMESS eax, xmm0, ecx
- movdqa [edx+%1+0x10], xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
-
- COPY_16_TIMES eax, xmm0
- movdqa [edx], xmm0
- COPY_16_TIMESS eax, xmm0, ecx
- movdqa [edx+0x10], xmm0
-
- SSE2_PRED_H_16X16_TWO_LINE 0x20
- SSE2_PRED_H_16X16_TWO_LINE 0x40
- SSE2_PRED_H_16X16_TWO_LINE 0x60
- SSE2_PRED_H_16X16_TWO_LINE 0x80
- SSE2_PRED_H_16X16_TWO_LINE 0xa0
- SSE2_PRED_H_16X16_TWO_LINE 0xc0
- SSE2_PRED_H_16X16_TWO_LINE 0xe0
-
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
-
- sub eax, ecx
- movdqa xmm0, [eax]
-
- movdqa [edx], xmm0
- movdqa [edx+10h], xmm0
- movdqa [edx+20h], xmm0
- movdqa [edx+30h], xmm0
- movdqa [edx+40h], xmm0
- movdqa [edx+50h], xmm0
- movdqa [edx+60h], xmm0
- movdqa [edx+70h], xmm0
- movdqa [edx+80h], xmm0
- movdqa [edx+90h], xmm0
- movdqa [edx+160], xmm0
- movdqa [edx+176], xmm0
- movdqa [edx+192], xmm0
- movdqa [edx+208], xmm0
- movdqa [edx+224], xmm0
- movdqa [edx+240], xmm0
-
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* intra_pred.asm
+;*
+;* Abstract
+;* sse2 function for intra predict operations
+;*
+;* History
+;* 18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "../../src/asm/asm_inc.asm"
+
+BITS 32
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata align=16
+%endif
+
+
+align 16
+mmx_01bytes: times 16 db 1
+
+;***********************************************************************
+; macros
+;***********************************************************************
+%macro COPY_16_TIMES 2
+ movdqa %2, [%1-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
+%endmacro
+
+%macro COPY_16_TIMESS 3
+ movdqa %2, [%1+%3-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_TWO_LINE 1
+ lea eax, [eax+ecx*2]
+
+ COPY_16_TIMES eax, xmm0
+ movdqa [edx+%1], xmm0
+ COPY_16_TIMESS eax, xmm0, ecx
+ movdqa [edx+%1+0x10], xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+WelsI16x16LumaPredH_sse2:
+ mov edx, [esp+4] ; pred
+ mov eax, [esp+8] ; pRef
+ mov ecx, [esp+12] ; stride
+
+ COPY_16_TIMES eax, xmm0
+ movdqa [edx], xmm0
+ COPY_16_TIMESS eax, xmm0, ecx
+ movdqa [edx+0x10], xmm0
+
+ SSE2_PRED_H_16X16_TWO_LINE 0x20
+ SSE2_PRED_H_16X16_TWO_LINE 0x40
+ SSE2_PRED_H_16X16_TWO_LINE 0x60
+ SSE2_PRED_H_16X16_TWO_LINE 0x80
+ SSE2_PRED_H_16X16_TWO_LINE 0xa0
+ SSE2_PRED_H_16X16_TWO_LINE 0xc0
+ SSE2_PRED_H_16X16_TWO_LINE 0xe0
+
+ ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+WelsI16x16LumaPredV_sse2:
+ mov edx, [esp+4] ; pred
+ mov eax, [esp+8] ; pRef
+ mov ecx, [esp+12] ; stride
+
+ sub eax, ecx
+ movdqa xmm0, [eax]
+
+ movdqa [edx], xmm0
+ movdqa [edx+10h], xmm0
+ movdqa [edx+20h], xmm0
+ movdqa [edx+30h], xmm0
+ movdqa [edx+40h], xmm0
+ movdqa [edx+50h], xmm0
+ movdqa [edx+60h], xmm0
+ movdqa [edx+70h], xmm0
+ movdqa [edx+80h], xmm0
+ movdqa [edx+90h], xmm0
+ movdqa [edx+160], xmm0
+ movdqa [edx+176], xmm0
+ movdqa [edx+192], xmm0
+ movdqa [edx+208], xmm0
+ movdqa [edx+224], xmm0
+ movdqa [edx+240], xmm0
+
ret
\ No newline at end of file
--- a/processing/src/asm/sad.asm
+++ b/processing/src/asm/sad.asm
@@ -1,79 +1,79 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* pixel_sse2.asm
-;*
-;* Abstract
-;* WelsSampleSad8x8_sse21
-;*
-;* History
-;* 8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-%macro SAD_8x4 0
- movq xmm0, [eax]
- movq xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movhps xmm0, [eax]
- movhps xmm1, [eax+ebx]
-
- movq xmm2, [ecx]
- movq xmm3, [ecx+edx]
- lea ecx, [ecx+2*edx]
- movhps xmm2, [ecx]
- movhps xmm3, [ecx+edx]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm3
- paddw xmm6, xmm0
- paddw xmm6, xmm1
-%endmacro
-
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and %1, 0x1f|(%3>>1)
-cmp %1, (32-%2)|(%3>>1)
-%endmacro
-
-
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* pixel_sse2.asm
+;*
+;* Abstract
+;* WelsSampleSad8x8_sse21
+;*
+;* History
+;* 8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+%macro SAD_8x4 0
+ movq xmm0, [eax]
+ movq xmm1, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ movhps xmm0, [eax]
+ movhps xmm1, [eax+ebx]
+
+ movq xmm2, [ecx]
+ movq xmm3, [ecx+edx]
+ lea ecx, [ecx+2*edx]
+ movhps xmm2, [ecx]
+ movhps xmm3, [ecx+edx]
+ psadbw xmm0, xmm2
+ psadbw xmm1, xmm3
+ paddw xmm6, xmm0
+ paddw xmm6, xmm1
+%endmacro
+
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and %1, 0x1f|(%3>>1)
+cmp %1, (32-%2)|(%3>>1)
+%endmacro
+
+
%macro SSE2_GetSad8x4 0
movq xmm0, [eax]
movq xmm1, [eax+ebx]
@@ -90,12 +90,12 @@
psadbw xmm1, xmm3
paddw xmm6, xmm0
paddw xmm6, xmm1
-%endmacro
+%endmacro
-;***********************************************************************
-; Code
-;***********************************************************************
+;***********************************************************************
+; Code
+;***********************************************************************
SECTION .text
WELS_EXTERN WelsSampleSad8x8_sse21
--- a/processing/src/asm/vaa.asm
+++ b/processing/src/asm/vaa.asm
@@ -1,1589 +1,1589 @@
-;*!
-;* \copy
-;* Copyright (c) 2010-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* vaa.asm
-;*
-;* Abstract
-;* sse2 for pVaa routines
-;*
-;* History
-;* 04/14/2010 Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;%macro SUM_SSE2 4 ; dst, pSrc, zero, pack1_8x2
-; movdqa %1, %2
-; punpcklbw %1, %3
-; punpckhbw %2, %3
-; paddw %1, %2
-; pmaddwd %1, %4
-; pshufd %2, %1, 04Eh ; 01001110 B
-; paddd %1, %2
-; pshufd %2, %1, 0B1h ; 10110001 B
-; paddd %1, %2
-;%endmacro ; END OF SUM_SSE2
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
- ; @sum_8x2 begin
- pshufd %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 0B1h ; 10110001 B
- paddw %1, %2
- ; end of @sum_8x2
-%endmacro ; END of SUM_WORD_8x2_SSE2
-
-%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
- movdqa %1, %2
- punpcklbw %1, %3
- punpckhbw %2, %3
- pmaddwd %1, %1
- pmaddwd %2, %2
- paddd %1, %2
- pshufd %2, %1, 04Eh ; 01001110 B
- paddd %1, %2
- pshufd %2, %1, 0B1h ; 10110001 B
- paddd %1, %2
-%endmacro ; END OF SUM_SQR_SSE2
-
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [esi ] ; line 0
- movdqa %2, [esi+ecx] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [esi+ebx] ; line 2
- movdqa %4, [esi+edx] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- pshufd %3, %1, 0B1h
- pshufd %4, %2, 0B1h
- paddw %1, %3
- paddw %2, %4
- movdqa %3, %1
- movdqa %4, %2
- pshuflw %5, %1, 0B1h
- pshufhw %6, %3, 0B1h
- paddw %1, %5
- paddw %3, %6
- pshuflw %5, %2, 0B1h
- pshufhw %6, %4, 0B1h
- paddw %2, %5
- paddw %4, %6
- punpcklwd %1, %2
- punpckhwd %3, %4
- punpcklwd %1, %3
- psraw %1, $4
-%endmacro
-
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [esi ] ; line 0
- movdqa %2, [esi+ecx] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [esi+ebx] ; line 2
- movdqa %4, [esi+edx] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
- phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
- psraw %1, $4
-%endmacro
-
-%macro WELS_SAD_16x2_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, [esi+ebx]
- movdqa xmm4, [edi+ebx]
- psadbw xmm1, xmm2
- psadbw xmm3, xmm4
- paddd xmm6, xmm1
- paddd xmm6, xmm3
- lea esi, [esi+ebx*2]
- lea edi, [edi+ebx*2]
-%endmacro
-
-%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm6, xmm3
-
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd xmm5, xmm3
-
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm4, xmm1
- paddd xmm4, xmm2
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm7, xmm3 ; sad
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; diff
-
- movdqa xmm2, xmm1
- psadbw xmm2, xmm0
- paddd xmm6, xmm2 ; sum
-
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm5, xmm1
- paddd xmm5, xmm2 ; sqsum
-
- movdqa xmm1, xmm3
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd xmm4, xmm1
- paddd xmm4, xmm3 ; sqdiff
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-%macro WELS_SAD_SD_MAD_16x1_SSE2 4
-%define sad_reg %1
-%define sum_cur_reg %2
-%define sum_ref_reg %3
-%define mad_reg %4
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_cur_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- paddd sum_ref_reg, xmm3 ; sum_ref
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
-
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-
-%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
-%define max_reg %1
- movdqa xmm1, max_reg
- psrldq xmm1, 4
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 2
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 1
- pmaxub max_reg, xmm1
-%endmacro
-
-%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 4
-%define sad_reg %1
-%define sum_reg %2
-%define mad_reg %3
-%define sqdiff_reg %4
- movdqa xmm1, [esi]
- movdqa xmm2, xmm1
- movdqa xmm3, xmm1
- punpcklbw xmm2, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- paddd xmm2, xmm3
- movdqa xmm3, xmm2
- psllq xmm2, 32
- psrlq xmm3, 32
- psllq xmm3, 32
- paddd xmm2, xmm3
- paddd sad_reg, xmm2 ; sqsum
-
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- pslldq xmm3, 4
- paddd sum_reg, xmm3 ; sum_ref
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
-
- movdqa xmm1, xmm3
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
-
- movdqa xmm3, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd sqdiff_reg, xmm1
- paddd sqdiff_reg, xmm3 ; sqdiff
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata align=16
-
-;ALIGN 16
-;pack1_8x2:
-; dw 1, 1, 1, 1, 1, 1, 1, 1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN rc_sad_frame_sse2
-;***********************************************************************
-; uint32_t rc_sad_frame_sse2( uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
-;***********************************************************************
-ALIGN 16
-rc_sad_frame_sse2:
- push esi
- push edi
- push ebp
- push ebx
- push edx
-
- mov esi, [esp+24]
- mov edi, [esp+28]
- mov ebx, [esp+32]
- mov ecx, [esp+36]
- mov edx, [esp+40]
- pxor xmm0, xmm0
-.hloop:
- mov eax, ebx
- mov ebp, $0
-.wloop:
- movdqa xmm1, [esi+ebp]
- movdqa xmm2, [edi+ebp]
- psadbw xmm1, xmm2
- pshufd xmm2, xmm1, 0f6h ; 11110110 B ; movhlps for float
- paddd xmm1, xmm2
- paddd xmm0, xmm1
- add ebp, 010h
- dec eax
- jnz near .wloop
- lea esi, [esi+edx]
- lea edi, [edi+edx]
- dec ecx
- jnz near .hloop
-
- movd eax, xmm0
- pop edx
- pop ebx
- pop ebp
- pop edi
- pop esi
- ret
-
-
-WELS_EXTERN SampleVariance16x16_sse2
-;***********************************************************************
-; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
-;***********************************************************************
-ALIGN 16
-SampleVariance16x16_sse2:
- push esi
- push edi
- push ebx
-
- sub esp, 16
- %define SUM [esp]
- %define SUM_CUR [esp+4]
- %define SQR [esp+8]
- %define SQR_CUR [esp+12]
- %define PUSH_SIZE 28 ; 12 + 16
-
- mov edi, [esp+PUSH_SIZE+4] ; y_ref
- mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
- mov esi, [esp+PUSH_SIZE+12] ; y_src
- mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
- mov ecx, 010h ; height = 16
-
- pxor xmm7, xmm7
- movdqu SUM, xmm7
-
-.hloops:
- movdqa xmm0, [edi] ; y_ref
- movdqa xmm1, [esi] ; y_src
- movdqa xmm2, xmm0 ; store first for future process
- movdqa xmm3, xmm1
- ; sum += diff;
- movdqa xmm4, xmm0
- psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
- ; to be continued for sum
- pshufd xmm5, xmm4, 0C6h ; 11000110 B
- paddw xmm4, xmm5
- movd ebx, xmm4
- add SUM, ebx
-
- ; sqr += diff * diff;
- pmaxub xmm0, xmm1
- pminub xmm1, xmm2
- psubb xmm0, xmm1 ; diff
- SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
- movd ebx, xmm1
- add SQR, ebx
-
- ; sum_cur += y_src[x];
- movdqa xmm0, xmm3 ; cur_orig
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- paddw xmm0, xmm1 ; 8x2
- SUM_WORD_8x2_SSE2 xmm0, xmm1
- movd ebx, xmm0
- and ebx, 0ffffh
- add SUM_CUR, ebx
-
- ; sqr_cur += y_src[x] * y_src[x];
- SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
- movd ebx, xmm0
- add SQR_CUR, ebx
-
- lea edi, [edi+edx]
- lea esi, [esi+eax]
- dec ecx
- jnz near .hloops
-
- mov ebx, 0
- mov bx, word SUM
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR
- sar ecx, 8
- sub ecx, ebx
- mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
- mov [edi], cx ; to store uiMotionIndex
- mov ebx, 0
- mov bx, word SUM_CUR
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR_CUR
- sar ecx, 8
- sub ecx, ebx
- mov [edi+2], cx ; to store uiTextureIndex
-
- %undef SUM
- %undef SUM_CUR
- %undef SQR
- %undef SQR_CUR
- %undef PUSH_SIZE
-
- add esp, 16
- pop ebx
- pop edi
- pop esi
-
- ret
-
-; , 6/7/2010
-
-%ifndef NO_DYNAMIC_VP
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
-;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_sse2:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov ebp, esp
- and ebp, 0fh
- sub esp, ebp
- sub esp, 32
- %define PUSH_SIZE 52 ; 20 + 32
-
- mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
- mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize
-
- mov ebx, ecx
- sal ebx, $1 ; linesize x 2 [ebx]
- mov edx, ebx
- add edx, ecx ; linesize x 3 [edx]
- mov eax, ebx
- sal eax, $1 ; linesize x 4 [eax]
-
- pxor xmm7, xmm7
-
- ; loops
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+8], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+24], xmm0
-
- movdqa xmm0, [esp] ; block 0~7
- movdqa xmm1, [esp+16] ; block 8~15
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3
-
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
-
- movd ebx, xmm0
- and ebx, 0ffffh ; effective low word truncated
- mov ecx, ebx
- imul ebx, ecx
- sar ebx, $4
- movd eax, xmm1
- sub eax, ebx
-
- %undef PUSH_SIZE
- add esp, 32
- add esp, ebp
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
-;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_ssse3:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov ebp, esp
- and ebp, 0fh
- sub esp, ebp
- sub esp, 32
- %define PUSH_SIZE 52 ; 20 + 32
-
- mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
- mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize
-
- mov ebx, ecx
- sal ebx, $1 ; linesize x 2 [ebx]
- mov edx, ebx
- add edx, ecx ; linesize x 3 [edx]
- mov eax, ebx
- sal eax, $1 ; linesize x 4 [eax]
-
- pxor xmm7, xmm7
-
- ; loops
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [esp+8], xmm1
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [esp+24], xmm1
-
- movdqa xmm0, [esp] ; block 0~7
- movdqa xmm1, [esp+16] ; block 8~15
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
-
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
-
- movd ebx, xmm0
- and ebx, 0ffffh ; effective low work truncated
- mov ecx, ebx
- imul ebx, ecx
- sar ebx, $4
- movd eax, xmm1
- sub eax, ebx
-
- %undef PUSH_SIZE
- add esp, 32
- add esp, ebp
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-%endif
-
-
-
-WELS_EXTERN abs_difference_mbrow_sse2
-;*************************************************************************************************************
-;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
-; int32_t gom_pixel_num, int32_t *pSum)
-;*************************************************************************************************************
-ALIGN 16
-abs_difference_mbrow_sse2:
-%define ref_orig esp + pushsize + 4
-%define cur_orig esp + pushsize + 8
-%define iPicStride esp + pushsize + 12
-%define gom_pixel_num esp + pushsize + 16
-%define pSum esp + pushsize + 20
-%define pushsize 12
- push esi
- push edi
- push ebx
- mov esi, [ref_orig]
- mov edi, [cur_orig]
- mov ebx, [iPicStride]
- mov eax, [gom_pixel_num]
- mov ecx, 16 ;MB_WIDTH_LUMA
- pxor xmm0, xmm0
-mb_width_loop_p:
- mov edx, esi
- add edx, eax ; end address
-gom_row_loop_p:
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- psadbw xmm1, xmm2
- paddd xmm0, xmm1
- add esi, 16
- add edi, 16
- cmp esi, edx
- jl gom_row_loop_p
-
- sub esi, eax
- sub edi, eax
- add esi, ebx
- add edi, ebx
- loop mb_width_loop_p
-
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddd xmm1, xmm0
- movd eax, xmm1
- mov edx, [pSum] ; pSum
- add [edx], eax
-
-%undef ref_orig
-%undef cur_orig
-%undef iPicStride
-%undef gom_pixel_num
-%undef pSum
-%undef pushsize
- pop ebx
- pop edi
- pop esi
- ret
-
-
-
-
-WELS_EXTERN sum_sqrsum_mbrow_sse2
-;*************************************************************************************************************
-;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
-; int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
-;*************************************************************************************************************
-ALIGN 16
-sum_sqrsum_mbrow_sse2:
-%define cur_orig esp + pushsize + 4
-%define iPicStride esp + pushsize + 8
-%define gom_pixel_num esp + pushsize + 12
-%define pSum esp + pushsize + 16
-%define pSqrSum esp + pushsize + 20
-%define pushsize 8
- push esi
- push ebx
- mov esi, [cur_orig]
- mov eax, [gom_pixel_num]
- mov ebx, [iPicStride]
- mov ecx, 16 ;MB_WIDTH_LUMA
- pxor xmm0, xmm0 ; zero
- pxor xmm1, xmm1 ; sum
- pxor xmm2, xmm2 ; sqr sum
-mb_width_loop_i:
- mov edx, esi
- add edx, eax ; end address
-gom_row_loop_i:
- movdqa xmm3, [esi]
- movdqa xmm4, xmm3
- psadbw xmm4, xmm0
- paddd xmm1, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm4, xmm4
- pmaddwd xmm3, xmm3
- paddd xmm2, xmm3
- paddd xmm2, xmm4
- add esi, 16
- cmp esi, edx
- jl gom_row_loop_i
-
- sub esi, eax
- add esi, ebx
- loop mb_width_loop_i
-
- movdqa xmm3, xmm1
- psrldq xmm3, 8
- paddd xmm1, xmm3
- movd eax, xmm1
- mov edx, [pSum]
- add [edx], eax
-
- movdqa xmm3, xmm2
- psrldq xmm3, 8
- paddd xmm2, xmm3
- movdqa xmm3, xmm2
- psrldq xmm3, 4
- paddd xmm2, xmm3
- movd eax, xmm2
- mov edx, [pSqrSum]
- add [edx], eax
-
-
-%undef cur_orig
-%undef iPicStride
-%undef gom_pixel_num
-%undef pSum
-%undef pSqrSum
-%undef pushsize
- pop ebx
- pop esi
- ret
-
-
-
-WELS_EXTERN VAACalcSad_sse2
-;*************************************************************************************************************
-;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSad_sse2:
-%define cur_data esp + pushsize + 4
-%define ref_data esp + pushsize + 8
-%define iPicWidth esp + pushsize + 12
-%define iPicHeight esp + pushsize + 16
-%define iPicStride esp + pushsize + 20
-%define psadframe esp + pushsize + 24
-%define psad8x8 esp + pushsize + 28
-%define pushsize 12
- push esi
- push edi
- push ebx
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
-height_loop:
- mov ecx, dword [iPicWidth]
- push esi
- push edi
-width_loop:
- pxor xmm6, xmm6 ;
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
-
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz width_loop
-
- pop edi
- pop esi
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz height_loop
-
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
-
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef pushsize
- pop ebx
- pop edi
- pop esi
- ret
-
-
-WELS_EXTERN VAACalcSadVar_sse2
-;*************************************************************************************************************
-;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadVar_sse2:
-%define localsize 8
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
-var_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-var_width_loop:
- pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
- pxor xmm5, xmm5 ; pSum16x16
- pxor xmm4, xmm4 ; sqsum_16x16
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
-
- pxor xmm6, xmm6
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
-
- mov ebp, [psum16x16]
- movdqa xmm1, xmm5
- psrldq xmm1, 8
- paddd xmm5, xmm1
- movd [ebp], xmm5
- add dword [psum16x16], 4
-
- movdqa xmm5, xmm4
- psrldq xmm5, 8
- paddd xmm4, xmm5
- movdqa xmm3, xmm4
- psrldq xmm3, 4
- paddd xmm4, xmm3
-
- mov ebp, [psqsum16x16]
- movd [ebp], xmm4
- add dword [psqsum16x16], 4
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz var_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz var_height_loop
-
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-
-
-
-WELS_EXTERN VAACalcSadSsd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsd_sse2:
-%define localsize 12
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define psqdiff16x16 esp + pushsize + localsize + 40
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_sadframe esp + 8
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
-sqdiff_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-sqdiff_width_loop:
- pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
- pxor xmm6, xmm6 ; pSum16x16
- pxor xmm5, xmm5 ; sqsum_16x16 four dword
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- movdqa xmm1, xmm7
- movd [edx], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+4], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
-
- pxor xmm7, xmm7
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- movdqa xmm1, xmm7
- movd [edx+8], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+12], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
-
- mov ebp, [psum16x16]
- movdqa xmm1, xmm6
- psrldq xmm1, 8
- paddd xmm6, xmm1
- movd [ebp], xmm6
- add dword [psum16x16], 4
-
- mov ebp, [psqsum16x16]
- pshufd xmm6, xmm5, 14 ;00001110
- paddd xmm6, xmm5
- pshufd xmm5, xmm6, 1 ;00000001
- paddd xmm5, xmm6
- movd [ebp], xmm5
- add dword [psqsum16x16], 4
-
- mov ebp, [psqdiff16x16]
- pshufd xmm5, xmm4, 14 ; 00001110
- paddd xmm5, xmm4
- pshufd xmm4, xmm5, 1 ; 00000001
- paddd xmm4, xmm5
- movd [ebp], xmm4
- add dword [psqdiff16x16], 4
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz sqdiff_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz sqdiff_height_loop
-
- mov ebx, [tmp_sadframe]
- mov eax, [psadframe]
- mov [eax], ebx
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef tmp_sadframe
-%undef pushsize
-%undef localsize
- ret
-
-
-
-
-
-WELS_EXTERN VAACalcSadBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadBgd_sse2:
-%define localsize 12
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define p_sd8x8 esp + pushsize + localsize + 32
-%define p_mad8x8 esp + pushsize + localsize + 36
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_ecx esp + 8
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- xor ebp, ebp
- pxor xmm0, xmm0
-bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8
- pxor xmm6, xmm6 ; sum_cur_8x8
- pxor xmm5, xmm5 ; sum_ref_8x8
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
-
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
-
- pslldq xmm7, 4
- pslldq xmm6, 4
- pslldq xmm5, 4
-
-
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
-
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
-
- mov edx, [psad8x8]
- pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
- movdqa [edx], xmm1
- add edx, 16
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
- pshufd xmm2, xmm1, 00000011b
- paddd xmm1, xmm2
- movd edx, xmm1
- add ebp, edx ; sad frame
-
- mov edx, [p_sd8x8]
- psubd xmm6, xmm5
- pshufd xmm1, xmm6, 10001101b
- movdqa [edx], xmm1
- add edx, 16
- mov [p_sd8x8], edx
-
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- mov ecx, [tmp_ecx]
- dec ecx
- jnz bgd_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz bgd_height_loop
-
- mov edx, [psadframe]
- mov [edx], ebp
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-
-
-
-WELS_EXTERN VAACalcSadSsdBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
-; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsdBgd_sse2:
-%define localsize 16
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define psqdiff16x16 esp + pushsize + localsize + 40
-%define p_sd8x8 esp + pushsize + localsize + 44
-%define p_mad8x8 esp + pushsize + localsize + 48
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_sadframe esp + 8
-%define tmp_ecx esp + 12
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
-sqdiff_bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-sqdiff_bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
-
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd [edx], xmm1 ; sum
-
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- psrlq xmm7, 32
- psllq xmm7, 32 ; clear sad
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
-
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd ebp, xmm1 ; sum
- add [edx], ebp
- add edx, 4
- mov [psum16x16], edx
-
- mov edx, [psqsum16x16]
- psrlq xmm7, 32
- pshufd xmm2, xmm7, 00001110b
- paddd xmm2, xmm7
- movd [edx], xmm2 ; sqsum
- add edx, 4
- mov [psqsum16x16], edx
-
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- mov edx, [psqdiff16x16]
- pshufd xmm1, xmm4, 00001110b
- paddd xmm4, xmm1
- pshufd xmm1, xmm4, 00000001b
- paddd xmm4, xmm1
- movd [edx], xmm4
- add edx, 4
- mov [psqdiff16x16], edx
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- mov ecx, [tmp_ecx]
- dec ecx
- jnz sqdiff_bgd_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz sqdiff_bgd_height_loop
-
- mov edx, [psadframe]
- mov ebp, [tmp_sadframe]
- mov [edx], ebp
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
+;*!
+;* \copy
+;* Copyright (c) 2010-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* vaa.asm
+;*
+;* Abstract
+;* sse2 for pVaa routines
+;*
+;* History
+;* 04/14/2010 Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;%macro SUM_SSE2 4 ; dst, pSrc, zero, pack1_8x2
+; movdqa %1, %2
+; punpcklbw %1, %3
+; punpckhbw %2, %3
+; paddw %1, %2
+; pmaddwd %1, %4
+; pshufd %2, %1, 04Eh ; 01001110 B
+; paddd %1, %2
+; pshufd %2, %1, 0B1h ; 10110001 B
+; paddd %1, %2
+;%endmacro ; END OF SUM_SSE2
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
+ ; @sum_8x2 begin
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 0B1h ; 10110001 B
+ paddw %1, %2
+ ; end of @sum_8x2
+%endmacro ; END of SUM_WORD_8x2_SSE2
+
+%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
+ movdqa %1, %2
+ punpcklbw %1, %3
+ punpckhbw %2, %3
+ pmaddwd %1, %1
+ pmaddwd %2, %2
+ paddd %1, %2
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddd %1, %2
+ pshufd %2, %1, 0B1h ; 10110001 B
+ paddd %1, %2
+%endmacro ; END OF SUM_SQR_SSE2
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+ movdqa %1, [esi ] ; line 0
+ movdqa %2, [esi+ecx] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [esi+ebx] ; line 2
+ movdqa %4, [esi+edx] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ pshufd %3, %1, 0B1h
+ pshufd %4, %2, 0B1h
+ paddw %1, %3
+ paddw %2, %4
+ movdqa %3, %1
+ movdqa %4, %2
+ pshuflw %5, %1, 0B1h
+ pshufhw %6, %3, 0B1h
+ paddw %1, %5
+ paddw %3, %6
+ pshuflw %5, %2, 0B1h
+ pshufhw %6, %4, 0B1h
+ paddw %2, %5
+ paddw %4, %6
+ punpcklwd %1, %2
+ punpckhwd %3, %4
+ punpcklwd %1, %3
+ psraw %1, $4
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+ movdqa %1, [esi ] ; line 0
+ movdqa %2, [esi+ecx] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [esi+ebx] ; line 2
+ movdqa %4, [esi+edx] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+ phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+ psraw %1, $4
+%endmacro
+
+%macro WELS_SAD_16x2_SSE2 0
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, [esi+ebx]
+ movdqa xmm4, [edi+ebx]
+ psadbw xmm1, xmm2
+ psadbw xmm3, xmm4
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+ lea esi, [esi+ebx*2]
+ lea edi, [edi+ebx*2]
+%endmacro
+
+%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 0
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm6, xmm3
+
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd xmm5, xmm3
+
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm4, xmm1
+ paddd xmm4, xmm2
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm7, xmm3 ; sad
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; diff
+
+ movdqa xmm2, xmm1
+ psadbw xmm2, xmm0
+ paddd xmm6, xmm2 ; sum
+
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm5, xmm1
+ paddd xmm5, xmm2 ; sqsum
+
+ movdqa xmm1, xmm3
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm4, xmm1
+ paddd xmm4, xmm3 ; sqdiff
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+%macro WELS_SAD_SD_MAD_16x1_SSE2 4
+%define sad_reg %1
+%define sum_cur_reg %2
+%define sum_ref_reg %3
+%define mad_reg %4
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_cur_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ paddd sum_ref_reg, xmm3 ; sum_ref
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
+
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+
+%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
+%define max_reg %1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 4
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 2
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 1
+ pmaxub max_reg, xmm1
+%endmacro
+
+%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 4
+%define sad_reg %1
+%define sum_reg %2
+%define mad_reg %3
+%define sqdiff_reg %4
+ movdqa xmm1, [esi]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm2, xmm3
+ movdqa xmm3, xmm2
+ psllq xmm2, 32
+ psrlq xmm3, 32
+ psllq xmm3, 32
+ paddd xmm2, xmm3
+ paddd sad_reg, xmm2 ; sqsum
+
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ pslldq xmm3, 4
+ paddd sum_reg, xmm3 ; sum_ref
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
+
+ movdqa xmm1, xmm3
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
+
+ movdqa xmm3, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd sqdiff_reg, xmm1
+ paddd sqdiff_reg, xmm3 ; sqdiff
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata align=16
+
+;ALIGN 16
+;pack1_8x2:
+; dw 1, 1, 1, 1, 1, 1, 1, 1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN rc_sad_frame_sse2
+;***********************************************************************
+; uint32_t rc_sad_frame_sse2( uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
+;***********************************************************************
+ALIGN 16
+rc_sad_frame_sse2:
+ push esi
+ push edi
+ push ebp
+ push ebx
+ push edx
+
+ mov esi, [esp+24]
+ mov edi, [esp+28]
+ mov ebx, [esp+32]
+ mov ecx, [esp+36]
+ mov edx, [esp+40]
+ pxor xmm0, xmm0
+.hloop:
+ mov eax, ebx
+ mov ebp, $0
+.wloop:
+ movdqa xmm1, [esi+ebp]
+ movdqa xmm2, [edi+ebp]
+ psadbw xmm1, xmm2
+ pshufd xmm2, xmm1, 0f6h ; 11110110 B ; movhlps for float
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ add ebp, 010h
+ dec eax
+ jnz near .wloop
+ lea esi, [esi+edx]
+ lea edi, [edi+edx]
+ dec ecx
+ jnz near .hloop
+
+ movd eax, xmm0
+ pop edx
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+
+
+WELS_EXTERN SampleVariance16x16_sse2
+;***********************************************************************
+; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+ALIGN 16
+SampleVariance16x16_sse2:
+ push esi
+ push edi
+ push ebx
+
+ sub esp, 16
+ %define SUM [esp]
+ %define SUM_CUR [esp+4]
+ %define SQR [esp+8]
+ %define SQR_CUR [esp+12]
+ %define PUSH_SIZE 28 ; 12 + 16
+
+ mov edi, [esp+PUSH_SIZE+4] ; y_ref
+ mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
+ mov esi, [esp+PUSH_SIZE+12] ; y_src
+ mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
+ mov ecx, 010h ; height = 16
+
+ pxor xmm7, xmm7
+ movdqu SUM, xmm7
+
+.hloops:
+ movdqa xmm0, [edi] ; y_ref
+ movdqa xmm1, [esi] ; y_src
+ movdqa xmm2, xmm0 ; store first for future process
+ movdqa xmm3, xmm1
+ ; sum += diff;
+ movdqa xmm4, xmm0
+ psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
+ ; to be continued for sum
+ pshufd xmm5, xmm4, 0C6h ; 11000110 B
+ paddw xmm4, xmm5
+ movd ebx, xmm4
+ add SUM, ebx
+
+ ; sqr += diff * diff;
+ pmaxub xmm0, xmm1
+ pminub xmm1, xmm2
+ psubb xmm0, xmm1 ; diff
+ SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm1
+ add SQR, ebx
+
+ ; sum_cur += y_src[x];
+ movdqa xmm0, xmm3 ; cur_orig
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ paddw xmm0, xmm1 ; 8x2
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
+ movd ebx, xmm0
+ and ebx, 0ffffh
+ add SUM_CUR, ebx
+
+ ; sqr_cur += y_src[x] * y_src[x];
+ SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm0
+ add SQR_CUR, ebx
+
+ lea edi, [edi+edx]
+ lea esi, [esi+eax]
+ dec ecx
+ jnz near .hloops
+
+ mov ebx, 0
+ mov bx, word SUM
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR
+ sar ecx, 8
+ sub ecx, ebx
+ mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
+ mov [edi], cx ; to store uiMotionIndex
+ mov ebx, 0
+ mov bx, word SUM_CUR
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR_CUR
+ sar ecx, 8
+ sub ecx, ebx
+ mov [edi+2], cx ; to store uiTextureIndex
+
+ %undef SUM
+ %undef SUM_CUR
+ %undef SQR
+ %undef SQR_CUR
+ %undef PUSH_SIZE
+
+ add esp, 16
+ pop ebx
+ pop edi
+ pop esi
+
+ ret
+
+; , 6/7/2010
+
+%ifndef NO_DYNAMIC_VP
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
+;***********************************************************************
+; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_sse2:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov ebp, esp
+ and ebp, 0fh
+ sub esp, ebp
+ sub esp, 32
+ %define PUSH_SIZE 52 ; 20 + 32
+
+ mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
+ mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize
+
+ mov ebx, ecx
+ sal ebx, $1 ; linesize x 2 [ebx]
+ mov edx, ebx
+ add edx, ecx ; linesize x 3 [edx]
+ mov eax, ebx
+ sal eax, $1 ; linesize x 4 [eax]
+
+ pxor xmm7, xmm7
+
+ ; loops
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp], xmm0
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp+8], xmm0
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp+16], xmm0
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp+24], xmm0
+
+ movdqa xmm0, [esp] ; block 0~7
+ movdqa xmm1, [esp+16] ; block 8~15
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3
+
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+
+ movd ebx, xmm0
+ and ebx, 0ffffh ; effective low word truncated
+ mov ecx, ebx
+ imul ebx, ecx
+ sar ebx, $4
+ movd eax, xmm1
+ sub eax, ebx
+
+ %undef PUSH_SIZE
+ add esp, 32
+ add esp, ebp
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
+;***********************************************************************
+; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_ssse3:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov ebp, esp
+ and ebp, 0fh
+ sub esp, ebp
+ sub esp, 32
+ %define PUSH_SIZE 52 ; 20 + 32
+
+ mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
+ mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize
+
+ mov ebx, ecx
+ sal ebx, $1 ; linesize x 2 [ebx]
+ mov edx, ebx
+ add edx, ecx ; linesize x 3 [edx]
+ mov eax, ebx
+ sal eax, $1 ; linesize x 4 [eax]
+
+ pxor xmm7, xmm7
+
+ ; loops
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp], xmm0
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ movq [esp+8], xmm1
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp+16], xmm0
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ movq [esp+24], xmm1
+
+ movdqa xmm0, [esp] ; block 0~7
+ movdqa xmm1, [esp+16] ; block 8~15
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
+
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+
+ movd ebx, xmm0
+ and ebx, 0ffffh ; effective low work truncated
+ mov ecx, ebx
+ imul ebx, ecx
+ sar ebx, $4
+ movd eax, xmm1
+ sub eax, ebx
+
+ %undef PUSH_SIZE
+ add esp, 32
+ add esp, ebp
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+%endif
+
+
+
+WELS_EXTERN abs_difference_mbrow_sse2
+;*************************************************************************************************************
+;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
+; int32_t gom_pixel_num, int32_t *pSum)
+;*************************************************************************************************************
+ALIGN 16
+abs_difference_mbrow_sse2:
+%define ref_orig esp + pushsize + 4
+%define cur_orig esp + pushsize + 8
+%define iPicStride esp + pushsize + 12
+%define gom_pixel_num esp + pushsize + 16
+%define pSum esp + pushsize + 20
+%define pushsize 12
+ push esi
+ push edi
+ push ebx
+ mov esi, [ref_orig]
+ mov edi, [cur_orig]
+ mov ebx, [iPicStride]
+ mov eax, [gom_pixel_num]
+ mov ecx, 16 ;MB_WIDTH_LUMA
+ pxor xmm0, xmm0
+mb_width_loop_p:
+ mov edx, esi
+ add edx, eax ; end address
+gom_row_loop_p:
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ psadbw xmm1, xmm2
+ paddd xmm0, xmm1
+ add esi, 16
+ add edi, 16
+ cmp esi, edx
+ jl gom_row_loop_p
+
+ sub esi, eax
+ sub edi, eax
+ add esi, ebx
+ add edi, ebx
+ loop mb_width_loop_p
+
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddd xmm1, xmm0
+ movd eax, xmm1
+ mov edx, [pSum] ; pSum
+ add [edx], eax
+
+%undef ref_orig
+%undef cur_orig
+%undef iPicStride
+%undef gom_pixel_num
+%undef pSum
+%undef pushsize
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+
+
+WELS_EXTERN sum_sqrsum_mbrow_sse2
+;*************************************************************************************************************
+;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
+; int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
+;*************************************************************************************************************
+ALIGN 16
+sum_sqrsum_mbrow_sse2:
+%define cur_orig esp + pushsize + 4
+%define iPicStride esp + pushsize + 8
+%define gom_pixel_num esp + pushsize + 12
+%define pSum esp + pushsize + 16
+%define pSqrSum esp + pushsize + 20
+%define pushsize 8
+ push esi
+ push ebx
+ mov esi, [cur_orig]
+ mov eax, [gom_pixel_num]
+ mov ebx, [iPicStride]
+ mov ecx, 16 ;MB_WIDTH_LUMA
+ pxor xmm0, xmm0 ; zero
+ pxor xmm1, xmm1 ; sum
+ pxor xmm2, xmm2 ; sqr sum
+mb_width_loop_i:
+ mov edx, esi
+ add edx, eax ; end address
+gom_row_loop_i:
+ movdqa xmm3, [esi]
+ movdqa xmm4, xmm3
+ psadbw xmm4, xmm0
+ paddd xmm1, xmm4
+ movdqa xmm4, xmm3
+ punpcklbw xmm4, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm4, xmm4
+ pmaddwd xmm3, xmm3
+ paddd xmm2, xmm3
+ paddd xmm2, xmm4
+ add esi, 16
+ cmp esi, edx
+ jl gom_row_loop_i
+
+ sub esi, eax
+ add esi, ebx
+ loop mb_width_loop_i
+
+ movdqa xmm3, xmm1
+ psrldq xmm3, 8
+ paddd xmm1, xmm3
+ movd eax, xmm1
+ mov edx, [pSum]
+ add [edx], eax
+
+ movdqa xmm3, xmm2
+ psrldq xmm3, 8
+ paddd xmm2, xmm3
+ movdqa xmm3, xmm2
+ psrldq xmm3, 4
+ paddd xmm2, xmm3
+ movd eax, xmm2
+ mov edx, [pSqrSum]
+ add [edx], eax
+
+
+%undef cur_orig
+%undef iPicStride
+%undef gom_pixel_num
+%undef pSum
+%undef pSqrSum
+%undef pushsize
+ pop ebx
+ pop esi
+ ret
+
+
+
+WELS_EXTERN VAACalcSad_sse2
+;*************************************************************************************************************
+;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSad_sse2:
+%define cur_data esp + pushsize + 4
+%define ref_data esp + pushsize + 8
+%define iPicWidth esp + pushsize + 12
+%define iPicHeight esp + pushsize + 16
+%define iPicStride esp + pushsize + 20
+%define psadframe esp + pushsize + 24
+%define psad8x8 esp + pushsize + 28
+%define pushsize 12
+ push esi
+ push edi
+ push ebx
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+height_loop:
+ mov ecx, dword [iPicWidth]
+ push esi
+ push edi
+width_loop:
+ pxor xmm6, xmm6 ;
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
+
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz width_loop
+
+ pop edi
+ pop esi
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz height_loop
+
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
+
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef pushsize
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+WELS_EXTERN VAACalcSadVar_sse2
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadVar_sse2:
+%define localsize 8
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+var_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+var_width_loop:
+ pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
+ pxor xmm5, xmm5 ; pSum16x16
+ pxor xmm4, xmm4 ; sqsum_16x16
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
+
+ pxor xmm6, xmm6
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
+
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm5
+ psrldq xmm1, 8
+ paddd xmm5, xmm1
+ movd [ebp], xmm5
+ add dword [psum16x16], 4
+
+ movdqa xmm5, xmm4
+ psrldq xmm5, 8
+ paddd xmm4, xmm5
+ movdqa xmm3, xmm4
+ psrldq xmm3, 4
+ paddd xmm4, xmm3
+
+ mov ebp, [psqsum16x16]
+ movd [ebp], xmm4
+ add dword [psqsum16x16], 4
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz var_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz var_height_loop
+
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+WELS_EXTERN VAACalcSadSsd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsd_sse2:
+%define localsize 12
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define psqdiff16x16 esp + pushsize + localsize + 40
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_sadframe esp + 8
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
+sqdiff_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+sqdiff_width_loop:
+ pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
+ pxor xmm6, xmm6 ; pSum16x16
+ pxor xmm5, xmm5 ; sqsum_16x16 four dword
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ movdqa xmm1, xmm7
+ movd [edx], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+4], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
+
+ pxor xmm7, xmm7
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ movdqa xmm1, xmm7
+ movd [edx+8], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+12], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
+
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm6
+ psrldq xmm1, 8
+ paddd xmm6, xmm1
+ movd [ebp], xmm6
+ add dword [psum16x16], 4
+
+ mov ebp, [psqsum16x16]
+ pshufd xmm6, xmm5, 14 ;00001110
+ paddd xmm6, xmm5
+ pshufd xmm5, xmm6, 1 ;00000001
+ paddd xmm5, xmm6
+ movd [ebp], xmm5
+ add dword [psqsum16x16], 4
+
+ mov ebp, [psqdiff16x16]
+ pshufd xmm5, xmm4, 14 ; 00001110
+ paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 1 ; 00000001
+ paddd xmm4, xmm5
+ movd [ebp], xmm4
+ add dword [psqdiff16x16], 4
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz sqdiff_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz sqdiff_height_loop
+
+ mov ebx, [tmp_sadframe]
+ mov eax, [psadframe]
+ mov [eax], ebx
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef tmp_sadframe
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+
+
+WELS_EXTERN VAACalcSadBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadBgd_sse2:
+%define localsize 12
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define p_sd8x8 esp + pushsize + localsize + 32
+%define p_mad8x8 esp + pushsize + localsize + 36
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_ecx esp + 8
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ xor ebp, ebp
+ pxor xmm0, xmm0
+bgd_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+bgd_width_loop:
+ pxor xmm7, xmm7 ; pSad8x8
+ pxor xmm6, xmm6 ; sum_cur_8x8
+ pxor xmm5, xmm5 ; sum_ref_8x8
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
+
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+
+ pslldq xmm7, 4
+ pslldq xmm6, 4
+ pslldq xmm5, 4
+
+
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
+
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
+
+ mov edx, [psad8x8]
+ pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
+ pshufd xmm2, xmm1, 00000011b
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add ebp, edx ; sad frame
+
+ mov edx, [p_sd8x8]
+ psubd xmm6, xmm5
+ pshufd xmm1, xmm6, 10001101b
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [p_sd8x8], edx
+
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz bgd_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz bgd_height_loop
+
+ mov edx, [psadframe]
+ mov [edx], ebp
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsdBgd_sse2:
+%define localsize 16
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define psqdiff16x16 esp + pushsize + localsize + 40
+%define p_sd8x8 esp + pushsize + localsize + 44
+%define p_mad8x8 esp + pushsize + localsize + 48
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_sadframe esp + 8
+%define tmp_ecx esp + 12
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
+sqdiff_bgd_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+sqdiff_bgd_width_loop:
+ pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
+
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd [edx], xmm1 ; sum
+
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ psrlq xmm7, 32
+ psllq xmm7, 32 ; clear sad
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
+
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd ebp, xmm1 ; sum
+ add [edx], ebp
+ add edx, 4
+ mov [psum16x16], edx
+
+ mov edx, [psqsum16x16]
+ psrlq xmm7, 32
+ pshufd xmm2, xmm7, 00001110b
+ paddd xmm2, xmm7
+ movd [edx], xmm2 ; sqsum
+ add edx, 4
+ mov [psqsum16x16], edx
+
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ mov edx, [psqdiff16x16]
+ pshufd xmm1, xmm4, 00001110b
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm4, 00000001b
+ paddd xmm4, xmm1
+ movd [edx], xmm4
+ add edx, 4
+ mov [psqdiff16x16], edx
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz sqdiff_bgd_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz sqdiff_bgd_height_loop
+
+ mov edx, [psadframe]
+ mov ebp, [tmp_sadframe]
+ mov [edx], ebp
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
--- a/processing/src/common/WelsVP.def
+++ b/processing/src/common/WelsVP.def
@@ -1,36 +1,36 @@
-;*!
-;* \copy
-;* Copyright (c) 2011-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-
-LIBRARY welsvp.dll
-EXPORTS
- CreateVpInterface PRIVATE
+;*!
+;* \copy
+;* Copyright (c) 2011-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+
+LIBRARY welsvp.dll
+EXPORTS
+ CreateVpInterface PRIVATE
DestroyVpInterface PRIVATE
\ No newline at end of file
--- a/testbin/layer2.cfg
+++ b/testbin/layer2.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth 320 # Input frame width
-SourceHeight 192 # Input frame height
-FrameRateIn 12 # Input frame rate [Hz]
-FrameRateOut 12 # Output frame rate [Hz]
-InputFile CiscoVT2people_320x192_12fps.yuv # Input file
-ReconFile rec_layer2.yuv # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
-
-InitialQP 24 # Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize 1500
-SliceNum 1 # multiple slices number specified
-
-SlicesAssign0 960 # count number of MBs in slice #0
-SlicesAssign1 0 # count number of MBs in slice #1
-SlicesAssign2 0 # count number of MBs in slice #2
-SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4 0 # count number of MBs in slice #4
-SlicesAssign5 0 # count number of MBs in slice #5
-SlicesAssign6 0 # count number of MBs in slice #6
-SlicesAssign7 0 # count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE ####
-# 0 SM_SINGLE_SLICE | SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth 320 # Input frame width
+SourceHeight 192 # Input frame height
+FrameRateIn 12 # Input frame rate [Hz]
+FrameRateOut 12 # Output frame rate [Hz]
+InputFile CiscoVT2people_320x192_12fps.yuv # Input file
+ReconFile rec_layer2.yuv # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
+
+InitialQP 24 # Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize 1500
+SliceNum 1 # multiple slices number specified
+
+SlicesAssign0 960 # count number of MBs in slice #0
+SlicesAssign1 0 # count number of MBs in slice #1
+SlicesAssign2 0 # count number of MBs in slice #2
+SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4 0 # count number of MBs in slice #4
+SlicesAssign5 0 # count number of MBs in slice #5
+SlicesAssign6 0 # count number of MBs in slice #6
+SlicesAssign7 0 # count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE ####
+# 0 SM_SINGLE_SLICE | SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd.cfg
+++ b/testbin/layer2_vd.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth 320 # Input frame width
-SourceHeight 192 # Input frame height
-FrameRateIn 12 # Input frame rate [Hz]
-FrameRateOut 12 # Output frame rate [Hz]
-InputFile CiscoVT2people_320x192_12fps.yuv # Input file
-ReconFile rec_layer2.yuv # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
-
-InitialQP 24 # Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize 1500
-SliceNum 1 # multiple slices number specified
-
-SlicesAssign0 960 # count number of MBs in slice #0
-SlicesAssign1 0 # count number of MBs in slice #1
-SlicesAssign2 0 # count number of MBs in slice #2
-SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4 0 # count number of MBs in slice #4
-SlicesAssign5 0 # count number of MBs in slice #5
-SlicesAssign6 0 # count number of MBs in slice #6
-SlicesAssign7 0 # count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE | SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth 320 # Input frame width
+SourceHeight 192 # Input frame height
+FrameRateIn 12 # Input frame rate [Hz]
+FrameRateOut 12 # Output frame rate [Hz]
+InputFile CiscoVT2people_320x192_12fps.yuv # Input file
+ReconFile rec_layer2.yuv # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
+
+InitialQP 24 # Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize 1500
+SliceNum 1 # multiple slices number specified
+
+SlicesAssign0 960 # count number of MBs in slice #0
+SlicesAssign1 0 # count number of MBs in slice #1
+SlicesAssign2 0 # count number of MBs in slice #2
+SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4 0 # count number of MBs in slice #4
+SlicesAssign5 0 # count number of MBs in slice #5
+SlicesAssign6 0 # count number of MBs in slice #6
+SlicesAssign7 0 # count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE | SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd_rc.cfg
+++ b/testbin/layer2_vd_rc.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth 320 # Input frame width
-SourceHeight 192 # Input frame height
-FrameRateIn 12 # Input frame rate [Hz]
-FrameRateOut 12 # Output frame rate [Hz]
-InputFile CiscoVT2people_320x192_12fps.yuv # Input file
-ReconFile rec_layer2.yuv # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
-
-InitialQP 24 # Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize 1500
-SliceNum 1 # multiple slices number specified
-
-SlicesAssign0 960 # count number of MBs in slice #0
-SlicesAssign1 0 # count number of MBs in slice #1
-SlicesAssign2 0 # count number of MBs in slice #2
-SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4 0 # count number of MBs in slice #4
-SlicesAssign5 0 # count number of MBs in slice #5
-SlicesAssign6 0 # count number of MBs in slice #6
-SlicesAssign7 0 # count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE | SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth 320 # Input frame width
+SourceHeight 192 # Input frame height
+FrameRateIn 12 # Input frame rate [Hz]
+FrameRateOut 12 # Output frame rate [Hz]
+InputFile CiscoVT2people_320x192_12fps.yuv # Input file
+ReconFile rec_layer2.yuv # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
+
+InitialQP 24 # Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize 1500
+SliceNum 1 # multiple slices number specified
+
+SlicesAssign0 960 # count number of MBs in slice #0
+SlicesAssign1 0 # count number of MBs in slice #1
+SlicesAssign2 0 # count number of MBs in slice #2
+SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4 0 # count number of MBs in slice #4
+SlicesAssign5 0 # count number of MBs in slice #5
+SlicesAssign6 0 # count number of MBs in slice #6
+SlicesAssign7 0 # count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE | SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/welsenc.cfg
+++ b/testbin/welsenc.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile test.264 # Bitstream file
-MaxFrameRate 30 # Maximum frame rate [Hz]
-FramesToBeEncoded -1 # Number of frames (at input frame rate)
-
-GOPSize 4 # GOP Size (at maximum frame rate), 16
-IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition 1
-
-EnableFrameCropping 1 # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC 1 # ENABLE RC
-TargetBitrate 5000 # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise 0 # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference 0 # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod 30 # Long Term Reference Marking Period
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
- # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
- # Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers 1 # Number of layers
-//LayerCfg layer0.cfg # Layer 0 configuration file
-//LayerCfg layer1.cfg # Layer 1 configuration file
-LayerCfg layer2.cfg # Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile test.264 # Bitstream file
+MaxFrameRate 30 # Maximum frame rate [Hz]
+FramesToBeEncoded -1 # Number of frames (at input frame rate)
+
+GOPSize 4 # GOP Size (at maximum frame rate), 16
+IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition 1
+
+EnableFrameCropping 1 # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC 1 # ENABLE RC
+TargetBitrate 5000 # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise 0 # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference 0 # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod 30 # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
+ # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+ # Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers 1 # Number of layers
+//LayerCfg layer0.cfg # Layer 0 configuration file
+//LayerCfg layer1.cfg # Layer 1 configuration file
+LayerCfg layer2.cfg # Layer 2 configuration file
--- a/testbin/welsenc_vd_1d.cfg
+++ b/testbin/welsenc_vd_1d.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile test_vd_1d.264 # Bitstream file
-MaxFrameRate 30 # Maximum frame rate [Hz]
-FramesToBeEncoded -1 # Number of frames (at input frame rate)
-
-GOPSize 4 # GOP Size (at maximum frame rate), 16
-IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition 1
-
-EnableFrameCropping 1 # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC 0 # ENABLE RC
-TargetBitrate 5000 # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise 0 # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization 0 # Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod 30 # Long Term Reference Marking Period
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
- # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
- # Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers 1 # Number of layers
-//LayerCfg layer0_vd.cfg # Layer 0 configuration file
-//LayerCfg layer1_vd.cfg # Layer 1 configuration file
-LayerCfg layer2_vd.cfg # Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile test_vd_1d.264 # Bitstream file
+MaxFrameRate 30 # Maximum frame rate [Hz]
+FramesToBeEncoded -1 # Number of frames (at input frame rate)
+
+GOPSize 4 # GOP Size (at maximum frame rate), 16
+IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition 1
+
+EnableFrameCropping 1 # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC 0 # ENABLE RC
+TargetBitrate 5000 # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise 0 # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization 0 # Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod 30 # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
+ # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+ # Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers 1 # Number of layers
+//LayerCfg layer0_vd.cfg # Layer 0 configuration file
+//LayerCfg layer1_vd.cfg # Layer 1 configuration file
+LayerCfg layer2_vd.cfg # Layer 2 configuration file
--- a/testbin/welsenc_vd_rc.cfg
+++ b/testbin/welsenc_vd_rc.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile test_vd_rc.264 # Bitstream file
-MaxFrameRate 30 # Maximum frame rate [Hz]
-FramesToBeEncoded -1 # Number of frames (at input frame rate), -1
-
-GOPSize 8 # GOP Size (at maximum frame rate), 16
-IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition 1
-
-EnableFrameCropping 1 # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC 1 # ENABLE RC
-TargetBitrate 600 # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise 1 # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod 30 # Long Term Reference Marking Period
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
- # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
- # Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers 1 # Number of layers
-//LayerCfg layer0_vd.cfg # Layer 0 configuration file
-//LayerCfg layer1_vd.cfg # Layer 1 configuration file
-LayerCfg layer2_vd_rc.cfg # Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile test_vd_rc.264 # Bitstream file
+MaxFrameRate 30 # Maximum frame rate [Hz]
+FramesToBeEncoded -1 # Number of frames (at input frame rate), -1
+
+GOPSize 8 # GOP Size (at maximum frame rate), 16
+IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition 1
+
+EnableFrameCropping 1 # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC 1 # ENABLE RC
+TargetBitrate 600 # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise 1 # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod 30 # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
+ # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+ # Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers 1 # Number of layers
+//LayerCfg layer0_vd.cfg # Layer 0 configuration file
+//LayerCfg layer1_vd.cfg # Layer 1 configuration file
+LayerCfg layer2_vd_rc.cfg # Layer 2 configuration file