shithub: openh264

Download patch

ref: 08638a9396dd466fdb5c68068b5aab8853def896
parent: 3c40261256f6e89bd868f9fafba8b9c4fd9ea15d
parent: 04dba61d22ab43c1f428302f2abbaecca723fe30
author: Ethan Hugg <[email protected]>
date: Tue Jan 28 06:04:12 EST 2014

Merge pull request #252 from mstorsjo/share-processing-asm

Merge identical assembly code between the processing and encoder libs

--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -2106,7 +2106,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\satd_sad.asm"
+				RelativePath="..\..\..\common\satd_sad.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
--- /dev/null
+++ b/codec/common/satd_sad.asm
@@ -1,0 +1,2344 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  satd_sad.asm
+;*
+;*  Abstract
+;*      WelsSampleSatd4x4_sse2
+;*      WelsSampleSatd8x8_sse2
+;*      WelsSampleSatd16x8_sse2
+;*      WelsSampleSatd8x16_sse2
+;*      WelsSampleSatd16x16_sse2
+;*
+;*      WelsSampleSad16x8_sse2
+;*      WelsSampleSad16x16_sse2
+;*
+;*  History
+;*      8/5/2009 Created
+;*     24/9/2009 modified
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Data
+;***********************************************************************
+SECTION .rodata align=16
+
+align 16
+HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
+align 16
+HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
+align 16
+PDW1:  dw 1,1,1,1,1,1,1,1
+align 16
+PDQ2:  dw 2,0,0,0,2,0,0,0
+align 16
+HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 BEGIN
+;
+;***********************************************************************
+%macro MMX_DW_1_2REG 2
+      pxor %1, %1
+      pcmpeqw %2, %2
+      psubw %1, %2
+%endmacro
+
+%macro  SSE2_SumWHorizon1 2
+	movdqa      %2, %1
+	psrldq      %2, 8
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 4
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 2
+	paddusw     %1, %2
+%endmacro
+
+%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
+   SSE2_SumSub %1, %2, %5
+   SSE2_SumSub %3, %4, %5
+   SSE2_SumSub %2, %4, %5
+   SSE2_SumSub %1, %3, %5
+%endmacro
+
+%macro SSE2_SumAbs4 7
+	WELS_AbsW %1, %3
+	WELS_AbsW %2, %3
+	WELS_AbsW %4, %6
+	WELS_AbsW %5, %6
+	paddusw       %1, %2
+	paddusw       %4, %5
+	paddusw       %7, %1
+	paddusw       %7, %4
+%endmacro
+
+%macro  SSE2_SumWHorizon 3
+	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
+	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
+	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
+	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
+	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%endmacro
+
+%macro SSE2_GetSatd8x8 0
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+	lea                 r0, [r0+2*r1]
+	lea                 r2, [r2+2*r3]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+
+	lea					r0,    [r0+2*r1]
+    lea					r2,    [r2+2*r3]
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+	lea                 r0, [r0+2*r1]
+	lea                 r2, [r2+2*r3]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse2
+align 16
+WelsSampleSatd4x4_sse2:
+	;push      ebx
+	;mov       eax,  [esp+8]
+	;mov       ebx,  [esp+12]
+	;mov       ecx,  [esp+16]
+	;mov       edx,  [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+    movd      xmm0, [r0]
+    movd      xmm1, [r0+r1]
+    lea       r0 , [r0+2*r1]
+    movd      xmm2, [r0]
+    movd      xmm3, [r0+r1]
+    punpckldq xmm0, xmm2
+    punpckldq xmm1, xmm3
+
+    movd      xmm4, [r2]
+    movd      xmm5, [r2+r3]
+    lea       r2 , [r2+2*r3]
+    movd      xmm6, [r2]
+    movd      xmm7, [r2+r3]
+    punpckldq xmm4, xmm6
+    punpckldq xmm5, xmm7
+
+    pxor      xmm6, xmm6
+    punpcklbw xmm0, xmm6
+    punpcklbw xmm1, xmm6
+    punpcklbw xmm4, xmm6
+    punpcklbw xmm5, xmm6
+
+    psubw     xmm0, xmm4
+    psubw     xmm1, xmm5
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+    SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+    movdqa     xmm4, xmm0
+    paddw      xmm0, xmm3
+    psubw      xmm4, xmm3
+
+    movdqa         xmm2, xmm0
+    punpcklwd      xmm0, xmm4
+    punpckhwd      xmm4, xmm2
+
+	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
+	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
+
+    movdqa         xmm7, xmm0
+    paddw          xmm0, xmm5
+    psubw          xmm7, xmm5
+
+	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
+
+    movdqa         xmm2, xmm0
+    paddw          xmm0, xmm1
+    psubw          xmm2, xmm1
+
+    WELS_AbsW  xmm0, xmm3
+    paddusw        xmm6, xmm0
+	WELS_AbsW  xmm2, xmm4
+    paddusw        xmm6, xmm2
+    SSE2_SumWHorizon1  xmm6, xmm4
+	movd           retrd,  xmm6
+    and            retrd,  0xffff
+    shr            retrd,  1
+	LOAD_4_PARA_POP
+	ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x8_sse2
+align 16
+ WelsSampleSatd8x8_sse2:
+	 ;push   ebx
+	 ;mov    eax,    [esp+8]
+	 ;mov    ebx,    [esp+12]
+	 ;mov    ecx,    [esp+16]
+	 ;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+    SSE2_GetSatd8x8
+    psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x16_sse2
+align 16
+ WelsSampleSatd8x16_sse2:
+	 ;push   ebx
+	 ;mov    eax,    [esp+8]
+	 ;mov    ebx,    [esp+12]
+	 ;mov    ecx,    [esp+16]
+	 ;mov    edx,    [esp+20]
+
+	 %assign  push_num 0
+	 LOAD_4_PARA
+	 SIGN_EXTENTION r1, r1d
+	 SIGN_EXTENTION r3, r3d
+	 pxor   xmm6,   xmm6
+     pxor   xmm7,   xmm7
+
+	 SSE2_GetSatd8x8
+     lea    r0,    [r0+2*r1]
+     lea    r2,    [r2+2*r3]
+	 SSE2_GetSatd8x8
+
+	 psrlw   xmm6,  1
+	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	 movd    retrd,   xmm6
+	 LOAD_4_PARA_POP
+	 ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse2
+align 16
+WelsSampleSatd16x8_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	push r0
+	push r2
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+
+	SSE2_GetSatd8x8
+
+	pop r2
+	pop r0
+	;mov    eax,    [esp+8]
+    ;mov    ecx,    [esp+16]
+    add    r0,    8
+    add    r2,    8
+	SSE2_GetSatd8x8
+
+	psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x16_sse2
+align 16
+WelsSampleSatd16x16_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	push r0
+	push r2
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+
+	SSE2_GetSatd8x8
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSatd8x8
+
+	pop r2
+	pop r0
+	;mov    eax,    [esp+8]
+	;mov    ecx,    [esp+16]
+	add    r0,    8
+	add    r2,    8
+
+	SSE2_GetSatd8x8
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSatd8x8
+
+ ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
+    psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+	paddd        xmm4, %1 ;for dc
+	paddd        xmm4, %3 ;for dc
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+;    paddd        xmm4, %1 ;for dc
+;	 paddd        xmm4, %3 ;for dc
+	movdqa       %4, %1
+	punpcklqdq   %4, %3
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+
+%macro SSE41_GetX38x4SatdDec 0
+	pxor        xmm7,   xmm7
+	movq        xmm0,   [eax]
+	movq        xmm1,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	movq        xmm2,   [eax]
+	movq        xmm3,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	punpcklbw   xmm0,   xmm7
+	punpcklbw   xmm1,   xmm7
+	punpcklbw   xmm2,   xmm7
+	punpcklbw   xmm3,   xmm7
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+	;doesn't need another transpose
+%endmacro
+%macro SSE41_GetX38x4SatdV 2
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2],   0
+	pinsrw      xmm0,   word[esi+%2+8], 4
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+2],  0
+	pinsrw      xmm0,   word[esi+%2+10], 4
+	psubsw      xmm0,   xmm1
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+4],  0
+	pinsrw      xmm0,   word[esi+%2+12], 4
+	psubsw      xmm0,   xmm3
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+6],  0
+	pinsrw      xmm0,   word[esi+%2+14], 4
+	psubsw      xmm0,   xmm2
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+%endmacro
+%macro SSE41_GetX38x4SatdH  3
+	movq        xmm0,   [esi+%3+8*%1]
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm5,   xmm0
+	pabsw       xmm1,   xmm1
+	pabsw       xmm2,   xmm2
+	pabsw       xmm3,   xmm3
+	paddw       xmm2,   xmm1;for DC
+	paddw       xmm2,   xmm3;for DC
+	paddw       xmm5,   xmm2
+%endmacro
+%macro SSE41_I16X16GetX38x4SatdDC 0
+	pxor        xmm0,   xmm0
+	movq2dq     xmm0,   mm4
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2
+%endmacro
+%macro SSE41_ChromaGetX38x4SatdDC 1
+	shl         %1,     4
+	movdqa      xmm0,   [esi+32+%1]
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2
+%endmacro
+%macro SSE41_I16x16GetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 32
+	SSE41_I16X16GetX38x4SatdDC
+%endmacro
+%macro SSE41_ChromaGetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 16
+	SSE41_ChromaGetX38x4SatdDC %1
+%endmacro
+%macro SSE41_HSum8W 3
+	pmaddwd     %1, %2
+	movhlps     %3, %1
+	paddd       %1, %3
+	pshuflw     %3, %1,0Eh
+	paddd       %1, %3
+%endmacro
+
+
+%ifdef X86_32
+WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
+WelsIntra16x16Combined3Satd_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+	mov    esi,    [esp+40] ;temp_satd
+	pxor        xmm4,   xmm4
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movdqu 		xmm0,   [ecx]
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi],  xmm0 ;V
+	movdqa      [esi+16], xmm1
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     8
+	pinsrb      xmm0,   byte[ecx+edx-1], 9
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     10
+	pinsrb      xmm0,   byte[ecx+edx-1], 11
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     12
+	pinsrb      xmm0,   byte[ecx+edx-1], 13
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     14
+	pinsrb      xmm0,   byte[ecx+edx-1], 15
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi+32], xmm0 ;H
+	movdqa      [esi+48], xmm1
+	movd        ecx,    xmm4 ;dc
+	add         ecx,    16   ;(sum+16)
+	shr         ecx,    5    ;((sum+16)>>5)
+	shl         ecx,    4    ;
+	movd        mm4,    ecx  ; mm4 copy DC
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+	mov         edi,    0
+.loop16x16_get_satd:
+.loopStart1:
+	SSE41_I16x16GetX38x4Satd ecx, edi
+	inc          ecx
+	cmp         ecx, 4
+	jl          .loopStart1
+	cmp         edi, 16
+	je          .loop16x16_get_satd_end
+	mov         eax, [esp+24]
+	add         eax, 8
+	mov         ecx, 0
+	add         edi, 16
+	jmp         .loop16x16_get_satd
+ .loop16x16_get_satd_end:
+	MMX_DW_1_2REG    xmm0, xmm1
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov      edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx
+	add       ebx, edx
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_16x16
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16
+
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx
+	jmp near return_satd_intra_16x16_x3
+not_dc_16x16:
+	; for H mode
+	cmp       edi, ecx
+	jge near   not_dc_h_16x16
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, edi
+	jmp near return_satd_intra_16x16_x3
+not_dc_h_16x16:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+return_satd_intra_16x16_x3:
+	WELSEMMS
+	pop         edi
+	pop         esi
+	pop         ebx
+ret
+
+%macro SSE41_ChromaGetX38x8Satd 0
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movq 		xmm0,   [ecx]
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+	movdqa      [esi],  xmm0 ;V
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+	movdqa      [esi+16], xmm0 ;H
+;(sum+2)>>2
+	movdqa      xmm6,   [PDQ2]
+	movdqa      xmm5,   xmm4
+	punpckhqdq  xmm5,   xmm1
+	paddd       xmm5,   xmm6
+	psrld       xmm5,   2
+;(sum1+sum2+4)>>3
+	paddd       xmm6,   xmm6
+	paddd       xmm4,   xmm1
+	paddd       xmm4,   xmm6
+	psrld       xmm4,   3
+;satd *16
+	pslld       xmm5,   4
+	pslld       xmm4,   4
+;temp satd
+	movdqa      xmm6,   xmm4
+	punpcklqdq  xmm4,   xmm5
+	psllq       xmm4,   32
+	psrlq       xmm4,   32
+	movdqa      [esi+32], xmm4
+	punpckhqdq  xmm5,   xmm6
+	psllq       xmm5,   32
+	psrlq       xmm5,   32
+	movdqa      [esi+48], xmm5
+
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+loop_chroma_satdx3_cb_cr:
+	SSE41_ChromaGetX38x4Satd ecx, 0
+	inc             ecx
+	cmp             ecx, 2
+	jl              loop_chroma_satdx3_cb_cr
+%endmacro
+
+%macro SSEReg2MMX 3
+	movdq2q     %2, %1
+	movhlps     %1, %1
+	movdq2q     %3, %1
+%endmacro
+%macro MMXReg2SSE 4
+	movq2dq     %1, %3
+	movq2dq     %2, %4
+	punpcklqdq  %1, %2
+%endmacro
+;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
+
+WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
+WelsIntraChroma8x8Combined3Satd_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+	mov    esi,    [esp+40] ;temp_satd
+	xor    edi,    edi
+loop_chroma_satdx3:
+	SSE41_ChromaGetX38x8Satd
+	cmp             edi, 1
+	je              loop_chroma_satdx3end
+	inc             edi
+	SSEReg2MMX  xmm4, mm0,mm1
+	SSEReg2MMX  xmm5, mm2,mm3
+	SSEReg2MMX  xmm6, mm5,mm6
+	mov         ecx,  [esp+44]
+	mov         eax,  [esp+48]
+	jmp         loop_chroma_satdx3
+loop_chroma_satdx3end:
+	MMXReg2SSE  xmm0, xmm3, mm0, mm1
+	MMXReg2SSE  xmm1, xmm3, mm2, mm3
+	MMXReg2SSE  xmm2, xmm3, mm5, mm6
+
+	paddw       xmm4, xmm0
+	paddw       xmm5, xmm1
+	paddw       xmm6, xmm2
+
+	MMX_DW_1_2REG    xmm0, xmm1
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov       edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx
+	add       ecx, edx
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_8x8
+	cmp        ebx, ecx
+	jge near   not_dc_h_8x8
+
+	; for DC mode
+	mov       dword[edx], 0;I8_PRED_DC
+	mov       eax, ebx
+	jmp near return_satd_intra_8x8_x3
+not_dc_8x8:
+	; for H mode
+	cmp       edi, ecx
+	jge near   not_dc_h_8x8
+	mov       dword[edx], 1;I8_PRED_H
+	mov       eax, edi
+	jmp near return_satd_intra_8x8_x3
+not_dc_h_8x8:
+	; for V mode
+	mov       dword[edx], 2;I8_PRED_V
+	mov       eax, ecx
+return_satd_intra_8x8_x3:
+	WELSEMMS
+	pop         edi
+	pop         esi
+	pop         ebx
+ret
+
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 END
+;
+;***********************************************************************
+%macro SSSE3_Get16BSadHVDC 2
+  movd        xmm6,%1
+  pshufb      xmm6,xmm1
+  movdqa      %1,  xmm6
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm7
+  paddw       xmm4,xmm0
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm5
+  paddw       xmm2,xmm0
+  psadbw      xmm6,%2
+  paddw       xmm3,xmm6
+%endmacro
+%macro WelsAddDCValue 4
+    movzx   %2, byte %1
+    mov    %3, %2
+    add     %4, %2
+%endmacro
+
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 BEGIN
+;
+;***********************************************************************
+WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
+WelsIntra16x16Combined3Sad_ssse3:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    edi,    [esp+40] ;temp_sad
+	sub    ecx,    edx
+    movdqa      xmm5,[ecx]
+    pxor        xmm0,xmm0
+    psadbw      xmm0,xmm5
+    movhlps     xmm1,xmm0
+    paddw       xmm0,xmm1
+    movd        eax,xmm0
+
+    add         ecx,edx
+    lea         ebx, [edx+2*edx]
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    sub        edi, 192
+    add         eax,10h
+    shr         eax,5
+    movd        xmm7,eax
+    pxor        xmm1,xmm1
+    pshufb      xmm7,xmm1
+    pxor        xmm4,xmm4
+    pxor        xmm3,xmm3
+    pxor        xmm2,xmm2
+;sad begin
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+    lea         esi, [ebx+2*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+
+    pslldq      xmm3,4
+    por         xmm3,xmm2
+    movhlps     xmm1,xmm3
+    paddw       xmm3,xmm1
+    movhlps     xmm0,xmm4
+    paddw       xmm4,xmm0
+; comparing order: DC H V
+	movd        ebx, xmm4 ;DC
+	movd        ecx, xmm3 ;V
+	psrldq      xmm3, 4
+	movd        esi, xmm3 ;H
+	mov         eax, [esp+36] ;lamda
+	shl         eax, 1
+	add         esi, eax
+	add         ebx, eax
+	mov         edx, [esp+32]
+	cmp         ebx, esi
+	jge near   not_dc_16x16_sad
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16_sad
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx
+    sub        edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm7
+%assign x x+1
+%endrep
+	jmp near return_sad_intra_16x16_x3
+not_dc_16x16_sad:
+	; for H mode
+	cmp       esi, ecx
+	jge near   not_dc_h_16x16_sad
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, esi
+	jmp near return_sad_intra_16x16_x3
+not_dc_h_16x16_sad:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+    sub       edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm5
+%assign x x+1
+%endrep
+return_sad_intra_16x16_x3:
+	pop    edi
+	pop    esi
+	pop    ebx
+	ret
+%endif
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 END
+;
+;***********************************************************************
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 BEGIN
+;
+;***********************************************************************
+
+;SSE4.1
+%macro SSE41_GetSatd8x4 0
+	movq             xmm0, [r0]
+	punpcklqdq       xmm0, xmm0
+	pmaddubsw        xmm0, xmm7
+	movq             xmm1, [r0+r1]
+	punpcklqdq       xmm1, xmm1
+	pmaddubsw        xmm1, xmm7
+	movq             xmm2, [r2]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [r2+r3]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
+	psubsw           xmm0, xmm2
+	psubsw           xmm1, xmm3
+	movq             xmm2, [r0+2*r1]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [r0+r4]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
+	movq             xmm4, [r2+2*r3]
+	punpcklqdq       xmm4, xmm4
+	pmaddubsw        xmm4, xmm7
+	movq             xmm5, [r2+r5]
+	punpcklqdq       xmm5, xmm5
+	pmaddubsw        xmm5, xmm7
+	psubsw           xmm2, xmm4
+	psubsw           xmm3, xmm5
+	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
+	pabsw            xmm0, xmm0
+	pabsw            xmm2, xmm2
+	pabsw            xmm1, xmm1
+	pabsw            xmm3, xmm3
+	movdqa           xmm4, xmm3
+	pblendw          xmm3, xmm1, 0xAA
+	pslld            xmm1, 16
+	psrld            xmm4, 16
+	por              xmm1, xmm4
+	pmaxuw           xmm1, xmm3
+	paddw            xmm6, xmm1
+	movdqa           xmm4, xmm0
+	pblendw          xmm0, xmm2, 0xAA
+	pslld            xmm2, 16
+	psrld            xmm4, 16
+	por              xmm2, xmm4
+	pmaxuw           xmm0, xmm2
+	paddw            xmm6, xmm0
+%endmacro
+
+%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
+	MMX_DW_1_2REG    %3, %4
+	pmaddwd     %2, %3
+	movhlps     %4, %2
+	paddd       %2, %4
+	pshuflw     %4, %2,0Eh
+	paddd       %2, %4
+	movd		%1, %2
+%endmacro
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse41
+WelsSampleSatd4x4_sse41:
+	;push        ebx
+	;mov         eax,[esp+8]
+	;mov         ebx,[esp+12]
+	;mov         ecx,[esp+16]
+	;mov         edx,[esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqa      xmm4,[HSwapSumSubDB1]
+	movd        xmm2,[r2]
+	movd        xmm5,[r2+r3]
+	shufps      xmm2,xmm5,0
+	movd        xmm3,[r2+r3*2]
+	lea         r2, [r3*2+r2]
+	movd        xmm5,[r2+r3]
+	shufps      xmm3,xmm5,0
+	movd        xmm0,[r0]
+	movd        xmm5,[r0+r1]
+	shufps      xmm0,xmm5,0
+	movd        xmm1,[r0+r1*2]
+	lea         r0, [r1*2+r0]
+	movd        xmm5,[r0+r1]
+	shufps      xmm1,xmm5,0
+	pmaddubsw   xmm0,xmm4
+	pmaddubsw   xmm1,xmm4
+	pmaddubsw   xmm2,xmm4
+	pmaddubsw   xmm3,xmm4
+	psubw       xmm0,xmm2
+	psubw       xmm1,xmm3
+	movdqa      xmm2,xmm0
+	paddw       xmm0,xmm1
+	psubw       xmm1,xmm2
+	movdqa      xmm2,xmm0
+	punpcklqdq  xmm0,xmm1
+	punpckhqdq  xmm2,xmm1
+	movdqa      xmm1,xmm0
+	paddw       xmm0,xmm2
+	psubw       xmm2,xmm1
+	movdqa      xmm1,xmm0
+	pblendw     xmm0,xmm2,0AAh
+	pslld       xmm2,16
+	psrld       xmm1,16
+	por         xmm2,xmm1
+	pabsw       xmm0,xmm0
+	pabsw       xmm2,xmm2
+	pmaxsw      xmm0,xmm2
+	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse41
+align 16
+WelsSampleSatd8x8_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;mov    eax,    [esp+16]
+	;mov    ebx,    [esp+20]
+	;mov    ecx,    [esp+24]
+	;mov    edx,    [esp+28]
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6, xmm6
+	SSE41_GetSatd8x4
+	lea			r0,	 [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse41
+align 16
+WelsSampleSatd8x16_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;push   ebp
+	;%define pushsize   16
+	;mov    eax,    [esp+pushsize+4]
+	;mov    ebx,    [esp+pushsize+8]
+	;mov    ecx,    [esp+pushsize+12]
+	;mov    edx,    [esp+pushsize+16]
+%ifdef X86_32
+	push  r4
+	push  r5
+	push  r6
+%endif
+	%assign  push_num 3
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor        xmm6, xmm6
+	mov         r6,    0
+loop_get_satd_8x16:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_8x16
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r6
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse41
+align 16
+WelsSampleSatd16x8_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;mov    eax,    [esp+16]
+	;mov    ebx,    [esp+20]
+	;mov    ecx,    [esp+24]
+	;mov    edx,    [esp+28]
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	push  r0
+	push  r2
+
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6,   xmm6
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+
+	pop  r2
+	pop  r0
+	;mov			eax,    [esp+16]
+	;mov			ecx,    [esp+24]
+	add			r0,    8
+	add			r2,    8
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd16x16_sse41
+align 16
+WelsSampleSatd16x16_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;push   ebp
+	;%define pushsize   16
+	;mov    eax,    [esp+pushsize+4]
+	;mov    ebx,    [esp+pushsize+8]
+	;mov    ecx,    [esp+pushsize+12]
+	;mov    edx,    [esp+pushsize+16]
+%ifdef X86_32
+	push  r4
+	push  r5
+	push  r6
+%endif
+	%assign  push_num 3
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+
+	push  r0
+	push  r2
+
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6,   xmm6
+	mov         r6,    0
+loop_get_satd_16x16_left:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_16x16_left
+
+	pop  r2
+	pop  r0
+	;mov			eax,    [esp+pushsize+4]
+	;mov			ecx,    [esp+pushsize+12]
+	add			r0,    8
+	add			r2,    8
+	mov         r6,    0
+loop_get_satd_16x16_right:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_16x16_right
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	;%undef pushsize
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r6
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE2_GetSad2x16 0
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqu xmm1,   [r2]
+	MOVDQ  xmm2,   [r0];[eax] must aligned 16
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad4x16 0
+	movdqu xmm0,   [r2]
+	MOVDQ  xmm2,   [r0]
+	psadbw xmm0,   xmm2
+	paddw  xmm7,   xmm0
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+	movdqu xmm1,   [r2+2*r3]
+	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+	movdqu xmm1,   [r2+r5]
+	MOVDQ  xmm2,   [r0+r4]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad8x4 0
+	movq   xmm0,   [r0]
+	movq   xmm1,   [r0+r1]
+	lea    r0,     [r0+2*r1]
+	movhps xmm0,   [r0]
+	movhps xmm1,   [r0+r1]
+
+	movq   xmm2,   [r2]
+	movq   xmm3,   [r2+r3]
+	lea    r2,     [r2+2*r3]
+	movhps xmm2,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x16_sse2
+align 16
+WelsSampleSad16x16_sse2:
+	;push ebx
+	;push edi
+	;push esi
+	;%define _STACK_SIZE		12
+	;mov eax, [esp+_STACK_SIZE+4 ]
+	;mov	ebx, [esp+_STACK_SIZE+8 ]
+	;mov ecx, [esp+_STACK_SIZE+12]
+	;mov edx, [esp+_STACK_SIZE+16]
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	lea r4, [3*r1]
+	lea r5, [3*r3]
+
+	pxor   xmm7,   xmm7
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	movhlps xmm0, xmm7
+	paddw xmm0, xmm7
+	movd retrd, xmm0
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x8_sse2
+align 16
+WelsSampleSad16x8_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqu xmm0,   [r2]
+	MOVDQ  xmm2,   [r0]
+	psadbw xmm0,   xmm2
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+
+	movhlps     xmm1, xmm0
+	paddw       xmm0, xmm1
+	movd        retrd,  xmm0
+	LOAD_4_PARA_POP
+	ret
+
+
+
+WELS_EXTERN WelsSampleSad8x16_sse2
+WelsSampleSad8x16_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+    pxor   xmm6,   xmm6
+
+	SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       retrd,  xmm0
+	LOAD_4_PARA_POP
+	ret
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+WelsSampleSad8x8_sse21:
+    ;mov    ecx,    [esp+12]
+	;mov    edx,    ecx
+    ;CACHE_SPLIT_CHECK edx, 8, 64
+	;jle    near   .pixel_sad_8x8_nsplit
+	;push   ebx
+	;push   edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+
+	%assign  push_num 0
+	mov		r2,  arg3
+	push	r2
+	CACHE_SPLIT_CHECK r2, 8, 64
+	jle    near   .pixel_sad_8x8_nsplit
+	pop		r2
+%ifdef X86_32
+	push	r3
+	push	r4
+	push	r5
+%endif
+	%assign  push_num 3
+	mov		r0,  arg1
+	mov		r1,  arg2
+	SIGN_EXTENTION r1, r1d
+    pxor   xmm7,   xmm7
+
+    ;ecx r2, edx r4, edi r5
+
+    mov    r5,    r2
+    and    r5,    0x07
+    sub    r2,    r5
+    mov    r4,    8
+    sub    r4,    r5
+
+    shl    r5,    3
+    shl    r4,    3
+    movd   xmm5,   r5d
+    movd   xmm6,   r4d
+	mov    r5,    8
+	add    r5,    r2
+    mov    r3,    arg4
+	SIGN_EXTENTION r3, r3d
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+    movhlps    xmm0, xmm7
+	paddw      xmm0, xmm7
+	movd       retrd,  xmm0
+%ifdef X86_32
+	pop	 r5
+	pop	 r4
+	pop	 r3
+%endif
+	jmp        .return
+
+.pixel_sad_8x8_nsplit:
+    ;push   ebx
+    ;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    edx,    [esp+20]
+
+	pop r2
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm6,   xmm6
+	SSE2_GetSad8x4
+    lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       retrd,  xmm0
+	LOAD_4_PARA_POP
+.return:
+	ret
+
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 END
+;
+;***********************************************************************
+
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+
+%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
+	psadbw %1,   %4
+	paddw  xmm5, %1
+	psadbw %4,   %3
+	paddw  xmm4, %4
+	movdqu %4,   [%5-1]
+	psadbw %4,   %2
+	paddw  xmm6, %4
+	movdqu %4,   [%5+1]
+	psadbw %4,   %2
+	paddw  xmm7, %4
+%endmacro
+WELS_EXTERN WelsSampleSadFour16x16_sse2
+WelsSampleSadFour16x16_sse2:
+	;push ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [r0]
+	sub    r2,    r3
+	movdqu xmm3,   [r2]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm3,   xmm1
+	paddw  xmm4,   xmm3
+
+	movdqu xmm2,   [r2+r3-1]
+	psadbw xmm2,   xmm0
+	paddw  xmm6,   xmm2
+
+	movdqu xmm3,   [r2+r3+1]
+	psadbw xmm3,   xmm0
+	paddw  xmm7,   xmm3
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r2,    [r2+2*r3]
+	movdqu xmm3,   [r2]
+	psadbw xmm2,   xmm3
+	paddw xmm5,   xmm2
+
+	movdqu xmm2,   [r2-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+
+	movdqu xmm3,   [r2+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+
+	;mov        ecx,  [esp+24]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+
+WELS_EXTERN WelsSampleSadFour16x8_sse2
+WelsSampleSadFour16x8_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [r0]
+	sub    r2,    r3
+	movdqu xmm3,   [r2]
+	psadbw xmm3,   xmm0
+	paddw xmm4,   xmm3
+
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm3,   xmm1
+	paddw xmm4,   xmm3
+
+	movdqu xmm2,   [r2+r3-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+
+	movdqu xmm3,   [r2+r3+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r2,    [r2+2*r3]
+	movdqu xmm3,   [r2]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+
+	movdqu xmm0,   [r2-1]
+	psadbw xmm0,   xmm1
+	paddw xmm6,   xmm0
+
+	movdqu xmm3,   [r2+1]
+	psadbw xmm3,   xmm1
+	paddw xmm7,   xmm3
+
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm1,   xmm3
+	paddw xmm5,   xmm1
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+WELS_EXTERN WelsSampleSadFour8x16_sse2
+WelsSampleSadFour8x16_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+	sub    r2,    r3
+	movq   xmm3,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+
+WELS_EXTERN WelsSampleSadFour8x8_sse2
+WelsSampleSadFour8x8_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+	sub    r2,    r3
+	movq   xmm3,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+WELS_EXTERN WelsSampleSadFour4x4_sse2
+WelsSampleSadFour4x4_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movd   xmm0,   [r0]
+	movd   xmm1,   [r0+r1]
+	lea        r0,    [r0+2*r1]
+	movd       xmm2,   [r0]
+	movd       xmm3,   [r0+r1]
+	punpckldq  xmm0, xmm1
+	punpckldq  xmm2, xmm3
+	punpcklqdq xmm0, xmm2
+	sub        r2,  r3
+	movd       xmm1, [r2]
+	movd       xmm2, [r2+r3]
+	punpckldq  xmm1, xmm2
+	movd       xmm2, [r2+r3-1]
+	movd       xmm3, [r2+r3+1]
+
+	lea        r2,  [r2+2*r3]
+
+	movd       xmm4, [r2]
+	movd       xmm5, [r2-1]
+	punpckldq  xmm2, xmm5
+	movd       xmm5, [r2+1]
+	punpckldq  xmm3, xmm5
+
+	movd       xmm5, [r2+r3]
+	punpckldq  xmm4, xmm5
+
+	punpcklqdq xmm1, xmm4 ;-L
+
+	movd       xmm5, [r2+r3-1]
+	movd       xmm6, [r2+r3+1]
+
+	lea        r2,  [r2+2*r3]
+	movd       xmm7, [r2-1]
+	punpckldq  xmm5, xmm7
+	punpcklqdq xmm2, xmm5 ;-1
+	movd       xmm7, [r2+1]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm3, xmm6 ;+1
+	movd       xmm6, [r2]
+	movd       xmm7, [r2+r3]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6 ;+L
+	psadbw     xmm1, xmm0
+	psadbw     xmm2, xmm0
+	psadbw     xmm3, xmm0
+	psadbw     xmm4, xmm0
+
+	movhlps    xmm0, xmm1
+	paddw      xmm1, xmm0
+	movhlps    xmm0, xmm2
+	paddw      xmm2, xmm0
+	movhlps    xmm0, xmm3
+	paddw      xmm3, xmm0
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	;mov        edi,  [esp+28]
+	punpckldq  xmm1, xmm4
+	punpckldq  xmm2, xmm3
+	punpcklqdq xmm1, xmm2
+	movdqa     [r4],xmm1
+	LOAD_5_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 END
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSad4x4_mmx
+
+align 16
+;***********************************************************************
+;   int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
+;***********************************************************************
+WelsSampleSad4x4_mmx:
+    ;push    ebx
+	;%define pushsize     4
+	;%define pix1address	 esp+pushsize+4
+	;%define pix1stride   esp+pushsize+8
+	;%define pix2address  esp+pushsize+12
+	;%define pix2stride   esp+pushsize+16
+    ;mov		  eax, [pix1address]
+    ;mov		  ebx, [pix1stride ]
+    ;mov		  ecx, [pix2address]
+    ;mov		  edx, [pix2stride ]
+
+    %assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movd	  mm0, [r0]
+	movd	  mm1, [r0+r1]
+	punpckldq mm0, mm1
+
+	movd      mm3, [r2]
+	movd      mm4, [r2+r3]
+	punpckldq mm3, mm4
+	psadbw    mm0, mm3
+
+	lea       r0, [r0+2*r1]
+	lea       r2, [r2+2*r3]
+
+	movd      mm1, [r0]
+	movd      mm2, [r0+r1]
+	punpckldq mm1, mm2
+
+	movd      mm3, [r2]
+	movd      mm4, [r2+r3]
+	punpckldq mm3, mm4
+	psadbw    mm1, mm3
+	paddw     mm0, mm1
+
+    movd      retrd, mm0
+
+	WELSEMMS
+    LOAD_4_PARA_POP
+    ret
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -16,6 +16,7 @@
 	$(COMMON_SRCDIR)/./mb_copy.asm\
 	$(COMMON_SRCDIR)/./mc_chroma.asm\
 	$(COMMON_SRCDIR)/./mc_luma.asm\
+	$(COMMON_SRCDIR)/./satd_sad.asm\
 	$(COMMON_SRCDIR)/./vaa.asm\
 
 COMMON_OBJS += $(COMMON_ASM_SRCS:.asm=.o)
--- a/codec/encoder/core/asm/satd_sad.asm
+++ /dev/null
@@ -1,2344 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  satd_sad.asm
-;*
-;*  Abstract
-;*      WelsSampleSatd4x4_sse2
-;*      WelsSampleSatd8x8_sse2
-;*      WelsSampleSatd16x8_sse2
-;*      WelsSampleSatd8x16_sse2
-;*      WelsSampleSatd16x16_sse2
-;*
-;*      WelsSampleSad16x8_sse2
-;*      WelsSampleSad16x16_sse2
-;*
-;*  History
-;*      8/5/2009 Created
-;*     24/9/2009 modified
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Data
-;***********************************************************************
-SECTION .rodata align=16
-
-align 16
-HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
-align 16
-HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
-align 16
-PDW1:  dw 1,1,1,1,1,1,1,1
-align 16
-PDQ2:  dw 2,0,0,0,2,0,0,0
-align 16
-HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 BEGIN
-;
-;***********************************************************************
-%macro MMX_DW_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubw %1, %2
-%endmacro
-
-%macro  SSE2_SumWHorizon1 2
-	movdqa      %2, %1
-	psrldq      %2, 8
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 4
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 2
-	paddusw     %1, %2
-%endmacro
-
-%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
-   SSE2_SumSub %1, %2, %5
-   SSE2_SumSub %3, %4, %5
-   SSE2_SumSub %2, %4, %5
-   SSE2_SumSub %1, %3, %5
-%endmacro
-
-%macro SSE2_SumAbs4 7
-	WELS_AbsW %1, %3
-	WELS_AbsW %2, %3
-	WELS_AbsW %4, %6
-	WELS_AbsW %5, %6
-	paddusw       %1, %2
-	paddusw       %4, %5
-	paddusw       %7, %1
-	paddusw       %7, %4
-%endmacro
-
-%macro  SSE2_SumWHorizon 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
-%endmacro
-
-%macro SSE2_GetSatd8x8 0
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-
-	lea					r0,    [r0+2*r1]
-    lea					r2,    [r2+2*r3]
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse2
-align 16
-WelsSampleSatd4x4_sse2:
-	;push      ebx
-	;mov       eax,  [esp+8]
-	;mov       ebx,  [esp+12]
-	;mov       ecx,  [esp+16]
-	;mov       edx,  [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-    movd      xmm0, [r0]
-    movd      xmm1, [r0+r1]
-    lea       r0 , [r0+2*r1]
-    movd      xmm2, [r0]
-    movd      xmm3, [r0+r1]
-    punpckldq xmm0, xmm2
-    punpckldq xmm1, xmm3
-
-    movd      xmm4, [r2]
-    movd      xmm5, [r2+r3]
-    lea       r2 , [r2+2*r3]
-    movd      xmm6, [r2]
-    movd      xmm7, [r2+r3]
-    punpckldq xmm4, xmm6
-    punpckldq xmm5, xmm7
-
-    pxor      xmm6, xmm6
-    punpcklbw xmm0, xmm6
-    punpcklbw xmm1, xmm6
-    punpcklbw xmm4, xmm6
-    punpcklbw xmm5, xmm6
-
-    psubw     xmm0, xmm4
-    psubw     xmm1, xmm5
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-    SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
-    movdqa     xmm4, xmm0
-    paddw      xmm0, xmm3
-    psubw      xmm4, xmm3
-
-    movdqa         xmm2, xmm0
-    punpcklwd      xmm0, xmm4
-    punpckhwd      xmm4, xmm2
-
-	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
-	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
-
-    movdqa         xmm7, xmm0
-    paddw          xmm0, xmm5
-    psubw          xmm7, xmm5
-
-	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
-
-    movdqa         xmm2, xmm0
-    paddw          xmm0, xmm1
-    psubw          xmm2, xmm1
-
-    WELS_AbsW  xmm0, xmm3
-    paddusw        xmm6, xmm0
-	WELS_AbsW  xmm2, xmm4
-    paddusw        xmm6, xmm2
-    SSE2_SumWHorizon1  xmm6, xmm4
-	movd           retrd,  xmm6
-    and            retrd,  0xffff
-    shr            retrd,  1
-	LOAD_4_PARA_POP
-	ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x8_sse2
-align 16
- WelsSampleSatd8x8_sse2:
-	 ;push   ebx
-	 ;mov    eax,    [esp+8]
-	 ;mov    ebx,    [esp+12]
-	 ;mov    ecx,    [esp+16]
-	 ;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-    SSE2_GetSatd8x8
-    psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	LOAD_4_PARA_POP
-	ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x16_sse2
-align 16
- WelsSampleSatd8x16_sse2:
-	 ;push   ebx
-	 ;mov    eax,    [esp+8]
-	 ;mov    ebx,    [esp+12]
-	 ;mov    ecx,    [esp+16]
-	 ;mov    edx,    [esp+20]
-
-	 %assign  push_num 0
-	 LOAD_4_PARA
-	 SIGN_EXTENTION r1, r1d
-	 SIGN_EXTENTION r3, r3d
-	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7
-
-	 SSE2_GetSatd8x8
-     lea    r0,    [r0+2*r1]
-     lea    r2,    [r2+2*r3]
-	 SSE2_GetSatd8x8
-
-	 psrlw   xmm6,  1
-	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	 movd    retrd,   xmm6
-	 LOAD_4_PARA_POP
-	 ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse2
-align 16
-WelsSampleSatd16x8_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	push r0
-	push r2
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-
-	SSE2_GetSatd8x8
-
-	pop r2
-	pop r0
-	;mov    eax,    [esp+8]
-    ;mov    ecx,    [esp+16]
-    add    r0,    8
-    add    r2,    8
-	SSE2_GetSatd8x8
-
-	psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x16_sse2
-align 16
-WelsSampleSatd16x16_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	push r0
-	push r2
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
-
-	pop r2
-	pop r0
-	;mov    eax,    [esp+8]
-	;mov    ecx,    [esp+16]
-	add    r0,    8
-	add    r2,    8
-
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
-
- ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
-    psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-	paddd        xmm4, %1 ;for dc
-	paddd        xmm4, %3 ;for dc
-	packssdw     %1, %3
-	psllw        %1, 2
-%endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-;    paddd        xmm4, %1 ;for dc
-;	 paddd        xmm4, %3 ;for dc
-	movdqa       %4, %1
-	punpcklqdq   %4, %3
-	packssdw     %1, %3
-	psllw        %1, 2
-%endmacro
-
-%macro SSE41_GetX38x4SatdDec 0
-	pxor        xmm7,   xmm7
-	movq        xmm0,   [eax]
-	movq        xmm1,   [eax+ebx]
-	lea         eax,    [eax+2*ebx]
-	movq        xmm2,   [eax]
-	movq        xmm3,   [eax+ebx]
-	lea         eax,    [eax+2*ebx]
-	punpcklbw   xmm0,   xmm7
-	punpcklbw   xmm1,   xmm7
-	punpcklbw   xmm2,   xmm7
-	punpcklbw   xmm3,   xmm7
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
-	;doesn't need another transpose
-%endmacro
-%macro SSE41_GetX38x4SatdV 2
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2],   0
-	pinsrw      xmm0,   word[esi+%2+8], 4
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+2],  0
-	pinsrw      xmm0,   word[esi+%2+10], 4
-	psubsw      xmm0,   xmm1
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+4],  0
-	pinsrw      xmm0,   word[esi+%2+12], 4
-	psubsw      xmm0,   xmm3
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+6],  0
-	pinsrw      xmm0,   word[esi+%2+14], 4
-	psubsw      xmm0,   xmm2
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-%endmacro
-%macro SSE41_GetX38x4SatdH  3
-	movq        xmm0,   [esi+%3+8*%1]
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm5,   xmm0
-	pabsw       xmm1,   xmm1
-	pabsw       xmm2,   xmm2
-	pabsw       xmm3,   xmm3
-	paddw       xmm2,   xmm1;for DC
-	paddw       xmm2,   xmm3;for DC
-	paddw       xmm5,   xmm2
-%endmacro
-%macro SSE41_I16X16GetX38x4SatdDC 0
-	pxor        xmm0,   xmm0
-	movq2dq     xmm0,   mm4
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
-%endmacro
-%macro SSE41_ChromaGetX38x4SatdDC 1
-	shl         %1,     4
-	movdqa      xmm0,   [esi+32+%1]
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
-%endmacro
-%macro SSE41_I16x16GetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 32
-	SSE41_I16X16GetX38x4SatdDC
-%endmacro
-%macro SSE41_ChromaGetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 16
-	SSE41_ChromaGetX38x4SatdDC %1
-%endmacro
-%macro SSE41_HSum8W 3
-	pmaddwd     %1, %2
-	movhlps     %3, %1
-	paddd       %1, %3
-	pshuflw     %3, %1,0Eh
-	paddd       %1, %3
-%endmacro
-
-
-%ifdef X86_32
-WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-WelsIntra16x16Combined3Satd_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-	mov    esi,    [esp+40] ;temp_satd
-	pxor        xmm4,   xmm4
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         ecx,    edx
-	movdqu 		xmm0,   [ecx]
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [esi],  xmm0 ;V
-	movdqa      [esi+16], xmm1
-	add         ecx,    edx
-	pinsrb      xmm0,   byte[ecx-1], 0
-	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     2
-	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     4
-	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     6
-	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     8
-	pinsrb      xmm0,   byte[ecx+edx-1], 9
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     10
-	pinsrb      xmm0,   byte[ecx+edx-1], 11
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     12
-	pinsrb      xmm0,   byte[ecx+edx-1], 13
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     14
-	pinsrb      xmm0,   byte[ecx+edx-1], 15
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [esi+32], xmm0 ;H
-	movdqa      [esi+48], xmm1
-	movd        ecx,    xmm4 ;dc
-	add         ecx,    16   ;(sum+16)
-	shr         ecx,    5    ;((sum+16)>>5)
-	shl         ecx,    4    ;
-	movd        mm4,    ecx  ; mm4 copy DC
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         ecx,    0
-	mov         edi,    0
-.loop16x16_get_satd:
-.loopStart1:
-	SSE41_I16x16GetX38x4Satd ecx, edi
-	inc          ecx
-	cmp         ecx, 4
-	jl          .loopStart1
-	cmp         edi, 16
-	je          .loop16x16_get_satd_end
-	mov         eax, [esp+24]
-	add         eax, 8
-	mov         ecx, 0
-	add         edi, 16
-	jmp         .loop16x16_get_satd
- .loop16x16_get_satd_end:
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-
-	; comparing order: DC H V
-	movd      ebx, xmm6 ;DC
-	movd      edi, xmm5 ;H
-	movd      ecx, xmm4 ;V
-	mov      edx, [esp+36]
-	shl       edx, 1
-	add       edi, edx
-	add       ebx, edx
-	mov       edx, [esp+32]
-	cmp       ebx, edi
-	jge near   not_dc_16x16
-	cmp        ebx, ecx
-	jge near   not_dc_h_16x16
-
-	; for DC mode
-	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx
-	jmp near return_satd_intra_16x16_x3
-not_dc_16x16:
-	; for H mode
-	cmp       edi, ecx
-	jge near   not_dc_h_16x16
-	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, edi
-	jmp near return_satd_intra_16x16_x3
-not_dc_h_16x16:
-	; for V mode
-	mov       dword[edx], 0;I16_PRED_V
-	mov       eax, ecx
-return_satd_intra_16x16_x3:
-	WELSEMMS
-	pop         edi
-	pop         esi
-	pop         ebx
-ret
-
-%macro SSE41_ChromaGetX38x8Satd 0
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         ecx,    edx
-	movq 		xmm0,   [ecx]
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
-	movdqa      [esi],  xmm0 ;V
-	add         ecx,    edx
-	pinsrb      xmm0,   byte[ecx-1], 0
-	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     2
-	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     4
-	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     6
-	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-	movdqa      [esi+16], xmm0 ;H
-;(sum+2)>>2
-	movdqa      xmm6,   [PDQ2]
-	movdqa      xmm5,   xmm4
-	punpckhqdq  xmm5,   xmm1
-	paddd       xmm5,   xmm6
-	psrld       xmm5,   2
-;(sum1+sum2+4)>>3
-	paddd       xmm6,   xmm6
-	paddd       xmm4,   xmm1
-	paddd       xmm4,   xmm6
-	psrld       xmm4,   3
-;satd *16
-	pslld       xmm5,   4
-	pslld       xmm4,   4
-;temp satd
-	movdqa      xmm6,   xmm4
-	punpcklqdq  xmm4,   xmm5
-	psllq       xmm4,   32
-	psrlq       xmm4,   32
-	movdqa      [esi+32], xmm4
-	punpckhqdq  xmm5,   xmm6
-	psllq       xmm5,   32
-	psrlq       xmm5,   32
-	movdqa      [esi+48], xmm5
-
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         ecx,    0
-loop_chroma_satdx3_cb_cr:
-	SSE41_ChromaGetX38x4Satd ecx, 0
-	inc             ecx
-	cmp             ecx, 2
-	jl              loop_chroma_satdx3_cb_cr
-%endmacro
-
-%macro SSEReg2MMX 3
-	movdq2q     %2, %1
-	movhlps     %1, %1
-	movdq2q     %3, %1
-%endmacro
-%macro MMXReg2SSE 4
-	movq2dq     %1, %3
-	movq2dq     %2, %4
-	punpcklqdq  %1, %2
-%endmacro
-;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
-
-WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-WelsIntraChroma8x8Combined3Satd_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-	mov    esi,    [esp+40] ;temp_satd
-	xor    edi,    edi
-loop_chroma_satdx3:
-	SSE41_ChromaGetX38x8Satd
-	cmp             edi, 1
-	je              loop_chroma_satdx3end
-	inc             edi
-	SSEReg2MMX  xmm4, mm0,mm1
-	SSEReg2MMX  xmm5, mm2,mm3
-	SSEReg2MMX  xmm6, mm5,mm6
-	mov         ecx,  [esp+44]
-	mov         eax,  [esp+48]
-	jmp         loop_chroma_satdx3
-loop_chroma_satdx3end:
-	MMXReg2SSE  xmm0, xmm3, mm0, mm1
-	MMXReg2SSE  xmm1, xmm3, mm2, mm3
-	MMXReg2SSE  xmm2, xmm3, mm5, mm6
-
-	paddw       xmm4, xmm0
-	paddw       xmm5, xmm1
-	paddw       xmm6, xmm2
-
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-	; comparing order: DC H V
-	movd      ebx, xmm6 ;DC
-	movd      edi, xmm5 ;H
-	movd      ecx, xmm4 ;V
-	mov       edx, [esp+36]
-	shl       edx, 1
-	add       edi, edx
-	add       ecx, edx
-	mov       edx, [esp+32]
-	cmp       ebx, edi
-	jge near   not_dc_8x8
-	cmp        ebx, ecx
-	jge near   not_dc_h_8x8
-
-	; for DC mode
-	mov       dword[edx], 0;I8_PRED_DC
-	mov       eax, ebx
-	jmp near return_satd_intra_8x8_x3
-not_dc_8x8:
-	; for H mode
-	cmp       edi, ecx
-	jge near   not_dc_h_8x8
-	mov       dword[edx], 1;I8_PRED_H
-	mov       eax, edi
-	jmp near return_satd_intra_8x8_x3
-not_dc_h_8x8:
-	; for V mode
-	mov       dword[edx], 2;I8_PRED_V
-	mov       eax, ecx
-return_satd_intra_8x8_x3:
-	WELSEMMS
-	pop         edi
-	pop         esi
-	pop         ebx
-ret
-
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 END
-;
-;***********************************************************************
-%macro SSSE3_Get16BSadHVDC 2
-  movd        xmm6,%1
-  pshufb      xmm6,xmm1
-  movdqa      %1,  xmm6
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm7
-  paddw       xmm4,xmm0
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm5
-  paddw       xmm2,xmm0
-  psadbw      xmm6,%2
-  paddw       xmm3,xmm6
-%endmacro
-%macro WelsAddDCValue 4
-    movzx   %2, byte %1
-    mov    %3, %2
-    add     %4, %2
-%endmacro
-
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 BEGIN
-;
-;***********************************************************************
-WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-WelsIntra16x16Combined3Sad_ssse3:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    edi,    [esp+40] ;temp_sad
-	sub    ecx,    edx
-    movdqa      xmm5,[ecx]
-    pxor        xmm0,xmm0
-    psadbw      xmm0,xmm5
-    movhlps     xmm1,xmm0
-    paddw       xmm0,xmm1
-    movd        eax,xmm0
-
-    add         ecx,edx
-    lea         ebx, [edx+2*edx]
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    sub        edi, 192
-    add         eax,10h
-    shr         eax,5
-    movd        xmm7,eax
-    pxor        xmm1,xmm1
-    pshufb      xmm7,xmm1
-    pxor        xmm4,xmm4
-    pxor        xmm3,xmm3
-    pxor        xmm2,xmm2
-;sad begin
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-    lea         esi, [ebx+2*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-
-    pslldq      xmm3,4
-    por         xmm3,xmm2
-    movhlps     xmm1,xmm3
-    paddw       xmm3,xmm1
-    movhlps     xmm0,xmm4
-    paddw       xmm4,xmm0
-; comparing order: DC H V
-	movd        ebx, xmm4 ;DC
-	movd        ecx, xmm3 ;V
-	psrldq      xmm3, 4
-	movd        esi, xmm3 ;H
-	mov         eax, [esp+36] ;lamda
-	shl         eax, 1
-	add         esi, eax
-	add         ebx, eax
-	mov         edx, [esp+32]
-	cmp         ebx, esi
-	jge near   not_dc_16x16_sad
-	cmp        ebx, ecx
-	jge near   not_dc_h_16x16_sad
-	; for DC mode
-	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx
-    sub        edi, 192
-%assign x 0
-%rep 16
-    movdqa    [edi+16*x], xmm7
-%assign x x+1
-%endrep
-	jmp near return_sad_intra_16x16_x3
-not_dc_16x16_sad:
-	; for H mode
-	cmp       esi, ecx
-	jge near   not_dc_h_16x16_sad
-	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, esi
-	jmp near return_sad_intra_16x16_x3
-not_dc_h_16x16_sad:
-	; for V mode
-	mov       dword[edx], 0;I16_PRED_V
-	mov       eax, ecx
-    sub       edi, 192
-%assign x 0
-%rep 16
-    movdqa    [edi+16*x], xmm5
-%assign x x+1
-%endrep
-return_sad_intra_16x16_x3:
-	pop    edi
-	pop    esi
-	pop    ebx
-	ret
-%endif
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 END
-;
-;***********************************************************************
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 BEGIN
-;
-;***********************************************************************
-
-;SSE4.1
-%macro SSE41_GetSatd8x4 0
-	movq             xmm0, [r0]
-	punpcklqdq       xmm0, xmm0
-	pmaddubsw        xmm0, xmm7
-	movq             xmm1, [r0+r1]
-	punpcklqdq       xmm1, xmm1
-	pmaddubsw        xmm1, xmm7
-	movq             xmm2, [r2]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r2+r3]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	psubsw           xmm0, xmm2
-	psubsw           xmm1, xmm3
-	movq             xmm2, [r0+2*r1]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r0+r4]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	movq             xmm4, [r2+2*r3]
-	punpcklqdq       xmm4, xmm4
-	pmaddubsw        xmm4, xmm7
-	movq             xmm5, [r2+r5]
-	punpcklqdq       xmm5, xmm5
-	pmaddubsw        xmm5, xmm7
-	psubsw           xmm2, xmm4
-	psubsw           xmm3, xmm5
-	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
-	pabsw            xmm0, xmm0
-	pabsw            xmm2, xmm2
-	pabsw            xmm1, xmm1
-	pabsw            xmm3, xmm3
-	movdqa           xmm4, xmm3
-	pblendw          xmm3, xmm1, 0xAA
-	pslld            xmm1, 16
-	psrld            xmm4, 16
-	por              xmm1, xmm4
-	pmaxuw           xmm1, xmm3
-	paddw            xmm6, xmm1
-	movdqa           xmm4, xmm0
-	pblendw          xmm0, xmm2, 0xAA
-	pslld            xmm2, 16
-	psrld            xmm4, 16
-	por              xmm2, xmm4
-	pmaxuw           xmm0, xmm2
-	paddw            xmm6, xmm0
-%endmacro
-
-%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
-	MMX_DW_1_2REG    %3, %4
-	pmaddwd     %2, %3
-	movhlps     %4, %2
-	paddd       %2, %4
-	pshuflw     %4, %2,0Eh
-	paddd       %2, %4
-	movd		%1, %2
-%endmacro
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse41
-WelsSampleSatd4x4_sse41:
-	;push        ebx
-	;mov         eax,[esp+8]
-	;mov         ebx,[esp+12]
-	;mov         ecx,[esp+16]
-	;mov         edx,[esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movdqa      xmm4,[HSwapSumSubDB1]
-	movd        xmm2,[r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm2,xmm5,0
-	movd        xmm3,[r2+r3*2]
-	lea         r2, [r3*2+r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm3,xmm5,0
-	movd        xmm0,[r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm0,xmm5,0
-	movd        xmm1,[r0+r1*2]
-	lea         r0, [r1*2+r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm1,xmm5,0
-	pmaddubsw   xmm0,xmm4
-	pmaddubsw   xmm1,xmm4
-	pmaddubsw   xmm2,xmm4
-	pmaddubsw   xmm3,xmm4
-	psubw       xmm0,xmm2
-	psubw       xmm1,xmm3
-	movdqa      xmm2,xmm0
-	paddw       xmm0,xmm1
-	psubw       xmm1,xmm2
-	movdqa      xmm2,xmm0
-	punpcklqdq  xmm0,xmm1
-	punpckhqdq  xmm2,xmm1
-	movdqa      xmm1,xmm0
-	paddw       xmm0,xmm2
-	psubw       xmm2,xmm1
-	movdqa      xmm1,xmm0
-	pblendw     xmm0,xmm2,0AAh
-	pslld       xmm2,16
-	psrld       xmm1,16
-	por         xmm2,xmm1
-	pabsw       xmm0,xmm0
-	pabsw       xmm2,xmm2
-	pmaxsw      xmm0,xmm2
-	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x8_sse41
-align 16
-WelsSampleSatd8x8_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;mov    eax,    [esp+16]
-	;mov    ebx,    [esp+20]
-	;mov    ecx,    [esp+24]
-	;mov    edx,    [esp+28]
-%ifdef X86_32
-	push  r4
-	push  r5
-%endif
-	%assign  push_num 2
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6, xmm6
-	SSE41_GetSatd8x4
-	lea			r0,	 [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x16_sse41
-align 16
-WelsSampleSatd8x16_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;push   ebp
-	;%define pushsize   16
-	;mov    eax,    [esp+pushsize+4]
-	;mov    ebx,    [esp+pushsize+8]
-	;mov    ecx,    [esp+pushsize+12]
-	;mov    edx,    [esp+pushsize+16]
-%ifdef X86_32
-	push  r4
-	push  r5
-	push  r6
-%endif
-	%assign  push_num 3
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor        xmm6, xmm6
-	mov         r6,    0
-loop_get_satd_8x16:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_8x16
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse41
-align 16
-WelsSampleSatd16x8_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;mov    eax,    [esp+16]
-	;mov    ebx,    [esp+20]
-	;mov    ecx,    [esp+24]
-	;mov    edx,    [esp+28]
-%ifdef X86_32
-	push  r4
-	push  r5
-%endif
-	%assign  push_num 2
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	push  r0
-	push  r2
-
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-
-	pop  r2
-	pop  r0
-	;mov			eax,    [esp+16]
-	;mov			ecx,    [esp+24]
-	add			r0,    8
-	add			r2,    8
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSatd16x16_sse41
-align 16
-WelsSampleSatd16x16_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;push   ebp
-	;%define pushsize   16
-	;mov    eax,    [esp+pushsize+4]
-	;mov    ebx,    [esp+pushsize+8]
-	;mov    ecx,    [esp+pushsize+12]
-	;mov    edx,    [esp+pushsize+16]
-%ifdef X86_32
-	push  r4
-	push  r5
-	push  r6
-%endif
-	%assign  push_num 3
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-
-	push  r0
-	push  r2
-
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	mov         r6,    0
-loop_get_satd_16x16_left:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_left
-
-	pop  r2
-	pop  r0
-	;mov			eax,    [esp+pushsize+4]
-	;mov			ecx,    [esp+pushsize+12]
-	add			r0,    8
-	add			r2,    8
-	mov         r6,    0
-loop_get_satd_16x16_right:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_right
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	;%undef pushsize
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE2_GetSad2x16 0
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqu xmm1,   [r2]
-	MOVDQ  xmm2,   [r0];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-%endmacro
-
-
-%macro SSE2_GetSad4x16 0
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	paddw  xmm7,   xmm0
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+2*r3]
-	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+r5]
-	MOVDQ  xmm2,   [r0+r4]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-%endmacro
-
-
-%macro SSE2_GetSad8x4 0
-	movq   xmm0,   [r0]
-	movq   xmm1,   [r0+r1]
-	lea    r0,     [r0+2*r1]
-	movhps xmm0,   [r0]
-	movhps xmm1,   [r0+r1]
-
-	movq   xmm2,   [r2]
-	movq   xmm3,   [r2+r3]
-	lea    r2,     [r2+2*r3]
-	movhps xmm2,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x16_sse2
-align 16
-WelsSampleSad16x16_sse2:
-	;push ebx
-	;push edi
-	;push esi
-	;%define _STACK_SIZE		12
-	;mov eax, [esp+_STACK_SIZE+4 ]
-	;mov	ebx, [esp+_STACK_SIZE+8 ]
-	;mov ecx, [esp+_STACK_SIZE+12]
-	;mov edx, [esp+_STACK_SIZE+16]
-%ifdef X86_32
-	push  r4
-	push  r5
-%endif
-
-	%assign  push_num 2
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	lea r4, [3*r1]
-	lea r5, [3*r3]
-
-	pxor   xmm7,   xmm7
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	movhlps xmm0, xmm7
-	paddw xmm0, xmm7
-	movd retrd, xmm0
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x8_sse2
-align 16
-WelsSampleSad16x8_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-
-	movhlps     xmm1, xmm0
-	paddw       xmm0, xmm1
-	movd        retrd,  xmm0
-	LOAD_4_PARA_POP
-	ret
-
-
-
-WELS_EXTERN WelsSampleSad8x16_sse2
-WelsSampleSad8x16_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-    pxor   xmm6,   xmm6
-
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	LOAD_4_PARA_POP
-	ret
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
-    ;mov    ecx,    [esp+12]
-	;mov    edx,    ecx
-    ;CACHE_SPLIT_CHECK edx, 8, 64
-	;jle    near   .pixel_sad_8x8_nsplit
-	;push   ebx
-	;push   edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-
-	%assign  push_num 0
-	mov		r2,  arg3
-	push	r2
-	CACHE_SPLIT_CHECK r2, 8, 64
-	jle    near   .pixel_sad_8x8_nsplit
-	pop		r2
-%ifdef X86_32
-	push	r3
-	push	r4
-	push	r5
-%endif
-	%assign  push_num 3
-	mov		r0,  arg1
-	mov		r1,  arg2
-	SIGN_EXTENTION r1, r1d
-    pxor   xmm7,   xmm7
-
-    ;ecx r2, edx r4, edi r5
-
-    mov    r5,    r2
-    and    r5,    0x07
-    sub    r2,    r5
-    mov    r4,    8
-    sub    r4,    r5
-
-    shl    r5,    3
-    shl    r4,    3
-    movd   xmm5,   r5d
-    movd   xmm6,   r4d
-	mov    r5,    8
-	add    r5,    r2
-    mov    r3,    arg4
-	SIGN_EXTENTION r3, r3d
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-    movhlps    xmm0, xmm7
-	paddw      xmm0, xmm7
-	movd       retrd,  xmm0
-%ifdef X86_32
-	pop	 r5
-	pop	 r4
-	pop	 r3
-%endif
-	jmp        .return
-
-.pixel_sad_8x8_nsplit:
-    ;push   ebx
-    ;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    edx,    [esp+20]
-
-	pop r2
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm6,   xmm6
-	SSE2_GetSad8x4
-    lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	LOAD_4_PARA_POP
-.return:
-	ret
-
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 END
-;
-;***********************************************************************
-
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-
-%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
-	psadbw %1,   %4
-	paddw  xmm5, %1
-	psadbw %4,   %3
-	paddw  xmm4, %4
-	movdqu %4,   [%5-1]
-	psadbw %4,   %2
-	paddw  xmm6, %4
-	movdqu %4,   [%5+1]
-	psadbw %4,   %2
-	paddw  xmm7, %4
-%endmacro
-WELS_EXTERN WelsSampleSadFour16x16_sse2
-WelsSampleSadFour16x16_sse2:
-	;push ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw  xmm4,   xmm3
-
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw  xmm6,   xmm2
-
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw  xmm7,   xmm3
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm2,   xmm3
-	paddw xmm5,   xmm2
-
-	movdqu xmm2,   [r2-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
-
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
-
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
-
-	;mov        ecx,  [esp+24]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-
-WELS_EXTERN WelsSampleSadFour16x8_sse2
-WelsSampleSadFour16x8_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw xmm4,   xmm3
-
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw xmm4,   xmm3
-
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
-
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
-
-	movdqu xmm0,   [r2-1]
-	psadbw xmm0,   xmm1
-	paddw xmm6,   xmm0
-
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm1
-	paddw xmm7,   xmm3
-
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm1,   xmm3
-	paddw xmm5,   xmm1
-
-	;mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-WELS_EXTERN WelsSampleSadFour8x16_sse2
-WelsSampleSadFour8x16_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	;mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-
-WELS_EXTERN WelsSampleSadFour8x8_sse2
-WelsSampleSadFour8x8_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	;mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-WELS_EXTERN WelsSampleSadFour4x4_sse2
-WelsSampleSadFour4x4_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movd   xmm0,   [r0]
-	movd   xmm1,   [r0+r1]
-	lea        r0,    [r0+2*r1]
-	movd       xmm2,   [r0]
-	movd       xmm3,   [r0+r1]
-	punpckldq  xmm0, xmm1
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm0, xmm2
-	sub        r2,  r3
-	movd       xmm1, [r2]
-	movd       xmm2, [r2+r3]
-	punpckldq  xmm1, xmm2
-	movd       xmm2, [r2+r3-1]
-	movd       xmm3, [r2+r3+1]
-
-	lea        r2,  [r2+2*r3]
-
-	movd       xmm4, [r2]
-	movd       xmm5, [r2-1]
-	punpckldq  xmm2, xmm5
-	movd       xmm5, [r2+1]
-	punpckldq  xmm3, xmm5
-
-	movd       xmm5, [r2+r3]
-	punpckldq  xmm4, xmm5
-
-	punpcklqdq xmm1, xmm4 ;-L
-
-	movd       xmm5, [r2+r3-1]
-	movd       xmm6, [r2+r3+1]
-
-	lea        r2,  [r2+2*r3]
-	movd       xmm7, [r2-1]
-	punpckldq  xmm5, xmm7
-	punpcklqdq xmm2, xmm5 ;-1
-	movd       xmm7, [r2+1]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm3, xmm6 ;+1
-	movd       xmm6, [r2]
-	movd       xmm7, [r2+r3]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6 ;+L
-	psadbw     xmm1, xmm0
-	psadbw     xmm2, xmm0
-	psadbw     xmm3, xmm0
-	psadbw     xmm4, xmm0
-
-	movhlps    xmm0, xmm1
-	paddw      xmm1, xmm0
-	movhlps    xmm0, xmm2
-	paddw      xmm2, xmm0
-	movhlps    xmm0, xmm3
-	paddw      xmm3, xmm0
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	;mov        edi,  [esp+28]
-	punpckldq  xmm1, xmm4
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm1, xmm2
-	movdqa     [r4],xmm1
-	LOAD_5_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 END
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSad4x4_mmx
-
-align 16
-;***********************************************************************
-;   int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
-;***********************************************************************
-WelsSampleSad4x4_mmx:
-    ;push    ebx
-	;%define pushsize     4
-	;%define pix1address	 esp+pushsize+4
-	;%define pix1stride   esp+pushsize+8
-	;%define pix2address  esp+pushsize+12
-	;%define pix2stride   esp+pushsize+16
-    ;mov		  eax, [pix1address]
-    ;mov		  ebx, [pix1stride ]
-    ;mov		  ecx, [pix2address]
-    ;mov		  edx, [pix2stride ]
-
-    %assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movd	  mm0, [r0]
-	movd	  mm1, [r0+r1]
-	punpckldq mm0, mm1
-
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm0, mm3
-
-	lea       r0, [r0+2*r1]
-	lea       r2, [r2+2*r3]
-
-	movd      mm1, [r0]
-	movd      mm2, [r0+r1]
-	punpckldq mm1, mm2
-
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm1, mm3
-	paddw     mm0, mm1
-
-    movd      retrd, mm0
-
-	WELSEMMS
-    LOAD_4_PARA_POP
-    ret
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -41,7 +41,6 @@
 	$(ENCODER_SRCDIR)/./core/asm/intra_pred.asm\
 	$(ENCODER_SRCDIR)/./core/asm/memzero.asm\
 	$(ENCODER_SRCDIR)/./core/asm/quant.asm\
-	$(ENCODER_SRCDIR)/./core/asm/satd_sad.asm\
 	$(ENCODER_SRCDIR)/./core/asm/score.asm\
 
 ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.o)
--- a/codec/processing/build/win32/WelsVP_2008.vcproj
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -594,47 +594,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\src\asm\intra_pred.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\src\asm\sad.asm"
+				RelativePath="..\..\..\common\satd_sad.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
--- a/codec/processing/src/asm/intra_pred.asm
+++ /dev/null
@@ -1,1505 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  intra_pred.asm
-;*
-;*  Abstract
-;*      sse2 function for intra predict operations
-;*
-;*  History
-;*      18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-align 16
-sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
-align 16
-sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
-align 16
-sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
-
-; for chroma plane mode
-sse2_plane_inc_c dw 1, 2, 3, 4
-sse2_plane_dec_c dw 4, 3, 2, 1
-align 16
-sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
-
-align 16
-mmx_01bytes:		times 16	db 1
-;align 16
-;sse_0x0004bytes:	times 8		dw 4
-;ALIGN 16
-;sse_f000 db  255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-
-align 16
-mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
-
-
-;***********************************************************************
-; macros
-;***********************************************************************
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-;%1 will keep the last result
-%macro SSE_DB_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubb %1, %2
-%endmacro
-
-;xmm0, xmm1, xmm2, eax, ecx
-;lower 64 bits of xmm0 save the result
-%macro SSE2_PRED_H_4X4_TWO_LINE 5
-    movd		%1,	[%4-1]
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
-
-	;add			%4,	%5
-	movd		%2,	[%4+%5-1]
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	punpckldq	%1,	%2
-%endmacro
-
-%macro  SUMW_HORIZON1 2
-	movdqa      %2, %1
-	psrldq      %2, 8
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 4
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 2
-	paddusw     %1, %2
-%endmacro
-
-%macro	LOAD_COLUMN 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]
-		movd	%4,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		lea		%5,	[%5+2*%6]
-		punpcklbw %3,	%2
-		punpcklwd %4,	%3
-		punpckhdq %1,	%4
-%endmacro
-
-%macro  SUMW_HORIZON 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
-%endmacro
-
-
-%macro  COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-%macro  COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-%macro	LOAD_COLUMN_C 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]
-%endmacro
-
-%macro LOAD_2_LEFT_AND_ADD 0
-        lea         r1, [r1+2*r2]
-        movzx		r4, byte [r1-0x01]
-        add			r3, r4
-        movzx		r4, byte [r1+r2-0x01]
-        add			r3, r4
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-WELS_EXTERN WelsI4x4LumaPredH_sse2
-WELS_EXTERN WelsI4x4LumaPredDDR_mmx
-WELS_EXTERN WelsI4x4LumaPredDc_sse2
-WELS_EXTERN WelsI16x16LumaPredPlane_sse2
-
-ALIGN 16
-;***********************************************************************
-;   void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;
-;	pred must align to 16
-;***********************************************************************
-WelsI4x4LumaPredH_sse2:
-	push r3
-	%assign push_num 1
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	movzx		r3,	byte [r1-1]
-	movd		xmm0,	r3d
-	pmuludq		xmm0,	[mmx_01bytes]
-
-	movzx		r3,	byte [r1+r2-1]
-	movd		xmm1,	r3d
-	pmuludq		xmm1,	[mmx_01bytes]
-
-	unpcklps	xmm0,	xmm1
-
-	lea			r1,	[r1+r2*2]
-	movzx		r3,	byte [r1-1]
-	movd		xmm2,	r3d
-	pmuludq		xmm2,	[mmx_01bytes]
-
-	movzx		r3,	byte [r1+r2-1]
-	movd		xmm3,	r3d
-	pmuludq		xmm3,	[mmx_01bytes]
-
-	unpcklps	xmm2,	xmm3
-	unpcklpd	xmm0,	xmm2
-
-	movdqa		[r0],	xmm0
-	pop r3
-	ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WelsI16x16LumaPredPlane_sse2:
-		;%define pushsize	4
-		;push	esi
-		;mov		esi,	[esp + pushsize + 8]
-		;mov		ecx,	[esp + pushsize + 12]
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_3_PARA
-		%ifndef X86_32
-		movsx r2, r2d
-		%endif
-		sub		r1,	1
-		sub		r1,	r2
-
-		;for H
-		pxor	xmm7,	xmm7
-		movq	xmm0,	[r1]
-		movdqa	xmm5,	[sse2_plane_dec]
-		punpcklbw xmm0,	xmm7
-		pmullw	xmm0,	xmm5
-		movq	xmm1,	[r1 + 9]
-		movdqa	xmm6,	[sse2_plane_inc]
-		punpcklbw xmm1,	xmm7
-		pmullw	xmm1,	xmm6
-		psubw	xmm1,	xmm0
-
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r3d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
-		movsx	r3,	r3w
-		imul	r3,	5
-		add		r3,	32
-		sar		r3,	6			; b = (5 * H + 32) >> 6;
-		SSE2_Copy8Times	xmm1, r3d	; xmm1 = b,b,b,b,b,b,b,b
-
-		movzx	r4,	BYTE [r1+16]
-		sub	r1, 3
-		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r1, r2
-
-		add		r1,	3
-		movzx	r3,	BYTE [r1+8*r2]
-		add		r4,	r3
-		shl		r4,	4			;	a = (left[15*stride] + top[15]) << 4;
-
-		sub	r1, 3
-		add		r1,	r2
-		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r1, r2
-		pxor	xmm4,	xmm4
-		punpckhbw xmm0,	xmm4
-		pmullw	xmm0,	xmm5
-		punpckhbw xmm7,	xmm4
-		pmullw	xmm7,	xmm6
-		psubw	xmm7,	xmm0
-
-		SUMW_HORIZON   xmm7,xmm0,xmm2
-		movd    r3d,   xmm7			; V
-		movsx	r3,	r3w
-		imul	r3,	5
-		add		r3,	32
-		sar		r3,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, r3d		; xmm4 = c,c,c,c,c,c,c,c
-
-		;mov		esi,	[esp + pushsize + 4]
-		add		r4,	16
-		imul	r3,	-7
-		add		r3,	r4				; s = a + 16 + (-7)*c
-		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
-
-		xor		r3,	r3
-		movdqa	xmm5,	[sse2_plane_inc_minus]
-
-get_i16x16_luma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		movdqa	xmm3,	xmm1
-		pmullw	xmm3,	xmm6
-		paddw	xmm3,	xmm0
-		psraw	xmm3,	5
-		packuswb xmm2,	xmm3
-		movdqa	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	16
-		inc		r3
-		cmp		r3,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1
-		pop r4
-		pop r3
-		ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_ONE_LINE 0
-	add r0, 16
-	add r1, r2
-	movzx r3, byte [r1]
-	SSE2_Copy16Times xmm0, r3d
-	movdqa [r0], xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
-	push r3
-	%assign push_num 1
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	dec r1
-	movzx r3, byte [r1]
-	SSE2_Copy16Times xmm0, r3d
-	movdqa [r0], xmm0
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	pop r3
-    ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
-    ;mov     edx, [esp+4]    ; pred
-    ;mov     eax, [esp+8]	; pRef
-    ;mov     ecx, [esp+12]   ; stride
-    %assign push_num 0
-    LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-    sub     r1, r2
-    movdqa  xmm0, [r1]
-
-    movdqa  [r0], xmm0
-    movdqa  [r0+10h], xmm0
-    movdqa  [r0+20h], xmm0
-    movdqa  [r0+30h], xmm0
-    movdqa  [r0+40h], xmm0
-    movdqa  [r0+50h], xmm0
-    movdqa  [r0+60h], xmm0
-    movdqa  [r0+70h], xmm0
-    movdqa  [r0+80h], xmm0
-    movdqa  [r0+90h], xmm0
-    movdqa  [r0+160], xmm0
-    movdqa  [r0+176], xmm0
-    movdqa  [r0+192], xmm0
-    movdqa  [r0+208], xmm0
-    movdqa  [r0+224], xmm0
-    movdqa  [r0+240], xmm0
-
-    ret
-
-;***********************************************************************
-; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredPlane_sse2
-WelsIChromaPredPlane_sse2:
-		;%define pushsize	4
-		;push	esi
-		;mov		esi,	[esp + pushsize + 8]	;pRef
-		;mov		ecx,	[esp + pushsize + 12]	;stride
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_3_PARA
-		%ifndef X86_32
-		movsx r2, r2d
-		%endif
-		sub		r1,	1
-		sub		r1,	r2
-
-		pxor	mm7,	mm7
-		movq	mm0,	[r1]
-		movq	mm5,	[sse2_plane_dec_c]
-		punpcklbw mm0,	mm7
-		pmullw	mm0,	mm5
-		movq	mm1,	[r1 + 5]
-		movq	mm6,	[sse2_plane_inc_c]
-		punpcklbw mm1,	mm7
-		pmullw	mm1,	mm6
-		psubw	mm1,	mm0
-
-		movq2dq xmm1,   mm1
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r3d,	xmm1
-		movsx	r3,	r3w
-		imul	r3,	17
-		add		r3,	16
-		sar		r3,	5			; b = (17 * H + 16) >> 5;
-		SSE2_Copy8Times	xmm1, r3d	; mm1 = b,b,b,b,b,b,b,b
-
-		movzx	r3,	BYTE [r1+8]
-		sub	r1, 3
-		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r1, r2
-
-		add		r1,	3
-		movzx	r4,	BYTE [r1+4*r2]
-		add		r4,	r3
-		shl		r4,	4			; a = (left[7*stride] + top[7]) << 4;
-
-		sub	r1, 3
-		add		r1,	r2
-		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r1, r2
-		pxor	mm4,	mm4
-		punpckhbw mm0,	mm4
-		pmullw	mm0,	mm5
-		punpckhbw mm7,	mm4
-		pmullw	mm7,	mm6
-		psubw	mm7,	mm0
-
-		movq2dq xmm7,   mm7
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm7,xmm0,xmm2
-		movd    r3d,    xmm7			; V
-		movsx	r3,	r3w
-		imul	r3,	17
-		add		r3,	16
-		sar		r3,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, r3d	; mm4 = c,c,c,c,c,c,c,c
-
-		;mov		esi,	[esp + pushsize + 4]
-		add		r4,	16
-		imul	r3,	-3
-		add		r3,	r4		; s = a + 16 + (-3)*c
-		SSE2_Copy8Times	xmm0, r3d	; xmm0 = s,s,s,s,s,s,s,s
-
-		xor		r3,	r3
-		movdqa	xmm5,	[sse2_plane_mul_b_c]
-
-get_i_chroma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		packuswb xmm2,	xmm2
-		movq	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	8
-		inc		r3
-		cmp		r3,	8
-		jnz get_i_chroma_pred_plane_sse2_1
-		pop r4
-		pop r3
-		WELSEMMS
-		ret
-
-ALIGN 16
-;***********************************************************************
-;	0 |1 |2 |3 |4 |
-;	6 |7 |8 |9 |10|
-;	11|12|13|14|15|
-;	16|17|18|19|20|
-;	21|22|23|24|25|
-;	7 is the start pixel of current 4x4 block
-;	pred[7] = ([6]+[0]*2+[1]+2)/4
-;
-;   void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
-;***********************************************************************
-WelsI4x4LumaPredDDR_mmx:
-	;mov			edx,[esp+4]			;pred
-	;mov         eax,[esp+8]			;pRef
-	;mov			ecx,[esp+12]		;stride
-	%assign push_num 0
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	movq        mm1,[r1+r2-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
-	movq        mm2,[r1-8]			;get value of 6 mm2[8] = 6
-	sub		r1, r2			;mov eax to above line of current block(postion of 1)
-	punpckhbw   mm2,[r1-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
-	movd        mm3,[r1]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
-	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
-	psllq       mm3,18h				;mm3[5]=[1]
-	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
-	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	lea  	    r1,[r1+r2*2-8h]		;set eax point to 12
-	movq        mm4,[r1+r2]		;get value of 16, mm4[8]=[16]
-	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[16]
-	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
-	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
-	movq        mm4,[r1+r2*2]		;mm4[8]=[21]
-	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[21]
-	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
-	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
-	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
-	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
-	pand        mm1,[mmx_01bytes]	;set the odd bit
-	psubusb     mm3,mm1				;decrease 1 from odd bytes
-	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
-
-	movd        [r0+12],mm2
-	psrlq       mm2,8
-	movd        [r0+8],mm2
-	psrlq       mm2,8
-	movd        [r0+4],mm2
-	psrlq       mm2,8
-	movd        [r0],mm2
-	WELSEMMS
-	ret
-
-ALIGN 16
-;***********************************************************************
-;	0 |1 |2 |3 |4 |
-;	5 |6 |7 |8 |9 |
-;	10|11|12|13|14|
-;	15|16|17|18|19|
-;	20|21|22|23|24|
-;	6 is the start pixel of current 4x4 block
-;	pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
-;
-;   void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
-;***********************************************************************
-WelsI4x4LumaPredDc_sse2:
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	movzx		r4,	byte [r1-1h]
-	sub			r1,	r2
-	movd		xmm0,	[r1]
-	pxor		xmm1,	xmm1
-	psadbw		xmm0,	xmm1
-	xor r3, r3
-	movd		r3d,	xmm0
-	add			r3,	r4
-	movzx		r4,	byte [r1+r2*2-1h]
-	add			r3,	r4
-
-	lea			r1,	[r1+r2*2-1]
-	movzx		r4,	byte [r1+r2]
-	add			r3,	r4
-
-	movzx		r4,	byte [r1+r2*2]
-	add			r3,	r4
-	add			r3,	4
-	sar			r3,	3
-	imul		r3,	0x01010101
-
-	movd		xmm0,	r3d
-	pshufd		xmm0,	xmm0,	0
-	movdqa		[r0],	xmm0
-	pop r4
-	pop r3
-	ret
-
-ALIGN 16
-;***********************************************************************
-;	void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;   copy 8 pixel of 8 line from left
-;***********************************************************************
-%macro MMX_PRED_H_8X8_ONE_LINE 4
-	movq		%1,		[%3-8]
-	psrlq		%1,		38h
-
-	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
-%endmacro
-
-%macro MMX_PRED_H_8X8_ONE_LINEE 4
-	movq		%1,		[%3+r2-8]
-	psrlq		%1,		38h
-
-	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
-%endmacro
-
-WELS_EXTERN WelsIChromaPredH_mmx
-WelsIChromaPredH_mmx:
-	;mov			edx,	[esp+4]			;pred
-	;mov         eax,	[esp+8]			;pRef
-	;mov			ecx,	[esp+12]		;stride
-	%assign push_num 0
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	movq		mm0,	[r1-8]
-	psrlq		mm0,	38h
-
-	;pmuludq		mm0,	[mmx_01bytes]		;extend to 4 bytes
-	pmullw		mm0,		[mmx_01bytes]
-	pshufw		mm0,	mm0,	0
-	movq		[r0],	mm0
-
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+8
-
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+16
-
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+24
-
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+32
-
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+40
-
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+48
-
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+56
-	WELSEMMS
-	ret
-
-ALIGN 16
-;***********************************************************************
-;	void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;   copy pixels from top 4 pixels
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredV_sse2
-WelsI4x4LumaPredV_sse2:
-	%assign push_num 0
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	sub			r1,	r2
-	movd		xmm0,	[r1]
-	pshufd		xmm0,	xmm0,	0
-	movdqa		[r0],	xmm0
-	ret
-
-ALIGN 16
-;***********************************************************************
-;	void __cdecl WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;   copy 8 pixels from top 8 pixels
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredV_sse2
-WelsIChromaPredV_sse2:
-	%assign push_num 0
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	sub		r1,		r2
-	movq		xmm0,		[r1]
-	movdqa		xmm1,		xmm0
-	punpcklqdq	xmm0,		xmm1
-	movdqa		[r0],		xmm0
-	movdqa		[r0+16],	xmm0
-	movdqa		[r0+32],	xmm0
-	movdqa		[r0+48],	xmm0
-	ret
-
-	ALIGN 16
-;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|e |f |a |b |
-;	|g |h |e |f |
-;	|i |j |g |h |
-
-;   a = (1 + lt + l0)>>1
-;   e = (1 + l0 + l1)>>1
-;   g = (1 + l1 + l2)>>1
-;   i = (1 + l2 + l3)>>1
-
-;   d = (2 + t0 + (t1<<1) + t2)>>2
-;   c = (2 + lt + (t0<<1) + t1)>>2
-;   b = (2 + l0 + (lt<<1) + t0)>>2
-
-;   f = (2 + l1 + (l0<<1) + lt)>>2
-;   h = (2 + l2 + (l1<<1) + l0)>>2
-;   j = (2 + l3 + (l2<<1) + l1)>>2
-;   [b a f e h g j i] + [d c b a] --> mov to memory
-;
-;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:
-	%assign push_num 0
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	sub         r1, r2
-	movd        mm0, [r1-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
-	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-
-	movd        mm1, [r1+2*r2-4]
-	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r1, [r1+2*r2]
-	movd        mm2, [r1+2*r2-4]
-	punpcklbw   mm2, [r1+r2-4]        ; mm2[7] = l2, mm2[6] = l3
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
-	psrlq       mm2, 20h
-	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-
-	movq        mm1, mm0
-	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
-	movq        mm2, mm0
-	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
-	movq        mm3, mm2
-	movq        mm4, mm1
-	pavgb       mm1, mm0
-
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm4				; decrease 1 from odd bytes
-
-	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
-
-	movq        mm4, mm0
-	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
-	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
-
-	psrlq       mm2, 20h
-	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
-	movq        mm4, mm3
-	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
-	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
-
-	movd        [r0], mm2
-	movd        [r0+12], mm3
-	psrlq       mm3, 10h
-	movd        [r0+8], mm3
-	psrlq       mm3, 10h
-	movd        [r0+4], mm3
-	WELSEMMS
-	ret
-
-ALIGN 16
-;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|c |d |e |f |
-;	|e |f |g |g |
-;	|g |g |g |g |
-
-;   a = (1 + l0 + l1)>>1
-;   c = (1 + l1 + l2)>>1
-;   e = (1 + l2 + l3)>>1
-;   g = l3
-
-;   b = (2 + l0 + (l1<<1) + l2)>>2
-;   d = (2 + l1 + (l2<<1) + l3)>>2
-;   f = (2 + l2 + (l3<<1) + l3)>>2
-
-;   [g g f e d c b a] + [g g g g] --> mov to memory
-;
-;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:
-	%assign push_num 0
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	movd        mm0, [r1-4]            ; mm0[3] = l0
-	punpcklbw   mm0, [r1+r2-4]        ; mm0[7] = l1, mm0[6] = l0
-	lea         r1, [r1+2*r2]
-	movd        mm2, [r1-4]            ; mm2[3] = l2
-	movd        mm4, [r1+r2-4]        ; mm4[3] = l3
-	punpcklbw   mm2, mm4
-	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-
-	psrlq       mm4, 18h
-	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
-	psrlq       mm0, 8h
-	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
-	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
-	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
-
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
-	movq        mm5, mm2
-	pavgb       mm2, mm0
-
-	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
-	pand        mm5, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm5				; decrease 1 from odd bytes
-
-	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
-
-	psrlq       mm2, 8h
-	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
-
-	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
-
-	psrlq       mm4, 20h
-	movd        [r0+12], mm4
-
-	movd        [r0], mm1
-	psrlq       mm1, 10h
-	movd        [r0+4], mm1
-	psrlq       mm1, 10h
-	movd        [r0+8], mm1
-	WELSEMMS
-	ret
-
-
-
-ALIGN 16
-;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	l3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|i |a |b |c |
-;	|j |e |f |g |
-
-;   a = (1 + lt + t0)>>1
-;   b = (1 + t0 + t1)>>1
-;   c = (1 + t1 + t2)>>1
-;   d = (1 + t2 + t3)>>1
-
-;   e = (2 + l0 + (lt<<1) + t0)>>2
-;   f = (2 + lt + (t0<<1) + t1)>>2
-;   g = (2 + t0 + (t1<<1) + t2)>>2
-
-;   h = (2 + t1 + (t2<<1) + t3)>>2
-;   i = (2 + lt + (l0<<1) + l1)>>2
-;   j = (2 + l0 + (l1<<1) + l2)>>2
-;
-;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:
-	%assign push_num 0
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	sub         r1, r2
-	movq        mm0, [r1-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
-	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-
-	movd        mm1, [r1+2*r2-4]
-	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r1, [r1+2*r2]
-	movq        mm2, [r1+r2-8]        ; mm2[7] = l2
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
-	psrlq       mm2, 28h
-	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
-
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
-	movq        mm3, mm2
-	pavgb       mm2, mm0
-
-	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm3				; decrease 1 from odd bytes
-
-	movq        mm3, mm0
-	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
-	movq        mm2, mm3
-
-	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
-	movd        [r0], mm1
-
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
-	movd        [r0+4], mm2
-
-	movq        mm4, mm3
-	psllq       mm4, 20h
-	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
-
-	movq        mm5, mm3
-	psllq       mm5, 28h
-	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
-
-	psllq       mm1, 8h
-	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
-	movd        [r0+8], mm4
-
-	psllq       mm2, 8h
-	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
-	movd        [r0+12], mm5
-	WELSEMMS
-	ret
-
-ALIGN 16
-;***********************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|b |c |d |e |
-;	|c |d |e |f |
-;	|d |e |f |g |
-
-;   a = (2 + t0 + t2 + (t1<<1))>>2
-;   b = (2 + t1 + t3 + (t2<<1))>>2
-;   c = (2 + t2 + t4 + (t3<<1))>>2
-;   d = (2 + t3 + t5 + (t4<<1))>>2
-
-;   e = (2 + t4 + t6 + (t5<<1))>>2
-;   f = (2 + t5 + t7 + (t6<<1))>>2
-;   g = (2 + t6 + t7 + (t7<<1))>>2
-
-;   [g f e d c b a] --> mov to memory
-;
-;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:
-	%assign push_num 0
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	sub         r1, r2
-	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
-
-	movq        mm3, mm0
-	psrlq       mm3, 38h
-	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
-
-	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
-	psrlq       mm2, 8h
-	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
-
-	movq        mm3, mm1
-	pavgb       mm1, mm2
-	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm3				; decrease 1 from odd bytes
-
-	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
-
-	psrlq       mm0, 8h
-	movd        [r0], mm0
-	psrlq       mm0, 8h
-	movd        [r0+4], mm0
-	psrlq       mm0, 8h
-	movd        [r0+8], mm0
-	psrlq       mm0, 8h
-	movd        [r0+12], mm0
-	WELSEMMS
-	ret
-
-
-ALIGN 16
-;***********************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|b |c |d |i |
-;	|f |g |h |j |
-
-;   a = (1 + t0 + t1)>>1
-;   b = (1 + t1 + t2)>>1
-;   c = (1 + t2 + t3)>>1
-;   d = (1 + t3 + t4)>>1
-;   i = (1 + t4 + t5)>>1
-
-;   e = (2 + t0 + (t1<<1) + t2)>>2
-;   f = (2 + t1 + (t2<<1) + t3)>>2
-;   g = (2 + t2 + (t3<<1) + t4)>>2
-;   h = (2 + t3 + (t4<<1) + t5)>>2
-;   j = (2 + t4 + (t5<<1) + t6)>>2
-
-;   [i d c b a] + [j h g f e] --> mov to memory
-;
-;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:
-	%assign push_num 0
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	sub         r1, r2
-	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
-
-	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
-	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
-
-	movq        mm3, mm1
-	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
-
-	movq        mm4, mm2
-	pavgb       mm2, mm0
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm4				; decrease 1 from odd bytes
-
-	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
-
-	movd        [r0], mm3
-	psrlq       mm3, 8h
-	movd        [r0+8], mm3
-
-	movd        [r0+4], mm2
-	psrlq       mm2, 8h
-	movd        [r0+12], mm2
-	WELSEMMS
-	ret
-
-ALIGN 16
-;***********************************************************************
-;
-;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	sub         r1, r2
-	movq        mm0, [r1]
-
-	movzx		r3, byte [r1+r2-0x01] ; l1
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l2
-	add		r3, r4
-	movzx		r4, byte [r1+r2-0x01] ; l3
-	add		r3, r4
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l4
-	add		r3, r4
-	movd        	mm1, r3d                 ; mm1 = l1+l2+l3+l4
-
-	movzx		r3, byte [r1+r2-0x01] ; l5
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l6
-	add		r3, r4
-	movzx		r4, byte [r1+r2-0x01] ; l7
-	add		r3, r4
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l8
-	add		r3, r4
-	movd        	mm2, r3d                 ; mm2 = l5+l6+l7+l8
-
-	movq        mm3, mm0
-	psrlq       mm0, 0x20
-	psllq       mm3, 0x20
-	psrlq       mm3, 0x20
-	pxor		mm4, mm4
-	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
-
-	paddq       mm3, mm1
-	movq        mm1, mm2
-	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-
-	movq        mm4, [mmx_0x02]
-
-	paddq       mm0, mm4
-	psrlq       mm0, 0x02
-
-	paddq       mm2, mm4
-	psrlq       mm2, 0x02
-
-	paddq       mm3, mm4
-	paddq       mm3, mm4
-	psrlq       mm3, 0x03
-
-	paddq       mm1, mm4
-	paddq       mm1, mm4
-	psrlq       mm1, 0x03
-
-	pmuludq     mm0, [mmx_01bytes]
-	pmuludq     mm3, [mmx_01bytes]
-	psllq       mm0, 0x20
-	pxor        mm0, mm3                 ; mm0 = m_up
-
-	pmuludq     mm2, [mmx_01bytes]
-	pmuludq     mm1, [mmx_01bytes]
-	psllq       mm1, 0x20
-	pxor        mm1, mm2                 ; mm2 = m_down
-
-	movq        [r0], mm0
-	movq        [r0+0x08], mm0
-	movq        [r0+0x10], mm0
-	movq        [r0+0x18], mm0
-
-	movq        [r0+0x20], mm1
-	movq        [r0+0x28], mm1
-	movq        [r0+0x30], mm1
-	movq        [r0+0x38], mm1
-
-	pop r4
-	pop r3
-	WELSEMMS
-	ret
-
-
-
-ALIGN 16
-;***********************************************************************
-;
-;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	%ifndef X86_32
-	movsx r2, r2d
-	%endif
-	sub         r1, r2
-	movdqa      xmm0, [r1]             ; read one row
-	pxor		xmm1, xmm1
-	psadbw		xmm0, xmm1
-	movdqa      xmm1, xmm0
-	psrldq      xmm1, 0x08
-	pslldq      xmm0, 0x08
-	psrldq      xmm0, 0x08
-	paddw       xmm0, xmm1
-
-	movzx		r3, byte [r1+r2-0x01]
-	movzx		r4, byte [r1+2*r2-0x01]
-	add		r3, r4
-	lea         r1, [r1+r2]
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	add         r3, 0x10
-	movd        xmm1, r3d
-	paddw       xmm0, xmm1
-	psrld       xmm0, 0x05
-	pmuludq     xmm0, [mmx_01bytes]
-	pshufd      xmm0, xmm0, 0
-
-	movdqa      [r0], xmm0
-	movdqa      [r0+0x10], xmm0
-	movdqa      [r0+0x20], xmm0
-	movdqa      [r0+0x30], xmm0
-	movdqa      [r0+0x40], xmm0
-	movdqa      [r0+0x50], xmm0
-	movdqa      [r0+0x60], xmm0
-	movdqa      [r0+0x70], xmm0
-	movdqa      [r0+0x80], xmm0
-	movdqa      [r0+0x90], xmm0
-	movdqa      [r0+0xa0], xmm0
-	movdqa      [r0+0xb0], xmm0
-	movdqa      [r0+0xc0], xmm0
-	movdqa      [r0+0xd0], xmm0
-	movdqa      [r0+0xe0], xmm0
-	movdqa      [r0+0xf0], xmm0
-
-	pop r4
-	pop r3
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
-;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
-;
-;***********************************************************************
-%ifdef X86_ASM
-WELS_EXTERN WelsSmpleSatdThree4x4_sse2
-align 16
-WelsSmpleSatdThree4x4_sse2:
-	push      ebx
-	push      esi
-	push      edi
-	mov       eax,  [esp+24];p_enc
-	mov       ebx,  [esp+28];linesize_enc
-
-	; load source 4x4 samples and Hadamard transform
-    movd      xmm0, [eax]
-    movd      xmm1, [eax+ebx]
-    lea       eax , [eax+2*ebx]
-    movd      xmm2, [eax]
-    movd      xmm3, [eax+ebx]
-    punpckldq xmm0, xmm2
-    punpckldq xmm1, xmm3
-
-    pxor      xmm6, xmm6
-    punpcklbw xmm0, xmm6
-    punpcklbw xmm1, xmm6
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-    SSE2_XSawp  qdq, xmm0, xmm2, xmm3
-
-    movdqa    xmm4, xmm0
-    paddw     xmm0, xmm3
-    psubw     xmm4, xmm3
-
-    movdqa    xmm2, xmm0
-    punpcklwd xmm0, xmm4
-    punpckhwd xmm4, xmm2
-
-	SSE2_XSawp  dq,  xmm0, xmm4, xmm3
-	SSE2_XSawp  qdq, xmm0, xmm3, xmm5
-
-    movdqa    xmm7, xmm0
-    paddw     xmm0, xmm5
-    psubw     xmm7, xmm5
-
-	SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
-
-    ; Hadamard transform results are saved in xmm0 and xmm2
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-
-	; load top boundary samples: [a b c d]
-    mov       eax,  [esp+16];p_dec
-	sub		  eax,	[esp+20];linesize_dec
-	movzx     ecx,  byte [eax]
-	movzx     edx,  byte [eax+1]
-	movzx     esi,  byte [eax+2]
-	movzx     edi,  byte [eax+3]
-
-	; get the transform results of top boundary samples: [a b c d]
-	add       edx, ecx ; edx = a + b
-	add       edi, esi ; edi = c + d
-	add       ecx, ecx ; ecx = a + a
-	add       esi, esi ; esi = c + c
-	sub       ecx, edx ; ecx = a + a - a - b = a - b
-	sub       esi, edi ; esi = c + c - c - d = c - d
-	add       edi, edx ; edi = (a + b) + (c + d)
-	add       edx, edx
-	sub       edx, edi ; edx = (a + b) - (c + d)
-	add       esi, ecx ; esi = (a - b) + (c - d)
-	add       ecx, ecx
-	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
-
-	movdqa    xmm6, xmm0
-	movdqa    xmm7, xmm2
-	movd      xmm5, edi ; store the edi for DC mode
-	pxor      xmm3, xmm3
-	pxor      xmm4, xmm4
-	pinsrw    xmm3, edi, 0
-	pinsrw    xmm3, esi, 4
-	psllw     xmm3, 2
-	pinsrw    xmm4, edx, 0
-	pinsrw    xmm4, ecx, 4
-	psllw     xmm4, 2
-
-	; get the satd of H
-	psubw     xmm0, xmm3
-	psubw     xmm2, xmm4
-
-	WELS_AbsW  xmm0, xmm1
-	WELS_AbsW  xmm2, xmm1
-    paddusw        xmm0, xmm2
-    SUMW_HORIZON1  xmm0, xmm1 ; satd of V is stored in xmm0
-
-	; load left boundary samples: [a b c d]'
-    mov       eax,  [esp+16]
-	mov       ebx,  [esp+20]
-	movzx     ecx,  byte [eax-1]
-	movzx     edx,  byte [eax+ebx-1]
-	lea       eax , [eax+2*ebx]
-	movzx     esi,  byte [eax-1]
-	movzx     edi,  byte [eax+ebx-1]
-
-	; get the transform results of left boundary samples: [a b c d]'
-	add       edx, ecx ; edx = a + b
-	add       edi, esi ; edi = c + d
-	add       ecx, ecx ; ecx = a + a
-	add       esi, esi ; esi = c + c
-	sub       ecx, edx ; ecx = a + a - a - b = a - b
-	sub       esi, edi ; esi = c + c - c - d = c - d
-	add       edi, edx ; edi = (a + b) + (c + d)
-	add       edx, edx
-	sub       edx, edi ; edx = (a + b) - (c + d)
-	add       esi, ecx ; esi = (a - b) + (c - d)
-	add       ecx, ecx
-	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
-
-	; store the transform results in xmm3
-    movd      xmm3, edi
-	pinsrw    xmm3, edx, 1
-	pinsrw    xmm3, ecx, 2
-	pinsrw    xmm3, esi, 3
-	psllw     xmm3, 2
-
-	; get the satd of V
-	movdqa    xmm2, xmm6
-	movdqa    xmm4, xmm7
-	psubw     xmm2, xmm3
-	WELS_AbsW  xmm2, xmm1
-	WELS_AbsW  xmm4, xmm1
-    paddusw        xmm2, xmm4
-    SUMW_HORIZON1  xmm2, xmm1 ; satd of H is stored in xmm2
-
-	; DC result is stored in xmm1
-	add       edi, 4
-	movd      xmm1, edi
-	paddw     xmm1, xmm5
-	psrlw     xmm1, 3
-	movdqa    xmm5, xmm1
-	psllw     xmm1, 4
-
-    ; get the satd of DC
-    psubw          xmm6, xmm1
-    WELS_AbsW  xmm6, xmm1
-	WELS_AbsW  xmm7, xmm1
-    paddusw        xmm6, xmm7
-    SUMW_HORIZON1  xmm6, xmm1 ; satd of DC is stored in xmm6
-
-    ; comparing order: DC H V
-    mov       edx, [esp+32]
-    movd      eax, xmm6
-    movd      edi, xmm2
-    movd      esi, xmm0
-    and       eax, 0xffff
-    shr       eax, 1
-    and       edi, 0xffff
-    shr       edi, 1
-    and       esi, 0xffff
-    shr       esi, 1
-    add       eax, [esp+40]
-    add       edi, [esp+44]
-    add       esi, [esp+48]
-    cmp       ax, di
-    jg near   not_dc
-    cmp       ax, si
-    jg near   not_dc_h
-
-    ; for DC mode
-    movd      ebx, xmm5
-    imul      ebx, 0x01010101
-    movd	  xmm5, ebx
-	pshufd    xmm5, xmm5, 0
-	movdqa    [edx], xmm5
-	mov       ebx, [esp+36]
-	mov       dword [ebx], 0x02
-	pop       edi
-    pop       esi
-    pop       ebx
-    ret
-
-not_dc:
-    cmp       di, si
-    jg near   not_dc_h
-
-    ; for H mode
-    SSE_DB_1_2REG  xmm6, xmm7
-    mov       eax,  [esp+16]
-	mov       ebx,  [esp+20]
-    movzx     ecx,  byte [eax-1]
-	movd      xmm0, ecx
-    pmuludq   xmm0, xmm6
-
-	movzx     ecx,  byte [eax+ebx-1]
-	movd      xmm1, ecx
-    pmuludq   xmm1, xmm6
-%if 1
-    punpckldq xmm0, xmm1
-%else
-	unpcklps  xmm0,	xmm1
-%endif
-	lea       eax,	[eax+ebx*2]
-	movzx	  ecx,	byte [eax-1]
-	movd	  xmm2,	ecx
-    pmuludq   xmm2, xmm6
-
-	movzx	  ecx,	byte [eax+ebx-1]
-	movd	  xmm3,	ecx
-    pmuludq   xmm3, xmm6
-%if 1
-    punpckldq  xmm2, xmm3
-    punpcklqdq xmm0, xmm2
-%else
-	unpcklps  xmm2,	xmm3
-	unpcklpd  xmm0,	xmm2
-%endif
-	movdqa	  [edx],xmm0
-
-	mov       eax, edi
-    mov       ebx, [esp+36]
-	mov       dword [ebx], 0x01
-
-    pop       edi
-    pop       esi
-    pop       ebx
-    ret
-not_dc_h:
-    ; for V mode
-    mov       eax,  [esp+16]
-    sub		  eax,	[esp+20]
-	movd	  xmm0,	[eax]
-	pshufd	  xmm0,	xmm0, 0
-	movdqa	  [edx],xmm0
-
-	mov       eax, esi
-    mov       ebx, [esp+36]
-	mov       dword [ebx], 0x00
-
-    pop       edi
-    pop       esi
-    pop       ebx
-    ret
-%endif
-
--- a/codec/processing/src/asm/sad.asm
+++ /dev/null
@@ -1,220 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  sad.asm
-;*
-;*  Abstract
-;*      WelsSampleSad8x8_sse21
-;*
-;*  History
-;*      8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-%macro SSE2_GetSad8x4 0
-	movq   xmm0,   [r0]
-	movq   xmm1,   [r0+r1]
-	lea    r0,     [r0+2*r1]
-	movhps xmm0,   [r0]
-	movhps xmm1,   [r0+r1]
-
-	movq   xmm2,   [r2]
-	movq   xmm3,   [r2+r3]
-	lea    r2,     [r2+2*r3]
-	movhps xmm2,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
-    ;mov    ecx,    [esp+12]
-	;mov    edx,    ecx
-    ;CACHE_SPLIT_CHECK edx, 8, 64
-	;jle    near   .pixel_sad_8x8_nsplit
-	;push   ebx
-	;push   edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-
-	%assign  push_num 0
-	mov		r2,  arg3
-	push	r2
-	CACHE_SPLIT_CHECK r2, 8, 64
-	jle    near   .pixel_sad_8x8_nsplit
-	pop		r2
-%ifdef X86_32
-	push	r3
-	push	r4
-	push	r5
-%endif
-	%assign  push_num 3
-	mov		r0,  arg1
-	mov		r1,  arg2
-	SIGN_EXTENTION r1, r1d
-    pxor   xmm7,   xmm7
-
-    ;ecx r2, edx r4, edi r5
-
-    mov    r5,    r2
-    and    r5,    0x07
-    sub    r2,    r5
-    mov    r4,    8
-    sub    r4,    r5
-
-    shl    r5,    3
-    shl    r4,    3
-    movd   xmm5,   r5d
-    movd   xmm6,   r4d
-	mov    r5,    8
-	add    r5,    r2
-    mov    r3,    arg4
-	SIGN_EXTENTION r3, r3d
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-    movhlps    xmm0, xmm7
-	paddw      xmm0, xmm7
-	movd       retrd,  xmm0
-%ifdef X86_32
-	pop	 r5
-	pop	 r4
-	pop	 r3
-%endif
-	jmp        .return
-
-.pixel_sad_8x8_nsplit:
-    ;push   ebx
-    ;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    edx,    [esp+20]
-
-	pop r2
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm6,   xmm6
-	SSE2_GetSad8x4
-    lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	LOAD_4_PARA_POP
-.return:
-	ret
\ No newline at end of file
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -23,8 +23,6 @@
 PROCESSING_ASM_SRCS=\
 	$(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm\
 	$(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm\
-	$(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm\
-	$(PROCESSING_SRCDIR)/./src/asm/sad.asm\
 	$(PROCESSING_SRCDIR)/./src/asm/vaa.asm\
 
 PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o)