shithub: openh264

Download patch

ref: 3761901ed4785ce0c85557362c857f506b4072ce
parent: ac1dbf5aa00a52df7a195e4ba0989ca7c5650355
author: Martin Storsjö <[email protected]>
date: Tue Jan 28 08:42:22 EST 2014

Remove sad.asm from the processing lib, move satd_sad from the encoder to the common lib

sad.asm as used in processing is an exact subset of the
code in satd_sad.asm in the encoder.

--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -2106,7 +2106,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\satd_sad.asm"
+				RelativePath="..\..\..\common\satd_sad.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
--- /dev/null
+++ b/codec/common/satd_sad.asm
@@ -1,0 +1,2344 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  satd_sad.asm
+;*
+;*  Abstract
+;*      WelsSampleSatd4x4_sse2
+;*      WelsSampleSatd8x8_sse2
+;*      WelsSampleSatd16x8_sse2
+;*      WelsSampleSatd8x16_sse2
+;*      WelsSampleSatd16x16_sse2
+;*
+;*      WelsSampleSad16x8_sse2
+;*      WelsSampleSad16x16_sse2
+;*
+;*  History
+;*      8/5/2009 Created
+;*     24/9/2009 modified
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Data
+;***********************************************************************
+SECTION .rodata align=16
+
+align 16
+HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
+align 16
+HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
+align 16
+PDW1:  dw 1,1,1,1,1,1,1,1
+align 16
+PDQ2:  dw 2,0,0,0,2,0,0,0
+align 16
+HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 BEGIN
+;
+;***********************************************************************
+%macro MMX_DW_1_2REG 2
+      pxor %1, %1
+      pcmpeqw %2, %2
+      psubw %1, %2
+%endmacro
+
+%macro  SSE2_SumWHorizon1 2
+	movdqa      %2, %1
+	psrldq      %2, 8
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 4
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 2
+	paddusw     %1, %2
+%endmacro
+
+%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
+   SSE2_SumSub %1, %2, %5
+   SSE2_SumSub %3, %4, %5
+   SSE2_SumSub %2, %4, %5
+   SSE2_SumSub %1, %3, %5
+%endmacro
+
+%macro SSE2_SumAbs4 7
+	WELS_AbsW %1, %3
+	WELS_AbsW %2, %3
+	WELS_AbsW %4, %6
+	WELS_AbsW %5, %6
+	paddusw       %1, %2
+	paddusw       %4, %5
+	paddusw       %7, %1
+	paddusw       %7, %4
+%endmacro
+
+%macro  SSE2_SumWHorizon 3
+	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
+	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
+	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
+	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
+	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%endmacro
+
+%macro SSE2_GetSatd8x8 0
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+	lea                 r0, [r0+2*r1]
+	lea                 r2, [r2+2*r3]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+
+	lea					r0,    [r0+2*r1]
+    lea					r2,    [r2+2*r3]
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+	lea                 r0, [r0+2*r1]
+	lea                 r2, [r2+2*r3]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse2
+align 16
+WelsSampleSatd4x4_sse2:
+	;push      ebx
+	;mov       eax,  [esp+8]
+	;mov       ebx,  [esp+12]
+	;mov       ecx,  [esp+16]
+	;mov       edx,  [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+    movd      xmm0, [r0]
+    movd      xmm1, [r0+r1]
+    lea       r0 , [r0+2*r1]
+    movd      xmm2, [r0]
+    movd      xmm3, [r0+r1]
+    punpckldq xmm0, xmm2
+    punpckldq xmm1, xmm3
+
+    movd      xmm4, [r2]
+    movd      xmm5, [r2+r3]
+    lea       r2 , [r2+2*r3]
+    movd      xmm6, [r2]
+    movd      xmm7, [r2+r3]
+    punpckldq xmm4, xmm6
+    punpckldq xmm5, xmm7
+
+    pxor      xmm6, xmm6
+    punpcklbw xmm0, xmm6
+    punpcklbw xmm1, xmm6
+    punpcklbw xmm4, xmm6
+    punpcklbw xmm5, xmm6
+
+    psubw     xmm0, xmm4
+    psubw     xmm1, xmm5
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+    SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+    movdqa     xmm4, xmm0
+    paddw      xmm0, xmm3
+    psubw      xmm4, xmm3
+
+    movdqa         xmm2, xmm0
+    punpcklwd      xmm0, xmm4
+    punpckhwd      xmm4, xmm2
+
+	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
+	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
+
+    movdqa         xmm7, xmm0
+    paddw          xmm0, xmm5
+    psubw          xmm7, xmm5
+
+	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
+
+    movdqa         xmm2, xmm0
+    paddw          xmm0, xmm1
+    psubw          xmm2, xmm1
+
+    WELS_AbsW  xmm0, xmm3
+    paddusw        xmm6, xmm0
+	WELS_AbsW  xmm2, xmm4
+    paddusw        xmm6, xmm2
+    SSE2_SumWHorizon1  xmm6, xmm4
+	movd           retrd,  xmm6
+    and            retrd,  0xffff
+    shr            retrd,  1
+	LOAD_4_PARA_POP
+	ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x8_sse2
+align 16
+ WelsSampleSatd8x8_sse2:
+	 ;push   ebx
+	 ;mov    eax,    [esp+8]
+	 ;mov    ebx,    [esp+12]
+	 ;mov    ecx,    [esp+16]
+	 ;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+    SSE2_GetSatd8x8
+    psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x16_sse2
+align 16
+ WelsSampleSatd8x16_sse2:
+	 ;push   ebx
+	 ;mov    eax,    [esp+8]
+	 ;mov    ebx,    [esp+12]
+	 ;mov    ecx,    [esp+16]
+	 ;mov    edx,    [esp+20]
+
+	 %assign  push_num 0
+	 LOAD_4_PARA
+	 SIGN_EXTENTION r1, r1d
+	 SIGN_EXTENTION r3, r3d
+	 pxor   xmm6,   xmm6
+     pxor   xmm7,   xmm7
+
+	 SSE2_GetSatd8x8
+     lea    r0,    [r0+2*r1]
+     lea    r2,    [r2+2*r3]
+	 SSE2_GetSatd8x8
+
+	 psrlw   xmm6,  1
+	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	 movd    retrd,   xmm6
+	 LOAD_4_PARA_POP
+	 ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse2
+align 16
+WelsSampleSatd16x8_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	push r0
+	push r2
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+
+	SSE2_GetSatd8x8
+
+	pop r2
+	pop r0
+	;mov    eax,    [esp+8]
+    ;mov    ecx,    [esp+16]
+    add    r0,    8
+    add    r2,    8
+	SSE2_GetSatd8x8
+
+	psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x16_sse2
+align 16
+WelsSampleSatd16x16_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	push r0
+	push r2
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+
+	SSE2_GetSatd8x8
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSatd8x8
+
+	pop r2
+	pop r0
+	;mov    eax,    [esp+8]
+	;mov    ecx,    [esp+16]
+	add    r0,    8
+	add    r2,    8
+
+	SSE2_GetSatd8x8
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSatd8x8
+
+ ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
+    psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+	paddd        xmm4, %1 ;for dc
+	paddd        xmm4, %3 ;for dc
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+;    paddd        xmm4, %1 ;for dc
+;	 paddd        xmm4, %3 ;for dc
+	movdqa       %4, %1
+	punpcklqdq   %4, %3
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+
+%macro SSE41_GetX38x4SatdDec 0
+	pxor        xmm7,   xmm7
+	movq        xmm0,   [eax]
+	movq        xmm1,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	movq        xmm2,   [eax]
+	movq        xmm3,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	punpcklbw   xmm0,   xmm7
+	punpcklbw   xmm1,   xmm7
+	punpcklbw   xmm2,   xmm7
+	punpcklbw   xmm3,   xmm7
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+	;doesn't need another transpose
+%endmacro
+%macro SSE41_GetX38x4SatdV 2
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2],   0
+	pinsrw      xmm0,   word[esi+%2+8], 4
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+2],  0
+	pinsrw      xmm0,   word[esi+%2+10], 4
+	psubsw      xmm0,   xmm1
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+4],  0
+	pinsrw      xmm0,   word[esi+%2+12], 4
+	psubsw      xmm0,   xmm3
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+6],  0
+	pinsrw      xmm0,   word[esi+%2+14], 4
+	psubsw      xmm0,   xmm2
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+%endmacro
+%macro SSE41_GetX38x4SatdH  3
+	movq        xmm0,   [esi+%3+8*%1]
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm5,   xmm0
+	pabsw       xmm1,   xmm1
+	pabsw       xmm2,   xmm2
+	pabsw       xmm3,   xmm3
+	paddw       xmm2,   xmm1;for DC
+	paddw       xmm2,   xmm3;for DC
+	paddw       xmm5,   xmm2
+%endmacro
+%macro SSE41_I16X16GetX38x4SatdDC 0
+	pxor        xmm0,   xmm0
+	movq2dq     xmm0,   mm4
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2
+%endmacro
+%macro SSE41_ChromaGetX38x4SatdDC 1
+	shl         %1,     4
+	movdqa      xmm0,   [esi+32+%1]
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2
+%endmacro
+%macro SSE41_I16x16GetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 32
+	SSE41_I16X16GetX38x4SatdDC
+%endmacro
+%macro SSE41_ChromaGetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 16
+	SSE41_ChromaGetX38x4SatdDC %1
+%endmacro
+%macro SSE41_HSum8W 3
+	pmaddwd     %1, %2
+	movhlps     %3, %1
+	paddd       %1, %3
+	pshuflw     %3, %1,0Eh
+	paddd       %1, %3
+%endmacro
+
+
+%ifdef X86_32
+WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
+WelsIntra16x16Combined3Satd_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+	mov    esi,    [esp+40] ;temp_satd
+	pxor        xmm4,   xmm4
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movdqu 		xmm0,   [ecx]
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi],  xmm0 ;V
+	movdqa      [esi+16], xmm1
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     8
+	pinsrb      xmm0,   byte[ecx+edx-1], 9
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     10
+	pinsrb      xmm0,   byte[ecx+edx-1], 11
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     12
+	pinsrb      xmm0,   byte[ecx+edx-1], 13
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     14
+	pinsrb      xmm0,   byte[ecx+edx-1], 15
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi+32], xmm0 ;H
+	movdqa      [esi+48], xmm1
+	movd        ecx,    xmm4 ;dc
+	add         ecx,    16   ;(sum+16)
+	shr         ecx,    5    ;((sum+16)>>5)
+	shl         ecx,    4    ;
+	movd        mm4,    ecx  ; mm4 copy DC
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+	mov         edi,    0
+.loop16x16_get_satd:
+.loopStart1:
+	SSE41_I16x16GetX38x4Satd ecx, edi
+	inc          ecx
+	cmp         ecx, 4
+	jl          .loopStart1
+	cmp         edi, 16
+	je          .loop16x16_get_satd_end
+	mov         eax, [esp+24]
+	add         eax, 8
+	mov         ecx, 0
+	add         edi, 16
+	jmp         .loop16x16_get_satd
+ .loop16x16_get_satd_end:
+	MMX_DW_1_2REG    xmm0, xmm1
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov      edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx
+	add       ebx, edx
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_16x16
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16
+
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx
+	jmp near return_satd_intra_16x16_x3
+not_dc_16x16:
+	; for H mode
+	cmp       edi, ecx
+	jge near   not_dc_h_16x16
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, edi
+	jmp near return_satd_intra_16x16_x3
+not_dc_h_16x16:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+return_satd_intra_16x16_x3:
+	WELSEMMS
+	pop         edi
+	pop         esi
+	pop         ebx
+ret
+
+%macro SSE41_ChromaGetX38x8Satd 0
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movq 		xmm0,   [ecx]
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+	movdqa      [esi],  xmm0 ;V
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+	movdqa      [esi+16], xmm0 ;H
+;(sum+2)>>2
+	movdqa      xmm6,   [PDQ2]
+	movdqa      xmm5,   xmm4
+	punpckhqdq  xmm5,   xmm1
+	paddd       xmm5,   xmm6
+	psrld       xmm5,   2
+;(sum1+sum2+4)>>3
+	paddd       xmm6,   xmm6
+	paddd       xmm4,   xmm1
+	paddd       xmm4,   xmm6
+	psrld       xmm4,   3
+;satd *16
+	pslld       xmm5,   4
+	pslld       xmm4,   4
+;temp satd
+	movdqa      xmm6,   xmm4
+	punpcklqdq  xmm4,   xmm5
+	psllq       xmm4,   32
+	psrlq       xmm4,   32
+	movdqa      [esi+32], xmm4
+	punpckhqdq  xmm5,   xmm6
+	psllq       xmm5,   32
+	psrlq       xmm5,   32
+	movdqa      [esi+48], xmm5
+
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+loop_chroma_satdx3_cb_cr:
+	SSE41_ChromaGetX38x4Satd ecx, 0
+	inc             ecx
+	cmp             ecx, 2
+	jl              loop_chroma_satdx3_cb_cr
+%endmacro
+
+%macro SSEReg2MMX 3
+	movdq2q     %2, %1
+	movhlps     %1, %1
+	movdq2q     %3, %1
+%endmacro
+%macro MMXReg2SSE 4
+	movq2dq     %1, %3
+	movq2dq     %2, %4
+	punpcklqdq  %1, %2
+%endmacro
+;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
+
+WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
+WelsIntraChroma8x8Combined3Satd_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+	mov    esi,    [esp+40] ;temp_satd
+	xor    edi,    edi
+loop_chroma_satdx3:
+	SSE41_ChromaGetX38x8Satd
+	cmp             edi, 1
+	je              loop_chroma_satdx3end
+	inc             edi
+	SSEReg2MMX  xmm4, mm0,mm1
+	SSEReg2MMX  xmm5, mm2,mm3
+	SSEReg2MMX  xmm6, mm5,mm6
+	mov         ecx,  [esp+44]
+	mov         eax,  [esp+48]
+	jmp         loop_chroma_satdx3
+loop_chroma_satdx3end:
+	MMXReg2SSE  xmm0, xmm3, mm0, mm1
+	MMXReg2SSE  xmm1, xmm3, mm2, mm3
+	MMXReg2SSE  xmm2, xmm3, mm5, mm6
+
+	paddw       xmm4, xmm0
+	paddw       xmm5, xmm1
+	paddw       xmm6, xmm2
+
+	MMX_DW_1_2REG    xmm0, xmm1
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov       edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx
+	add       ecx, edx
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_8x8
+	cmp        ebx, ecx
+	jge near   not_dc_h_8x8
+
+	; for DC mode
+	mov       dword[edx], 0;I8_PRED_DC
+	mov       eax, ebx
+	jmp near return_satd_intra_8x8_x3
+not_dc_8x8:
+	; for H mode
+	cmp       edi, ecx
+	jge near   not_dc_h_8x8
+	mov       dword[edx], 1;I8_PRED_H
+	mov       eax, edi
+	jmp near return_satd_intra_8x8_x3
+not_dc_h_8x8:
+	; for V mode
+	mov       dword[edx], 2;I8_PRED_V
+	mov       eax, ecx
+return_satd_intra_8x8_x3:
+	WELSEMMS
+	pop         edi
+	pop         esi
+	pop         ebx
+ret
+
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 END
+;
+;***********************************************************************
+%macro SSSE3_Get16BSadHVDC 2
+  movd        xmm6,%1
+  pshufb      xmm6,xmm1
+  movdqa      %1,  xmm6
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm7
+  paddw       xmm4,xmm0
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm5
+  paddw       xmm2,xmm0
+  psadbw      xmm6,%2
+  paddw       xmm3,xmm6
+%endmacro
+%macro WelsAddDCValue 4
+    movzx   %2, byte %1
+    mov    %3, %2
+    add     %4, %2
+%endmacro
+
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 BEGIN
+;
+;***********************************************************************
+WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
+WelsIntra16x16Combined3Sad_ssse3:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    edi,    [esp+40] ;temp_sad
+	sub    ecx,    edx
+    movdqa      xmm5,[ecx]
+    pxor        xmm0,xmm0
+    psadbw      xmm0,xmm5
+    movhlps     xmm1,xmm0
+    paddw       xmm0,xmm1
+    movd        eax,xmm0
+
+    add         ecx,edx
+    lea         ebx, [edx+2*edx]
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    sub        edi, 192
+    add         eax,10h
+    shr         eax,5
+    movd        xmm7,eax
+    pxor        xmm1,xmm1
+    pshufb      xmm7,xmm1
+    pxor        xmm4,xmm4
+    pxor        xmm3,xmm3
+    pxor        xmm2,xmm2
+;sad begin
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+    lea         esi, [ebx+2*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+
+    pslldq      xmm3,4
+    por         xmm3,xmm2
+    movhlps     xmm1,xmm3
+    paddw       xmm3,xmm1
+    movhlps     xmm0,xmm4
+    paddw       xmm4,xmm0
+; comparing order: DC H V
+	movd        ebx, xmm4 ;DC
+	movd        ecx, xmm3 ;V
+	psrldq      xmm3, 4
+	movd        esi, xmm3 ;H
+	mov         eax, [esp+36] ;lamda
+	shl         eax, 1
+	add         esi, eax
+	add         ebx, eax
+	mov         edx, [esp+32]
+	cmp         ebx, esi
+	jge near   not_dc_16x16_sad
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16_sad
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx
+    sub        edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm7
+%assign x x+1
+%endrep
+	jmp near return_sad_intra_16x16_x3
+not_dc_16x16_sad:
+	; for H mode
+	cmp       esi, ecx
+	jge near   not_dc_h_16x16_sad
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, esi
+	jmp near return_sad_intra_16x16_x3
+not_dc_h_16x16_sad:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+    sub       edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm5
+%assign x x+1
+%endrep
+return_sad_intra_16x16_x3:
+	pop    edi
+	pop    esi
+	pop    ebx
+	ret
+%endif
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 END
+;
+;***********************************************************************
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 BEGIN
+;
+;***********************************************************************
+
+;SSE4.1
+%macro SSE41_GetSatd8x4 0
+	movq             xmm0, [r0]
+	punpcklqdq       xmm0, xmm0
+	pmaddubsw        xmm0, xmm7
+	movq             xmm1, [r0+r1]
+	punpcklqdq       xmm1, xmm1
+	pmaddubsw        xmm1, xmm7
+	movq             xmm2, [r2]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [r2+r3]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
+	psubsw           xmm0, xmm2
+	psubsw           xmm1, xmm3
+	movq             xmm2, [r0+2*r1]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [r0+r4]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
+	movq             xmm4, [r2+2*r3]
+	punpcklqdq       xmm4, xmm4
+	pmaddubsw        xmm4, xmm7
+	movq             xmm5, [r2+r5]
+	punpcklqdq       xmm5, xmm5
+	pmaddubsw        xmm5, xmm7
+	psubsw           xmm2, xmm4
+	psubsw           xmm3, xmm5
+	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
+	pabsw            xmm0, xmm0
+	pabsw            xmm2, xmm2
+	pabsw            xmm1, xmm1
+	pabsw            xmm3, xmm3
+	movdqa           xmm4, xmm3
+	pblendw          xmm3, xmm1, 0xAA
+	pslld            xmm1, 16
+	psrld            xmm4, 16
+	por              xmm1, xmm4
+	pmaxuw           xmm1, xmm3
+	paddw            xmm6, xmm1
+	movdqa           xmm4, xmm0
+	pblendw          xmm0, xmm2, 0xAA
+	pslld            xmm2, 16
+	psrld            xmm4, 16
+	por              xmm2, xmm4
+	pmaxuw           xmm0, xmm2
+	paddw            xmm6, xmm0
+%endmacro
+
+%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
+	MMX_DW_1_2REG    %3, %4
+	pmaddwd     %2, %3
+	movhlps     %4, %2
+	paddd       %2, %4
+	pshuflw     %4, %2,0Eh
+	paddd       %2, %4
+	movd		%1, %2
+%endmacro
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse41
+WelsSampleSatd4x4_sse41:
+	;push        ebx
+	;mov         eax,[esp+8]
+	;mov         ebx,[esp+12]
+	;mov         ecx,[esp+16]
+	;mov         edx,[esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqa      xmm4,[HSwapSumSubDB1]
+	movd        xmm2,[r2]
+	movd        xmm5,[r2+r3]
+	shufps      xmm2,xmm5,0
+	movd        xmm3,[r2+r3*2]
+	lea         r2, [r3*2+r2]
+	movd        xmm5,[r2+r3]
+	shufps      xmm3,xmm5,0
+	movd        xmm0,[r0]
+	movd        xmm5,[r0+r1]
+	shufps      xmm0,xmm5,0
+	movd        xmm1,[r0+r1*2]
+	lea         r0, [r1*2+r0]
+	movd        xmm5,[r0+r1]
+	shufps      xmm1,xmm5,0
+	pmaddubsw   xmm0,xmm4
+	pmaddubsw   xmm1,xmm4
+	pmaddubsw   xmm2,xmm4
+	pmaddubsw   xmm3,xmm4
+	psubw       xmm0,xmm2
+	psubw       xmm1,xmm3
+	movdqa      xmm2,xmm0
+	paddw       xmm0,xmm1
+	psubw       xmm1,xmm2
+	movdqa      xmm2,xmm0
+	punpcklqdq  xmm0,xmm1
+	punpckhqdq  xmm2,xmm1
+	movdqa      xmm1,xmm0
+	paddw       xmm0,xmm2
+	psubw       xmm2,xmm1
+	movdqa      xmm1,xmm0
+	pblendw     xmm0,xmm2,0AAh
+	pslld       xmm2,16
+	psrld       xmm1,16
+	por         xmm2,xmm1
+	pabsw       xmm0,xmm0
+	pabsw       xmm2,xmm2
+	pmaxsw      xmm0,xmm2
+	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse41
+align 16
+WelsSampleSatd8x8_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;mov    eax,    [esp+16]
+	;mov    ebx,    [esp+20]
+	;mov    ecx,    [esp+24]
+	;mov    edx,    [esp+28]
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6, xmm6
+	SSE41_GetSatd8x4
+	lea			r0,	 [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse41
+align 16
+WelsSampleSatd8x16_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;push   ebp
+	;%define pushsize   16
+	;mov    eax,    [esp+pushsize+4]
+	;mov    ebx,    [esp+pushsize+8]
+	;mov    ecx,    [esp+pushsize+12]
+	;mov    edx,    [esp+pushsize+16]
+%ifdef X86_32
+	push  r4
+	push  r5
+	push  r6
+%endif
+	%assign  push_num 3
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor        xmm6, xmm6
+	mov         r6,    0
+loop_get_satd_8x16:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_8x16
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r6
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse41
+align 16
+WelsSampleSatd16x8_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;mov    eax,    [esp+16]
+	;mov    ebx,    [esp+20]
+	;mov    ecx,    [esp+24]
+	;mov    edx,    [esp+28]
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	push  r0
+	push  r2
+
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6,   xmm6
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+
+	pop  r2
+	pop  r0
+	;mov			eax,    [esp+16]
+	;mov			ecx,    [esp+24]
+	add			r0,    8
+	add			r2,    8
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd16x16_sse41
+align 16
+WelsSampleSatd16x16_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;push   ebp
+	;%define pushsize   16
+	;mov    eax,    [esp+pushsize+4]
+	;mov    ebx,    [esp+pushsize+8]
+	;mov    ecx,    [esp+pushsize+12]
+	;mov    edx,    [esp+pushsize+16]
+%ifdef X86_32
+	push  r4
+	push  r5
+	push  r6
+%endif
+	%assign  push_num 3
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+
+	push  r0
+	push  r2
+
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6,   xmm6
+	mov         r6,    0
+loop_get_satd_16x16_left:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_16x16_left
+
+	pop  r2
+	pop  r0
+	;mov			eax,    [esp+pushsize+4]
+	;mov			ecx,    [esp+pushsize+12]
+	add			r0,    8
+	add			r2,    8
+	mov         r6,    0
+loop_get_satd_16x16_right:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_16x16_right
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	;%undef pushsize
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r6
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE2_GetSad2x16 0
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqu xmm1,   [r2]
+	MOVDQ  xmm2,   [r0];[eax] must aligned 16
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad4x16 0
+	movdqu xmm0,   [r2]
+	MOVDQ  xmm2,   [r0]
+	psadbw xmm0,   xmm2
+	paddw  xmm7,   xmm0
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+	movdqu xmm1,   [r2+2*r3]
+	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+	movdqu xmm1,   [r2+r5]
+	MOVDQ  xmm2,   [r0+r4]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad8x4 0
+	movq   xmm0,   [r0]
+	movq   xmm1,   [r0+r1]
+	lea    r0,     [r0+2*r1]
+	movhps xmm0,   [r0]
+	movhps xmm1,   [r0+r1]
+
+	movq   xmm2,   [r2]
+	movq   xmm3,   [r2+r3]
+	lea    r2,     [r2+2*r3]
+	movhps xmm2,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x16_sse2
+align 16
+WelsSampleSad16x16_sse2:
+	;push ebx
+	;push edi
+	;push esi
+	;%define _STACK_SIZE		12
+	;mov eax, [esp+_STACK_SIZE+4 ]
+	;mov	ebx, [esp+_STACK_SIZE+8 ]
+	;mov ecx, [esp+_STACK_SIZE+12]
+	;mov edx, [esp+_STACK_SIZE+16]
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	lea r4, [3*r1]
+	lea r5, [3*r3]
+
+	pxor   xmm7,   xmm7
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	movhlps xmm0, xmm7
+	paddw xmm0, xmm7
+	movd retrd, xmm0
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x8_sse2
+align 16
+WelsSampleSad16x8_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqu xmm0,   [r2]
+	MOVDQ  xmm2,   [r0]
+	psadbw xmm0,   xmm2
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+
+	movhlps     xmm1, xmm0
+	paddw       xmm0, xmm1
+	movd        retrd,  xmm0
+	LOAD_4_PARA_POP
+	ret
+
+
+
+WELS_EXTERN WelsSampleSad8x16_sse2
+WelsSampleSad8x16_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+    pxor   xmm6,   xmm6
+
+	SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       retrd,  xmm0
+	LOAD_4_PARA_POP
+	ret
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+WelsSampleSad8x8_sse21:
+    ;mov    ecx,    [esp+12]
+	;mov    edx,    ecx
+    ;CACHE_SPLIT_CHECK edx, 8, 64
+	;jle    near   .pixel_sad_8x8_nsplit
+	;push   ebx
+	;push   edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+
+	%assign  push_num 0
+	mov		r2,  arg3
+	push	r2
+	CACHE_SPLIT_CHECK r2, 8, 64
+	jle    near   .pixel_sad_8x8_nsplit
+	pop		r2
+%ifdef X86_32
+	push	r3
+	push	r4
+	push	r5
+%endif
+	%assign  push_num 3
+	mov		r0,  arg1
+	mov		r1,  arg2
+	SIGN_EXTENTION r1, r1d
+    pxor   xmm7,   xmm7
+
+    ;ecx r2, edx r4, edi r5
+
+    mov    r5,    r2
+    and    r5,    0x07
+    sub    r2,    r5
+    mov    r4,    8
+    sub    r4,    r5
+
+    shl    r5,    3
+    shl    r4,    3
+    movd   xmm5,   r5d
+    movd   xmm6,   r4d
+	mov    r5,    8
+	add    r5,    r2
+    mov    r3,    arg4
+	SIGN_EXTENTION r3, r3d
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+    movhlps    xmm0, xmm7
+	paddw      xmm0, xmm7
+	movd       retrd,  xmm0
+%ifdef X86_32
+	pop	 r5
+	pop	 r4
+	pop	 r3
+%endif
+	jmp        .return
+
+.pixel_sad_8x8_nsplit:
+    ;push   ebx
+    ;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    edx,    [esp+20]
+
+	pop r2
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm6,   xmm6
+	SSE2_GetSad8x4
+    lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       retrd,  xmm0
+	LOAD_4_PARA_POP
+.return:
+	ret
+
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 END
+;
+;***********************************************************************
+
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+
+%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
+	psadbw %1,   %4
+	paddw  xmm5, %1
+	psadbw %4,   %3
+	paddw  xmm4, %4
+	movdqu %4,   [%5-1]
+	psadbw %4,   %2
+	paddw  xmm6, %4
+	movdqu %4,   [%5+1]
+	psadbw %4,   %2
+	paddw  xmm7, %4
+%endmacro
+WELS_EXTERN WelsSampleSadFour16x16_sse2
+WelsSampleSadFour16x16_sse2:
+	;push ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [r0]
+	sub    r2,    r3
+	movdqu xmm3,   [r2]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm3,   xmm1
+	paddw  xmm4,   xmm3
+
+	movdqu xmm2,   [r2+r3-1]
+	psadbw xmm2,   xmm0
+	paddw  xmm6,   xmm2
+
+	movdqu xmm3,   [r2+r3+1]
+	psadbw xmm3,   xmm0
+	paddw  xmm7,   xmm3
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r2,    [r2+2*r3]
+	movdqu xmm3,   [r2]
+	psadbw xmm2,   xmm3
+	paddw xmm5,   xmm2
+
+	movdqu xmm2,   [r2-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+
+	movdqu xmm3,   [r2+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+
+	;mov        ecx,  [esp+24]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+
+WELS_EXTERN WelsSampleSadFour16x8_sse2
+WelsSampleSadFour16x8_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [r0]
+	sub    r2,    r3
+	movdqu xmm3,   [r2]
+	psadbw xmm3,   xmm0
+	paddw xmm4,   xmm3
+
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm3,   xmm1
+	paddw xmm4,   xmm3
+
+	movdqu xmm2,   [r2+r3-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+
+	movdqu xmm3,   [r2+r3+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r2,    [r2+2*r3]
+	movdqu xmm3,   [r2]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+
+	movdqu xmm0,   [r2-1]
+	psadbw xmm0,   xmm1
+	paddw xmm6,   xmm0
+
+	movdqu xmm3,   [r2+1]
+	psadbw xmm3,   xmm1
+	paddw xmm7,   xmm3
+
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm1,   xmm3
+	paddw xmm5,   xmm1
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+WELS_EXTERN WelsSampleSadFour8x16_sse2
+WelsSampleSadFour8x16_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+	sub    r2,    r3
+	movq   xmm3,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+
+WELS_EXTERN WelsSampleSadFour8x8_sse2
+WelsSampleSadFour8x8_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+	sub    r2,    r3
+	movq   xmm3,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+WELS_EXTERN WelsSampleSadFour4x4_sse2
+WelsSampleSadFour4x4_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movd   xmm0,   [r0]
+	movd   xmm1,   [r0+r1]
+	lea        r0,    [r0+2*r1]
+	movd       xmm2,   [r0]
+	movd       xmm3,   [r0+r1]
+	punpckldq  xmm0, xmm1
+	punpckldq  xmm2, xmm3
+	punpcklqdq xmm0, xmm2
+	sub        r2,  r3
+	movd       xmm1, [r2]
+	movd       xmm2, [r2+r3]
+	punpckldq  xmm1, xmm2
+	movd       xmm2, [r2+r3-1]
+	movd       xmm3, [r2+r3+1]
+
+	lea        r2,  [r2+2*r3]
+
+	movd       xmm4, [r2]
+	movd       xmm5, [r2-1]
+	punpckldq  xmm2, xmm5
+	movd       xmm5, [r2+1]
+	punpckldq  xmm3, xmm5
+
+	movd       xmm5, [r2+r3]
+	punpckldq  xmm4, xmm5
+
+	punpcklqdq xmm1, xmm4 ;-L
+
+	movd       xmm5, [r2+r3-1]
+	movd       xmm6, [r2+r3+1]
+
+	lea        r2,  [r2+2*r3]
+	movd       xmm7, [r2-1]
+	punpckldq  xmm5, xmm7
+	punpcklqdq xmm2, xmm5 ;-1
+	movd       xmm7, [r2+1]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm3, xmm6 ;+1
+	movd       xmm6, [r2]
+	movd       xmm7, [r2+r3]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6 ;+L
+	psadbw     xmm1, xmm0
+	psadbw     xmm2, xmm0
+	psadbw     xmm3, xmm0
+	psadbw     xmm4, xmm0
+
+	movhlps    xmm0, xmm1
+	paddw      xmm1, xmm0
+	movhlps    xmm0, xmm2
+	paddw      xmm2, xmm0
+	movhlps    xmm0, xmm3
+	paddw      xmm3, xmm0
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	;mov        edi,  [esp+28]
+	punpckldq  xmm1, xmm4
+	punpckldq  xmm2, xmm3
+	punpcklqdq xmm1, xmm2
+	movdqa     [r4],xmm1
+	LOAD_5_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 END
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSad4x4_mmx
+
+align 16
+;***********************************************************************
+;   int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
+;***********************************************************************
+WelsSampleSad4x4_mmx:
+    ;push    ebx
+	;%define pushsize     4
+	;%define pix1address	 esp+pushsize+4
+	;%define pix1stride   esp+pushsize+8
+	;%define pix2address  esp+pushsize+12
+	;%define pix2stride   esp+pushsize+16
+    ;mov		  eax, [pix1address]
+    ;mov		  ebx, [pix1stride ]
+    ;mov		  ecx, [pix2address]
+    ;mov		  edx, [pix2stride ]
+
+    %assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movd	  mm0, [r0]
+	movd	  mm1, [r0+r1]
+	punpckldq mm0, mm1
+
+	movd      mm3, [r2]
+	movd      mm4, [r2+r3]
+	punpckldq mm3, mm4
+	psadbw    mm0, mm3
+
+	lea       r0, [r0+2*r1]
+	lea       r2, [r2+2*r3]
+
+	movd      mm1, [r0]
+	movd      mm2, [r0+r1]
+	punpckldq mm1, mm2
+
+	movd      mm3, [r2]
+	movd      mm4, [r2+r3]
+	punpckldq mm3, mm4
+	psadbw    mm1, mm3
+	paddw     mm0, mm1
+
+    movd      retrd, mm0
+
+	WELSEMMS
+    LOAD_4_PARA_POP
+    ret
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -16,6 +16,7 @@
 	$(COMMON_SRCDIR)/./mb_copy.asm\
 	$(COMMON_SRCDIR)/./mc_chroma.asm\
 	$(COMMON_SRCDIR)/./mc_luma.asm\
+	$(COMMON_SRCDIR)/./satd_sad.asm\
 	$(COMMON_SRCDIR)/./vaa.asm\
 
 COMMON_OBJS += $(COMMON_ASM_SRCS:.asm=.o)
--- a/codec/encoder/core/asm/satd_sad.asm
+++ /dev/null
@@ -1,2344 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  satd_sad.asm
-;*
-;*  Abstract
-;*      WelsSampleSatd4x4_sse2
-;*      WelsSampleSatd8x8_sse2
-;*      WelsSampleSatd16x8_sse2
-;*      WelsSampleSatd8x16_sse2
-;*      WelsSampleSatd16x16_sse2
-;*
-;*      WelsSampleSad16x8_sse2
-;*      WelsSampleSad16x16_sse2
-;*
-;*  History
-;*      8/5/2009 Created
-;*     24/9/2009 modified
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Data
-;***********************************************************************
-SECTION .rodata align=16
-
-align 16
-HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
-align 16
-HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
-align 16
-PDW1:  dw 1,1,1,1,1,1,1,1
-align 16
-PDQ2:  dw 2,0,0,0,2,0,0,0
-align 16
-HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 BEGIN
-;
-;***********************************************************************
-%macro MMX_DW_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubw %1, %2
-%endmacro
-
-%macro  SSE2_SumWHorizon1 2
-	movdqa      %2, %1
-	psrldq      %2, 8
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 4
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 2
-	paddusw     %1, %2
-%endmacro
-
-%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
-   SSE2_SumSub %1, %2, %5
-   SSE2_SumSub %3, %4, %5
-   SSE2_SumSub %2, %4, %5
-   SSE2_SumSub %1, %3, %5
-%endmacro
-
-%macro SSE2_SumAbs4 7
-	WELS_AbsW %1, %3
-	WELS_AbsW %2, %3
-	WELS_AbsW %4, %6
-	WELS_AbsW %5, %6
-	paddusw       %1, %2
-	paddusw       %4, %5
-	paddusw       %7, %1
-	paddusw       %7, %4
-%endmacro
-
-%macro  SSE2_SumWHorizon 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
-%endmacro
-
-%macro SSE2_GetSatd8x8 0
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-
-	lea					r0,    [r0+2*r1]
-    lea					r2,    [r2+2*r3]
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse2
-align 16
-WelsSampleSatd4x4_sse2:
-	;push      ebx
-	;mov       eax,  [esp+8]
-	;mov       ebx,  [esp+12]
-	;mov       ecx,  [esp+16]
-	;mov       edx,  [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-    movd      xmm0, [r0]
-    movd      xmm1, [r0+r1]
-    lea       r0 , [r0+2*r1]
-    movd      xmm2, [r0]
-    movd      xmm3, [r0+r1]
-    punpckldq xmm0, xmm2
-    punpckldq xmm1, xmm3
-
-    movd      xmm4, [r2]
-    movd      xmm5, [r2+r3]
-    lea       r2 , [r2+2*r3]
-    movd      xmm6, [r2]
-    movd      xmm7, [r2+r3]
-    punpckldq xmm4, xmm6
-    punpckldq xmm5, xmm7
-
-    pxor      xmm6, xmm6
-    punpcklbw xmm0, xmm6
-    punpcklbw xmm1, xmm6
-    punpcklbw xmm4, xmm6
-    punpcklbw xmm5, xmm6
-
-    psubw     xmm0, xmm4
-    psubw     xmm1, xmm5
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-    SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
-    movdqa     xmm4, xmm0
-    paddw      xmm0, xmm3
-    psubw      xmm4, xmm3
-
-    movdqa         xmm2, xmm0
-    punpcklwd      xmm0, xmm4
-    punpckhwd      xmm4, xmm2
-
-	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
-	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
-
-    movdqa         xmm7, xmm0
-    paddw          xmm0, xmm5
-    psubw          xmm7, xmm5
-
-	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
-
-    movdqa         xmm2, xmm0
-    paddw          xmm0, xmm1
-    psubw          xmm2, xmm1
-
-    WELS_AbsW  xmm0, xmm3
-    paddusw        xmm6, xmm0
-	WELS_AbsW  xmm2, xmm4
-    paddusw        xmm6, xmm2
-    SSE2_SumWHorizon1  xmm6, xmm4
-	movd           retrd,  xmm6
-    and            retrd,  0xffff
-    shr            retrd,  1
-	LOAD_4_PARA_POP
-	ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x8_sse2
-align 16
- WelsSampleSatd8x8_sse2:
-	 ;push   ebx
-	 ;mov    eax,    [esp+8]
-	 ;mov    ebx,    [esp+12]
-	 ;mov    ecx,    [esp+16]
-	 ;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-    SSE2_GetSatd8x8
-    psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	LOAD_4_PARA_POP
-	ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x16_sse2
-align 16
- WelsSampleSatd8x16_sse2:
-	 ;push   ebx
-	 ;mov    eax,    [esp+8]
-	 ;mov    ebx,    [esp+12]
-	 ;mov    ecx,    [esp+16]
-	 ;mov    edx,    [esp+20]
-
-	 %assign  push_num 0
-	 LOAD_4_PARA
-	 SIGN_EXTENTION r1, r1d
-	 SIGN_EXTENTION r3, r3d
-	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7
-
-	 SSE2_GetSatd8x8
-     lea    r0,    [r0+2*r1]
-     lea    r2,    [r2+2*r3]
-	 SSE2_GetSatd8x8
-
-	 psrlw   xmm6,  1
-	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	 movd    retrd,   xmm6
-	 LOAD_4_PARA_POP
-	 ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse2
-align 16
-WelsSampleSatd16x8_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	push r0
-	push r2
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-
-	SSE2_GetSatd8x8
-
-	pop r2
-	pop r0
-	;mov    eax,    [esp+8]
-    ;mov    ecx,    [esp+16]
-    add    r0,    8
-    add    r2,    8
-	SSE2_GetSatd8x8
-
-	psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x16_sse2
-align 16
-WelsSampleSatd16x16_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	push r0
-	push r2
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
-
-	pop r2
-	pop r0
-	;mov    eax,    [esp+8]
-	;mov    ecx,    [esp+16]
-	add    r0,    8
-	add    r2,    8
-
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
-
- ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
-    psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-	paddd        xmm4, %1 ;for dc
-	paddd        xmm4, %3 ;for dc
-	packssdw     %1, %3
-	psllw        %1, 2
-%endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-;    paddd        xmm4, %1 ;for dc
-;	 paddd        xmm4, %3 ;for dc
-	movdqa       %4, %1
-	punpcklqdq   %4, %3
-	packssdw     %1, %3
-	psllw        %1, 2
-%endmacro
-
-%macro SSE41_GetX38x4SatdDec 0
-	pxor        xmm7,   xmm7
-	movq        xmm0,   [eax]
-	movq        xmm1,   [eax+ebx]
-	lea         eax,    [eax+2*ebx]
-	movq        xmm2,   [eax]
-	movq        xmm3,   [eax+ebx]
-	lea         eax,    [eax+2*ebx]
-	punpcklbw   xmm0,   xmm7
-	punpcklbw   xmm1,   xmm7
-	punpcklbw   xmm2,   xmm7
-	punpcklbw   xmm3,   xmm7
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
-	;doesn't need another transpose
-%endmacro
-%macro SSE41_GetX38x4SatdV 2
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2],   0
-	pinsrw      xmm0,   word[esi+%2+8], 4
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+2],  0
-	pinsrw      xmm0,   word[esi+%2+10], 4
-	psubsw      xmm0,   xmm1
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+4],  0
-	pinsrw      xmm0,   word[esi+%2+12], 4
-	psubsw      xmm0,   xmm3
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+6],  0
-	pinsrw      xmm0,   word[esi+%2+14], 4
-	psubsw      xmm0,   xmm2
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-%endmacro
-%macro SSE41_GetX38x4SatdH  3
-	movq        xmm0,   [esi+%3+8*%1]
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm5,   xmm0
-	pabsw       xmm1,   xmm1
-	pabsw       xmm2,   xmm2
-	pabsw       xmm3,   xmm3
-	paddw       xmm2,   xmm1;for DC
-	paddw       xmm2,   xmm3;for DC
-	paddw       xmm5,   xmm2
-%endmacro
-%macro SSE41_I16X16GetX38x4SatdDC 0
-	pxor        xmm0,   xmm0
-	movq2dq     xmm0,   mm4
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
-%endmacro
-%macro SSE41_ChromaGetX38x4SatdDC 1
-	shl         %1,     4
-	movdqa      xmm0,   [esi+32+%1]
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
-%endmacro
-%macro SSE41_I16x16GetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 32
-	SSE41_I16X16GetX38x4SatdDC
-%endmacro
-%macro SSE41_ChromaGetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 16
-	SSE41_ChromaGetX38x4SatdDC %1
-%endmacro
-%macro SSE41_HSum8W 3
-	pmaddwd     %1, %2
-	movhlps     %3, %1
-	paddd       %1, %3
-	pshuflw     %3, %1,0Eh
-	paddd       %1, %3
-%endmacro
-
-
-%ifdef X86_32
-WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-WelsIntra16x16Combined3Satd_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-	mov    esi,    [esp+40] ;temp_satd
-	pxor        xmm4,   xmm4
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         ecx,    edx
-	movdqu 		xmm0,   [ecx]
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [esi],  xmm0 ;V
-	movdqa      [esi+16], xmm1
-	add         ecx,    edx
-	pinsrb      xmm0,   byte[ecx-1], 0
-	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     2
-	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     4
-	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     6
-	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     8
-	pinsrb      xmm0,   byte[ecx+edx-1], 9
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     10
-	pinsrb      xmm0,   byte[ecx+edx-1], 11
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     12
-	pinsrb      xmm0,   byte[ecx+edx-1], 13
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     14
-	pinsrb      xmm0,   byte[ecx+edx-1], 15
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [esi+32], xmm0 ;H
-	movdqa      [esi+48], xmm1
-	movd        ecx,    xmm4 ;dc
-	add         ecx,    16   ;(sum+16)
-	shr         ecx,    5    ;((sum+16)>>5)
-	shl         ecx,    4    ;
-	movd        mm4,    ecx  ; mm4 copy DC
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         ecx,    0
-	mov         edi,    0
-.loop16x16_get_satd:
-.loopStart1:
-	SSE41_I16x16GetX38x4Satd ecx, edi
-	inc          ecx
-	cmp         ecx, 4
-	jl          .loopStart1
-	cmp         edi, 16
-	je          .loop16x16_get_satd_end
-	mov         eax, [esp+24]
-	add         eax, 8
-	mov         ecx, 0
-	add         edi, 16
-	jmp         .loop16x16_get_satd
- .loop16x16_get_satd_end:
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-
-	; comparing order: DC H V
-	movd      ebx, xmm6 ;DC
-	movd      edi, xmm5 ;H
-	movd      ecx, xmm4 ;V
-	mov      edx, [esp+36]
-	shl       edx, 1
-	add       edi, edx
-	add       ebx, edx
-	mov       edx, [esp+32]
-	cmp       ebx, edi
-	jge near   not_dc_16x16
-	cmp        ebx, ecx
-	jge near   not_dc_h_16x16
-
-	; for DC mode
-	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx
-	jmp near return_satd_intra_16x16_x3
-not_dc_16x16:
-	; for H mode
-	cmp       edi, ecx
-	jge near   not_dc_h_16x16
-	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, edi
-	jmp near return_satd_intra_16x16_x3
-not_dc_h_16x16:
-	; for V mode
-	mov       dword[edx], 0;I16_PRED_V
-	mov       eax, ecx
-return_satd_intra_16x16_x3:
-	WELSEMMS
-	pop         edi
-	pop         esi
-	pop         ebx
-ret
-
-%macro SSE41_ChromaGetX38x8Satd 0
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         ecx,    edx
-	movq 		xmm0,   [ecx]
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
-	movdqa      [esi],  xmm0 ;V
-	add         ecx,    edx
-	pinsrb      xmm0,   byte[ecx-1], 0
-	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     2
-	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     4
-	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     6
-	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-	movdqa      [esi+16], xmm0 ;H
-;(sum+2)>>2
-	movdqa      xmm6,   [PDQ2]
-	movdqa      xmm5,   xmm4
-	punpckhqdq  xmm5,   xmm1
-	paddd       xmm5,   xmm6
-	psrld       xmm5,   2
-;(sum1+sum2+4)>>3
-	paddd       xmm6,   xmm6
-	paddd       xmm4,   xmm1
-	paddd       xmm4,   xmm6
-	psrld       xmm4,   3
-;satd *16
-	pslld       xmm5,   4
-	pslld       xmm4,   4
-;temp satd
-	movdqa      xmm6,   xmm4
-	punpcklqdq  xmm4,   xmm5
-	psllq       xmm4,   32
-	psrlq       xmm4,   32
-	movdqa      [esi+32], xmm4
-	punpckhqdq  xmm5,   xmm6
-	psllq       xmm5,   32
-	psrlq       xmm5,   32
-	movdqa      [esi+48], xmm5
-
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         ecx,    0
-loop_chroma_satdx3_cb_cr:
-	SSE41_ChromaGetX38x4Satd ecx, 0
-	inc             ecx
-	cmp             ecx, 2
-	jl              loop_chroma_satdx3_cb_cr
-%endmacro
-
-%macro SSEReg2MMX 3
-	movdq2q     %2, %1
-	movhlps     %1, %1
-	movdq2q     %3, %1
-%endmacro
-%macro MMXReg2SSE 4
-	movq2dq     %1, %3
-	movq2dq     %2, %4
-	punpcklqdq  %1, %2
-%endmacro
-;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
-
-WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-WelsIntraChroma8x8Combined3Satd_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-	mov    esi,    [esp+40] ;temp_satd
-	xor    edi,    edi
-loop_chroma_satdx3:
-	SSE41_ChromaGetX38x8Satd
-	cmp             edi, 1
-	je              loop_chroma_satdx3end
-	inc             edi
-	SSEReg2MMX  xmm4, mm0,mm1
-	SSEReg2MMX  xmm5, mm2,mm3
-	SSEReg2MMX  xmm6, mm5,mm6
-	mov         ecx,  [esp+44]
-	mov         eax,  [esp+48]
-	jmp         loop_chroma_satdx3
-loop_chroma_satdx3end:
-	MMXReg2SSE  xmm0, xmm3, mm0, mm1
-	MMXReg2SSE  xmm1, xmm3, mm2, mm3
-	MMXReg2SSE  xmm2, xmm3, mm5, mm6
-
-	paddw       xmm4, xmm0
-	paddw       xmm5, xmm1
-	paddw       xmm6, xmm2
-
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-	; comparing order: DC H V
-	movd      ebx, xmm6 ;DC
-	movd      edi, xmm5 ;H
-	movd      ecx, xmm4 ;V
-	mov       edx, [esp+36]
-	shl       edx, 1
-	add       edi, edx
-	add       ecx, edx
-	mov       edx, [esp+32]
-	cmp       ebx, edi
-	jge near   not_dc_8x8
-	cmp        ebx, ecx
-	jge near   not_dc_h_8x8
-
-	; for DC mode
-	mov       dword[edx], 0;I8_PRED_DC
-	mov       eax, ebx
-	jmp near return_satd_intra_8x8_x3
-not_dc_8x8:
-	; for H mode
-	cmp       edi, ecx
-	jge near   not_dc_h_8x8
-	mov       dword[edx], 1;I8_PRED_H
-	mov       eax, edi
-	jmp near return_satd_intra_8x8_x3
-not_dc_h_8x8:
-	; for V mode
-	mov       dword[edx], 2;I8_PRED_V
-	mov       eax, ecx
-return_satd_intra_8x8_x3:
-	WELSEMMS
-	pop         edi
-	pop         esi
-	pop         ebx
-ret
-
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 END
-;
-;***********************************************************************
-%macro SSSE3_Get16BSadHVDC 2
-  movd        xmm6,%1
-  pshufb      xmm6,xmm1
-  movdqa      %1,  xmm6
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm7
-  paddw       xmm4,xmm0
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm5
-  paddw       xmm2,xmm0
-  psadbw      xmm6,%2
-  paddw       xmm3,xmm6
-%endmacro
-%macro WelsAddDCValue 4
-    movzx   %2, byte %1
-    mov    %3, %2
-    add     %4, %2
-%endmacro
-
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 BEGIN
-;
-;***********************************************************************
-WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-WelsIntra16x16Combined3Sad_ssse3:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    edi,    [esp+40] ;temp_sad
-	sub    ecx,    edx
-    movdqa      xmm5,[ecx]
-    pxor        xmm0,xmm0
-    psadbw      xmm0,xmm5
-    movhlps     xmm1,xmm0
-    paddw       xmm0,xmm1
-    movd        eax,xmm0
-
-    add         ecx,edx
-    lea         ebx, [edx+2*edx]
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    sub        edi, 192
-    add         eax,10h
-    shr         eax,5
-    movd        xmm7,eax
-    pxor        xmm1,xmm1
-    pshufb      xmm7,xmm1
-    pxor        xmm4,xmm4
-    pxor        xmm3,xmm3
-    pxor        xmm2,xmm2
-;sad begin
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-    lea         esi, [ebx+2*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-
-    pslldq      xmm3,4
-    por         xmm3,xmm2
-    movhlps     xmm1,xmm3
-    paddw       xmm3,xmm1
-    movhlps     xmm0,xmm4
-    paddw       xmm4,xmm0
-; comparing order: DC H V
-	movd        ebx, xmm4 ;DC
-	movd        ecx, xmm3 ;V
-	psrldq      xmm3, 4
-	movd        esi, xmm3 ;H
-	mov         eax, [esp+36] ;lamda
-	shl         eax, 1
-	add         esi, eax
-	add         ebx, eax
-	mov         edx, [esp+32]
-	cmp         ebx, esi
-	jge near   not_dc_16x16_sad
-	cmp        ebx, ecx
-	jge near   not_dc_h_16x16_sad
-	; for DC mode
-	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx
-    sub        edi, 192
-%assign x 0
-%rep 16
-    movdqa    [edi+16*x], xmm7
-%assign x x+1
-%endrep
-	jmp near return_sad_intra_16x16_x3
-not_dc_16x16_sad:
-	; for H mode
-	cmp       esi, ecx
-	jge near   not_dc_h_16x16_sad
-	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, esi
-	jmp near return_sad_intra_16x16_x3
-not_dc_h_16x16_sad:
-	; for V mode
-	mov       dword[edx], 0;I16_PRED_V
-	mov       eax, ecx
-    sub       edi, 192
-%assign x 0
-%rep 16
-    movdqa    [edi+16*x], xmm5
-%assign x x+1
-%endrep
-return_sad_intra_16x16_x3:
-	pop    edi
-	pop    esi
-	pop    ebx
-	ret
-%endif
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 END
-;
-;***********************************************************************
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 BEGIN
-;
-;***********************************************************************
-
-;SSE4.1
-%macro SSE41_GetSatd8x4 0
-	movq             xmm0, [r0]
-	punpcklqdq       xmm0, xmm0
-	pmaddubsw        xmm0, xmm7
-	movq             xmm1, [r0+r1]
-	punpcklqdq       xmm1, xmm1
-	pmaddubsw        xmm1, xmm7
-	movq             xmm2, [r2]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r2+r3]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	psubsw           xmm0, xmm2
-	psubsw           xmm1, xmm3
-	movq             xmm2, [r0+2*r1]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r0+r4]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	movq             xmm4, [r2+2*r3]
-	punpcklqdq       xmm4, xmm4
-	pmaddubsw        xmm4, xmm7
-	movq             xmm5, [r2+r5]
-	punpcklqdq       xmm5, xmm5
-	pmaddubsw        xmm5, xmm7
-	psubsw           xmm2, xmm4
-	psubsw           xmm3, xmm5
-	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
-	pabsw            xmm0, xmm0
-	pabsw            xmm2, xmm2
-	pabsw            xmm1, xmm1
-	pabsw            xmm3, xmm3
-	movdqa           xmm4, xmm3
-	pblendw          xmm3, xmm1, 0xAA
-	pslld            xmm1, 16
-	psrld            xmm4, 16
-	por              xmm1, xmm4
-	pmaxuw           xmm1, xmm3
-	paddw            xmm6, xmm1
-	movdqa           xmm4, xmm0
-	pblendw          xmm0, xmm2, 0xAA
-	pslld            xmm2, 16
-	psrld            xmm4, 16
-	por              xmm2, xmm4
-	pmaxuw           xmm0, xmm2
-	paddw            xmm6, xmm0
-%endmacro
-
-%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
-	MMX_DW_1_2REG    %3, %4
-	pmaddwd     %2, %3
-	movhlps     %4, %2
-	paddd       %2, %4
-	pshuflw     %4, %2,0Eh
-	paddd       %2, %4
-	movd		%1, %2
-%endmacro
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse41
-WelsSampleSatd4x4_sse41:
-	;push        ebx
-	;mov         eax,[esp+8]
-	;mov         ebx,[esp+12]
-	;mov         ecx,[esp+16]
-	;mov         edx,[esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movdqa      xmm4,[HSwapSumSubDB1]
-	movd        xmm2,[r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm2,xmm5,0
-	movd        xmm3,[r2+r3*2]
-	lea         r2, [r3*2+r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm3,xmm5,0
-	movd        xmm0,[r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm0,xmm5,0
-	movd        xmm1,[r0+r1*2]
-	lea         r0, [r1*2+r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm1,xmm5,0
-	pmaddubsw   xmm0,xmm4
-	pmaddubsw   xmm1,xmm4
-	pmaddubsw   xmm2,xmm4
-	pmaddubsw   xmm3,xmm4
-	psubw       xmm0,xmm2
-	psubw       xmm1,xmm3
-	movdqa      xmm2,xmm0
-	paddw       xmm0,xmm1
-	psubw       xmm1,xmm2
-	movdqa      xmm2,xmm0
-	punpcklqdq  xmm0,xmm1
-	punpckhqdq  xmm2,xmm1
-	movdqa      xmm1,xmm0
-	paddw       xmm0,xmm2
-	psubw       xmm2,xmm1
-	movdqa      xmm1,xmm0
-	pblendw     xmm0,xmm2,0AAh
-	pslld       xmm2,16
-	psrld       xmm1,16
-	por         xmm2,xmm1
-	pabsw       xmm0,xmm0
-	pabsw       xmm2,xmm2
-	pmaxsw      xmm0,xmm2
-	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x8_sse41
-align 16
-WelsSampleSatd8x8_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;mov    eax,    [esp+16]
-	;mov    ebx,    [esp+20]
-	;mov    ecx,    [esp+24]
-	;mov    edx,    [esp+28]
-%ifdef X86_32
-	push  r4
-	push  r5
-%endif
-	%assign  push_num 2
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6, xmm6
-	SSE41_GetSatd8x4
-	lea			r0,	 [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x16_sse41
-align 16
-WelsSampleSatd8x16_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;push   ebp
-	;%define pushsize   16
-	;mov    eax,    [esp+pushsize+4]
-	;mov    ebx,    [esp+pushsize+8]
-	;mov    ecx,    [esp+pushsize+12]
-	;mov    edx,    [esp+pushsize+16]
-%ifdef X86_32
-	push  r4
-	push  r5
-	push  r6
-%endif
-	%assign  push_num 3
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor        xmm6, xmm6
-	mov         r6,    0
-loop_get_satd_8x16:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_8x16
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse41
-align 16
-WelsSampleSatd16x8_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;mov    eax,    [esp+16]
-	;mov    ebx,    [esp+20]
-	;mov    ecx,    [esp+24]
-	;mov    edx,    [esp+28]
-%ifdef X86_32
-	push  r4
-	push  r5
-%endif
-	%assign  push_num 2
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	push  r0
-	push  r2
-
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-
-	pop  r2
-	pop  r0
-	;mov			eax,    [esp+16]
-	;mov			ecx,    [esp+24]
-	add			r0,    8
-	add			r2,    8
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSatd16x16_sse41
-align 16
-WelsSampleSatd16x16_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;push   ebp
-	;%define pushsize   16
-	;mov    eax,    [esp+pushsize+4]
-	;mov    ebx,    [esp+pushsize+8]
-	;mov    ecx,    [esp+pushsize+12]
-	;mov    edx,    [esp+pushsize+16]
-%ifdef X86_32
-	push  r4
-	push  r5
-	push  r6
-%endif
-	%assign  push_num 3
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-
-	push  r0
-	push  r2
-
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	mov         r6,    0
-loop_get_satd_16x16_left:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_left
-
-	pop  r2
-	pop  r0
-	;mov			eax,    [esp+pushsize+4]
-	;mov			ecx,    [esp+pushsize+12]
-	add			r0,    8
-	add			r2,    8
-	mov         r6,    0
-loop_get_satd_16x16_right:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_right
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	;%undef pushsize
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE2_GetSad2x16 0
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqu xmm1,   [r2]
-	MOVDQ  xmm2,   [r0];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-%endmacro
-
-
-%macro SSE2_GetSad4x16 0
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	paddw  xmm7,   xmm0
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+2*r3]
-	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+r5]
-	MOVDQ  xmm2,   [r0+r4]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-%endmacro
-
-
-%macro SSE2_GetSad8x4 0
-	movq   xmm0,   [r0]
-	movq   xmm1,   [r0+r1]
-	lea    r0,     [r0+2*r1]
-	movhps xmm0,   [r0]
-	movhps xmm1,   [r0+r1]
-
-	movq   xmm2,   [r2]
-	movq   xmm3,   [r2+r3]
-	lea    r2,     [r2+2*r3]
-	movhps xmm2,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x16_sse2
-align 16
-WelsSampleSad16x16_sse2:
-	;push ebx
-	;push edi
-	;push esi
-	;%define _STACK_SIZE		12
-	;mov eax, [esp+_STACK_SIZE+4 ]
-	;mov	ebx, [esp+_STACK_SIZE+8 ]
-	;mov ecx, [esp+_STACK_SIZE+12]
-	;mov edx, [esp+_STACK_SIZE+16]
-%ifdef X86_32
-	push  r4
-	push  r5
-%endif
-
-	%assign  push_num 2
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	lea r4, [3*r1]
-	lea r5, [3*r3]
-
-	pxor   xmm7,   xmm7
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	movhlps xmm0, xmm7
-	paddw xmm0, xmm7
-	movd retrd, xmm0
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x8_sse2
-align 16
-WelsSampleSad16x8_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-
-	movhlps     xmm1, xmm0
-	paddw       xmm0, xmm1
-	movd        retrd,  xmm0
-	LOAD_4_PARA_POP
-	ret
-
-
-
-WELS_EXTERN WelsSampleSad8x16_sse2
-WelsSampleSad8x16_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-    pxor   xmm6,   xmm6
-
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	LOAD_4_PARA_POP
-	ret
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
-    ;mov    ecx,    [esp+12]
-	;mov    edx,    ecx
-    ;CACHE_SPLIT_CHECK edx, 8, 64
-	;jle    near   .pixel_sad_8x8_nsplit
-	;push   ebx
-	;push   edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-
-	%assign  push_num 0
-	mov		r2,  arg3
-	push	r2
-	CACHE_SPLIT_CHECK r2, 8, 64
-	jle    near   .pixel_sad_8x8_nsplit
-	pop		r2
-%ifdef X86_32
-	push	r3
-	push	r4
-	push	r5
-%endif
-	%assign  push_num 3
-	mov		r0,  arg1
-	mov		r1,  arg2
-	SIGN_EXTENTION r1, r1d
-    pxor   xmm7,   xmm7
-
-    ;ecx r2, edx r4, edi r5
-
-    mov    r5,    r2
-    and    r5,    0x07
-    sub    r2,    r5
-    mov    r4,    8
-    sub    r4,    r5
-
-    shl    r5,    3
-    shl    r4,    3
-    movd   xmm5,   r5d
-    movd   xmm6,   r4d
-	mov    r5,    8
-	add    r5,    r2
-    mov    r3,    arg4
-	SIGN_EXTENTION r3, r3d
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-    movhlps    xmm0, xmm7
-	paddw      xmm0, xmm7
-	movd       retrd,  xmm0
-%ifdef X86_32
-	pop	 r5
-	pop	 r4
-	pop	 r3
-%endif
-	jmp        .return
-
-.pixel_sad_8x8_nsplit:
-    ;push   ebx
-    ;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    edx,    [esp+20]
-
-	pop r2
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm6,   xmm6
-	SSE2_GetSad8x4
-    lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	LOAD_4_PARA_POP
-.return:
-	ret
-
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 END
-;
-;***********************************************************************
-
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-
-%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
-	psadbw %1,   %4
-	paddw  xmm5, %1
-	psadbw %4,   %3
-	paddw  xmm4, %4
-	movdqu %4,   [%5-1]
-	psadbw %4,   %2
-	paddw  xmm6, %4
-	movdqu %4,   [%5+1]
-	psadbw %4,   %2
-	paddw  xmm7, %4
-%endmacro
-WELS_EXTERN WelsSampleSadFour16x16_sse2
-WelsSampleSadFour16x16_sse2:
-	;push ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw  xmm4,   xmm3
-
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw  xmm6,   xmm2
-
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw  xmm7,   xmm3
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm2,   xmm3
-	paddw xmm5,   xmm2
-
-	movdqu xmm2,   [r2-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
-
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
-
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
-
-	;mov        ecx,  [esp+24]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-
-WELS_EXTERN WelsSampleSadFour16x8_sse2
-WelsSampleSadFour16x8_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw xmm4,   xmm3
-
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw xmm4,   xmm3
-
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
-
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
-
-	movdqu xmm0,   [r2-1]
-	psadbw xmm0,   xmm1
-	paddw xmm6,   xmm0
-
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm1
-	paddw xmm7,   xmm3
-
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm1,   xmm3
-	paddw xmm5,   xmm1
-
-	;mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-WELS_EXTERN WelsSampleSadFour8x16_sse2
-WelsSampleSadFour8x16_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	;mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-
-WELS_EXTERN WelsSampleSadFour8x8_sse2
-WelsSampleSadFour8x8_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	;mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-WELS_EXTERN WelsSampleSadFour4x4_sse2
-WelsSampleSadFour4x4_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movd   xmm0,   [r0]
-	movd   xmm1,   [r0+r1]
-	lea        r0,    [r0+2*r1]
-	movd       xmm2,   [r0]
-	movd       xmm3,   [r0+r1]
-	punpckldq  xmm0, xmm1
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm0, xmm2
-	sub        r2,  r3
-	movd       xmm1, [r2]
-	movd       xmm2, [r2+r3]
-	punpckldq  xmm1, xmm2
-	movd       xmm2, [r2+r3-1]
-	movd       xmm3, [r2+r3+1]
-
-	lea        r2,  [r2+2*r3]
-
-	movd       xmm4, [r2]
-	movd       xmm5, [r2-1]
-	punpckldq  xmm2, xmm5
-	movd       xmm5, [r2+1]
-	punpckldq  xmm3, xmm5
-
-	movd       xmm5, [r2+r3]
-	punpckldq  xmm4, xmm5
-
-	punpcklqdq xmm1, xmm4 ;-L
-
-	movd       xmm5, [r2+r3-1]
-	movd       xmm6, [r2+r3+1]
-
-	lea        r2,  [r2+2*r3]
-	movd       xmm7, [r2-1]
-	punpckldq  xmm5, xmm7
-	punpcklqdq xmm2, xmm5 ;-1
-	movd       xmm7, [r2+1]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm3, xmm6 ;+1
-	movd       xmm6, [r2]
-	movd       xmm7, [r2+r3]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6 ;+L
-	psadbw     xmm1, xmm0
-	psadbw     xmm2, xmm0
-	psadbw     xmm3, xmm0
-	psadbw     xmm4, xmm0
-
-	movhlps    xmm0, xmm1
-	paddw      xmm1, xmm0
-	movhlps    xmm0, xmm2
-	paddw      xmm2, xmm0
-	movhlps    xmm0, xmm3
-	paddw      xmm3, xmm0
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	;mov        edi,  [esp+28]
-	punpckldq  xmm1, xmm4
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm1, xmm2
-	movdqa     [r4],xmm1
-	LOAD_5_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 END
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSad4x4_mmx
-
-align 16
-;***********************************************************************
-;   int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
-;***********************************************************************
-WelsSampleSad4x4_mmx:
-    ;push    ebx
-	;%define pushsize     4
-	;%define pix1address	 esp+pushsize+4
-	;%define pix1stride   esp+pushsize+8
-	;%define pix2address  esp+pushsize+12
-	;%define pix2stride   esp+pushsize+16
-    ;mov		  eax, [pix1address]
-    ;mov		  ebx, [pix1stride ]
-    ;mov		  ecx, [pix2address]
-    ;mov		  edx, [pix2stride ]
-
-    %assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	movd	  mm0, [r0]
-	movd	  mm1, [r0+r1]
-	punpckldq mm0, mm1
-
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm0, mm3
-
-	lea       r0, [r0+2*r1]
-	lea       r2, [r2+2*r3]
-
-	movd      mm1, [r0]
-	movd      mm2, [r0+r1]
-	punpckldq mm1, mm2
-
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm1, mm3
-	paddw     mm0, mm1
-
-    movd      retrd, mm0
-
-	WELSEMMS
-    LOAD_4_PARA_POP
-    ret
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -41,7 +41,6 @@
 	$(ENCODER_SRCDIR)/./core/asm/intra_pred.asm\
 	$(ENCODER_SRCDIR)/./core/asm/memzero.asm\
 	$(ENCODER_SRCDIR)/./core/asm/quant.asm\
-	$(ENCODER_SRCDIR)/./core/asm/satd_sad.asm\
 	$(ENCODER_SRCDIR)/./core/asm/score.asm\
 
 ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.o)
--- a/codec/processing/build/win32/WelsVP_2008.vcproj
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -634,7 +634,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\src\asm\sad.asm"
+				RelativePath="..\..\..\common\satd_sad.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
--- a/codec/processing/src/asm/sad.asm
+++ /dev/null
@@ -1,220 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  sad.asm
-;*
-;*  Abstract
-;*      WelsSampleSad8x8_sse21
-;*
-;*  History
-;*      8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-%macro SSE2_GetSad8x4 0
-	movq   xmm0,   [r0]
-	movq   xmm1,   [r0+r1]
-	lea    r0,     [r0+2*r1]
-	movhps xmm0,   [r0]
-	movhps xmm1,   [r0+r1]
-
-	movq   xmm2,   [r2]
-	movq   xmm3,   [r2+r3]
-	lea    r2,     [r2+2*r3]
-	movhps xmm2,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
-    ;mov    ecx,    [esp+12]
-	;mov    edx,    ecx
-    ;CACHE_SPLIT_CHECK edx, 8, 64
-	;jle    near   .pixel_sad_8x8_nsplit
-	;push   ebx
-	;push   edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-
-	%assign  push_num 0
-	mov		r2,  arg3
-	push	r2
-	CACHE_SPLIT_CHECK r2, 8, 64
-	jle    near   .pixel_sad_8x8_nsplit
-	pop		r2
-%ifdef X86_32
-	push	r3
-	push	r4
-	push	r5
-%endif
-	%assign  push_num 3
-	mov		r0,  arg1
-	mov		r1,  arg2
-	SIGN_EXTENTION r1, r1d
-    pxor   xmm7,   xmm7
-
-    ;ecx r2, edx r4, edi r5
-
-    mov    r5,    r2
-    and    r5,    0x07
-    sub    r2,    r5
-    mov    r4,    8
-    sub    r4,    r5
-
-    shl    r5,    3
-    shl    r4,    3
-    movd   xmm5,   r5d
-    movd   xmm6,   r4d
-	mov    r5,    8
-	add    r5,    r2
-    mov    r3,    arg4
-	SIGN_EXTENTION r3, r3d
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-    movhlps    xmm0, xmm7
-	paddw      xmm0, xmm7
-	movd       retrd,  xmm0
-%ifdef X86_32
-	pop	 r5
-	pop	 r4
-	pop	 r3
-%endif
-	jmp        .return
-
-.pixel_sad_8x8_nsplit:
-    ;push   ebx
-    ;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    edx,    [esp+20]
-
-	pop r2
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	pxor   xmm6,   xmm6
-	SSE2_GetSad8x4
-    lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	LOAD_4_PARA_POP
-.return:
-	ret
\ No newline at end of file
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -24,7 +24,6 @@
 	$(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm\
 	$(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm\
 	$(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm\
-	$(PROCESSING_SRCDIR)/./src/asm/sad.asm\
 	$(PROCESSING_SRCDIR)/./src/asm/vaa.asm\
 
 PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o)