shithub: openh264

ref: 94b55520b8fd861b80e0e36ac82a553361fffdcf
dir: /codec/common/x86/satd_sad.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  satd_sad.asm
;*
;*  Abstract
;*      WelsSampleSatd4x4_sse2
;*      WelsSampleSatd8x8_sse2
;*      WelsSampleSatd16x8_sse2
;*      WelsSampleSatd8x16_sse2
;*      WelsSampleSatd16x16_sse2
;*
;*      WelsSampleSad16x8_sse2
;*      WelsSampleSad16x16_sse2
;*
;*  History
;*      8/5/2009 Created
;*     24/9/2009 modified
;*
;*
;*************************************************************************/

%include "asm_inc.asm"

;***********************************************************************
; Data
;***********************************************************************
SECTION .rodata align=16

align 16
HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
align 16
HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
align 16
PDW1:  dw 1,1,1,1,1,1,1,1
align 16
PDQ2:  dw 2,0,0,0,2,0,0,0
align 16
HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1

;***********************************************************************
; Code
;***********************************************************************
SECTION .text

;***********************************************************************
;
;Pixel_satd_wxh_sse2 BEGIN
;
;***********************************************************************
%macro MMX_DW_1_2REG 2
    pxor %1, %1
    pcmpeqw %2, %2
    psubw %1, %2
%endmacro

%macro SSE2_SumWHorizon1 2
    movdqa      %2, %1
    psrldq      %2, 8
    paddusw     %1, %2
    movdqa      %2, %1
    psrldq      %2, 4
    paddusw     %1, %2
    movdqa      %2, %1
    psrldq      %2, 2
    paddusw     %1, %2
%endmacro

%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
    SSE2_SumSub %1, %2, %5
    SSE2_SumSub %3, %4, %5
    SSE2_SumSub %2, %4, %5
    SSE2_SumSub %1, %3, %5
%endmacro

%macro SSE2_SumAbs4 7
    WELS_AbsW %1, %3
    WELS_AbsW %2, %3
    WELS_AbsW %4, %6
    WELS_AbsW %5, %6
    paddusw       %1, %2
    paddusw       %4, %5
    paddusw       %7, %1
    paddusw       %7, %4
%endmacro

%macro SSE2_SumWHorizon 3
    movhlps     %2, %1          ; x2 = xx xx xx xx d7 d6 d5 d4
    paddw       %1, %2          ; x1 = xx xx xx xx d37 d26 d15 d04
    punpcklwd   %1, %3          ; x1 =  d37  d26 d15 d04
    movhlps     %2, %1          ; x2 = xxxx xxxx d37 d26
    paddd       %1, %2          ; x1 = xxxx xxxx d1357 d0246
    pshuflw     %2, %1, 0x4e    ; x2 = xxxx xxxx d0246 d1357
    paddd       %1, %2          ; x1 = xxxx xxxx xxxx  d01234567
%endmacro

%macro SSE2_GetSatd8x8 0
    SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
    SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
    lea                 r0, [r0+2*r1]
    lea                 r2, [r2+2*r3]
    SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
    SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]

    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
    SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6

    lea                 r0,    [r0+2*r1]
    lea                 r2,    [r2+2*r3]
    SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
    SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
    lea                 r0, [r0+2*r1]
    lea                 r2, [r2+2*r3]
    SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
    SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]

    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
    SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
%endmacro

;***********************************************************************
;
;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse2
    %assign  push_num 0
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    movd      xmm0, [r0]
    movd      xmm1, [r0+r1]
    lea       r0 , [r0+2*r1]
    movd      xmm2, [r0]
    movd      xmm3, [r0+r1]
    punpckldq xmm0, xmm2
    punpckldq xmm1, xmm3

    movd      xmm4, [r2]
    movd      xmm5, [r2+r3]
    lea       r2 , [r2+2*r3]
    movd      xmm6, [r2]
    movd      xmm7, [r2+r3]
    punpckldq xmm4, xmm6
    punpckldq xmm5, xmm7

    pxor      xmm6, xmm6
    punpcklbw xmm0, xmm6
    punpcklbw xmm1, xmm6
    punpcklbw xmm4, xmm6
    punpcklbw xmm5, xmm6

    psubw     xmm0, xmm4
    psubw     xmm1, xmm5

    movdqa    xmm2, xmm0
    paddw     xmm0, xmm1
    psubw     xmm2, xmm1
    SSE2_XSawp qdq, xmm0, xmm2, xmm3

    movdqa     xmm4, xmm0
    paddw      xmm0, xmm3
    psubw      xmm4, xmm3

    movdqa         xmm2, xmm0
    punpcklwd      xmm0, xmm4
    punpckhwd      xmm4, xmm2

    SSE2_XSawp     dq,  xmm0, xmm4, xmm3
    SSE2_XSawp     qdq, xmm0, xmm3, xmm5

    movdqa         xmm7, xmm0
    paddw          xmm0, xmm5
    psubw          xmm7, xmm5

    SSE2_XSawp     qdq,  xmm0, xmm7, xmm1

    movdqa         xmm2, xmm0
    paddw          xmm0, xmm1
    psubw          xmm2, xmm1

    WELS_AbsW  xmm0, xmm3
    paddusw        xmm6, xmm0
    WELS_AbsW  xmm2, xmm4
    paddusw        xmm6, xmm2
    SSE2_SumWHorizon1  xmm6, xmm4
    movd           retrd,  xmm6
    and            retrd,  0xffff
    shr            retrd,  1
    POP_XMM
    LOAD_4_PARA_POP
    ret

 ;***********************************************************************
 ;
 ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
 ;
 ;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse2
    %assign  push_num 0
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    pxor   xmm6,   xmm6
    pxor   xmm7,   xmm7
    SSE2_GetSatd8x8
    psrlw   xmm6,  1
    SSE2_SumWHorizon   xmm6,xmm4,xmm7
    movd    retrd,   xmm6
    POP_XMM
    LOAD_4_PARA_POP
    ret

 ;***********************************************************************
 ;
 ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
 ;
 ;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse2
    %assign  push_num 0
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    pxor   xmm6,   xmm6
    pxor   xmm7,   xmm7

    SSE2_GetSatd8x8
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    SSE2_GetSatd8x8

    psrlw   xmm6,  1
    SSE2_SumWHorizon   xmm6,xmm4,xmm7
    movd    retrd,   xmm6
    POP_XMM
    LOAD_4_PARA_POP
    ret

;***********************************************************************
;
;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse2
    %assign  push_num 0
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    push r0
    push r2
    pxor   xmm6,   xmm6
    pxor   xmm7,   xmm7

    SSE2_GetSatd8x8

    pop r2
    pop r0
    add    r0,    8
    add    r2,    8
    SSE2_GetSatd8x8

    psrlw   xmm6,  1
    SSE2_SumWHorizon   xmm6,xmm4,xmm7
    movd    retrd,   xmm6
    POP_XMM
    LOAD_4_PARA_POP
    ret

;***********************************************************************
;
;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse2
    %assign  push_num 0
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    push r0
    push r2
    pxor   xmm6,   xmm6
    pxor   xmm7,   xmm7

    SSE2_GetSatd8x8
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    SSE2_GetSatd8x8

    pop r2
    pop r0
    add    r0,    8
    add    r2,    8

    SSE2_GetSatd8x8
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    SSE2_GetSatd8x8

 ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
    psrlw   xmm6,  1
    SSE2_SumWHorizon   xmm6,xmm4,xmm7
    movd    retrd,   xmm6
    POP_XMM
    LOAD_4_PARA_POP
    ret

;***********************************************************************
;
;Pixel_satd_wxh_sse2 END
;
;***********************************************************************

;***********************************************************************
;
;Pixel_satd_intra_sse2 BEGIN
;
;***********************************************************************


%macro SSE_DB_1_2REG 2
    pxor %1, %1
    pcmpeqw %2, %2
    psubb %1, %2
%endmacro

;***********************************************************************
;
;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
;
;***********************************************************************
WELS_EXTERN WelsSampleSatdThree4x4_sse2

%ifdef X86_32
    push r3
    push r4
    push r5
    push r6
    %assign  push_num 4
%else
    %assign  push_num 0
%endif
    PUSH_XMM 8

    mov  r2, arg3
    mov  r3, arg4
    SIGN_EXTENSION r3, r3d

    ; load source 4x4 samples and Hadamard transform
    movd      xmm0, [r2]
    movd      xmm1, [r2+r3]
    lea       r2 , [r2+2*r3]
    movd      xmm2, [r2]
    movd      xmm3, [r2+r3]
    punpckldq xmm0, xmm2
    punpckldq xmm1, xmm3

    pxor      xmm6, xmm6
    punpcklbw xmm0, xmm6
    punpcklbw xmm1, xmm6

    movdqa    xmm2, xmm0
    paddw     xmm0, xmm1
    psubw     xmm2, xmm1
    SSE2_XSawp  qdq, xmm0, xmm2, xmm3

    movdqa    xmm4, xmm0
    paddw     xmm0, xmm3
    psubw     xmm4, xmm3

    movdqa    xmm2, xmm0
    punpcklwd xmm0, xmm4
    punpckhwd xmm4, xmm2

    SSE2_XSawp  dq,  xmm0, xmm4, xmm3
    SSE2_XSawp  qdq, xmm0, xmm3, xmm5

    movdqa    xmm7, xmm0
    paddw     xmm0, xmm5
    psubw     xmm7, xmm5

    SSE2_XSawp  qdq,  xmm0, xmm7, xmm1

    ; Hadamard transform results are saved in xmm0 and xmm2
    movdqa    xmm2, xmm0
    paddw     xmm0, xmm1
    psubw     xmm2, xmm1

    ;load top boundary samples: [a b c d]
    mov r0, arg1
    mov r1, arg2
    SIGN_EXTENSION r1, r1d
    sub r0, r1
%ifdef UNIX64
    push r4
    push r5
%endif

    movzx     r2d,  byte [r0]
    movzx     r3d,  byte [r0+1]
    movzx     r4d,  byte [r0+2]
    movzx     r5d,  byte [r0+3]

    ; get the transform results of top boundary samples: [a b c d]
    add       r3d, r2d ; r3d = a + b
    add       r5d, r4d ; r5d = c + d
    add       r2d, r2d ; r2d = a + a
    add       r4d, r4d ; r4d = c + c
    sub       r2d, r3d ; r2d = a + a - a - b = a - b
    sub       r4d, r5d ; r4d = c + c - c - d = c - d
    add       r5d, r3d ; r5d = (a + b) + (c + d)
    add       r3d, r3d
    sub       r3d, r5d ; r3d = (a + b) - (c + d)
    add       r4d, r2d ; r4d = (a - b) + (c - d)
    add       r2d, r2d
    sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]

    movdqa    xmm6, xmm0
    movdqa    xmm7, xmm2
    movd      xmm5, r5d ; store the edi for DC mode
    pxor      xmm3, xmm3
    pxor      xmm4, xmm4
    pinsrw    xmm3, r5d, 0
    pinsrw    xmm3, r4d, 4
    psllw     xmm3, 2
    pinsrw    xmm4, r3d, 0
    pinsrw    xmm4, r2d, 4
    psllw     xmm4, 2

    ; get the satd of H
    psubw     xmm0, xmm3
    psubw     xmm2, xmm4

    WELS_AbsW  xmm0, xmm1
    WELS_AbsW  xmm2, xmm1
    paddusw        xmm0, xmm2
    SSE2_SumWHorizon1  xmm0, xmm1 ; satd of V is stored in xmm0

    ;load left boundary samples: [a b c d]'
    add r0, r1

    movzx     r2d,  byte [r0-1]
    movzx     r3d,  byte [r0+r1-1]
    lea       r0 , [r0+2*r1]
    movzx     r4d,  byte [r0-1]
    movzx     r5d,  byte [r0+r1-1]

    ; get the transform results of left boundary samples: [a b c d]'
    add       r3d, r2d ; r3d = a + b
    add       r5d, r4d ; r5d = c + d
    add       r2d, r2d ; r2d = a + a
    add       r4d, r4d ; r4d = c + c
    sub       r2d, r3d ; r2d = a + a - a - b = a - b
    sub       r4d, r5d ; r4d = c + c - c - d = c - d
    add       r5d, r3d ; r5d = (a + b) + (c + d)
    add       r3d, r3d
    sub       r3d, r5d ; r3d = (a + b) - (c + d)
    add       r4d, r2d ; r4d = (a - b) + (c - d)
    add       r2d, r2d
    sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]

    ; store the transform results in xmm3
    movd      xmm3, r5d
    pinsrw    xmm3, r3d, 1
    pinsrw    xmm3, r2d, 2
    pinsrw    xmm3, r4d, 3
    psllw     xmm3, 2

    ; get the satd of V
    movdqa    xmm2, xmm6
    movdqa    xmm4, xmm7
    psubw     xmm2, xmm3
    WELS_AbsW  xmm2, xmm1
    WELS_AbsW  xmm4, xmm1
    paddusw        xmm2, xmm4
    SSE2_SumWHorizon1  xmm2, xmm1 ; satd of H is stored in xmm2

    ; DC result is stored in xmm1
    add       r5d, 4
    movd      xmm1, r5d
    paddw     xmm1, xmm5
    psrlw     xmm1, 3
    movdqa    xmm5, xmm1
    psllw     xmm1, 4

    ; get the satd of DC
    psubw          xmm6, xmm1
    WELS_AbsW  xmm6, xmm1
    WELS_AbsW  xmm7, xmm1
    paddusw        xmm6, xmm7
    SSE2_SumWHorizon1  xmm6, xmm1 ; satd of DC is stored in xmm6
%ifdef UNIX64
    pop r5
    pop r4
%endif
    ; comparing order: DC H V

    mov  r4, arg5
    movd      r2d, xmm6
    movd      r3d, xmm2
    movd      r6d, xmm0

    and       r2d, 0xffff
    shr       r2d, 1
    and       r3d, 0xffff
    shr       r3d, 1
    and       r6d, 0xffff
    shr       r6d, 1
    add       r2d, dword arg7
    add       r3d, dword arg8
    add       r6d, dword arg9
    cmp       r2w, r3w
    jg near   not_dc
    cmp       r2w, r6w
    jg near   not_dc_h

    ; for DC mode
    movd      r3d, xmm5
    imul      r3d, 0x01010101
    movd      xmm5, r3d
    pshufd    xmm5, xmm5, 0
    movdqa    [r4], xmm5
    mov r5, arg6
    mov       dword [r5], 0x02
    mov retrd, r2d
    POP_XMM
%ifdef X86_32
    pop r6
    pop r5
    pop r4
    pop r3
%endif
    ret

not_dc:
    cmp       r3w, r6w
    jg near   not_dc_h

    ; for H mode
    SSE_DB_1_2REG  xmm6, xmm7
    sub        r0, r1
    sub        r0, r1
    movzx      r6d,  byte [r0-1]
    movd       xmm0, r6d
    pmuludq    xmm0, xmm6

    movzx     r6d,  byte [r0+r1-1]
    movd      xmm1, r6d
    pmuludq   xmm1, xmm6
    punpckldq xmm0, xmm1

    lea       r0,   [r0+r1*2]
    movzx     r6d,  byte [r0-1]
    movd      xmm2, r6d
    pmuludq   xmm2, xmm6

    movzx     r6d,  byte [r0+r1-1]
    movd      xmm3, r6d
    pmuludq   xmm3, xmm6
    punpckldq  xmm2, xmm3
    punpcklqdq xmm0, xmm2

    movdqa    [r4],xmm0

    mov       retrd, r3d
    mov r5, arg6
    mov       dword [r5], 0x01
    POP_XMM
%ifdef X86_32
    pop r6
    pop r5
    pop r4
    pop r3
%endif
    ret
not_dc_h:
    sub        r0, r1
    sub        r0, r1
    sub        r0, r1
    movd      xmm0, [r0]
    pshufd    xmm0, xmm0, 0
    movdqa    [r4],xmm0
    mov       retrd, r6d
    mov r5, arg6
    mov       dword [r5], 0x00
    POP_XMM
%ifdef X86_32
    pop r6
    pop r5
    pop r4
    pop r3
%endif
    ret


%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
    pmaddubsw    %1, xmm5
    movdqa       %2, %1
    pmaddwd      %1, xmm7
    pmaddwd      %2, xmm6
    movdqa       %3, %1
    punpckldq    %1, %2
    punpckhdq    %2, %3
    movdqa       %3, %1
    punpcklqdq   %1, %2
    punpckhqdq   %3, %2
    paddd        xmm4, %1 ;for dc
    paddd        xmm4, %3 ;for dc
    packssdw     %1, %3
    psllw        %1, 2
%endmacro
%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
    pmaddubsw    %1, xmm5
    movdqa       %2, %1
    pmaddwd      %1, xmm7
    pmaddwd      %2, xmm6
    movdqa       %3, %1
    punpckldq    %1, %2
    punpckhdq    %2, %3
    movdqa       %3, %1
    punpcklqdq   %1, %2
    punpckhqdq   %3, %2
;    paddd        xmm4, %1 ;for dc
;    paddd        xmm4, %3 ;for dc
    movdqa       %4, %1
    punpcklqdq   %4, %3
    packssdw     %1, %3
    psllw        %1, 2
%endmacro

%macro SSE41_GetX38x4SatdDec 0
    pxor        xmm7,   xmm7
    movq        xmm0,   [r2]
    movq        xmm1,   [r2+r3]
    lea         r2,    [r2+2*r3]
    movq        xmm2,   [r2]
    movq        xmm3,   [r2+r3]
    lea         r2,    [r2+2*r3]
    punpcklbw   xmm0,   xmm7
    punpcklbw   xmm1,   xmm7
    punpcklbw   xmm2,   xmm7
    punpcklbw   xmm3,   xmm7
    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
    ;doesn't need another transpose
%endmacro

%macro SSE41_GetX38x4SatdV 2
    pxor        xmm0,   xmm0
    pinsrw      xmm0,   word[r6+%2],   0
    pinsrw      xmm0,   word[r6+%2+8], 4
    psubsw      xmm0,   xmm7
    pabsw       xmm0,   xmm0
    paddw       xmm4,   xmm0
    pxor        xmm0,   xmm0
    pinsrw      xmm0,   word[r6+%2+2],  0
    pinsrw      xmm0,   word[r6+%2+10], 4
    psubsw      xmm0,   xmm1
    pabsw       xmm0,   xmm0
    paddw       xmm4,   xmm0
    pxor        xmm0,   xmm0
    pinsrw      xmm0,   word[r6+%2+4],  0
    pinsrw      xmm0,   word[r6+%2+12], 4
    psubsw      xmm0,   xmm3
    pabsw       xmm0,   xmm0
    paddw       xmm4,   xmm0
    pxor        xmm0,   xmm0
    pinsrw      xmm0,   word[r6+%2+6],  0
    pinsrw      xmm0,   word[r6+%2+14], 4
    psubsw      xmm0,   xmm2
    pabsw       xmm0,   xmm0
    paddw       xmm4,   xmm0
%endmacro
%macro SSE41_GetX38x4SatdH  3
    movq        xmm0,   [r6+%3+8*%1]
    punpcklqdq  xmm0,   xmm0
    psubsw      xmm0,   xmm7
    pabsw       xmm0,   xmm0
    paddw       xmm5,   xmm0
    pabsw       xmm1,   xmm1
    pabsw       xmm2,   xmm2
    pabsw       xmm3,   xmm3
    paddw       xmm2,   xmm1;for DC
    paddw       xmm2,   xmm3;for DC
    paddw       xmm5,   xmm2
%endmacro
%macro SSE41_I16X16GetX38x4SatdDC 0
    pxor        xmm0,   xmm0
    movq2dq     xmm0,   mm4
    punpcklqdq  xmm0,   xmm0
    psubsw      xmm0,   xmm7
    pabsw       xmm0,   xmm0
    paddw       xmm6,   xmm0
    paddw       xmm6,   xmm2
%endmacro
%macro SSE41_ChromaGetX38x4SatdDC 1
    shl         %1,     4
    movdqa      xmm0,   [r6+32+%1]
    psubsw      xmm0,   xmm7
    pabsw       xmm0,   xmm0
    paddw       xmm6,   xmm0
    paddw       xmm6,   xmm2
%endmacro
%macro SSE41_I16x16GetX38x4Satd 2
    SSE41_GetX38x4SatdDec
    SSE41_GetX38x4SatdV   %1, %2
    SSE41_GetX38x4SatdH   %1, %2, 32
    SSE41_I16X16GetX38x4SatdDC
%endmacro
%macro SSE41_ChromaGetX38x4Satd 2
    SSE41_GetX38x4SatdDec
    SSE41_GetX38x4SatdV   %1, %2
    SSE41_GetX38x4SatdH   %1, %2, 16
    SSE41_ChromaGetX38x4SatdDC %1
%endmacro
%macro SSE41_HSum8W 3
    pmaddwd     %1, %2
    movhlps     %3, %1
    paddd       %1, %3
    pshuflw     %3, %1,0Eh
    paddd       %1, %3
%endmacro

WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
    %assign  push_num 0
    LOAD_7_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r5, r5d

%ifndef X86_32
    push r12
    mov  r12, r2
%endif

    pxor        xmm4,   xmm4
%ifdef X86_32_PICASM
    push        0xff01ff01
    push        0xff01ff01
    push        0x01010101
    push        0x01010101
    movdqu      xmm5,   [esp]
    push        0xffff0001
    push        0xffff0001
    push        0xffff0001
    push        0xffff0001
    movdqu      xmm6,   [esp]
    push        0x00010001
    push        0x00010001
    push        0x00010001
    push        0x00010001
    movdqu      xmm7,   [esp]
    add         esp, 48
%else
    movdqa      xmm5,   [HSumSubDB1]
    movdqa      xmm6,   [HSumSubDW1]
    movdqa      xmm7,   [PDW1]
%endif
    sub         r0,    r1
    movdqu      xmm0,   [r0]
    movhlps     xmm1,   xmm0
    punpcklqdq  xmm0,   xmm0
    punpcklqdq  xmm1,   xmm1
    SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
    SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
    movdqa      [r6],  xmm0 ;V
    movdqa      [r6+16], xmm1
    add         r0,    r1
    pinsrb      xmm0,   byte[r0-1], 0
    pinsrb      xmm0,   byte[r0+r1-1], 1
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     2
    pinsrb      xmm0,   byte[r0+r1-1], 3
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     4
    pinsrb      xmm0,   byte[r0+r1-1], 5
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     6
    pinsrb      xmm0,   byte[r0+r1-1], 7
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     8
    pinsrb      xmm0,   byte[r0+r1-1], 9
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     10
    pinsrb      xmm0,   byte[r0+r1-1], 11
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     12
    pinsrb      xmm0,   byte[r0+r1-1], 13
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     14
    pinsrb      xmm0,   byte[r0+r1-1], 15
    movhlps     xmm1,   xmm0
    punpcklqdq  xmm0,   xmm0
    punpcklqdq  xmm1,   xmm1
    SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
    SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
    movdqa      [r6+32], xmm0 ;H
    movdqa      [r6+48], xmm1
    movd        r0d,    xmm4 ;dc
    add         r0d,    16   ;(sum+16)
    shr         r0d,    5    ;((sum+16)>>5)
    shl         r0d,    4    ;
    movd        mm4,    r0d  ; mm4 copy DC
    pxor        xmm4,   xmm4 ;V
    pxor        xmm5,   xmm5 ;H
    pxor        xmm6,   xmm6 ;DC
%ifdef UNIX64
    push r4
%endif
    mov         r0,    0
    mov         r4,    0

.loop16x16_get_satd:
.loopStart1:
    SSE41_I16x16GetX38x4Satd r0, r4
    inc          r0
    cmp         r0, 4
    jl          .loopStart1
    cmp         r4, 16
    je          .loop16x16_get_satd_end
%ifdef X86_32
    mov r2, arg3
%else
    mov r2, r12
%endif
    add         r2, 8
    mov         r0, 0
    add         r4, 16
    jmp         .loop16x16_get_satd
 .loop16x16_get_satd_end:
    MMX_DW_1_2REG    xmm0, xmm1
    psrlw       xmm4, 1 ;/2
    psrlw       xmm5, 1 ;/2
    psrlw       xmm6, 1 ;/2
    SSE41_HSum8W     xmm4, xmm0, xmm1
    SSE41_HSum8W     xmm5, xmm0, xmm1
    SSE41_HSum8W     xmm6, xmm0, xmm1

%ifdef UNIX64
    pop r4
%endif
    ; comparing order: DC H V
    movd      r3d, xmm6 ;DC
    movd      r1d, xmm5 ;H
    movd      r0d, xmm4 ;V
%ifndef X86_32
    pop r12
%endif
    shl       r5d, 1
    add       r1d, r5d
    add       r3d, r5d
    mov       r4, arg5
    cmp       r3d, r1d
    jge near   not_dc_16x16
    cmp        r3d, r0d
    jge near   not_dc_h_16x16

    ; for DC mode
    mov       dword[r4], 2;I16_PRED_DC
    mov       retrd, r3d
    jmp near return_satd_intra_16x16_x3
not_dc_16x16:
    ; for H mode
    cmp       r1d, r0d
    jge near   not_dc_h_16x16
    mov       dword[r4], 1;I16_PRED_H
    mov       retrd, r1d
    jmp near return_satd_intra_16x16_x3
not_dc_h_16x16:
    ; for V mode
    mov       dword[r4], 0;I16_PRED_V
    mov       retrd, r0d
return_satd_intra_16x16_x3:
    WELSEMMS
    POP_XMM
    LOAD_7_PARA_POP
ret

%macro SSE41_ChromaGetX38x8Satd 0
    movdqa      xmm5,   [HSumSubDB1]
    movdqa      xmm6,   [HSumSubDW1]
    movdqa      xmm7,   [PDW1]
    sub         r0,    r1
    movq        xmm0,   [r0]
    punpcklqdq  xmm0,   xmm0
    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
    movdqa      [r6],  xmm0 ;V
    add         r0,    r1
    pinsrb      xmm0,   byte[r0-1], 0
    pinsrb      xmm0,   byte[r0+r1-1], 1
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     2
    pinsrb      xmm0,   byte[r0+r1-1], 3
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     4
    pinsrb      xmm0,   byte[r0+r1-1], 5
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     6
    pinsrb      xmm0,   byte[r0+r1-1], 7
    punpcklqdq  xmm0,   xmm0
    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
    movdqa      [r6+16], xmm0 ;H
;(sum+2)>>2
    movdqa      xmm6,   [PDQ2]
    movdqa      xmm5,   xmm4
    punpckhqdq  xmm5,   xmm1
    paddd       xmm5,   xmm6
    psrld       xmm5,   2
;(sum1+sum2+4)>>3
    paddd       xmm6,   xmm6
    paddd       xmm4,   xmm1
    paddd       xmm4,   xmm6
    psrld       xmm4,   3
;satd *16
    pslld       xmm5,   4
    pslld       xmm4,   4
;temp satd
    movdqa      xmm6,   xmm4
    punpcklqdq  xmm4,   xmm5
    psllq       xmm4,   32
    psrlq       xmm4,   32
    movdqa      [r6+32], xmm4
    punpckhqdq  xmm5,   xmm6
    psllq       xmm5,   32
    psrlq       xmm5,   32
    movdqa      [r6+48], xmm5

    pxor        xmm4,   xmm4 ;V
    pxor        xmm5,   xmm5 ;H
    pxor        xmm6,   xmm6 ;DC
    mov         r0,    0
    SSE41_ChromaGetX38x4Satd r0, 0
    inc             r0
    SSE41_ChromaGetX38x4Satd r0, 0
%endmacro

%macro SSEReg2MMX 3
    movdq2q     %2, %1
    movhlps     %1, %1
    movdq2q     %3, %1
%endmacro
%macro MMXReg2SSE 4
    movq2dq     %1, %3
    movq2dq     %2, %4
    punpcklqdq  %1, %2
%endmacro
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41

WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
    %assign  push_num 0
    LOAD_7_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r5, r5d
loop_chroma_satdx3:
%ifdef X86_32_PICASM
    mov         r0, esp
    and         esp, 0xfffffff0
    push        0xff01ff01
    push        0xff01ff01
    push        0x01010101
    push        0x01010101
    movdqa      xmm5, [esp]
    push        0xffff0001
    push        0xffff0001
    push        0xffff0001
    push        0xffff0001
    movdqa      xmm6, [esp]
    push        0x00010001
    push        0x00010001
    push        0x00010001
    push        0x00010001
    movdqa      xmm7, [esp]
    mov         esp, r0
    mov         r0, [esp + push_num*4 + 4]

    sub         r0,    r1
    movq        xmm0,  [r0]
    punpcklqdq  xmm0,  xmm0
    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
    movdqa      [r6],  xmm0 ;V
    add         r0,     r1
    pinsrb      xmm0,   byte[r0-1], 0
    pinsrb      xmm0,   byte[r0+r1-1], 1
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     2
    pinsrb      xmm0,   byte[r0+r1-1], 3
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     4
    pinsrb      xmm0,   byte[r0+r1-1], 5
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     6
    pinsrb      xmm0,   byte[r0+r1-1], 7
    punpcklqdq  xmm0,   xmm0
    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
;movdqa      [r6+16], xmm0 ;H
;(sum+2)>>2
    mov        DWORD [r6+16], 0x0002
    mov        DWORD [r6+20], 0x0000
    mov        DWORD [r6+24], 0x0002
    mov        DWORD [r6+28], 0x0000
    movdqa      xmm6,   [r6+16]
    movdqa      [r6+16], xmm0 ;H

    movdqa      xmm5,   xmm4
    punpckhqdq  xmm5,   xmm1
    paddd       xmm5,   xmm6
    psrld       xmm5,   2
;(sum1+sum2+4)>>3
    paddd       xmm6,   xmm6
    paddd       xmm4,   xmm1
    paddd       xmm4,   xmm6
    psrld       xmm4,   3
;satd *16
    pslld       xmm5,   4
    pslld       xmm4,   4
;temp satd
    movdqa      xmm6,   xmm4
    punpcklqdq  xmm4,   xmm5
    psllq       xmm4,   32
    psrlq       xmm4,   32
    movdqa      [r6+32], xmm4
    punpckhqdq  xmm5,   xmm6
    psllq       xmm5,   32
    psrlq       xmm5,   32
    movdqa      [r6+48], xmm5

    pxor        xmm4,   xmm4 ;V
    pxor        xmm5,   xmm5 ;H
    pxor        xmm6,   xmm6 ;DC
    mov         r0,    0
    SSE41_ChromaGetX38x4Satd r0, 0
    inc         r0
    SSE41_ChromaGetX38x4Satd r0, 0
%else
    SSE41_ChromaGetX38x8Satd
%endif
    SSEReg2MMX  xmm4, mm0,mm1
    SSEReg2MMX  xmm5, mm2,mm3
    SSEReg2MMX  xmm6, mm5,mm6
    mov r0,     arg8
    mov r2,     arg9

%ifdef X86_32_PICASM
    mov         r0, esp
    and         esp, 0xfffffff0
    push        0xff01ff01
    push        0xff01ff01
    push        0x01010101
    push        0x01010101
    movdqa      xmm5,   [esp]
    push        0xffff0001
    push        0xffff0001
    push        0xffff0001
    push        0xffff0001
    movdqa      xmm6,   [esp]
    push        0x00010001
    push        0x00010001
    push        0x00010001
    push        0x00010001
    movdqa      xmm7,   [esp]
    mov         esp,    r0
    mov r0,     arg8

    sub         r0,    r1
    movq        xmm0,   [r0]
    punpcklqdq  xmm0,   xmm0
    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
    movdqa      [r6],  xmm0 ;V
    add         r0,    r1
    pinsrb      xmm0,   byte[r0-1], 0
    pinsrb      xmm0,   byte[r0+r1-1], 1
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     2
    pinsrb      xmm0,   byte[r0+r1-1], 3
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     4
    pinsrb      xmm0,   byte[r0+r1-1], 5
    lea         r0,    [r0+2*r1]
    pinsrb      xmm0,   byte[r0-1],     6
    pinsrb      xmm0,   byte[r0+r1-1], 7
    punpcklqdq  xmm0,   xmm0
    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
    ;movdqa      [r6+16], xmm0 ;H
;(sum+2)>>2

    mov        DWORD [r6+16], 0x0002
    mov        DWORD [r6+20], 0x0000
    mov        DWORD [r6+24], 0x0002
    mov        DWORD [r6+28], 0x0000
    movdqa      xmm6,   [r6+16]
    movdqa      [r6+16], xmm0 ;H

    movdqa      xmm5,   xmm4
    punpckhqdq  xmm5,   xmm1
    paddd       xmm5,   xmm6
    psrld       xmm5,   2
;(sum1+sum2+4)>>3
    paddd       xmm6,   xmm6
    paddd       xmm4,   xmm1
    paddd       xmm4,   xmm6
    psrld       xmm4,   3
;satd *16
    pslld       xmm5,   4
    pslld       xmm4,   4
;temp satd
    movdqa      xmm6,   xmm4
    punpcklqdq  xmm4,   xmm5
    psllq       xmm4,   32
    psrlq       xmm4,   32
    movdqa      [r6+32], xmm4
    punpckhqdq  xmm5,   xmm6
    psllq       xmm5,   32
    psrlq       xmm5,   32
    movdqa      [r6+48], xmm5

    pxor        xmm4,   xmm4 ;V
    pxor        xmm5,   xmm5 ;H
    pxor        xmm6,   xmm6 ;DC
    mov         r0,    0
    SSE41_ChromaGetX38x4Satd r0, 0
    inc         r0
    SSE41_ChromaGetX38x4Satd r0, 0
%else
    SSE41_ChromaGetX38x8Satd
%endif

    MMXReg2SSE  xmm0, xmm3, mm0, mm1
    MMXReg2SSE  xmm1, xmm3, mm2, mm3
    MMXReg2SSE  xmm2, xmm3, mm5, mm6

    paddw       xmm4, xmm0
    paddw       xmm5, xmm1
    paddw       xmm6, xmm2

    MMX_DW_1_2REG    xmm0, xmm1
    psrlw       xmm4, 1 ;/2
    psrlw       xmm5, 1 ;/2
    psrlw       xmm6, 1 ;/2
    SSE41_HSum8W     xmm4, xmm0, xmm1
    SSE41_HSum8W     xmm5, xmm0, xmm1
    SSE41_HSum8W     xmm6, xmm0, xmm1
    ; comparing order: DC H V
    movd      r3d, xmm6 ;DC
    movd      r1d, xmm5 ;H
    movd      r0d, xmm4 ;V


    shl       r5d, 1
    add       r1d, r5d
    add       r0d, r5d
    cmp       r3d, r1d
    jge near   not_dc_8x8
    cmp        r3d, r0d
    jge near   not_dc_h_8x8

    ; for DC mode
    mov       dword[r4], 0;I8_PRED_DC
    mov       retrd, r3d
    jmp near return_satd_intra_8x8_x3
not_dc_8x8:
    ; for H mode
    cmp       r1d, r0d
    jge near   not_dc_h_8x8
    mov       dword[r4], 1;I8_PRED_H
    mov       retrd, r1d
    jmp near return_satd_intra_8x8_x3
not_dc_h_8x8:
    ; for V mode
    mov       dword[r4], 2;I8_PRED_V
    mov       retrd, r0d
return_satd_intra_8x8_x3:
    WELSEMMS
    POP_XMM
    LOAD_7_PARA_POP
ret


;***********************************************************************
;
;Pixel_satd_intra_sse2 END
;
;***********************************************************************
%macro SSSE3_Get16BSadHVDC 2
    movd        xmm6,%1
    pshufb      xmm6,xmm1
    movdqa      %1,  xmm6
    movdqa      xmm0,%2
    psadbw      xmm0,xmm7
    paddw       xmm4,xmm0
    movdqa      xmm0,%2
    psadbw      xmm0,xmm5
    paddw       xmm2,xmm0
    psadbw      xmm6,%2
    paddw       xmm3,xmm6
%endmacro
%macro WelsAddDCValue 4
    movzx   %2, byte %1
    mov    %3, %2
    add     %4, %2
%endmacro

;***********************************************************************
;
;Pixel_sad_intra_ssse3 BEGIN
;
;***********************************************************************
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
    %assign  push_num 0
    LOAD_7_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    SIGN_EXTENSION r5, r5d

    push  r5
    push  r4
    push  r3

    sub    r0,    r1
    movdqa      xmm5,[r0]
    pxor        xmm0,xmm0
    psadbw      xmm0,xmm5
    movhlps     xmm1,xmm0
    paddw       xmm0,xmm1
    movd        r5d, xmm0

    add         r0,r1
    lea         r3,[r1+2*r1]    ;ebx r3
    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d    ; esi r4d, eax r5d
    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
    lea         r0, [r0+4*r1]
    add         r6, 64
    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
    lea         r0, [r0+4*r1]
    add         r6, 64
    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
    lea         r0, [r0+4*r1]
    add         r6, 64
    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
    sub         r6, 192
    add         r5d,10h
    shr         r5d,5
    movd        xmm7,r5d
    pxor        xmm1,xmm1
    pshufb      xmm7,xmm1
    pxor        xmm4,xmm4
    pxor        xmm3,xmm3
    pxor        xmm2,xmm2
    ;sad begin
    pop   r3
    lea         r4, [r3+2*r3] ;esi r4
    SSSE3_Get16BSadHVDC [r6], [r2]
    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
    add         r6, 64
    lea         r2, [r2+4*r3]
    SSSE3_Get16BSadHVDC [r6], [r2]
    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
    add         r6, 64
    lea         r2, [r2+4*r3]
    SSSE3_Get16BSadHVDC [r6], [r2]
    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
    add         r6, 64
    lea         r2, [r2+4*r3]
    SSSE3_Get16BSadHVDC [r6], [r2]
    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]

    pop r4
    pop r5
    pslldq      xmm3,4
    por         xmm3,xmm2
    movhlps     xmm1,xmm3
    paddw       xmm3,xmm1
    movhlps     xmm0,xmm4
    paddw       xmm4,xmm0
    ; comparing order: DC H V
    movd        r1d, xmm4 ;DC   ;ebx r1d
    movd        r0d, xmm3 ;V    ;ecx r0d
    psrldq      xmm3, 4
    movd        r2d, xmm3 ;H    ;esi r2d

    ;mov         eax, [esp+36] ;lamda ;eax r5
    shl         r5d, 1
    add         r2d, r5d
    add         r1d, r5d
    ;mov         edx, [esp+32]  ;edx r4
    cmp         r1d, r2d
    jge near   not_dc_16x16_sad
    cmp        r1d, r0d
    jge near   not_dc_h_16x16_sad
    ; for DC mode
    mov       dword[r4], 2;I16_PRED_DC
    mov       retrd, r1d
    sub        r6, 192
%assign x 0
%rep 16
    movdqa    [r6+16*x], xmm7
%assign x x+1
%endrep
    jmp near return_sad_intra_16x16_x3
not_dc_16x16_sad:
    ; for H mode
    cmp       r2d, r0d
    jge near   not_dc_h_16x16_sad
    mov       dword[r4], 1;I16_PRED_H
    mov       retrd, r2d
    jmp near return_sad_intra_16x16_x3
not_dc_h_16x16_sad:
    ; for V mode
    mov       dword[r4], 0;I16_PRED_V
    mov       retrd, r0d
    sub       r6, 192
%assign x 0
%rep 16
    movdqa    [r6+16*x], xmm5
%assign x x+1
%endrep
return_sad_intra_16x16_x3:
    POP_XMM
    LOAD_7_PARA_POP
    ret

;***********************************************************************
;
;Pixel_sad_intra_ssse3 END
;
;***********************************************************************
;***********************************************************************
;
;Pixel_satd_wxh_sse41 BEGIN
;
;***********************************************************************

;SSE4.1
%macro SSE41_GetSatd8x4 0
    movq             xmm0, [r0]
    punpcklqdq       xmm0, xmm0
    pmaddubsw        xmm0, xmm7
    movq             xmm1, [r0+r1]
    punpcklqdq       xmm1, xmm1
    pmaddubsw        xmm1, xmm7
    movq             xmm2, [r2]
    punpcklqdq       xmm2, xmm2
    pmaddubsw        xmm2, xmm7
    movq             xmm3, [r2+r3]
    punpcklqdq       xmm3, xmm3
    pmaddubsw        xmm3, xmm7
    psubsw           xmm0, xmm2
    psubsw           xmm1, xmm3
    movq             xmm2, [r0+2*r1]
    punpcklqdq       xmm2, xmm2
    pmaddubsw        xmm2, xmm7
    movq             xmm3, [r0+r4]
    punpcklqdq       xmm3, xmm3
    pmaddubsw        xmm3, xmm7
    movq             xmm4, [r2+2*r3]
    punpcklqdq       xmm4, xmm4
    pmaddubsw        xmm4, xmm7
    movq             xmm5, [r2+r5]
    punpcklqdq       xmm5, xmm5
    pmaddubsw        xmm5, xmm7
    psubsw           xmm2, xmm4
    psubsw           xmm3, xmm5
    SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
    pabsw            xmm0, xmm0
    pabsw            xmm2, xmm2
    pabsw            xmm1, xmm1
    pabsw            xmm3, xmm3
    movdqa           xmm4, xmm3
    pblendw          xmm3, xmm1, 0xAA
    pslld            xmm1, 16
    psrld            xmm4, 16
    por              xmm1, xmm4
    pmaxuw           xmm1, xmm3
    paddw            xmm6, xmm1
    movdqa           xmm4, xmm0
    pblendw          xmm0, xmm2, 0xAA
    pslld            xmm2, 16
    psrld            xmm4, 16
    por              xmm2, xmm4
    pmaxuw           xmm0, xmm2
    paddw            xmm6, xmm0
%endmacro

%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
    MMX_DW_1_2REG    %3, %4
    pmaddwd     %2, %3
    movhlps     %4, %2
    paddd       %2, %4
    pshuflw     %4, %2,0Eh
    paddd       %2, %4
    movd        %1, %2
%endmacro
;***********************************************************************
;
;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse41
    %assign  push_num 0
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
%ifdef X86_32_PICASM
    push        0xff01ff01
    push        0x01010101
    push        0xff01ff01
    push        0x01010101
    movdqu      xmm4,   [esp]
    add         esp, 16
%else
    movdqa      xmm4,[HSwapSumSubDB1]
%endif
    movd        xmm2,[r2]
    movd        xmm5,[r2+r3]
    shufps      xmm2,xmm5,0
    movd        xmm3,[r2+r3*2]
    lea         r2, [r3*2+r2]
    movd        xmm5,[r2+r3]
    shufps      xmm3,xmm5,0
    movd        xmm0,[r0]
    movd        xmm5,[r0+r1]
    shufps      xmm0,xmm5,0
    movd        xmm1,[r0+r1*2]
    lea         r0, [r1*2+r0]
    movd        xmm5,[r0+r1]
    shufps      xmm1,xmm5,0
    pmaddubsw   xmm0,xmm4
    pmaddubsw   xmm1,xmm4
    pmaddubsw   xmm2,xmm4
    pmaddubsw   xmm3,xmm4
    psubw       xmm0,xmm2
    psubw       xmm1,xmm3
    movdqa      xmm2,xmm0
    paddw       xmm0,xmm1
    psubw       xmm1,xmm2
    movdqa      xmm2,xmm0
    punpcklqdq  xmm0,xmm1
    punpckhqdq  xmm2,xmm1
    movdqa      xmm1,xmm0
    paddw       xmm0,xmm2
    psubw       xmm2,xmm1
    movdqa      xmm1,xmm0
    pblendw     xmm0,xmm2,0AAh
    pslld       xmm2,16
    psrld       xmm1,16
    por         xmm2,xmm1
    pabsw       xmm0,xmm0
    pabsw       xmm2,xmm2
    pmaxsw      xmm0,xmm2
    SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
    POP_XMM
    LOAD_4_PARA_POP
    ret

;***********************************************************************
;
;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse41
%ifdef X86_32
    push  r4
    push  r5
%endif
    %assign  push_num 2
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d

%ifdef X86_32_PICASM
    push        0xff01ff01
    push        0xff01ff01
    push        0x01010101
    push        0x01010101
    movdqu      xmm7,   [esp]
    add         esp, 16
%else
    movdqa      xmm7, [HSumSubDB1]
%endif
    lea         r4,  [r1+r1*2]
    lea         r5,  [r3+r3*2]
    pxor        xmm6, xmm6
    SSE41_GetSatd8x4
    lea         r0,  [r0+4*r1]
    lea         r2,  [r2+4*r3]
    SSE41_GetSatd8x4
    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
    POP_XMM
    LOAD_4_PARA_POP
%ifdef X86_32
    pop  r5
    pop  r4
%endif
    ret

;***********************************************************************
;
;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse41
%ifdef X86_32
    push  r4
    push  r5
    push  r6
%endif
    %assign  push_num 3
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d

%ifdef X86_32_PICASM
    push        0xff01ff01
    push        0xff01ff01
    push        0x01010101
    push        0x01010101
    movdqu      xmm7,   [esp]
    add         esp, 16
%else
    movdqa      xmm7, [HSumSubDB1]
%endif
    lea         r4,  [r1+r1*2]
    lea         r5,  [r3+r3*2]
    pxor        xmm6, xmm6
    mov         r6,    0
loop_get_satd_8x16:
    SSE41_GetSatd8x4
    lea         r0,  [r0+4*r1]
    lea         r2,  [r2+4*r3]
    inc         r6
    cmp         r6,  4
    jl          loop_get_satd_8x16
    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
    POP_XMM
    LOAD_4_PARA_POP
%ifdef X86_32
    pop  r6
    pop  r5
    pop  r4
%endif
    ret

;***********************************************************************
;
;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse41
%ifdef X86_32
    push  r4
    push  r5
%endif
    %assign  push_num 2
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    push  r0
    push  r2

%ifdef X86_32_PICASM
    push        0xff01ff01
    push        0xff01ff01
    push        0x01010101
    push        0x01010101
    movdqu      xmm7,   [esp]
    add         esp, 16
%else
    movdqa      xmm7, [HSumSubDB1]
%endif
    lea         r4,  [r1+r1*2]
    lea         r5,  [r3+r3*2]
    pxor        xmm6,   xmm6
    SSE41_GetSatd8x4
    lea         r0,  [r0+4*r1]
    lea         r2,  [r2+4*r3]
    SSE41_GetSatd8x4

    pop  r2
    pop  r0
    add         r0,    8
    add         r2,    8
    SSE41_GetSatd8x4
    lea         r0,  [r0+4*r1]
    lea         r2,  [r2+4*r3]
    SSE41_GetSatd8x4
    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
    POP_XMM
    LOAD_4_PARA_POP
%ifdef X86_32
    pop  r5
    pop  r4
%endif
    ret

;***********************************************************************
;
;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************

WELS_EXTERN WelsSampleSatd16x16_sse41
%ifdef X86_32
    push  r4
    push  r5
    push  r6
%endif
    %assign  push_num 3
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d

    push  r0
    push  r2

%ifdef X86_32_PICASM
    push        0xff01ff01
    push        0xff01ff01
    push        0x01010101
    push        0x01010101
    movdqu      xmm7,   [esp]
    add         esp, 16
%else
    movdqa      xmm7, [HSumSubDB1]
%endif
    lea         r4,  [r1+r1*2]
    lea         r5,  [r3+r3*2]
    pxor        xmm6,   xmm6
    mov         r6,    0
loop_get_satd_16x16_left:
    SSE41_GetSatd8x4
    lea         r0,  [r0+4*r1]
    lea         r2,  [r2+4*r3]
    inc         r6
    cmp         r6,  4
    jl          loop_get_satd_16x16_left

    pop  r2
    pop  r0
    add         r0,    8
    add         r2,    8
    mov         r6,    0
loop_get_satd_16x16_right:
    SSE41_GetSatd8x4
    lea         r0,  [r0+4*r1]
    lea         r2,  [r2+4*r3]
    inc         r6
    cmp         r6,  4
    jl          loop_get_satd_16x16_right
    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
    POP_XMM
    LOAD_4_PARA_POP
%ifdef X86_32
    pop  r6
    pop  r5
    pop  r4
%endif
    ret

;***********************************************************************
;
;Pixel_satd_wxh_sse41 END
;
;***********************************************************************

;***********************************************************************
;
;Pixel_satd_wxh_avx2 BEGIN
;
;***********************************************************************

%ifdef HAVE_AVX2
; out=%1 pSrcA=%2 pSrcB=%3 HSumSubDB1_256=%4 ymm_clobber=%5
%macro AVX2_LoadDiffSatd16x1 5
    vbroadcasti128   %1, [%2]
    vpmaddubsw       %1, %1, %4             ; hadamard neighboring horizontal sums and differences
    vbroadcasti128   %5, [%3]
    vpmaddubsw       %5, %5, %4             ; hadamard neighboring horizontal sums and differences
    vpsubw           %1, %1, %5             ; diff srcA srcB
%endmacro

; out=%1 pSrcA=%2 pSrcA+4*iStride=%3 pSrcB=%4 pSrcB+4*iStride=%5 HSumSubDB1_128x2=%6 ymm_clobber=%7,%8
%macro AVX2_LoadDiffSatd8x2 8
    vpbroadcastq     %1, [%2]
    vpbroadcastq     %7, [%3]
    vpblendd         %1, %1, %7, 11110000b
    vpmaddubsw       %1, %1, %6             ; hadamard neighboring horizontal sums and differences
    vpbroadcastq     %7, [%4]
    vpbroadcastq     %8, [%5]
    vpblendd         %7, %7, %8, 11110000b
    vpmaddubsw       %7, %7, %6             ; hadamard neighboring horizontal sums and differences
    vpsubw           %1, %1, %7             ; diff srcA srcB
%endmacro

; in/out=%1,%2,%3,%4 clobber=%5
%macro AVX2_HDMFour4x4 5
    vpsubw           %5, %1, %4             ; s3 = x0 - x3
    vpaddw           %1, %1, %4             ; s0 = x0 + x3
    vpsubw           %4, %2, %3             ; s2 = x1 - x2
    vpaddw           %2, %2, %3             ; s1 = x1 + x2
    vpsubw           %3, %1, %2             ; y2 = s0 - s1
    vpaddw           %1, %1, %2             ; y0 = s0 + s1
    vpaddw           %2, %5, %4             ; y1 = s3 + s2
    vpsubw           %4, %5, %4             ; y3 = s3 - s2
%endmacro

; out=%1 in=%1,%2,%3,%4 clobber=%5
%macro AVX2_SatdFour4x4 5
    AVX2_HDMFour4x4  %1, %2, %3, %4, %5
    vpabsw           %1, %1
    vpabsw           %2, %2
    vpabsw           %3, %3
    vpabsw           %4, %4
    ; second stage of horizontal hadamard.
    ; utilizes that |a + b| + |a - b| = 2 * max(|a|, |b|)
    vpblendw         %5, %1, %2, 10101010b
    vpslld           %2, %2, 16
    vpsrld           %1, %1, 16
    vpor             %2, %2, %1
    vpmaxuw          %2, %2, %5
    vpblendw         %5, %3, %4, 10101010b
    vpslld           %4, %4, 16
    vpsrld           %3, %3, 16
    vpor             %4, %4, %3
    vpmaxuw          %3, %5, %4
    vpaddw           %1, %2, %3
%endmacro

; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_256=%8 ymm_clobber=%9,%10,%11,%12
%macro AVX2_GetSatd16x4 12
    AVX2_LoadDiffSatd16x1  %1, %2 + 0 * %3, %5 + 0 * %6, %8, %12
    AVX2_LoadDiffSatd16x1  %9, %2 + 1 * %3, %5 + 1 * %6, %8, %12
    AVX2_LoadDiffSatd16x1 %10, %2 + 2 * %3, %5 + 2 * %6, %8, %12
    AVX2_LoadDiffSatd16x1 %11, %2 + 1 * %4, %5 + 1 * %7, %8, %12
    AVX2_SatdFour4x4 %1, %9, %10, %11, %12
%endmacro

; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_128x2=%8 ymm_clobber=%9,%10,%11,%12,%13
%macro AVX2_GetSatd8x8 13
    AVX2_LoadDiffSatd8x2  %1, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13
    AVX2_LoadDiffSatd8x2 %10, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13
    add              %2, %3
    add              %5, %6
    AVX2_LoadDiffSatd8x2  %9, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13
    AVX2_LoadDiffSatd8x2 %11, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13
    AVX2_SatdFour4x4 %1, %9, %10, %11, %12
%endmacro

; d_out=%1 mm_in=%2 mm_clobber=%3
%macro AVX2_SumWHorizon 3
    WELS_DW1_VEX     y%3
    vpmaddwd         y%2, y%2, y%3
    vextracti128     x%3, y%2, 1
    vpaddd           x%2, x%2, x%3
    vpunpckhqdq      x%3, x%2, x%2
    vpaddd           x%2, x%2, x%3
    vpsrldq          x%3, x%2, 4
    vpaddd           x%2, x%2, x%3
    vmovd            %1, x%2
%endmacro

;***********************************************************************
;
;int32_t WelsSampleSatd8x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************

WELS_EXTERN WelsSampleSatd8x16_avx2
    %assign push_num 0
%ifdef X86_32
    push r4
    %assign push_num 1
%endif
    mov r4, 2                      ; loop cnt
    jmp WelsSampleSatd8x8N_avx2

;***********************************************************************
;
;int32_t WelsSampleSatd8x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************

WELS_EXTERN WelsSampleSatd8x8_avx2
    %assign push_num 0
%ifdef X86_32
    push           r4
    %assign push_num 1
%endif
    mov            r4, 1           ; loop cnt
                                   ; fall through
WelsSampleSatd8x8N_avx2:
%ifdef X86_32
    push           r5
    push           r6
    %assign push_num push_num+2
%endif
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d

%ifdef X86_32_PICASM
    mov         r1, esp
    and         esp, 0xfffffff0
    push        0xff01ff01
    push        0xff01ff01
    push        0x01010101
    push        0x01010101
    vbroadcasti128 ymm7, [esp]
    mov            esp, r1
    mov            r1, [esp + push_num*4 + 8]
%else
    vbroadcasti128 ymm7, [HSumSubDB1]
%endif
    lea            r5, [3 * r1]
    lea            r6, [3 * r3]
    vpxor          ymm6, ymm6, ymm6
.loop:
    AVX2_GetSatd8x8 ymm0, r0, r1, r5, r2, r3, r6, ymm7, ymm1, ymm2, ymm3, ymm4, ymm5
    vpaddw         ymm6, ymm6, ymm0
    sub            r4, 1
    jbe            .loop_end
    add            r0, r5
    add            r2, r6
    lea            r0, [r0 + 4 * r1]
    lea            r2, [r2 + 4 * r3]
    jmp            .loop
.loop_end:
    AVX2_SumWHorizon retrd, mm6, mm5
    vzeroupper
    POP_XMM
    LOAD_4_PARA_POP
%ifdef X86_32
    pop            r6
    pop            r5
    pop            r4
%endif
    ret

;***********************************************************************
;
;int32_t WelsSampleSatd16x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************

WELS_EXTERN WelsSampleSatd16x16_avx2
    %assign push_num 0
%ifdef X86_32
    push r4
    %assign push_num 1
%endif
    mov r4, 4                      ; loop cnt
    jmp WelsSampleSatd16x4N_avx2

;***********************************************************************
;
;int32_t WelsSampleSatd16x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************

WELS_EXTERN WelsSampleSatd16x8_avx2
    %assign push_num 0
%ifdef X86_32
    push r4
    %assign push_num 1
%endif
    mov r4, 2                      ; loop cnt
                                   ; fall through
WelsSampleSatd16x4N_avx2:
%ifdef X86_32
    push r5
    push r6
    %assign push_num push_num+2
%endif
    LOAD_4_PARA
    PUSH_XMM 7
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d

%ifdef X86_32_PICASM
    mov         r0, esp
    and         esp, 0xfffffff0
    push        0xff01ff01
    push        0xff01ff01
    push        0x01010101
    push        0x01010101
    vpbroadcastq xmm0, [esp]
    vpbroadcastq ymm6, [esp + 8]
    mov            esp, r0
    mov            r0, [esp + push_num*4 + 4]
%else
    vpbroadcastq xmm0, [HSumSubDB1]
    vpbroadcastq ymm6, [HSumSubDB1 + 8]
%endif
    vpblendd     ymm6, ymm0, ymm6, 11110000b
    lea          r5, [3 * r1]
    lea          r6, [3 * r3]
    vpxor        ymm5, ymm5, ymm5
.loop:
    AVX2_GetSatd16x4 ymm0, r0, r1, r5, r2, r3, r6, ymm6, ymm1, ymm2, ymm3, ymm4
    vpaddw       ymm5, ymm5, ymm0
    lea          r0, [r0 + 4 * r1]
    lea          r2, [r2 + 4 * r3]
    sub          r4, 1
    ja           .loop
    AVX2_SumWHorizon retrd, mm5, mm0
    vzeroupper
    POP_XMM
    LOAD_4_PARA_POP
%ifdef X86_32
    pop r6
    pop r5
    pop r4
%endif
    ret

%endif

;***********************************************************************
;
;Pixel_satd_wxh_avx2 END
;
;***********************************************************************

;***********************************************************************
;
;Pixel_sad_wxh_sse2 BEGIN
;
;***********************************************************************

%macro SSE2_GetSad2x16 0
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movdqu xmm1,   [r2]
    MOVDQ  xmm2,   [r0];[eax] must aligned 16
    psadbw xmm1,   xmm2
    paddw  xmm0,   xmm1
    movdqu xmm1,   [r2+r3]
    MOVDQ  xmm2,   [r0+r1]
    psadbw xmm1,   xmm2
    paddw  xmm0,   xmm1
%endmacro


%macro SSE2_GetSad4x16 0
    movdqu xmm0,   [r2]
    MOVDQ  xmm2,   [r0]
    psadbw xmm0,   xmm2
    paddw  xmm7,   xmm0
    movdqu xmm1,   [r2+r3]
    MOVDQ  xmm2,   [r0+r1]
    psadbw xmm1,   xmm2
    paddw  xmm7,   xmm1
    movdqu xmm1,   [r2+2*r3]
    MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
    psadbw xmm1,   xmm2
    paddw  xmm7,   xmm1
    movdqu xmm1,   [r2+r5]
    MOVDQ  xmm2,   [r0+r4]
    psadbw xmm1,   xmm2
    paddw  xmm7,   xmm1
%endmacro


%macro SSE2_GetSad8x4 0
    movq   xmm0,   [r0]
    movq   xmm1,   [r0+r1]
    lea    r0,     [r0+2*r1]
    movhps xmm0,   [r0]
    movhps xmm1,   [r0+r1]

    movq   xmm2,   [r2]
    movq   xmm3,   [r2+r3]
    lea    r2,     [r2+2*r3]
    movhps xmm2,   [r2]
    movhps xmm3,   [r2+r3]
    psadbw xmm0,   xmm2
    psadbw xmm1,   xmm3
    paddw  xmm6,   xmm0
    paddw  xmm6,   xmm1
%endmacro

;***********************************************************************
;
;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
;First parameter can align to 16 bytes,
;In wels, the third parameter can't align to 16 bytes.
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x16_sse2
%ifdef X86_32
    push  r4
    push  r5
%endif

    %assign  push_num 2
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    lea r4, [3*r1]
    lea r5, [3*r3]

    pxor   xmm7,   xmm7
    SSE2_GetSad4x16
    lea    r0,  [r0+4*r1]
    lea    r2,  [r2+4*r3]
    SSE2_GetSad4x16
    lea    r0,  [r0+4*r1]
    lea    r2,  [r2+4*r3]
    SSE2_GetSad4x16
    lea    r0,  [r0+4*r1]
    lea    r2,  [r2+4*r3]
    SSE2_GetSad4x16
    movhlps xmm0, xmm7
    paddw xmm0, xmm7
    movd retrd, xmm0
    POP_XMM
    LOAD_4_PARA_POP
%ifdef X86_32
    pop  r5
    pop  r4
%endif
    ret

;***********************************************************************
;
;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
;First parameter can align to 16 bytes,
;In wels, the third parameter can't align to 16 bytes.
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x8_sse2
    %assign  push_num 0
    LOAD_4_PARA
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    movdqu xmm0,   [r2]
    MOVDQ  xmm2,   [r0]
    psadbw xmm0,   xmm2
    movdqu xmm1,   [r2+r3]
    MOVDQ  xmm2,   [r0+r1]
    psadbw xmm1,   xmm2
    paddw  xmm0,   xmm1

    SSE2_GetSad2x16
    SSE2_GetSad2x16
    SSE2_GetSad2x16

    movhlps     xmm1, xmm0
    paddw       xmm0, xmm1
    movd        retrd,  xmm0
    LOAD_4_PARA_POP
    ret



WELS_EXTERN WelsSampleSad8x16_sse2
    %assign  push_num 0
    LOAD_4_PARA
    PUSH_XMM 7
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    pxor   xmm6,   xmm6

    SSE2_GetSad8x4
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    SSE2_GetSad8x4
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    SSE2_GetSad8x4
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    SSE2_GetSad8x4

    movhlps    xmm0, xmm6
    paddw      xmm0, xmm6
    movd       retrd,  xmm0
    POP_XMM
    LOAD_4_PARA_POP
    ret


%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
and    %1,  0x1f|(%3>>1)
cmp    %1,  (32-%2)|(%3>>1)
%endmacro

WELS_EXTERN WelsSampleSad8x8_sse21
    %assign  push_num 0
    mov     r2,  arg3
    push    r2
    CACHE_SPLIT_CHECK r2, 8, 64
    jle    near   .pixel_sad_8x8_nsplit
    pop     r2
%ifdef X86_32
    push    r3
    push    r4
    push    r5
%endif
    %assign  push_num 3
    PUSH_XMM 8
    mov     r0,  arg1
    mov     r1,  arg2
    SIGN_EXTENSION r1, r1d
    pxor   xmm7,   xmm7

    ;ecx r2, edx r4, edi r5

    mov    r5,    r2
    and    r5,    0x07
    sub    r2,    r5
    mov    r4,    8
    sub    r4,    r5

    shl    r5,    3
    shl    r4,    3
    movd   xmm5,   r5d
    movd   xmm6,   r4d
    mov    r5,    8
    add    r5,    r2
    mov    r3,    arg4
    SIGN_EXTENSION r3, r3d
    movq   xmm0,   [r0]
    movhps xmm0,   [r0+r1]

    movq   xmm1,   [r2]
    movq   xmm2,   [r5]
    movhps xmm1,   [r2+r3]
    movhps xmm2,   [r5+r3]
    psrlq  xmm1,   xmm5
    psllq  xmm2,   xmm6
    por    xmm1,   xmm2

    psadbw xmm0,   xmm1
    paddw  xmm7,   xmm0

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    lea    r5,    [r5+2*r3]

    movq   xmm0,   [r0]
    movhps xmm0,   [r0+r1]

    movq   xmm1,   [r2]
    movq   xmm2,   [r5]
    movhps xmm1,   [r2+r3]
    movhps xmm2,   [r5+r3]
    psrlq  xmm1,   xmm5
    psllq  xmm2,   xmm6
    por    xmm1,   xmm2

    psadbw xmm0,   xmm1
    paddw  xmm7,   xmm0

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    lea    r5,    [r5+2*r3]

    movq   xmm0,   [r0]
    movhps xmm0,   [r0+r1]

    movq   xmm1,   [r2]
    movq   xmm2,   [r5]
    movhps xmm1,   [r2+r3]
    movhps xmm2,   [r5+r3]
    psrlq  xmm1,   xmm5
    psllq  xmm2,   xmm6
    por    xmm1,   xmm2

    psadbw xmm0,   xmm1
    paddw  xmm7,   xmm0

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    lea    r5,    [r5+2*r3]

    movq   xmm0,   [r0]
    movhps xmm0,   [r0+r1]

    movq   xmm1,   [r2]
    movq   xmm2,   [r5]
    movhps xmm1,   [r2+r3]
    movhps xmm2,   [r5+r3]
    psrlq  xmm1,   xmm5
    psllq  xmm2,   xmm6
    por    xmm1,   xmm2

    psadbw xmm0,   xmm1
    paddw  xmm7,   xmm0

    movhlps    xmm0, xmm7
    paddw      xmm0, xmm7
    movd       retrd,  xmm0
    POP_XMM
%ifdef X86_32
    pop  r5
    pop  r4
    pop  r3
%endif
    jmp        .return

.pixel_sad_8x8_nsplit:

    pop r2
    %assign  push_num 0
    LOAD_4_PARA
    PUSH_XMM 7
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    pxor   xmm6,   xmm6
    SSE2_GetSad8x4
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    SSE2_GetSad8x4
    movhlps    xmm0, xmm6
    paddw      xmm0, xmm6
    movd       retrd,  xmm0
    POP_XMM
    LOAD_4_PARA_POP
.return:
    ret


;***********************************************************************
;
;Pixel_sad_wxh_sse2 END
;
;***********************************************************************


;***********************************************************************
;
;Pixel_sad_4_wxh_sse2 BEGIN
;
;***********************************************************************


%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
    psadbw %1,   %4
    paddw  xmm5, %1
    psadbw %4,   %3
    paddw  xmm4, %4
    movdqu %4,   [%5-1]
    psadbw %4,   %2
    paddw  xmm6, %4
    movdqu %4,   [%5+1]
    psadbw %4,   %2
    paddw  xmm7, %4
%endmacro
WELS_EXTERN WelsSampleSadFour16x16_sse2
    %assign  push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
    pxor   xmm6,   xmm6    ;sad pRefMb-1
    pxor   xmm7,   xmm7    ;sad pRefMb+1
    movdqa xmm0,   [r0]
    sub    r2,    r3
    movdqu xmm3,   [r2]
    psadbw xmm3,   xmm0
    paddw  xmm4,   xmm3

    movdqa xmm1,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    psadbw xmm3,   xmm1
    paddw  xmm4,   xmm3

    movdqu xmm2,   [r2+r3-1]
    psadbw xmm2,   xmm0
    paddw  xmm6,   xmm2

    movdqu xmm3,   [r2+r3+1]
    psadbw xmm3,   xmm0
    paddw  xmm7,   xmm3

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movdqa xmm2,   [r0]
    movdqu xmm3,   [r2]
    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
    movdqa xmm0,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movdqa xmm1,   [r0]
    movdqu xmm3,   [r2]
    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
    movdqa xmm2,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movdqa xmm0,   [r0]
    movdqu xmm3,   [r2]
    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
    movdqa xmm1,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movdqa xmm2,   [r0]
    movdqu xmm3,   [r2]
    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
    movdqa xmm0,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movdqa xmm1,   [r0]
    movdqu xmm3,   [r2]
    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
    movdqa xmm2,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movdqa xmm0,   [r0]
    movdqu xmm3,   [r2]
    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
    movdqa xmm1,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movdqa xmm2,   [r0]
    movdqu xmm3,   [r2]
    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
    movdqa xmm0,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
    lea    r2,    [r2+2*r3]
    movdqu xmm3,   [r2]
    psadbw xmm2,   xmm3
    paddw xmm5,   xmm2

    movdqu xmm2,   [r2-1]
    psadbw xmm2,   xmm0
    paddw xmm6,   xmm2

    movdqu xmm3,   [r2+1]
    psadbw xmm3,   xmm0
    paddw xmm7,   xmm3

    movdqu xmm3,   [r2+r3]
    psadbw xmm0,   xmm3
    paddw xmm5,   xmm0

    movhlps    xmm0, xmm4
    paddw      xmm4, xmm0
    movhlps    xmm0, xmm5
    paddw      xmm5, xmm0
    movhlps    xmm0, xmm6
    paddw      xmm6, xmm0
    movhlps    xmm0, xmm7
    paddw      xmm7, xmm0
    punpckldq  xmm4, xmm5
    punpckldq  xmm6, xmm7
    punpcklqdq xmm4, xmm6
    movdqa     [r4],xmm4
    POP_XMM
    LOAD_5_PARA_POP
    ret


WELS_EXTERN WelsSampleSadFour16x8_sse2
    %assign  push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
    pxor   xmm6,   xmm6    ;sad pRefMb-1
    pxor   xmm7,   xmm7    ;sad pRefMb+1
    movdqa xmm0,   [r0]
    sub    r2,    r3
    movdqu xmm3,   [r2]
    psadbw xmm3,   xmm0
    paddw xmm4,   xmm3

    movdqa xmm1,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    psadbw xmm3,   xmm1
    paddw xmm4,   xmm3

    movdqu xmm2,   [r2+r3-1]
    psadbw xmm2,   xmm0
    paddw xmm6,   xmm2

    movdqu xmm3,   [r2+r3+1]
    psadbw xmm3,   xmm0
    paddw xmm7,   xmm3

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movdqa xmm2,   [r0]
    movdqu xmm3,   [r2]
    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
    movdqa xmm0,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movdqa xmm1,   [r0]
    movdqu xmm3,   [r2]
    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
    movdqa xmm2,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movdqa xmm0,   [r0]
    movdqu xmm3,   [r2]
    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
    movdqa xmm1,   [r0+r1]
    movdqu xmm3,   [r2+r3]
    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
    lea    r2,    [r2+2*r3]
    movdqu xmm3,   [r2]
    psadbw xmm0,   xmm3
    paddw xmm5,   xmm0

    movdqu xmm0,   [r2-1]
    psadbw xmm0,   xmm1
    paddw xmm6,   xmm0

    movdqu xmm3,   [r2+1]
    psadbw xmm3,   xmm1
    paddw xmm7,   xmm3

    movdqu xmm3,   [r2+r3]
    psadbw xmm1,   xmm3
    paddw xmm5,   xmm1

    movhlps    xmm0, xmm4
    paddw      xmm4, xmm0
    movhlps    xmm0, xmm5
    paddw      xmm5, xmm0
    movhlps    xmm0, xmm6
    paddw      xmm6, xmm0
    movhlps    xmm0, xmm7
    paddw      xmm7, xmm0
    punpckldq  xmm4, xmm5
    punpckldq  xmm6, xmm7
    punpcklqdq xmm4, xmm6
    movdqa     [r4],xmm4
    POP_XMM
    LOAD_5_PARA_POP
    ret

WELS_EXTERN WelsSampleSadFour8x16_sse2
    %assign  push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
    pxor   xmm6,   xmm6    ;sad pRefMb-1
    pxor   xmm7,   xmm7    ;sad pRefMb+1
    movq   xmm0,   [r0]
    movhps xmm0,   [r0+r1]
    sub    r2,    r3
    movq   xmm3,   [r2]
    movhps xmm3,   [r2+r3]
    psadbw xmm3,   xmm0
    paddw  xmm4,   xmm3

    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]
    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movq   xmm0,  [r0]
    movhps xmm0,  [r0+r1]
    psadbw xmm3,  xmm0
    paddw  xmm4,  xmm3

    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]

    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movq   xmm0,  [r0]
    movhps xmm0,  [r0+r1]
    psadbw xmm3,  xmm0
    paddw  xmm4,  xmm3

    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]

    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movq   xmm0,  [r0]
    movhps xmm0,  [r0+r1]
    psadbw xmm3,  xmm0
    paddw  xmm4,  xmm3

    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]

    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movq   xmm0,  [r0]
    movhps xmm0,  [r0+r1]
    psadbw xmm3,  xmm0
    paddw  xmm4,  xmm3

    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]

    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movq   xmm0,  [r0]
    movhps xmm0,  [r0+r1]
    psadbw xmm3,  xmm0
    paddw  xmm4,  xmm3

    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]

    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movq   xmm0,  [r0]
    movhps xmm0,  [r0+r1]
    psadbw xmm3,  xmm0
    paddw  xmm4,  xmm3

    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]

    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movq   xmm0,  [r0]
    movhps xmm0,  [r0+r1]
    psadbw xmm3,  xmm0
    paddw  xmm4,  xmm3

    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]

    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movhlps    xmm0, xmm4
    paddw      xmm4, xmm0
    movhlps    xmm0, xmm5
    paddw      xmm5, xmm0
    movhlps    xmm0, xmm6
    paddw      xmm6, xmm0
    movhlps    xmm0, xmm7
    paddw      xmm7, xmm0
    punpckldq  xmm4, xmm5
    punpckldq  xmm6, xmm7
    punpcklqdq xmm4, xmm6
    movdqa     [r4],xmm4
    POP_XMM
    LOAD_5_PARA_POP
    ret


WELS_EXTERN WelsSampleSadFour8x8_sse2
    %assign  push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
    pxor   xmm6,   xmm6    ;sad pRefMb-1
    pxor   xmm7,   xmm7    ;sad pRefMb+1
    movq   xmm0,   [r0]
    movhps xmm0,   [r0+r1]
    sub    r2,    r3
    movq   xmm3,   [r2]
    movhps xmm3,   [r2+r3]
    psadbw xmm3,   xmm0
    paddw  xmm4,   xmm3

    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]
    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movq   xmm0,  [r0]
    movhps xmm0,  [r0+r1]
    psadbw xmm3,  xmm0
    paddw  xmm4,  xmm3

    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]

    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movq   xmm0,  [r0]
    movhps xmm0,  [r0+r1]
    psadbw xmm3,  xmm0
    paddw  xmm4,  xmm3

    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]

    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movq   xmm0,  [r0]
    movhps xmm0,  [r0+r1]
    psadbw xmm3,  xmm0
    paddw  xmm4,  xmm3


    movq   xmm1,  [r2+r3-1]
    movq   xmm3,  [r2+r3+1]

    lea    r0,    [r0+2*r1]
    lea    r2,    [r2+2*r3]
    movhps xmm1,  [r2-1]
    movhps xmm3,  [r2+1]

    psadbw xmm1,  xmm0
    paddw  xmm6,  xmm1
    psadbw xmm3,  xmm0
    paddw  xmm7,  xmm3

    movq   xmm3,  [r2]
    movhps xmm3,  [r2+r3]
    psadbw xmm0,  xmm3
    paddw  xmm5,  xmm0

    movhlps    xmm0, xmm4
    paddw      xmm4, xmm0
    movhlps    xmm0, xmm5
    paddw      xmm5, xmm0
    movhlps    xmm0, xmm6
    paddw      xmm6, xmm0
    movhlps    xmm0, xmm7
    paddw      xmm7, xmm0
    punpckldq  xmm4, xmm5
    punpckldq  xmm6, xmm7
    punpcklqdq xmm4, xmm6
    movdqa     [r4],xmm4
    POP_XMM
    LOAD_5_PARA_POP
    ret

WELS_EXTERN WelsSampleSadFour4x4_sse2
    %assign  push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    movd   xmm0,   [r0]
    movd   xmm1,   [r0+r1]
    lea        r0,    [r0+2*r1]
    movd       xmm2,   [r0]
    movd       xmm3,   [r0+r1]
    punpckldq  xmm0, xmm1
    punpckldq  xmm2, xmm3
    punpcklqdq xmm0, xmm2
    sub        r2,  r3
    movd       xmm1, [r2]
    movd       xmm2, [r2+r3]
    punpckldq  xmm1, xmm2
    movd       xmm2, [r2+r3-1]
    movd       xmm3, [r2+r3+1]

    lea        r2,  [r2+2*r3]

    movd       xmm4, [r2]
    movd       xmm5, [r2-1]
    punpckldq  xmm2, xmm5
    movd       xmm5, [r2+1]
    punpckldq  xmm3, xmm5

    movd       xmm5, [r2+r3]
    punpckldq  xmm4, xmm5

    punpcklqdq xmm1, xmm4 ;-L

    movd       xmm5, [r2+r3-1]
    movd       xmm6, [r2+r3+1]

    lea        r2,  [r2+2*r3]
    movd       xmm7, [r2-1]
    punpckldq  xmm5, xmm7
    punpcklqdq xmm2, xmm5 ;-1
    movd       xmm7, [r2+1]
    punpckldq  xmm6, xmm7
    punpcklqdq xmm3, xmm6 ;+1
    movd       xmm6, [r2]
    movd       xmm7, [r2+r3]
    punpckldq  xmm6, xmm7
    punpcklqdq xmm4, xmm6 ;+L
    psadbw     xmm1, xmm0
    psadbw     xmm2, xmm0
    psadbw     xmm3, xmm0
    psadbw     xmm4, xmm0

    movhlps    xmm0, xmm1
    paddw      xmm1, xmm0
    movhlps    xmm0, xmm2
    paddw      xmm2, xmm0
    movhlps    xmm0, xmm3
    paddw      xmm3, xmm0
    movhlps    xmm0, xmm4
    paddw      xmm4, xmm0
    punpckldq  xmm1, xmm4
    punpckldq  xmm2, xmm3
    punpcklqdq xmm1, xmm2
    movdqa     [r4],xmm1
    POP_XMM
    LOAD_5_PARA_POP
    ret

;***********************************************************************
;
;Pixel_sad_4_wxh_sse2 END
;
;***********************************************************************

;***********************************************************************
;   int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
;***********************************************************************
WELS_EXTERN WelsSampleSad4x4_mmx
    %assign  push_num 0
    LOAD_4_PARA
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    movd      mm0, [r0]
    movd      mm1, [r0+r1]
    punpckldq mm0, mm1

    movd      mm3, [r2]
    movd      mm4, [r2+r3]
    punpckldq mm3, mm4
    psadbw    mm0, mm3

    lea       r0, [r0+2*r1]
    lea       r2, [r2+2*r3]

    movd      mm1, [r0]
    movd      mm2, [r0+r1]
    punpckldq mm1, mm2

    movd      mm3, [r2]
    movd      mm4, [r2+r3]
    punpckldq mm3, mm4
    psadbw    mm1, mm3
    paddw     mm0, mm1

    movd      retrd, mm0

    WELSEMMS
    LOAD_4_PARA_POP
    ret