ref: 08638a9396dd466fdb5c68068b5aab8853def896
parent: 3c40261256f6e89bd868f9fafba8b9c4fd9ea15d
parent: 04dba61d22ab43c1f428302f2abbaecca723fe30
author: Ethan Hugg <[email protected]>
date: Tue Jan 28 06:04:12 EST 2014
Merge pull request #252 from mstorsjo/share-processing-asm Merge identical assembly code between the processing and encoder libs
--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -2106,7 +2106,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\satd_sad.asm"
+ RelativePath="..\..\..\common\satd_sad.asm"
>
<FileConfiguration
Name="Debug|Win32"
--- /dev/null
+++ b/codec/common/satd_sad.asm
@@ -1,0 +1,2344 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* satd_sad.asm
+;*
+;* Abstract
+;* WelsSampleSatd4x4_sse2
+;* WelsSampleSatd8x8_sse2
+;* WelsSampleSatd16x8_sse2
+;* WelsSampleSatd8x16_sse2
+;* WelsSampleSatd16x16_sse2
+;*
+;* WelsSampleSad16x8_sse2
+;* WelsSampleSad16x16_sse2
+;*
+;* History
+;* 8/5/2009 Created
+;* 24/9/2009 modified
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Data
+;***********************************************************************
+SECTION .rodata align=16
+
+align 16
+HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
+align 16
+HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
+align 16
+PDW1: dw 1,1,1,1,1,1,1,1
+align 16
+PDQ2: dw 2,0,0,0,2,0,0,0
+align 16
+HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 BEGIN
+;
+;***********************************************************************
+%macro MMX_DW_1_2REG 2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubw %1, %2
+%endmacro
+
+%macro SSE2_SumWHorizon1 2
+ movdqa %2, %1
+ psrldq %2, 8
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ paddusw %1, %2
+%endmacro
+
+%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %4, %5
+ SSE2_SumSub %2, %4, %5
+ SSE2_SumSub %1, %3, %5
+%endmacro
+
+%macro SSE2_SumAbs4 7
+ WELS_AbsW %1, %3
+ WELS_AbsW %2, %3
+ WELS_AbsW %4, %6
+ WELS_AbsW %5, %6
+ paddusw %1, %2
+ paddusw %4, %5
+ paddusw %7, %1
+ paddusw %7, %4
+%endmacro
+
+%macro SSE2_SumWHorizon 3
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+%endmacro
+
+%macro SSE2_GetSatd8x8 0
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse2
+align 16
+WelsSampleSatd4x4_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movd xmm0, [r0]
+ movd xmm1, [r0+r1]
+ lea r0 , [r0+2*r1]
+ movd xmm2, [r0]
+ movd xmm3, [r0+r1]
+ punpckldq xmm0, xmm2
+ punpckldq xmm1, xmm3
+
+ movd xmm4, [r2]
+ movd xmm5, [r2+r3]
+ lea r2 , [r2+2*r3]
+ movd xmm6, [r2]
+ movd xmm7, [r2+r3]
+ punpckldq xmm4, xmm6
+ punpckldq xmm5, xmm7
+
+ pxor xmm6, xmm6
+ punpcklbw xmm0, xmm6
+ punpcklbw xmm1, xmm6
+ punpcklbw xmm4, xmm6
+ punpcklbw xmm5, xmm6
+
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+ SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+ movdqa xmm4, xmm0
+ paddw xmm0, xmm3
+ psubw xmm4, xmm3
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm4, xmm2
+
+ SSE2_XSawp dq, xmm0, xmm4, xmm3
+ SSE2_XSawp qdq, xmm0, xmm3, xmm5
+
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm5
+ psubw xmm7, xmm5
+
+ SSE2_XSawp qdq, xmm0, xmm7, xmm1
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+
+ WELS_AbsW xmm0, xmm3
+ paddusw xmm6, xmm0
+ WELS_AbsW xmm2, xmm4
+ paddusw xmm6, xmm2
+ SSE2_SumWHorizon1 xmm6, xmm4
+ movd retrd, xmm6
+ and retrd, 0xffff
+ shr retrd, 1
+ LOAD_4_PARA_POP
+ ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x8_sse2
+align 16
+ WelsSampleSatd8x8_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+ SSE2_GetSatd8x8
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x16_sse2
+align 16
+ WelsSampleSatd8x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse2
+align 16
+WelsSampleSatd16x8_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+8]
+ ;mov ecx, [esp+16]
+ add r0, 8
+ add r2, 8
+ SSE2_GetSatd8x8
+
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x16_sse2
+align 16
+WelsSampleSatd16x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+8]
+ ;mov ecx, [esp+16]
+ add r0, 8
+ add r2, 8
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
+ paddd xmm4, %1 ;for dc
+ paddd xmm4, %3 ;for dc
+ packssdw %1, %3
+ psllw %1, 2
+%endmacro
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
+; paddd xmm4, %1 ;for dc
+; paddd xmm4, %3 ;for dc
+ movdqa %4, %1
+ punpcklqdq %4, %3
+ packssdw %1, %3
+ psllw %1, 2
+%endmacro
+
+%macro SSE41_GetX38x4SatdDec 0
+ pxor xmm7, xmm7
+ movq xmm0, [eax]
+ movq xmm1, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ movq xmm2, [eax]
+ movq xmm3, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+ punpcklbw xmm2, xmm7
+ punpcklbw xmm3, xmm7
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+ ;doesn't need another transpose
+%endmacro
+%macro SSE41_GetX38x4SatdV 2
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2], 0
+ pinsrw xmm0, word[esi+%2+8], 4
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+2], 0
+ pinsrw xmm0, word[esi+%2+10], 4
+ psubsw xmm0, xmm1
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+4], 0
+ pinsrw xmm0, word[esi+%2+12], 4
+ psubsw xmm0, xmm3
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+6], 0
+ pinsrw xmm0, word[esi+%2+14], 4
+ psubsw xmm0, xmm2
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+%endmacro
+%macro SSE41_GetX38x4SatdH 3
+ movq xmm0, [esi+%3+8*%1]
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm5, xmm0
+ pabsw xmm1, xmm1
+ pabsw xmm2, xmm2
+ pabsw xmm3, xmm3
+ paddw xmm2, xmm1;for DC
+ paddw xmm2, xmm3;for DC
+ paddw xmm5, xmm2
+%endmacro
+%macro SSE41_I16X16GetX38x4SatdDC 0
+ pxor xmm0, xmm0
+ movq2dq xmm0, mm4
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
+%endmacro
+%macro SSE41_ChromaGetX38x4SatdDC 1
+ shl %1, 4
+ movdqa xmm0, [esi+32+%1]
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
+%endmacro
+%macro SSE41_I16x16GetX38x4Satd 2
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 32
+ SSE41_I16X16GetX38x4SatdDC
+%endmacro
+%macro SSE41_ChromaGetX38x4Satd 2
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 16
+ SSE41_ChromaGetX38x4SatdDC %1
+%endmacro
+%macro SSE41_HSum8W 3
+ pmaddwd %1, %2
+ movhlps %3, %1
+ paddd %1, %3
+ pshuflw %3, %1,0Eh
+ paddd %1, %3
+%endmacro
+
+
+%ifdef X86_32
+WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
+WelsIntra16x16Combined3Satd_sse41:
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ mov esi, [esp+40] ;temp_satd
+ pxor xmm4, xmm4
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub ecx, edx
+ movdqu xmm0, [ecx]
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [esi], xmm0 ;V
+ movdqa [esi+16], xmm1
+ add ecx, edx
+ pinsrb xmm0, byte[ecx-1], 0
+ pinsrb xmm0, byte[ecx+edx-1], 1
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 2
+ pinsrb xmm0, byte[ecx+edx-1], 3
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 4
+ pinsrb xmm0, byte[ecx+edx-1], 5
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 6
+ pinsrb xmm0, byte[ecx+edx-1], 7
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 8
+ pinsrb xmm0, byte[ecx+edx-1], 9
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 10
+ pinsrb xmm0, byte[ecx+edx-1], 11
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 12
+ pinsrb xmm0, byte[ecx+edx-1], 13
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 14
+ pinsrb xmm0, byte[ecx+edx-1], 15
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [esi+32], xmm0 ;H
+ movdqa [esi+48], xmm1
+ movd ecx, xmm4 ;dc
+ add ecx, 16 ;(sum+16)
+ shr ecx, 5 ;((sum+16)>>5)
+ shl ecx, 4 ;
+ movd mm4, ecx ; mm4 copy DC
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov ecx, 0
+ mov edi, 0
+.loop16x16_get_satd:
+.loopStart1:
+ SSE41_I16x16GetX38x4Satd ecx, edi
+ inc ecx
+ cmp ecx, 4
+ jl .loopStart1
+ cmp edi, 16
+ je .loop16x16_get_satd_end
+ mov eax, [esp+24]
+ add eax, 8
+ mov ecx, 0
+ add edi, 16
+ jmp .loop16x16_get_satd
+ .loop16x16_get_satd_end:
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
+
+ ; comparing order: DC H V
+ movd ebx, xmm6 ;DC
+ movd edi, xmm5 ;H
+ movd ecx, xmm4 ;V
+ mov edx, [esp+36]
+ shl edx, 1
+ add edi, edx
+ add ebx, edx
+ mov edx, [esp+32]
+ cmp ebx, edi
+ jge near not_dc_16x16
+ cmp ebx, ecx
+ jge near not_dc_h_16x16
+
+ ; for DC mode
+ mov dword[edx], 2;I16_PRED_DC
+ mov eax, ebx
+ jmp near return_satd_intra_16x16_x3
+not_dc_16x16:
+ ; for H mode
+ cmp edi, ecx
+ jge near not_dc_h_16x16
+ mov dword[edx], 1;I16_PRED_H
+ mov eax, edi
+ jmp near return_satd_intra_16x16_x3
+not_dc_h_16x16:
+ ; for V mode
+ mov dword[edx], 0;I16_PRED_V
+ mov eax, ecx
+return_satd_intra_16x16_x3:
+ WELSEMMS
+ pop edi
+ pop esi
+ pop ebx
+ret
+
+%macro SSE41_ChromaGetX38x8Satd 0
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub ecx, edx
+ movq xmm0, [ecx]
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+ movdqa [esi], xmm0 ;V
+ add ecx, edx
+ pinsrb xmm0, byte[ecx-1], 0
+ pinsrb xmm0, byte[ecx+edx-1], 1
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 2
+ pinsrb xmm0, byte[ecx+edx-1], 3
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 4
+ pinsrb xmm0, byte[ecx+edx-1], 5
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 6
+ pinsrb xmm0, byte[ecx+edx-1], 7
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+ movdqa [esi+16], xmm0 ;H
+;(sum+2)>>2
+ movdqa xmm6, [PDQ2]
+ movdqa xmm5, xmm4
+ punpckhqdq xmm5, xmm1
+ paddd xmm5, xmm6
+ psrld xmm5, 2
+;(sum1+sum2+4)>>3
+ paddd xmm6, xmm6
+ paddd xmm4, xmm1
+ paddd xmm4, xmm6
+ psrld xmm4, 3
+;satd *16
+ pslld xmm5, 4
+ pslld xmm4, 4
+;temp satd
+ movdqa xmm6, xmm4
+ punpcklqdq xmm4, xmm5
+ psllq xmm4, 32
+ psrlq xmm4, 32
+ movdqa [esi+32], xmm4
+ punpckhqdq xmm5, xmm6
+ psllq xmm5, 32
+ psrlq xmm5, 32
+ movdqa [esi+48], xmm5
+
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov ecx, 0
+loop_chroma_satdx3_cb_cr:
+ SSE41_ChromaGetX38x4Satd ecx, 0
+ inc ecx
+ cmp ecx, 2
+ jl loop_chroma_satdx3_cb_cr
+%endmacro
+
+%macro SSEReg2MMX 3
+ movdq2q %2, %1
+ movhlps %1, %1
+ movdq2q %3, %1
+%endmacro
+%macro MMXReg2SSE 4
+ movq2dq %1, %3
+ movq2dq %2, %4
+ punpcklqdq %1, %2
+%endmacro
+;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
+
+WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
+WelsIntraChroma8x8Combined3Satd_sse41:
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ mov esi, [esp+40] ;temp_satd
+ xor edi, edi
+loop_chroma_satdx3:
+ SSE41_ChromaGetX38x8Satd
+ cmp edi, 1
+ je loop_chroma_satdx3end
+ inc edi
+ SSEReg2MMX xmm4, mm0,mm1
+ SSEReg2MMX xmm5, mm2,mm3
+ SSEReg2MMX xmm6, mm5,mm6
+ mov ecx, [esp+44]
+ mov eax, [esp+48]
+ jmp loop_chroma_satdx3
+loop_chroma_satdx3end:
+ MMXReg2SSE xmm0, xmm3, mm0, mm1
+ MMXReg2SSE xmm1, xmm3, mm2, mm3
+ MMXReg2SSE xmm2, xmm3, mm5, mm6
+
+ paddw xmm4, xmm0
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
+ ; comparing order: DC H V
+ movd ebx, xmm6 ;DC
+ movd edi, xmm5 ;H
+ movd ecx, xmm4 ;V
+ mov edx, [esp+36]
+ shl edx, 1
+ add edi, edx
+ add ecx, edx
+ mov edx, [esp+32]
+ cmp ebx, edi
+ jge near not_dc_8x8
+ cmp ebx, ecx
+ jge near not_dc_h_8x8
+
+ ; for DC mode
+ mov dword[edx], 0;I8_PRED_DC
+ mov eax, ebx
+ jmp near return_satd_intra_8x8_x3
+not_dc_8x8:
+ ; for H mode
+ cmp edi, ecx
+ jge near not_dc_h_8x8
+ mov dword[edx], 1;I8_PRED_H
+ mov eax, edi
+ jmp near return_satd_intra_8x8_x3
+not_dc_h_8x8:
+ ; for V mode
+ mov dword[edx], 2;I8_PRED_V
+ mov eax, ecx
+return_satd_intra_8x8_x3:
+ WELSEMMS
+ pop edi
+ pop esi
+ pop ebx
+ret
+
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 END
+;
+;***********************************************************************
+%macro SSSE3_Get16BSadHVDC 2
+ movd xmm6,%1
+ pshufb xmm6,xmm1
+ movdqa %1, xmm6
+ movdqa xmm0,%2
+ psadbw xmm0,xmm7
+ paddw xmm4,xmm0
+ movdqa xmm0,%2
+ psadbw xmm0,xmm5
+ paddw xmm2,xmm0
+ psadbw xmm6,%2
+ paddw xmm3,xmm6
+%endmacro
+%macro WelsAddDCValue 4
+ movzx %2, byte %1
+ mov %3, %2
+ add %4, %2
+%endmacro
+
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 BEGIN
+;
+;***********************************************************************
+WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
+WelsIntra16x16Combined3Sad_ssse3:
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov edi, [esp+40] ;temp_sad
+ sub ecx, edx
+ movdqa xmm5,[ecx]
+ pxor xmm0,xmm0
+ psadbw xmm0,xmm5
+ movhlps xmm1,xmm0
+ paddw xmm0,xmm1
+ movd eax,xmm0
+
+ add ecx,edx
+ lea ebx, [edx+2*edx]
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ sub edi, 192
+ add eax,10h
+ shr eax,5
+ movd xmm7,eax
+ pxor xmm1,xmm1
+ pshufb xmm7,xmm1
+ pxor xmm4,xmm4
+ pxor xmm3,xmm3
+ pxor xmm2,xmm2
+;sad begin
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ lea esi, [ebx+2*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+
+ pslldq xmm3,4
+ por xmm3,xmm2
+ movhlps xmm1,xmm3
+ paddw xmm3,xmm1
+ movhlps xmm0,xmm4
+ paddw xmm4,xmm0
+; comparing order: DC H V
+ movd ebx, xmm4 ;DC
+ movd ecx, xmm3 ;V
+ psrldq xmm3, 4
+ movd esi, xmm3 ;H
+ mov eax, [esp+36] ;lamda
+ shl eax, 1
+ add esi, eax
+ add ebx, eax
+ mov edx, [esp+32]
+ cmp ebx, esi
+ jge near not_dc_16x16_sad
+ cmp ebx, ecx
+ jge near not_dc_h_16x16_sad
+ ; for DC mode
+ mov dword[edx], 2;I16_PRED_DC
+ mov eax, ebx
+ sub edi, 192
+%assign x 0
+%rep 16
+ movdqa [edi+16*x], xmm7
+%assign x x+1
+%endrep
+ jmp near return_sad_intra_16x16_x3
+not_dc_16x16_sad:
+ ; for H mode
+ cmp esi, ecx
+ jge near not_dc_h_16x16_sad
+ mov dword[edx], 1;I16_PRED_H
+ mov eax, esi
+ jmp near return_sad_intra_16x16_x3
+not_dc_h_16x16_sad:
+ ; for V mode
+ mov dword[edx], 0;I16_PRED_V
+ mov eax, ecx
+ sub edi, 192
+%assign x 0
+%rep 16
+ movdqa [edi+16*x], xmm5
+%assign x x+1
+%endrep
+return_sad_intra_16x16_x3:
+ pop edi
+ pop esi
+ pop ebx
+ ret
+%endif
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 END
+;
+;***********************************************************************
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 BEGIN
+;
+;***********************************************************************
+
+;SSE4.1
+%macro SSE41_GetSatd8x4 0
+ movq xmm0, [r0]
+ punpcklqdq xmm0, xmm0
+ pmaddubsw xmm0, xmm7
+ movq xmm1, [r0+r1]
+ punpcklqdq xmm1, xmm1
+ pmaddubsw xmm1, xmm7
+ movq xmm2, [r2]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r2+r3]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ psubsw xmm0, xmm2
+ psubsw xmm1, xmm3
+ movq xmm2, [r0+2*r1]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r0+r4]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ movq xmm4, [r2+2*r3]
+ punpcklqdq xmm4, xmm4
+ pmaddubsw xmm4, xmm7
+ movq xmm5, [r2+r5]
+ punpcklqdq xmm5, xmm5
+ pmaddubsw xmm5, xmm7
+ psubsw xmm2, xmm4
+ psubsw xmm3, xmm5
+ SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
+ pabsw xmm0, xmm0
+ pabsw xmm2, xmm2
+ pabsw xmm1, xmm1
+ pabsw xmm3, xmm3
+ movdqa xmm4, xmm3
+ pblendw xmm3, xmm1, 0xAA
+ pslld xmm1, 16
+ psrld xmm4, 16
+ por xmm1, xmm4
+ pmaxuw xmm1, xmm3
+ paddw xmm6, xmm1
+ movdqa xmm4, xmm0
+ pblendw xmm0, xmm2, 0xAA
+ pslld xmm2, 16
+ psrld xmm4, 16
+ por xmm2, xmm4
+ pmaxuw xmm0, xmm2
+ paddw xmm6, xmm0
+%endmacro
+
+%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
+ MMX_DW_1_2REG %3, %4
+ pmaddwd %2, %3
+ movhlps %4, %2
+ paddd %2, %4
+ pshuflw %4, %2,0Eh
+ paddd %2, %4
+ movd %1, %2
+%endmacro
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse41
+WelsSampleSatd4x4_sse41:
+ ;push ebx
+ ;mov eax,[esp+8]
+ ;mov ebx,[esp+12]
+ ;mov ecx,[esp+16]
+ ;mov edx,[esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqa xmm4,[HSwapSumSubDB1]
+ movd xmm2,[r2]
+ movd xmm5,[r2+r3]
+ shufps xmm2,xmm5,0
+ movd xmm3,[r2+r3*2]
+ lea r2, [r3*2+r2]
+ movd xmm5,[r2+r3]
+ shufps xmm3,xmm5,0
+ movd xmm0,[r0]
+ movd xmm5,[r0+r1]
+ shufps xmm0,xmm5,0
+ movd xmm1,[r0+r1*2]
+ lea r0, [r1*2+r0]
+ movd xmm5,[r0+r1]
+ shufps xmm1,xmm5,0
+ pmaddubsw xmm0,xmm4
+ pmaddubsw xmm1,xmm4
+ pmaddubsw xmm2,xmm4
+ pmaddubsw xmm3,xmm4
+ psubw xmm0,xmm2
+ psubw xmm1,xmm3
+ movdqa xmm2,xmm0
+ paddw xmm0,xmm1
+ psubw xmm1,xmm2
+ movdqa xmm2,xmm0
+ punpcklqdq xmm0,xmm1
+ punpckhqdq xmm2,xmm1
+ movdqa xmm1,xmm0
+ paddw xmm0,xmm2
+ psubw xmm2,xmm1
+ movdqa xmm1,xmm0
+ pblendw xmm0,xmm2,0AAh
+ pslld xmm2,16
+ psrld xmm1,16
+ por xmm2,xmm1
+ pabsw xmm0,xmm0
+ pabsw xmm2,xmm2
+ pmaxsw xmm0,xmm2
+ SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse41
+align 16
+WelsSampleSatd8x8_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;mov eax, [esp+16]
+ ;mov ebx, [esp+20]
+ ;mov ecx, [esp+24]
+ ;mov edx, [esp+28]
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+ %assign push_num 2
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse41
+align 16
+WelsSampleSatd8x16_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;push ebp
+ ;%define pushsize 16
+ ;mov eax, [esp+pushsize+4]
+ ;mov ebx, [esp+pushsize+8]
+ ;mov ecx, [esp+pushsize+12]
+ ;mov edx, [esp+pushsize+16]
+%ifdef X86_32
+ push r4
+ push r5
+ push r6
+%endif
+ %assign push_num 3
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
+loop_get_satd_8x16:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_8x16
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse41
+align 16
+WelsSampleSatd16x8_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;mov eax, [esp+16]
+ ;mov ebx, [esp+20]
+ ;mov ecx, [esp+24]
+ ;mov edx, [esp+28]
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+ %assign push_num 2
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ push r0
+ push r2
+
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+16]
+ ;mov ecx, [esp+24]
+ add r0, 8
+ add r2, 8
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd16x16_sse41
+align 16
+WelsSampleSatd16x16_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;push ebp
+ ;%define pushsize 16
+ ;mov eax, [esp+pushsize+4]
+ ;mov ebx, [esp+pushsize+8]
+ ;mov ecx, [esp+pushsize+12]
+ ;mov edx, [esp+pushsize+16]
+%ifdef X86_32
+ push r4
+ push r5
+ push r6
+%endif
+ %assign push_num 3
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+
+ push r0
+ push r2
+
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
+loop_get_satd_16x16_left:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_left
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+pushsize+4]
+ ;mov ecx, [esp+pushsize+12]
+ add r0, 8
+ add r2, 8
+ mov r6, 0
+loop_get_satd_16x16_right:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_right
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ ;%undef pushsize
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE2_GetSad2x16 0
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqu xmm1, [r2]
+ MOVDQ xmm2, [r0];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+%endmacro
+
+
+%macro SSE2_GetSad4x16 0
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ paddw xmm7, xmm0
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+2*r3]
+ MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+r5]
+ MOVDQ xmm2, [r0+r4]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+%endmacro
+
+
+%macro SSE2_GetSad8x4 0
+ movq xmm0, [r0]
+ movq xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movhps xmm0, [r0]
+ movhps xmm1, [r0+r1]
+
+ movq xmm2, [r2]
+ movq xmm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movhps xmm2, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm2
+ psadbw xmm1, xmm3
+ paddw xmm6, xmm0
+ paddw xmm6, xmm1
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x16_sse2
+align 16
+WelsSampleSad16x16_sse2:
+ ;push ebx
+ ;push edi
+ ;push esi
+ ;%define _STACK_SIZE 12
+ ;mov eax, [esp+_STACK_SIZE+4 ]
+ ;mov ebx, [esp+_STACK_SIZE+8 ]
+ ;mov ecx, [esp+_STACK_SIZE+12]
+ ;mov edx, [esp+_STACK_SIZE+16]
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+
+ %assign push_num 2
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+
+ pxor xmm7, xmm7
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ movhlps xmm0, xmm7
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x8_sse2
+align 16
+WelsSampleSad16x8_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
+
+ movhlps xmm1, xmm0
+ paddw xmm0, xmm1
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+ ret
+
+
+
+WELS_EXTERN WelsSampleSad8x16_sse2
+WelsSampleSad8x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+
+ movhlps xmm0, xmm6
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+ ret
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and %1, 0x1f|(%3>>1)
+cmp %1, (32-%2)|(%3>>1)
+%endmacro
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+WelsSampleSad8x8_sse21:
+ ;mov ecx, [esp+12]
+ ;mov edx, ecx
+ ;CACHE_SPLIT_CHECK edx, 8, 64
+ ;jle near .pixel_sad_8x8_nsplit
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+
+ %assign push_num 0
+ mov r2, arg3
+ push r2
+ CACHE_SPLIT_CHECK r2, 8, 64
+ jle near .pixel_sad_8x8_nsplit
+ pop r2
+%ifdef X86_32
+ push r3
+ push r4
+ push r5
+%endif
+ %assign push_num 3
+ mov r0, arg1
+ mov r1, arg2
+ SIGN_EXTENTION r1, r1d
+ pxor xmm7, xmm7
+
+ ;ecx r2, edx r4, edi r5
+
+ mov r5, r2
+ and r5, 0x07
+ sub r2, r5
+ mov r4, 8
+ sub r4, r5
+
+ shl r5, 3
+ shl r4, 3
+ movd xmm5, r5d
+ movd xmm6, r4d
+ mov r5, 8
+ add r5, r2
+ mov r3, arg4
+ SIGN_EXTENTION r3, r3d
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ movhlps xmm0, xmm7
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+%ifdef X86_32
+ pop r5
+ pop r4
+ pop r3
+%endif
+ jmp .return
+
+.pixel_sad_8x8_nsplit:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov edx, [esp+20]
+
+ pop r2
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ movhlps xmm0, xmm6
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+.return:
+ ret
+
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 END
+;
+;***********************************************************************
+
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+
+%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
+ psadbw %1, %4
+ paddw xmm5, %1
+ psadbw %4, %3
+ paddw xmm4, %4
+ movdqu %4, [%5-1]
+ psadbw %4, %2
+ paddw xmm6, %4
+ movdqu %4, [%5+1]
+ psadbw %4, %2
+ paddw xmm7, %4
+%endmacro
+WELS_EXTERN WelsSampleSadFour16x16_sse2
+WelsSampleSadFour16x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
+
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm2, xmm3
+ paddw xmm5, xmm2
+
+ movdqu xmm2, [r2-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movdqu xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ ;mov ecx, [esp+24]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+
+WELS_EXTERN WelsSampleSadFour16x8_sse2
+WelsSampleSadFour16x8_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
+
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movdqu xmm0, [r2-1]
+ psadbw xmm0, xmm1
+ paddw xmm6, xmm0
+
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm1
+ paddw xmm7, xmm3
+
+ movdqu xmm3, [r2+r3]
+ psadbw xmm1, xmm3
+ paddw xmm5, xmm1
+
+ ;mov edi, [esp+28]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+WELS_EXTERN WelsSampleSadFour8x16_sse2
+WelsSampleSadFour8x16_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ ;mov edi, [esp+28]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+
+WELS_EXTERN WelsSampleSadFour8x8_sse2
+WelsSampleSadFour8x8_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ ;mov edi, [esp+28]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+WELS_EXTERN WelsSampleSadFour4x4_sse2
+WelsSampleSadFour4x4_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movd xmm0, [r0]
+ movd xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movd xmm2, [r0]
+ movd xmm3, [r0+r1]
+ punpckldq xmm0, xmm1
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ sub r2, r3
+ movd xmm1, [r2]
+ movd xmm2, [r2+r3]
+ punpckldq xmm1, xmm2
+ movd xmm2, [r2+r3-1]
+ movd xmm3, [r2+r3+1]
+
+ lea r2, [r2+2*r3]
+
+ movd xmm4, [r2]
+ movd xmm5, [r2-1]
+ punpckldq xmm2, xmm5
+ movd xmm5, [r2+1]
+ punpckldq xmm3, xmm5
+
+ movd xmm5, [r2+r3]
+ punpckldq xmm4, xmm5
+
+ punpcklqdq xmm1, xmm4 ;-L
+
+ movd xmm5, [r2+r3-1]
+ movd xmm6, [r2+r3+1]
+
+ lea r2, [r2+2*r3]
+ movd xmm7, [r2-1]
+ punpckldq xmm5, xmm7
+ punpcklqdq xmm2, xmm5 ;-1
+ movd xmm7, [r2+1]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm3, xmm6 ;+1
+ movd xmm6, [r2]
+ movd xmm7, [r2+r3]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6 ;+L
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+
+ movhlps xmm0, xmm1
+ paddw xmm1, xmm0
+ movhlps xmm0, xmm2
+ paddw xmm2, xmm0
+ movhlps xmm0, xmm3
+ paddw xmm3, xmm0
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ ;mov edi, [esp+28]
+ punpckldq xmm1, xmm4
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm1, xmm2
+ movdqa [r4],xmm1
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 END
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSad4x4_mmx
+
+align 16
+;***********************************************************************
+; int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
+;***********************************************************************
+WelsSampleSad4x4_mmx:
+ ;push ebx
+ ;%define pushsize 4
+ ;%define pix1address esp+pushsize+4
+ ;%define pix1stride esp+pushsize+8
+ ;%define pix2address esp+pushsize+12
+ ;%define pix2stride esp+pushsize+16
+ ;mov eax, [pix1address]
+ ;mov ebx, [pix1stride ]
+ ;mov ecx, [pix2address]
+ ;mov edx, [pix2stride ]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movd mm0, [r0]
+ movd mm1, [r0+r1]
+ punpckldq mm0, mm1
+
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm0, mm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+
+ movd mm1, [r0]
+ movd mm2, [r0+r1]
+ punpckldq mm1, mm2
+
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm1, mm3
+ paddw mm0, mm1
+
+ movd retrd, mm0
+
+ WELSEMMS
+ LOAD_4_PARA_POP
+ ret
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -16,6 +16,7 @@
$(COMMON_SRCDIR)/./mb_copy.asm\
$(COMMON_SRCDIR)/./mc_chroma.asm\
$(COMMON_SRCDIR)/./mc_luma.asm\
+ $(COMMON_SRCDIR)/./satd_sad.asm\
$(COMMON_SRCDIR)/./vaa.asm\
COMMON_OBJS += $(COMMON_ASM_SRCS:.asm=.o)
--- a/codec/encoder/core/asm/satd_sad.asm
+++ /dev/null
@@ -1,2344 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* satd_sad.asm
-;*
-;* Abstract
-;* WelsSampleSatd4x4_sse2
-;* WelsSampleSatd8x8_sse2
-;* WelsSampleSatd16x8_sse2
-;* WelsSampleSatd8x16_sse2
-;* WelsSampleSatd16x16_sse2
-;*
-;* WelsSampleSad16x8_sse2
-;* WelsSampleSad16x16_sse2
-;*
-;* History
-;* 8/5/2009 Created
-;* 24/9/2009 modified
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Data
-;***********************************************************************
-SECTION .rodata align=16
-
-align 16
-HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
-align 16
-HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
-align 16
-PDW1: dw 1,1,1,1,1,1,1,1
-align 16
-PDQ2: dw 2,0,0,0,2,0,0,0
-align 16
-HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 BEGIN
-;
-;***********************************************************************
-%macro MMX_DW_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubw %1, %2
-%endmacro
-
-%macro SSE2_SumWHorizon1 2
- movdqa %2, %1
- psrldq %2, 8
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 4
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 2
- paddusw %1, %2
-%endmacro
-
-%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %4, %5
- SSE2_SumSub %2, %4, %5
- SSE2_SumSub %1, %3, %5
-%endmacro
-
-%macro SSE2_SumAbs4 7
- WELS_AbsW %1, %3
- WELS_AbsW %2, %3
- WELS_AbsW %4, %6
- WELS_AbsW %5, %6
- paddusw %1, %2
- paddusw %4, %5
- paddusw %7, %1
- paddusw %7, %4
-%endmacro
-
-%macro SSE2_SumWHorizon 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
-%endmacro
-
-%macro SSE2_GetSatd8x8 0
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse2
-align 16
-WelsSampleSatd4x4_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movd xmm0, [r0]
- movd xmm1, [r0+r1]
- lea r0 , [r0+2*r1]
- movd xmm2, [r0]
- movd xmm3, [r0+r1]
- punpckldq xmm0, xmm2
- punpckldq xmm1, xmm3
-
- movd xmm4, [r2]
- movd xmm5, [r2+r3]
- lea r2 , [r2+2*r3]
- movd xmm6, [r2]
- movd xmm7, [r2+r3]
- punpckldq xmm4, xmm6
- punpckldq xmm5, xmm7
-
- pxor xmm6, xmm6
- punpcklbw xmm0, xmm6
- punpcklbw xmm1, xmm6
- punpcklbw xmm4, xmm6
- punpcklbw xmm5, xmm6
-
- psubw xmm0, xmm4
- psubw xmm1, xmm5
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
- SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
- movdqa xmm4, xmm0
- paddw xmm0, xmm3
- psubw xmm4, xmm3
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm4
- punpckhwd xmm4, xmm2
-
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
-
- movdqa xmm7, xmm0
- paddw xmm0, xmm5
- psubw xmm7, xmm5
-
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
-
- WELS_AbsW xmm0, xmm3
- paddusw xmm6, xmm0
- WELS_AbsW xmm2, xmm4
- paddusw xmm6, xmm2
- SSE2_SumWHorizon1 xmm6, xmm4
- movd retrd, xmm6
- and retrd, 0xffff
- shr retrd, 1
- LOAD_4_PARA_POP
- ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x8_sse2
-align 16
- WelsSampleSatd8x8_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm6, xmm6
- pxor xmm7, xmm7
- SSE2_GetSatd8x8
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- LOAD_4_PARA_POP
- ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x16_sse2
-align 16
- WelsSampleSatd8x16_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
-
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse2
-align 16
-WelsSampleSatd16x8_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- push r0
- push r2
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
-
- pop r2
- pop r0
- ;mov eax, [esp+8]
- ;mov ecx, [esp+16]
- add r0, 8
- add r2, 8
- SSE2_GetSatd8x8
-
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x16_sse2
-align 16
-WelsSampleSatd16x16_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- push r0
- push r2
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
-
- pop r2
- pop r0
- ;mov eax, [esp+8]
- ;mov ecx, [esp+16]
- add r0, 8
- add r2, 8
-
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
-
- ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
- paddd xmm4, %1 ;for dc
- paddd xmm4, %3 ;for dc
- packssdw %1, %3
- psllw %1, 2
-%endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
-; paddd xmm4, %1 ;for dc
-; paddd xmm4, %3 ;for dc
- movdqa %4, %1
- punpcklqdq %4, %3
- packssdw %1, %3
- psllw %1, 2
-%endmacro
-
-%macro SSE41_GetX38x4SatdDec 0
- pxor xmm7, xmm7
- movq xmm0, [eax]
- movq xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movq xmm2, [eax]
- movq xmm3, [eax+ebx]
- lea eax, [eax+2*ebx]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
- ;doesn't need another transpose
-%endmacro
-%macro SSE41_GetX38x4SatdV 2
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2], 0
- pinsrw xmm0, word[esi+%2+8], 4
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+2], 0
- pinsrw xmm0, word[esi+%2+10], 4
- psubsw xmm0, xmm1
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+4], 0
- pinsrw xmm0, word[esi+%2+12], 4
- psubsw xmm0, xmm3
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+6], 0
- pinsrw xmm0, word[esi+%2+14], 4
- psubsw xmm0, xmm2
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
-%endmacro
-%macro SSE41_GetX38x4SatdH 3
- movq xmm0, [esi+%3+8*%1]
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm5, xmm0
- pabsw xmm1, xmm1
- pabsw xmm2, xmm2
- pabsw xmm3, xmm3
- paddw xmm2, xmm1;for DC
- paddw xmm2, xmm3;for DC
- paddw xmm5, xmm2
-%endmacro
-%macro SSE41_I16X16GetX38x4SatdDC 0
- pxor xmm0, xmm0
- movq2dq xmm0, mm4
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
-%endmacro
-%macro SSE41_ChromaGetX38x4SatdDC 1
- shl %1, 4
- movdqa xmm0, [esi+32+%1]
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
-%endmacro
-%macro SSE41_I16x16GetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 32
- SSE41_I16X16GetX38x4SatdDC
-%endmacro
-%macro SSE41_ChromaGetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 16
- SSE41_ChromaGetX38x4SatdDC %1
-%endmacro
-%macro SSE41_HSum8W 3
- pmaddwd %1, %2
- movhlps %3, %1
- paddd %1, %3
- pshuflw %3, %1,0Eh
- paddd %1, %3
-%endmacro
-
-
-%ifdef X86_32
-WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-WelsIntra16x16Combined3Satd_sse41:
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov eax, [esp+24]
- mov ebx, [esp+28]
- mov esi, [esp+40] ;temp_satd
- pxor xmm4, xmm4
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub ecx, edx
- movdqu xmm0, [ecx]
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [esi], xmm0 ;V
- movdqa [esi+16], xmm1
- add ecx, edx
- pinsrb xmm0, byte[ecx-1], 0
- pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 2
- pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 4
- pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 6
- pinsrb xmm0, byte[ecx+edx-1], 7
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 8
- pinsrb xmm0, byte[ecx+edx-1], 9
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 10
- pinsrb xmm0, byte[ecx+edx-1], 11
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 12
- pinsrb xmm0, byte[ecx+edx-1], 13
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 14
- pinsrb xmm0, byte[ecx+edx-1], 15
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [esi+32], xmm0 ;H
- movdqa [esi+48], xmm1
- movd ecx, xmm4 ;dc
- add ecx, 16 ;(sum+16)
- shr ecx, 5 ;((sum+16)>>5)
- shl ecx, 4 ;
- movd mm4, ecx ; mm4 copy DC
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov ecx, 0
- mov edi, 0
-.loop16x16_get_satd:
-.loopStart1:
- SSE41_I16x16GetX38x4Satd ecx, edi
- inc ecx
- cmp ecx, 4
- jl .loopStart1
- cmp edi, 16
- je .loop16x16_get_satd_end
- mov eax, [esp+24]
- add eax, 8
- mov ecx, 0
- add edi, 16
- jmp .loop16x16_get_satd
- .loop16x16_get_satd_end:
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
-
- ; comparing order: DC H V
- movd ebx, xmm6 ;DC
- movd edi, xmm5 ;H
- movd ecx, xmm4 ;V
- mov edx, [esp+36]
- shl edx, 1
- add edi, edx
- add ebx, edx
- mov edx, [esp+32]
- cmp ebx, edi
- jge near not_dc_16x16
- cmp ebx, ecx
- jge near not_dc_h_16x16
-
- ; for DC mode
- mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
- jmp near return_satd_intra_16x16_x3
-not_dc_16x16:
- ; for H mode
- cmp edi, ecx
- jge near not_dc_h_16x16
- mov dword[edx], 1;I16_PRED_H
- mov eax, edi
- jmp near return_satd_intra_16x16_x3
-not_dc_h_16x16:
- ; for V mode
- mov dword[edx], 0;I16_PRED_V
- mov eax, ecx
-return_satd_intra_16x16_x3:
- WELSEMMS
- pop edi
- pop esi
- pop ebx
-ret
-
-%macro SSE41_ChromaGetX38x8Satd 0
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub ecx, edx
- movq xmm0, [ecx]
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
- movdqa [esi], xmm0 ;V
- add ecx, edx
- pinsrb xmm0, byte[ecx-1], 0
- pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 2
- pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 4
- pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 6
- pinsrb xmm0, byte[ecx+edx-1], 7
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
- movdqa [esi+16], xmm0 ;H
-;(sum+2)>>2
- movdqa xmm6, [PDQ2]
- movdqa xmm5, xmm4
- punpckhqdq xmm5, xmm1
- paddd xmm5, xmm6
- psrld xmm5, 2
-;(sum1+sum2+4)>>3
- paddd xmm6, xmm6
- paddd xmm4, xmm1
- paddd xmm4, xmm6
- psrld xmm4, 3
-;satd *16
- pslld xmm5, 4
- pslld xmm4, 4
-;temp satd
- movdqa xmm6, xmm4
- punpcklqdq xmm4, xmm5
- psllq xmm4, 32
- psrlq xmm4, 32
- movdqa [esi+32], xmm4
- punpckhqdq xmm5, xmm6
- psllq xmm5, 32
- psrlq xmm5, 32
- movdqa [esi+48], xmm5
-
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov ecx, 0
-loop_chroma_satdx3_cb_cr:
- SSE41_ChromaGetX38x4Satd ecx, 0
- inc ecx
- cmp ecx, 2
- jl loop_chroma_satdx3_cb_cr
-%endmacro
-
-%macro SSEReg2MMX 3
- movdq2q %2, %1
- movhlps %1, %1
- movdq2q %3, %1
-%endmacro
-%macro MMXReg2SSE 4
- movq2dq %1, %3
- movq2dq %2, %4
- punpcklqdq %1, %2
-%endmacro
-;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
-
-WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-WelsIntraChroma8x8Combined3Satd_sse41:
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov eax, [esp+24]
- mov ebx, [esp+28]
- mov esi, [esp+40] ;temp_satd
- xor edi, edi
-loop_chroma_satdx3:
- SSE41_ChromaGetX38x8Satd
- cmp edi, 1
- je loop_chroma_satdx3end
- inc edi
- SSEReg2MMX xmm4, mm0,mm1
- SSEReg2MMX xmm5, mm2,mm3
- SSEReg2MMX xmm6, mm5,mm6
- mov ecx, [esp+44]
- mov eax, [esp+48]
- jmp loop_chroma_satdx3
-loop_chroma_satdx3end:
- MMXReg2SSE xmm0, xmm3, mm0, mm1
- MMXReg2SSE xmm1, xmm3, mm2, mm3
- MMXReg2SSE xmm2, xmm3, mm5, mm6
-
- paddw xmm4, xmm0
- paddw xmm5, xmm1
- paddw xmm6, xmm2
-
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
- ; comparing order: DC H V
- movd ebx, xmm6 ;DC
- movd edi, xmm5 ;H
- movd ecx, xmm4 ;V
- mov edx, [esp+36]
- shl edx, 1
- add edi, edx
- add ecx, edx
- mov edx, [esp+32]
- cmp ebx, edi
- jge near not_dc_8x8
- cmp ebx, ecx
- jge near not_dc_h_8x8
-
- ; for DC mode
- mov dword[edx], 0;I8_PRED_DC
- mov eax, ebx
- jmp near return_satd_intra_8x8_x3
-not_dc_8x8:
- ; for H mode
- cmp edi, ecx
- jge near not_dc_h_8x8
- mov dword[edx], 1;I8_PRED_H
- mov eax, edi
- jmp near return_satd_intra_8x8_x3
-not_dc_h_8x8:
- ; for V mode
- mov dword[edx], 2;I8_PRED_V
- mov eax, ecx
-return_satd_intra_8x8_x3:
- WELSEMMS
- pop edi
- pop esi
- pop ebx
-ret
-
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 END
-;
-;***********************************************************************
-%macro SSSE3_Get16BSadHVDC 2
- movd xmm6,%1
- pshufb xmm6,xmm1
- movdqa %1, xmm6
- movdqa xmm0,%2
- psadbw xmm0,xmm7
- paddw xmm4,xmm0
- movdqa xmm0,%2
- psadbw xmm0,xmm5
- paddw xmm2,xmm0
- psadbw xmm6,%2
- paddw xmm3,xmm6
-%endmacro
-%macro WelsAddDCValue 4
- movzx %2, byte %1
- mov %3, %2
- add %4, %2
-%endmacro
-
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 BEGIN
-;
-;***********************************************************************
-WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-WelsIntra16x16Combined3Sad_ssse3:
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov edi, [esp+40] ;temp_sad
- sub ecx, edx
- movdqa xmm5,[ecx]
- pxor xmm0,xmm0
- psadbw xmm0,xmm5
- movhlps xmm1,xmm0
- paddw xmm0,xmm1
- movd eax,xmm0
-
- add ecx,edx
- lea ebx, [edx+2*edx]
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- sub edi, 192
- add eax,10h
- shr eax,5
- movd xmm7,eax
- pxor xmm1,xmm1
- pshufb xmm7,xmm1
- pxor xmm4,xmm4
- pxor xmm3,xmm3
- pxor xmm2,xmm2
-;sad begin
- mov eax, [esp+24]
- mov ebx, [esp+28]
- lea esi, [ebx+2*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-
- pslldq xmm3,4
- por xmm3,xmm2
- movhlps xmm1,xmm3
- paddw xmm3,xmm1
- movhlps xmm0,xmm4
- paddw xmm4,xmm0
-; comparing order: DC H V
- movd ebx, xmm4 ;DC
- movd ecx, xmm3 ;V
- psrldq xmm3, 4
- movd esi, xmm3 ;H
- mov eax, [esp+36] ;lamda
- shl eax, 1
- add esi, eax
- add ebx, eax
- mov edx, [esp+32]
- cmp ebx, esi
- jge near not_dc_16x16_sad
- cmp ebx, ecx
- jge near not_dc_h_16x16_sad
- ; for DC mode
- mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
- sub edi, 192
-%assign x 0
-%rep 16
- movdqa [edi+16*x], xmm7
-%assign x x+1
-%endrep
- jmp near return_sad_intra_16x16_x3
-not_dc_16x16_sad:
- ; for H mode
- cmp esi, ecx
- jge near not_dc_h_16x16_sad
- mov dword[edx], 1;I16_PRED_H
- mov eax, esi
- jmp near return_sad_intra_16x16_x3
-not_dc_h_16x16_sad:
- ; for V mode
- mov dword[edx], 0;I16_PRED_V
- mov eax, ecx
- sub edi, 192
-%assign x 0
-%rep 16
- movdqa [edi+16*x], xmm5
-%assign x x+1
-%endrep
-return_sad_intra_16x16_x3:
- pop edi
- pop esi
- pop ebx
- ret
-%endif
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 END
-;
-;***********************************************************************
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 BEGIN
-;
-;***********************************************************************
-
-;SSE4.1
-%macro SSE41_GetSatd8x4 0
- movq xmm0, [r0]
- punpcklqdq xmm0, xmm0
- pmaddubsw xmm0, xmm7
- movq xmm1, [r0+r1]
- punpcklqdq xmm1, xmm1
- pmaddubsw xmm1, xmm7
- movq xmm2, [r2]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [r2+r3]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- psubsw xmm0, xmm2
- psubsw xmm1, xmm3
- movq xmm2, [r0+2*r1]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [r0+r4]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- movq xmm4, [r2+2*r3]
- punpcklqdq xmm4, xmm4
- pmaddubsw xmm4, xmm7
- movq xmm5, [r2+r5]
- punpcklqdq xmm5, xmm5
- pmaddubsw xmm5, xmm7
- psubsw xmm2, xmm4
- psubsw xmm3, xmm5
- SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
- pabsw xmm0, xmm0
- pabsw xmm2, xmm2
- pabsw xmm1, xmm1
- pabsw xmm3, xmm3
- movdqa xmm4, xmm3
- pblendw xmm3, xmm1, 0xAA
- pslld xmm1, 16
- psrld xmm4, 16
- por xmm1, xmm4
- pmaxuw xmm1, xmm3
- paddw xmm6, xmm1
- movdqa xmm4, xmm0
- pblendw xmm0, xmm2, 0xAA
- pslld xmm2, 16
- psrld xmm4, 16
- por xmm2, xmm4
- pmaxuw xmm0, xmm2
- paddw xmm6, xmm0
-%endmacro
-
-%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
- MMX_DW_1_2REG %3, %4
- pmaddwd %2, %3
- movhlps %4, %2
- paddd %2, %4
- pshuflw %4, %2,0Eh
- paddd %2, %4
- movd %1, %2
-%endmacro
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse41
-WelsSampleSatd4x4_sse41:
- ;push ebx
- ;mov eax,[esp+8]
- ;mov ebx,[esp+12]
- ;mov ecx,[esp+16]
- ;mov edx,[esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movdqa xmm4,[HSwapSumSubDB1]
- movd xmm2,[r2]
- movd xmm5,[r2+r3]
- shufps xmm2,xmm5,0
- movd xmm3,[r2+r3*2]
- lea r2, [r3*2+r2]
- movd xmm5,[r2+r3]
- shufps xmm3,xmm5,0
- movd xmm0,[r0]
- movd xmm5,[r0+r1]
- shufps xmm0,xmm5,0
- movd xmm1,[r0+r1*2]
- lea r0, [r1*2+r0]
- movd xmm5,[r0+r1]
- shufps xmm1,xmm5,0
- pmaddubsw xmm0,xmm4
- pmaddubsw xmm1,xmm4
- pmaddubsw xmm2,xmm4
- pmaddubsw xmm3,xmm4
- psubw xmm0,xmm2
- psubw xmm1,xmm3
- movdqa xmm2,xmm0
- paddw xmm0,xmm1
- psubw xmm1,xmm2
- movdqa xmm2,xmm0
- punpcklqdq xmm0,xmm1
- punpckhqdq xmm2,xmm1
- movdqa xmm1,xmm0
- paddw xmm0,xmm2
- psubw xmm2,xmm1
- movdqa xmm1,xmm0
- pblendw xmm0,xmm2,0AAh
- pslld xmm2,16
- psrld xmm1,16
- por xmm2,xmm1
- pabsw xmm0,xmm0
- pabsw xmm2,xmm2
- pmaxsw xmm0,xmm2
- SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x8_sse41
-align 16
-WelsSampleSatd8x8_sse41:
- ;push ebx
- ;push esi
- ;push edi
- ;mov eax, [esp+16]
- ;mov ebx, [esp+20]
- ;mov ecx, [esp+24]
- ;mov edx, [esp+28]
-%ifdef X86_32
- push r4
- push r5
-%endif
- %assign push_num 2
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x16_sse41
-align 16
-WelsSampleSatd8x16_sse41:
- ;push ebx
- ;push esi
- ;push edi
- ;push ebp
- ;%define pushsize 16
- ;mov eax, [esp+pushsize+4]
- ;mov ebx, [esp+pushsize+8]
- ;mov ecx, [esp+pushsize+12]
- ;mov edx, [esp+pushsize+16]
-%ifdef X86_32
- push r4
- push r5
- push r6
-%endif
- %assign push_num 3
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- mov r6, 0
-loop_get_satd_8x16:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_8x16
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r6
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse41
-align 16
-WelsSampleSatd16x8_sse41:
- ;push ebx
- ;push esi
- ;push edi
- ;mov eax, [esp+16]
- ;mov ebx, [esp+20]
- ;mov ecx, [esp+24]
- ;mov edx, [esp+28]
-%ifdef X86_32
- push r4
- push r5
-%endif
- %assign push_num 2
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- push r0
- push r2
-
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
-
- pop r2
- pop r0
- ;mov eax, [esp+16]
- ;mov ecx, [esp+24]
- add r0, 8
- add r2, 8
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSatd16x16_sse41
-align 16
-WelsSampleSatd16x16_sse41:
- ;push ebx
- ;push esi
- ;push edi
- ;push ebp
- ;%define pushsize 16
- ;mov eax, [esp+pushsize+4]
- ;mov ebx, [esp+pushsize+8]
- ;mov ecx, [esp+pushsize+12]
- ;mov edx, [esp+pushsize+16]
-%ifdef X86_32
- push r4
- push r5
- push r6
-%endif
- %assign push_num 3
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
-
- push r0
- push r2
-
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- mov r6, 0
-loop_get_satd_16x16_left:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_16x16_left
-
- pop r2
- pop r0
- ;mov eax, [esp+pushsize+4]
- ;mov ecx, [esp+pushsize+12]
- add r0, 8
- add r2, 8
- mov r6, 0
-loop_get_satd_16x16_right:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_16x16_right
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- ;%undef pushsize
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r6
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE2_GetSad2x16 0
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqu xmm1, [r2]
- MOVDQ xmm2, [r0];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
-%endmacro
-
-
-%macro SSE2_GetSad4x16 0
- movdqu xmm0, [r2]
- MOVDQ xmm2, [r0]
- psadbw xmm0, xmm2
- paddw xmm7, xmm0
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [r2+2*r3]
- MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [r2+r5]
- MOVDQ xmm2, [r0+r4]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
-%endmacro
-
-
-%macro SSE2_GetSad8x4 0
- movq xmm0, [r0]
- movq xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movhps xmm0, [r0]
- movhps xmm1, [r0+r1]
-
- movq xmm2, [r2]
- movq xmm3, [r2+r3]
- lea r2, [r2+2*r3]
- movhps xmm2, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm3
- paddw xmm6, xmm0
- paddw xmm6, xmm1
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x16_sse2
-align 16
-WelsSampleSad16x16_sse2:
- ;push ebx
- ;push edi
- ;push esi
- ;%define _STACK_SIZE 12
- ;mov eax, [esp+_STACK_SIZE+4 ]
- ;mov ebx, [esp+_STACK_SIZE+8 ]
- ;mov ecx, [esp+_STACK_SIZE+12]
- ;mov edx, [esp+_STACK_SIZE+16]
-%ifdef X86_32
- push r4
- push r5
-%endif
-
- %assign push_num 2
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- lea r4, [3*r1]
- lea r5, [3*r3]
-
- pxor xmm7, xmm7
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd retrd, xmm0
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x8_sse2
-align 16
-WelsSampleSad16x8_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movdqu xmm0, [r2]
- MOVDQ xmm2, [r0]
- psadbw xmm0, xmm2
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
-
- SSE2_GetSad2x16
- SSE2_GetSad2x16
- SSE2_GetSad2x16
-
- movhlps xmm1, xmm0
- paddw xmm0, xmm1
- movd retrd, xmm0
- LOAD_4_PARA_POP
- ret
-
-
-
-WELS_EXTERN WelsSampleSad8x16_sse2
-WelsSampleSad8x16_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm6, xmm6
-
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
-
- movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd retrd, xmm0
- LOAD_4_PARA_POP
- ret
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and %1, 0x1f|(%3>>1)
-cmp %1, (32-%2)|(%3>>1)
-%endmacro
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
- ;mov ecx, [esp+12]
- ;mov edx, ecx
- ;CACHE_SPLIT_CHECK edx, 8, 64
- ;jle near .pixel_sad_8x8_nsplit
- ;push ebx
- ;push edi
- ;mov eax, [esp+12]
- ;mov ebx, [esp+16]
-
- %assign push_num 0
- mov r2, arg3
- push r2
- CACHE_SPLIT_CHECK r2, 8, 64
- jle near .pixel_sad_8x8_nsplit
- pop r2
-%ifdef X86_32
- push r3
- push r4
- push r5
-%endif
- %assign push_num 3
- mov r0, arg1
- mov r1, arg2
- SIGN_EXTENTION r1, r1d
- pxor xmm7, xmm7
-
- ;ecx r2, edx r4, edi r5
-
- mov r5, r2
- and r5, 0x07
- sub r2, r5
- mov r4, 8
- sub r4, r5
-
- shl r5, 3
- shl r4, 3
- movd xmm5, r5d
- movd xmm6, r4d
- mov r5, 8
- add r5, r2
- mov r3, arg4
- SIGN_EXTENTION r3, r3d
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd retrd, xmm0
-%ifdef X86_32
- pop r5
- pop r4
- pop r3
-%endif
- jmp .return
-
-.pixel_sad_8x8_nsplit:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov edx, [esp+20]
-
- pop r2
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm6, xmm6
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd retrd, xmm0
- LOAD_4_PARA_POP
-.return:
- ret
-
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 END
-;
-;***********************************************************************
-
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-
-%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
- psadbw %1, %4
- paddw xmm5, %1
- psadbw %4, %3
- paddw xmm4, %4
- movdqu %4, [%5-1]
- psadbw %4, %2
- paddw xmm6, %4
- movdqu %4, [%5+1]
- psadbw %4, %2
- paddw xmm7, %4
-%endmacro
-WELS_EXTERN WelsSampleSadFour16x16_sse2
-WelsSampleSadFour16x16_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [r0]
- sub r2, r3
- movdqu xmm3, [r2]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
-
- movdqu xmm2, [r2+r3-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [r2+r3+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- psadbw xmm2, xmm3
- paddw xmm5, xmm2
-
- movdqu xmm2, [r2-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [r2+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movdqu xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- ;mov ecx, [esp+24]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- LOAD_5_PARA_POP
- ret
-
-
-WELS_EXTERN WelsSampleSadFour16x8_sse2
-WelsSampleSadFour16x8_sse2:
- ;push ebx
- ;push edi
- ;mov eax, [esp+12]
- ;mov ebx, [esp+16]
- ;mov edi, [esp+20]
- ;mov edx, [esp+24]
-
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [r0]
- sub r2, r3
- movdqu xmm3, [r2]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
-
- movdqu xmm2, [r2+r3-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [r2+r3+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movdqu xmm0, [r2-1]
- psadbw xmm0, xmm1
- paddw xmm6, xmm0
-
- movdqu xmm3, [r2+1]
- psadbw xmm3, xmm1
- paddw xmm7, xmm3
-
- movdqu xmm3, [r2+r3]
- psadbw xmm1, xmm3
- paddw xmm5, xmm1
-
- ;mov edi, [esp+28]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- LOAD_5_PARA_POP
- ret
-
-WELS_EXTERN WelsSampleSadFour8x16_sse2
-WelsSampleSadFour8x16_sse2:
- ;push ebx
- ;push edi
- ;mov eax, [esp+12]
- ;mov ebx, [esp+16]
- ;mov edi, [esp+20]
- ;mov edx, [esp+24]
-
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- sub r2, r3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- ;mov edi, [esp+28]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- LOAD_5_PARA_POP
- ret
-
-
-WELS_EXTERN WelsSampleSadFour8x8_sse2
-WelsSampleSadFour8x8_sse2:
- ;push ebx
- ;push edi
- ;mov eax, [esp+12]
- ;mov ebx, [esp+16]
- ;mov edi, [esp+20]
- ;mov edx, [esp+24]
-
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- sub r2, r3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- ;mov edi, [esp+28]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- LOAD_5_PARA_POP
- ret
-
-WELS_EXTERN WelsSampleSadFour4x4_sse2
-WelsSampleSadFour4x4_sse2:
- ;push ebx
- ;push edi
- ;mov eax, [esp+12]
- ;mov ebx, [esp+16]
- ;mov edi, [esp+20]
- ;mov edx, [esp+24]
-
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movd xmm0, [r0]
- movd xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movd xmm2, [r0]
- movd xmm3, [r0+r1]
- punpckldq xmm0, xmm1
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
- sub r2, r3
- movd xmm1, [r2]
- movd xmm2, [r2+r3]
- punpckldq xmm1, xmm2
- movd xmm2, [r2+r3-1]
- movd xmm3, [r2+r3+1]
-
- lea r2, [r2+2*r3]
-
- movd xmm4, [r2]
- movd xmm5, [r2-1]
- punpckldq xmm2, xmm5
- movd xmm5, [r2+1]
- punpckldq xmm3, xmm5
-
- movd xmm5, [r2+r3]
- punpckldq xmm4, xmm5
-
- punpcklqdq xmm1, xmm4 ;-L
-
- movd xmm5, [r2+r3-1]
- movd xmm6, [r2+r3+1]
-
- lea r2, [r2+2*r3]
- movd xmm7, [r2-1]
- punpckldq xmm5, xmm7
- punpcklqdq xmm2, xmm5 ;-1
- movd xmm7, [r2+1]
- punpckldq xmm6, xmm7
- punpcklqdq xmm3, xmm6 ;+1
- movd xmm6, [r2]
- movd xmm7, [r2+r3]
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6 ;+L
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
-
- movhlps xmm0, xmm1
- paddw xmm1, xmm0
- movhlps xmm0, xmm2
- paddw xmm2, xmm0
- movhlps xmm0, xmm3
- paddw xmm3, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- ;mov edi, [esp+28]
- punpckldq xmm1, xmm4
- punpckldq xmm2, xmm3
- punpcklqdq xmm1, xmm2
- movdqa [r4],xmm1
- LOAD_5_PARA_POP
- ret
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 END
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSad4x4_mmx
-
-align 16
-;***********************************************************************
-; int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
-;***********************************************************************
-WelsSampleSad4x4_mmx:
- ;push ebx
- ;%define pushsize 4
- ;%define pix1address esp+pushsize+4
- ;%define pix1stride esp+pushsize+8
- ;%define pix2address esp+pushsize+12
- ;%define pix2stride esp+pushsize+16
- ;mov eax, [pix1address]
- ;mov ebx, [pix1stride ]
- ;mov ecx, [pix2address]
- ;mov edx, [pix2stride ]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movd mm0, [r0]
- movd mm1, [r0+r1]
- punpckldq mm0, mm1
-
- movd mm3, [r2]
- movd mm4, [r2+r3]
- punpckldq mm3, mm4
- psadbw mm0, mm3
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
-
- movd mm1, [r0]
- movd mm2, [r0+r1]
- punpckldq mm1, mm2
-
- movd mm3, [r2]
- movd mm4, [r2+r3]
- punpckldq mm3, mm4
- psadbw mm1, mm3
- paddw mm0, mm1
-
- movd retrd, mm0
-
- WELSEMMS
- LOAD_4_PARA_POP
- ret
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -41,7 +41,6 @@
$(ENCODER_SRCDIR)/./core/asm/intra_pred.asm\
$(ENCODER_SRCDIR)/./core/asm/memzero.asm\
$(ENCODER_SRCDIR)/./core/asm/quant.asm\
- $(ENCODER_SRCDIR)/./core/asm/satd_sad.asm\
$(ENCODER_SRCDIR)/./core/asm/score.asm\
ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.o)
--- a/codec/processing/build/win32/WelsVP_2008.vcproj
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -594,47 +594,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\src\asm\intra_pred.asm"
- >
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|x64"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\src\asm\sad.asm"
+ RelativePath="..\..\..\common\satd_sad.asm"
>
<FileConfiguration
Name="Debug|Win32"
--- a/codec/processing/src/asm/intra_pred.asm
+++ /dev/null
@@ -1,1505 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* intra_pred.asm
-;*
-;* Abstract
-;* sse2 function for intra predict operations
-;*
-;* History
-;* 18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-align 16
-sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
-align 16
-sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
-align 16
-sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
-
-; for chroma plane mode
-sse2_plane_inc_c dw 1, 2, 3, 4
-sse2_plane_dec_c dw 4, 3, 2, 1
-align 16
-sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
-
-align 16
-mmx_01bytes: times 16 db 1
-;align 16
-;sse_0x0004bytes: times 8 dw 4
-;ALIGN 16
-;sse_f000 db 255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-
-align 16
-mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
-
-
-;***********************************************************************
-; macros
-;***********************************************************************
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-;%1 will keep the last result
-%macro SSE_DB_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubb %1, %2
-%endmacro
-
-;xmm0, xmm1, xmm2, eax, ecx
-;lower 64 bits of xmm0 save the result
-%macro SSE2_PRED_H_4X4_TWO_LINE 5
- movd %1, [%4-1]
- movdqa %3, %1
- punpcklbw %1, %3
- movdqa %3, %1
- punpcklbw %1, %3
-
- ;add %4, %5
- movd %2, [%4+%5-1]
- movdqa %3, %2
- punpcklbw %2, %3
- movdqa %3, %2
- punpcklbw %2, %3
- punpckldq %1, %2
-%endmacro
-
-%macro SUMW_HORIZON1 2
- movdqa %2, %1
- psrldq %2, 8
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 4
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 2
- paddusw %1, %2
-%endmacro
-
-%macro LOAD_COLUMN 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpcklwd %1, %3
- lea %5, [%5+2*%6]
- movd %4, [%5]
- movd %2, [%5+%6]
- punpcklbw %4, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- lea %5, [%5+2*%6]
- punpcklbw %3, %2
- punpcklwd %4, %3
- punpckhdq %1, %4
-%endmacro
-
-%macro SUMW_HORIZON 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
-%endmacro
-
-
-%macro COPY_16_TIMES 2
- movdqa %2, [%1-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-%macro COPY_16_TIMESS 3
- movdqa %2, [%1+%3-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-%macro LOAD_COLUMN_C 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1,%2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpckhwd %1, %3
- lea %5, [%5+2*%6]
-%endmacro
-
-%macro LOAD_2_LEFT_AND_ADD 0
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01]
- add r3, r4
- movzx r4, byte [r1+r2-0x01]
- add r3, r4
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-WELS_EXTERN WelsI4x4LumaPredH_sse2
-WELS_EXTERN WelsI4x4LumaPredDDR_mmx
-WELS_EXTERN WelsI4x4LumaPredDc_sse2
-WELS_EXTERN WelsI16x16LumaPredPlane_sse2
-
-ALIGN 16
-;***********************************************************************
-; void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;
-; pred must align to 16
-;***********************************************************************
-WelsI4x4LumaPredH_sse2:
- push r3
- %assign push_num 1
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- movzx r3, byte [r1-1]
- movd xmm0, r3d
- pmuludq xmm0, [mmx_01bytes]
-
- movzx r3, byte [r1+r2-1]
- movd xmm1, r3d
- pmuludq xmm1, [mmx_01bytes]
-
- unpcklps xmm0, xmm1
-
- lea r1, [r1+r2*2]
- movzx r3, byte [r1-1]
- movd xmm2, r3d
- pmuludq xmm2, [mmx_01bytes]
-
- movzx r3, byte [r1+r2-1]
- movd xmm3, r3d
- pmuludq xmm3, [mmx_01bytes]
-
- unpcklps xmm2, xmm3
- unpcklpd xmm0, xmm2
-
- movdqa [r0], xmm0
- pop r3
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WelsI16x16LumaPredPlane_sse2:
- ;%define pushsize 4
- ;push esi
- ;mov esi, [esp + pushsize + 8]
- ;mov ecx, [esp + pushsize + 12]
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, 1
- sub r1, r2
-
- ;for H
- pxor xmm7, xmm7
- movq xmm0, [r1]
- movdqa xmm5, [sse2_plane_dec]
- punpcklbw xmm0, xmm7
- pmullw xmm0, xmm5
- movq xmm1, [r1 + 9]
- movdqa xmm6, [sse2_plane_inc]
- punpcklbw xmm1, xmm7
- pmullw xmm1, xmm6
- psubw xmm1, xmm0
-
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
- movsx r3, r3w
- imul r3, 5
- add r3, 32
- sar r3, 6 ; b = (5 * H + 32) >> 6;
- SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b
-
- movzx r4, BYTE [r1+16]
- sub r1, 3
- LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2
-
- add r1, 3
- movzx r3, BYTE [r1+8*r2]
- add r4, r3
- shl r4, 4 ; a = (left[15*stride] + top[15]) << 4;
-
- sub r1, 3
- add r1, r2
- LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2
- pxor xmm4, xmm4
- punpckhbw xmm0, xmm4
- pmullw xmm0, xmm5
- punpckhbw xmm7, xmm4
- pmullw xmm7, xmm6
- psubw xmm7, xmm0
-
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r3d, xmm7 ; V
- movsx r3, r3w
- imul r3, 5
- add r3, 32
- sar r3, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c
-
- ;mov esi, [esp + pushsize + 4]
- add r4, 16
- imul r3, -7
- add r3, r4 ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
-
- xor r3, r3
- movdqa xmm5, [sse2_plane_inc_minus]
-
-get_i16x16_luma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- movdqa xmm3, xmm1
- pmullw xmm3, xmm6
- paddw xmm3, xmm0
- psraw xmm3, 5
- packuswb xmm2, xmm3
- movdqa [r0], xmm2
- paddw xmm0, xmm4
- add r0, 16
- inc r3
- cmp r3, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
- pop r4
- pop r3
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_ONE_LINE 0
- add r0, 16
- add r1, r2
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
- push r3
- %assign push_num 1
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- dec r1
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- pop r3
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
- ;mov edx, [esp+4] ; pred
- ;mov eax, [esp+8] ; pRef
- ;mov ecx, [esp+12] ; stride
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movdqa xmm0, [r1]
-
- movdqa [r0], xmm0
- movdqa [r0+10h], xmm0
- movdqa [r0+20h], xmm0
- movdqa [r0+30h], xmm0
- movdqa [r0+40h], xmm0
- movdqa [r0+50h], xmm0
- movdqa [r0+60h], xmm0
- movdqa [r0+70h], xmm0
- movdqa [r0+80h], xmm0
- movdqa [r0+90h], xmm0
- movdqa [r0+160], xmm0
- movdqa [r0+176], xmm0
- movdqa [r0+192], xmm0
- movdqa [r0+208], xmm0
- movdqa [r0+224], xmm0
- movdqa [r0+240], xmm0
-
- ret
-
-;***********************************************************************
-; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredPlane_sse2
-WelsIChromaPredPlane_sse2:
- ;%define pushsize 4
- ;push esi
- ;mov esi, [esp + pushsize + 8] ;pRef
- ;mov ecx, [esp + pushsize + 12] ;stride
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, 1
- sub r1, r2
-
- pxor mm7, mm7
- movq mm0, [r1]
- movq mm5, [sse2_plane_dec_c]
- punpcklbw mm0, mm7
- pmullw mm0, mm5
- movq mm1, [r1 + 5]
- movq mm6, [sse2_plane_inc_c]
- punpcklbw mm1, mm7
- pmullw mm1, mm6
- psubw mm1, mm0
-
- movq2dq xmm1, mm1
- pxor xmm2, xmm2
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r3d, xmm1
- movsx r3, r3w
- imul r3, 17
- add r3, 16
- sar r3, 5 ; b = (17 * H + 16) >> 5;
- SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b
-
- movzx r3, BYTE [r1+8]
- sub r1, 3
- LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2
-
- add r1, 3
- movzx r4, BYTE [r1+4*r2]
- add r4, r3
- shl r4, 4 ; a = (left[7*stride] + top[7]) << 4;
-
- sub r1, 3
- add r1, r2
- LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2
- pxor mm4, mm4
- punpckhbw mm0, mm4
- pmullw mm0, mm5
- punpckhbw mm7, mm4
- pmullw mm7, mm6
- psubw mm7, mm0
-
- movq2dq xmm7, mm7
- pxor xmm2, xmm2
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r3d, xmm7 ; V
- movsx r3, r3w
- imul r3, 17
- add r3, 16
- sar r3, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c
-
- ;mov esi, [esp + pushsize + 4]
- add r4, 16
- imul r3, -3
- add r3, r4 ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
-
- xor r3, r3
- movdqa xmm5, [sse2_plane_mul_b_c]
-
-get_i_chroma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r0], xmm2
- paddw xmm0, xmm4
- add r0, 8
- inc r3
- cmp r3, 8
- jnz get_i_chroma_pred_plane_sse2_1
- pop r4
- pop r3
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-; 0 |1 |2 |3 |4 |
-; 6 |7 |8 |9 |10|
-; 11|12|13|14|15|
-; 16|17|18|19|20|
-; 21|22|23|24|25|
-; 7 is the start pixel of current 4x4 block
-; pred[7] = ([6]+[0]*2+[1]+2)/4
-;
-; void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
-;***********************************************************************
-WelsI4x4LumaPredDDR_mmx:
- ;mov edx,[esp+4] ;pred
- ;mov eax,[esp+8] ;pRef
- ;mov ecx,[esp+12] ;stride
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
- movq mm2,[r1-8] ;get value of 6 mm2[8] = 6
- sub r1, r2 ;mov eax to above line of current block(postion of 1)
- punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
- movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
- punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
- psllq mm3,18h ;mm3[5]=[1]
- psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
- movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- lea r1,[r1+r2*2-8h] ;set eax point to 12
- movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16]
- psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[16]
- por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
- movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
- movq mm4,[r1+r2*2] ;mm4[8]=[21]
- psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[21]
- por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
- movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
- pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
- pxor mm1,mm4 ;find odd value in the lowest bit of each byte
- pand mm1,[mmx_01bytes] ;set the odd bit
- psubusb mm3,mm1 ;decrease 1 from odd bytes
- pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
-
- movd [r0+12],mm2
- psrlq mm2,8
- movd [r0+8],mm2
- psrlq mm2,8
- movd [r0+4],mm2
- psrlq mm2,8
- movd [r0],mm2
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-; 0 |1 |2 |3 |4 |
-; 5 |6 |7 |8 |9 |
-; 10|11|12|13|14|
-; 15|16|17|18|19|
-; 20|21|22|23|24|
-; 6 is the start pixel of current 4x4 block
-; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
-;
-; void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
-;***********************************************************************
-WelsI4x4LumaPredDc_sse2:
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- movzx r4, byte [r1-1h]
- sub r1, r2
- movd xmm0, [r1]
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- xor r3, r3
- movd r3d, xmm0
- add r3, r4
- movzx r4, byte [r1+r2*2-1h]
- add r3, r4
-
- lea r1, [r1+r2*2-1]
- movzx r4, byte [r1+r2]
- add r3, r4
-
- movzx r4, byte [r1+r2*2]
- add r3, r4
- add r3, 4
- sar r3, 3
- imul r3, 0x01010101
-
- movd xmm0, r3d
- pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- pop r4
- pop r3
- ret
-
-ALIGN 16
-;***********************************************************************
-; void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
-; copy 8 pixel of 8 line from left
-;***********************************************************************
-%macro MMX_PRED_H_8X8_ONE_LINE 4
- movq %1, [%3-8]
- psrlq %1, 38h
-
- ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
-%endmacro
-
-%macro MMX_PRED_H_8X8_ONE_LINEE 4
- movq %1, [%3+r2-8]
- psrlq %1, 38h
-
- ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
-%endmacro
-
-WELS_EXTERN WelsIChromaPredH_mmx
-WelsIChromaPredH_mmx:
- ;mov edx, [esp+4] ;pred
- ;mov eax, [esp+8] ;pRef
- ;mov ecx, [esp+12] ;stride
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- movq mm0, [r1-8]
- psrlq mm0, 38h
-
- ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
- pmullw mm0, [mmx_01bytes]
- pshufw mm0, mm0, 0
- movq [r0], mm0
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8
-
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24
-
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40
-
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-; void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-; copy pixels from top 4 pixels
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredV_sse2
-WelsI4x4LumaPredV_sse2:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movd xmm0, [r1]
- pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- ret
-
-ALIGN 16
-;***********************************************************************
-; void __cdecl WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-; copy 8 pixels from top 8 pixels
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredV_sse2
-WelsIChromaPredV_sse2:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movq xmm0, [r1]
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm1
- movdqa [r0], xmm0
- movdqa [r0+16], xmm0
- movdqa [r0+32], xmm0
- movdqa [r0+48], xmm0
- ret
-
- ALIGN 16
-;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |a |b |
-; |g |h |e |f |
-; |i |j |g |h |
-
-; a = (1 + lt + l0)>>1
-; e = (1 + l0 + l1)>>1
-; g = (1 + l1 + l2)>>1
-; i = (1 + l2 + l3)>>1
-
-; d = (2 + t0 + (t1<<1) + t2)>>2
-; c = (2 + lt + (t0<<1) + t1)>>2
-; b = (2 + l0 + (lt<<1) + t0)>>2
-
-; f = (2 + l1 + (l0<<1) + lt)>>2
-; h = (2 + l2 + (l1<<1) + l0)>>2
-; j = (2 + l3 + (l2<<1) + l1)>>2
-; [b a f e h g j i] + [d c b a] --> mov to memory
-;
-; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
- psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-
- movd mm1, [r1+2*r2-4]
- punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
- lea r1, [r1+2*r2]
- movd mm2, [r1+2*r2-4]
- punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
- psrlq mm2, 20h
- pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-
- movq mm1, mm0
- psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
- movq mm2, mm0
- psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
- movq mm3, mm2
- movq mm4, mm1
- pavgb mm1, mm0
-
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm4 ; decrease 1 from odd bytes
-
- pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
-
- movq mm4, mm0
- pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
- punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
-
- psrlq mm2, 20h
- psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
- movq mm4, mm3
- psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
- pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
- psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
-
- movd [r0], mm2
- movd [r0+12], mm3
- psrlq mm3, 10h
- movd [r0+8], mm3
- psrlq mm3, 10h
- movd [r0+4], mm3
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
-; destination:
-; |a |b |c |d |
-; |c |d |e |f |
-; |e |f |g |g |
-; |g |g |g |g |
-
-; a = (1 + l0 + l1)>>1
-; c = (1 + l1 + l2)>>1
-; e = (1 + l2 + l3)>>1
-; g = l3
-
-; b = (2 + l0 + (l1<<1) + l2)>>2
-; d = (2 + l1 + (l2<<1) + l3)>>2
-; f = (2 + l2 + (l3<<1) + l3)>>2
-
-; [g g f e d c b a] + [g g g g] --> mov to memory
-;
-; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- movd mm0, [r1-4] ; mm0[3] = l0
- punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0
- lea r1, [r1+2*r2]
- movd mm2, [r1-4] ; mm2[3] = l2
- movd mm4, [r1+r2-4] ; mm4[3] = l3
- punpcklbw mm2, mm4
- punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-
- psrlq mm4, 18h
- psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
- psrlq mm0, 8h
- pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
- movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
- pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
-
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
- movq mm5, mm2
- pavgb mm2, mm0
-
- pxor mm5, mm0 ; find odd value in the lowest bit of each byte
- pand mm5, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm5 ; decrease 1 from odd bytes
-
- pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
-
- psrlq mm2, 8h
- pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
-
- punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
- punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
- punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
-
- psrlq mm4, 20h
- movd [r0+12], mm4
-
- movd [r0], mm1
- psrlq mm1, 10h
- movd [r0+4], mm1
- psrlq mm1, 10h
- movd [r0+8], mm1
- WELSEMMS
- ret
-
-
-
-ALIGN 16
-;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; l3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |i |a |b |c |
-; |j |e |f |g |
-
-; a = (1 + lt + t0)>>1
-; b = (1 + t0 + t1)>>1
-; c = (1 + t1 + t2)>>1
-; d = (1 + t2 + t3)>>1
-
-; e = (2 + l0 + (lt<<1) + t0)>>2
-; f = (2 + lt + (t0<<1) + t1)>>2
-; g = (2 + t0 + (t1<<1) + t2)>>2
-
-; h = (2 + t1 + (t2<<1) + t3)>>2
-; i = (2 + lt + (l0<<1) + l1)>>2
-; j = (2 + l0 + (l1<<1) + l2)>>2
-;
-; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
- psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-
- movd mm1, [r1+2*r2-4]
- punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
- lea r1, [r1+2*r2]
- movq mm2, [r1+r2-8] ; mm2[7] = l2
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
- psrlq mm2, 28h
- pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
-
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
- movq mm3, mm2
- pavgb mm2, mm0
-
- pxor mm3, mm0 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm3 ; decrease 1 from odd bytes
-
- movq mm3, mm0
- psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
- movq mm2, mm3
-
- psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
- movd [r0], mm1
-
- psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
- movd [r0+4], mm2
-
- movq mm4, mm3
- psllq mm4, 20h
- psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
-
- movq mm5, mm3
- psllq mm5, 28h
- psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
-
- psllq mm1, 8h
- pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
- movd [r0+8], mm4
-
- psllq mm2, 8h
- pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
- movd [r0+12], mm5
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
-; destination:
-; |a |b |c |d |
-; |b |c |d |e |
-; |c |d |e |f |
-; |d |e |f |g |
-
-; a = (2 + t0 + t2 + (t1<<1))>>2
-; b = (2 + t1 + t3 + (t2<<1))>>2
-; c = (2 + t2 + t4 + (t3<<1))>>2
-; d = (2 + t3 + t5 + (t4<<1))>>2
-
-; e = (2 + t4 + t6 + (t5<<1))>>2
-; f = (2 + t5 + t7 + (t6<<1))>>2
-; g = (2 + t6 + t7 + (t7<<1))>>2
-
-; [g f e d c b a] --> mov to memory
-;
-; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
-
- movq mm3, mm0
- psrlq mm3, 38h
- psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
-
- psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
- psrlq mm2, 8h
- pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
-
- movq mm3, mm1
- pavgb mm1, mm2
- pxor mm3, mm2 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm3 ; decrease 1 from odd bytes
-
- pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
-
- psrlq mm0, 8h
- movd [r0], mm0
- psrlq mm0, 8h
- movd [r0+4], mm0
- psrlq mm0, 8h
- movd [r0+8], mm0
- psrlq mm0, 8h
- movd [r0+12], mm0
- WELSEMMS
- ret
-
-
-ALIGN 16
-;***********************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |b |c |d |i |
-; |f |g |h |j |
-
-; a = (1 + t0 + t1)>>1
-; b = (1 + t1 + t2)>>1
-; c = (1 + t2 + t3)>>1
-; d = (1 + t3 + t4)>>1
-; i = (1 + t4 + t5)>>1
-
-; e = (2 + t0 + (t1<<1) + t2)>>2
-; f = (2 + t1 + (t2<<1) + t3)>>2
-; g = (2 + t2 + (t3<<1) + t4)>>2
-; h = (2 + t3 + (t4<<1) + t5)>>2
-; j = (2 + t4 + (t5<<1) + t6)>>2
-
-; [i d c b a] + [j h g f e] --> mov to memory
-;
-; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
-
- psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
- psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
-
- movq mm3, mm1
- pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
-
- movq mm4, mm2
- pavgb mm2, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm4 ; decrease 1 from odd bytes
-
- pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
-
- movd [r0], mm3
- psrlq mm3, 8h
- movd [r0+8], mm3
-
- movd [r0+4], mm2
- psrlq mm2, 8h
- movd [r0+12], mm2
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-;
-; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movq mm0, [r1]
-
- movzx r3, byte [r1+r2-0x01] ; l1
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l2
- add r3, r4
- movzx r4, byte [r1+r2-0x01] ; l3
- add r3, r4
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l4
- add r3, r4
- movd mm1, r3d ; mm1 = l1+l2+l3+l4
-
- movzx r3, byte [r1+r2-0x01] ; l5
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l6
- add r3, r4
- movzx r4, byte [r1+r2-0x01] ; l7
- add r3, r4
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l8
- add r3, r4
- movd mm2, r3d ; mm2 = l5+l6+l7+l8
-
- movq mm3, mm0
- psrlq mm0, 0x20
- psllq mm3, 0x20
- psrlq mm3, 0x20
- pxor mm4, mm4
- psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
-
- paddq mm3, mm1
- movq mm1, mm2
- paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-
- movq mm4, [mmx_0x02]
-
- paddq mm0, mm4
- psrlq mm0, 0x02
-
- paddq mm2, mm4
- psrlq mm2, 0x02
-
- paddq mm3, mm4
- paddq mm3, mm4
- psrlq mm3, 0x03
-
- paddq mm1, mm4
- paddq mm1, mm4
- psrlq mm1, 0x03
-
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
- psllq mm0, 0x20
- pxor mm0, mm3 ; mm0 = m_up
-
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
- psllq mm1, 0x20
- pxor mm1, mm2 ; mm2 = m_down
-
- movq [r0], mm0
- movq [r0+0x08], mm0
- movq [r0+0x10], mm0
- movq [r0+0x18], mm0
-
- movq [r0+0x20], mm1
- movq [r0+0x28], mm1
- movq [r0+0x30], mm1
- movq [r0+0x38], mm1
-
- pop r4
- pop r3
- WELSEMMS
- ret
-
-
-
-ALIGN 16
-;***********************************************************************
-;
-; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movdqa xmm0, [r1] ; read one row
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- movdqa xmm1, xmm0
- psrldq xmm1, 0x08
- pslldq xmm0, 0x08
- psrldq xmm0, 0x08
- paddw xmm0, xmm1
-
- movzx r3, byte [r1+r2-0x01]
- movzx r4, byte [r1+2*r2-0x01]
- add r3, r4
- lea r1, [r1+r2]
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- add r3, 0x10
- movd xmm1, r3d
- paddw xmm0, xmm1
- psrld xmm0, 0x05
- pmuludq xmm0, [mmx_01bytes]
- pshufd xmm0, xmm0, 0
-
- movdqa [r0], xmm0
- movdqa [r0+0x10], xmm0
- movdqa [r0+0x20], xmm0
- movdqa [r0+0x30], xmm0
- movdqa [r0+0x40], xmm0
- movdqa [r0+0x50], xmm0
- movdqa [r0+0x60], xmm0
- movdqa [r0+0x70], xmm0
- movdqa [r0+0x80], xmm0
- movdqa [r0+0x90], xmm0
- movdqa [r0+0xa0], xmm0
- movdqa [r0+0xb0], xmm0
- movdqa [r0+0xc0], xmm0
- movdqa [r0+0xd0], xmm0
- movdqa [r0+0xe0], xmm0
- movdqa [r0+0xf0], xmm0
-
- pop r4
- pop r3
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
-; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
-;
-;***********************************************************************
-%ifdef X86_ASM
-WELS_EXTERN WelsSmpleSatdThree4x4_sse2
-align 16
-WelsSmpleSatdThree4x4_sse2:
- push ebx
- push esi
- push edi
- mov eax, [esp+24];p_enc
- mov ebx, [esp+28];linesize_enc
-
- ; load source 4x4 samples and Hadamard transform
- movd xmm0, [eax]
- movd xmm1, [eax+ebx]
- lea eax , [eax+2*ebx]
- movd xmm2, [eax]
- movd xmm3, [eax+ebx]
- punpckldq xmm0, xmm2
- punpckldq xmm1, xmm3
-
- pxor xmm6, xmm6
- punpcklbw xmm0, xmm6
- punpcklbw xmm1, xmm6
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
- SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
- movdqa xmm4, xmm0
- paddw xmm0, xmm3
- psubw xmm4, xmm3
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm4
- punpckhwd xmm4, xmm2
-
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
-
- movdqa xmm7, xmm0
- paddw xmm0, xmm5
- psubw xmm7, xmm5
-
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
-
- ; Hadamard transform results are saved in xmm0 and xmm2
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
-
- ; load top boundary samples: [a b c d]
- mov eax, [esp+16];p_dec
- sub eax, [esp+20];linesize_dec
- movzx ecx, byte [eax]
- movzx edx, byte [eax+1]
- movzx esi, byte [eax+2]
- movzx edi, byte [eax+3]
-
- ; get the transform results of top boundary samples: [a b c d]
- add edx, ecx ; edx = a + b
- add edi, esi ; edi = c + d
- add ecx, ecx ; ecx = a + a
- add esi, esi ; esi = c + c
- sub ecx, edx ; ecx = a + a - a - b = a - b
- sub esi, edi ; esi = c + c - c - d = c - d
- add edi, edx ; edi = (a + b) + (c + d)
- add edx, edx
- sub edx, edi ; edx = (a + b) - (c + d)
- add esi, ecx ; esi = (a - b) + (c - d)
- add ecx, ecx
- sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
-
- movdqa xmm6, xmm0
- movdqa xmm7, xmm2
- movd xmm5, edi ; store the edi for DC mode
- pxor xmm3, xmm3
- pxor xmm4, xmm4
- pinsrw xmm3, edi, 0
- pinsrw xmm3, esi, 4
- psllw xmm3, 2
- pinsrw xmm4, edx, 0
- pinsrw xmm4, ecx, 4
- psllw xmm4, 2
-
- ; get the satd of H
- psubw xmm0, xmm3
- psubw xmm2, xmm4
-
- WELS_AbsW xmm0, xmm1
- WELS_AbsW xmm2, xmm1
- paddusw xmm0, xmm2
- SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0
-
- ; load left boundary samples: [a b c d]'
- mov eax, [esp+16]
- mov ebx, [esp+20]
- movzx ecx, byte [eax-1]
- movzx edx, byte [eax+ebx-1]
- lea eax , [eax+2*ebx]
- movzx esi, byte [eax-1]
- movzx edi, byte [eax+ebx-1]
-
- ; get the transform results of left boundary samples: [a b c d]'
- add edx, ecx ; edx = a + b
- add edi, esi ; edi = c + d
- add ecx, ecx ; ecx = a + a
- add esi, esi ; esi = c + c
- sub ecx, edx ; ecx = a + a - a - b = a - b
- sub esi, edi ; esi = c + c - c - d = c - d
- add edi, edx ; edi = (a + b) + (c + d)
- add edx, edx
- sub edx, edi ; edx = (a + b) - (c + d)
- add esi, ecx ; esi = (a - b) + (c - d)
- add ecx, ecx
- sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
-
- ; store the transform results in xmm3
- movd xmm3, edi
- pinsrw xmm3, edx, 1
- pinsrw xmm3, ecx, 2
- pinsrw xmm3, esi, 3
- psllw xmm3, 2
-
- ; get the satd of V
- movdqa xmm2, xmm6
- movdqa xmm4, xmm7
- psubw xmm2, xmm3
- WELS_AbsW xmm2, xmm1
- WELS_AbsW xmm4, xmm1
- paddusw xmm2, xmm4
- SUMW_HORIZON1 xmm2, xmm1 ; satd of H is stored in xmm2
-
- ; DC result is stored in xmm1
- add edi, 4
- movd xmm1, edi
- paddw xmm1, xmm5
- psrlw xmm1, 3
- movdqa xmm5, xmm1
- psllw xmm1, 4
-
- ; get the satd of DC
- psubw xmm6, xmm1
- WELS_AbsW xmm6, xmm1
- WELS_AbsW xmm7, xmm1
- paddusw xmm6, xmm7
- SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6
-
- ; comparing order: DC H V
- mov edx, [esp+32]
- movd eax, xmm6
- movd edi, xmm2
- movd esi, xmm0
- and eax, 0xffff
- shr eax, 1
- and edi, 0xffff
- shr edi, 1
- and esi, 0xffff
- shr esi, 1
- add eax, [esp+40]
- add edi, [esp+44]
- add esi, [esp+48]
- cmp ax, di
- jg near not_dc
- cmp ax, si
- jg near not_dc_h
-
- ; for DC mode
- movd ebx, xmm5
- imul ebx, 0x01010101
- movd xmm5, ebx
- pshufd xmm5, xmm5, 0
- movdqa [edx], xmm5
- mov ebx, [esp+36]
- mov dword [ebx], 0x02
- pop edi
- pop esi
- pop ebx
- ret
-
-not_dc:
- cmp di, si
- jg near not_dc_h
-
- ; for H mode
- SSE_DB_1_2REG xmm6, xmm7
- mov eax, [esp+16]
- mov ebx, [esp+20]
- movzx ecx, byte [eax-1]
- movd xmm0, ecx
- pmuludq xmm0, xmm6
-
- movzx ecx, byte [eax+ebx-1]
- movd xmm1, ecx
- pmuludq xmm1, xmm6
-%if 1
- punpckldq xmm0, xmm1
-%else
- unpcklps xmm0, xmm1
-%endif
- lea eax, [eax+ebx*2]
- movzx ecx, byte [eax-1]
- movd xmm2, ecx
- pmuludq xmm2, xmm6
-
- movzx ecx, byte [eax+ebx-1]
- movd xmm3, ecx
- pmuludq xmm3, xmm6
-%if 1
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
-%else
- unpcklps xmm2, xmm3
- unpcklpd xmm0, xmm2
-%endif
- movdqa [edx],xmm0
-
- mov eax, edi
- mov ebx, [esp+36]
- mov dword [ebx], 0x01
-
- pop edi
- pop esi
- pop ebx
- ret
-not_dc_h:
- ; for V mode
- mov eax, [esp+16]
- sub eax, [esp+20]
- movd xmm0, [eax]
- pshufd xmm0, xmm0, 0
- movdqa [edx],xmm0
-
- mov eax, esi
- mov ebx, [esp+36]
- mov dword [ebx], 0x00
-
- pop edi
- pop esi
- pop ebx
- ret
-%endif
-
--- a/codec/processing/src/asm/sad.asm
+++ /dev/null
@@ -1,220 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* sad.asm
-;*
-;* Abstract
-;* WelsSampleSad8x8_sse21
-;*
-;* History
-;* 8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and %1, 0x1f|(%3>>1)
-cmp %1, (32-%2)|(%3>>1)
-%endmacro
-
-%macro SSE2_GetSad8x4 0
- movq xmm0, [r0]
- movq xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movhps xmm0, [r0]
- movhps xmm1, [r0+r1]
-
- movq xmm2, [r2]
- movq xmm3, [r2+r3]
- lea r2, [r2+2*r3]
- movhps xmm2, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm3
- paddw xmm6, xmm0
- paddw xmm6, xmm1
-%endmacro
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
- ;mov ecx, [esp+12]
- ;mov edx, ecx
- ;CACHE_SPLIT_CHECK edx, 8, 64
- ;jle near .pixel_sad_8x8_nsplit
- ;push ebx
- ;push edi
- ;mov eax, [esp+12]
- ;mov ebx, [esp+16]
-
- %assign push_num 0
- mov r2, arg3
- push r2
- CACHE_SPLIT_CHECK r2, 8, 64
- jle near .pixel_sad_8x8_nsplit
- pop r2
-%ifdef X86_32
- push r3
- push r4
- push r5
-%endif
- %assign push_num 3
- mov r0, arg1
- mov r1, arg2
- SIGN_EXTENTION r1, r1d
- pxor xmm7, xmm7
-
- ;ecx r2, edx r4, edi r5
-
- mov r5, r2
- and r5, 0x07
- sub r2, r5
- mov r4, 8
- sub r4, r5
-
- shl r5, 3
- shl r4, 3
- movd xmm5, r5d
- movd xmm6, r4d
- mov r5, 8
- add r5, r2
- mov r3, arg4
- SIGN_EXTENTION r3, r3d
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd retrd, xmm0
-%ifdef X86_32
- pop r5
- pop r4
- pop r3
-%endif
- jmp .return
-
-.pixel_sad_8x8_nsplit:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov edx, [esp+20]
-
- pop r2
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm6, xmm6
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd retrd, xmm0
- LOAD_4_PARA_POP
-.return:
- ret
\ No newline at end of file
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -23,8 +23,6 @@
PROCESSING_ASM_SRCS=\
$(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm\
$(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm\
- $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm\
- $(PROCESSING_SRCDIR)/./src/asm/sad.asm\
$(PROCESSING_SRCDIR)/./src/asm/vaa.asm\
PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o)