ref: f421aab6a3bc615d3d1d3e496aeccde7b947f86b
dir: /codec/common/x86/mc_chroma.asm/
;*! ;* \copy ;* Copyright (c) 2004-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* ;* mc_chroma.asm ;* ;* Abstract ;* mmx motion compensation for chroma ;* ;* History ;* 10/13/2004 Created ;* ;* ;*************************************************************************/ %include "asm_inc.asm" ;*********************************************************************** ; Local Data (Read Only) ;*********************************************************************** SECTION .rodata align=16 ;*********************************************************************** ; Various memory constants (trigonometric values or rounding values) ;*********************************************************************** ALIGN 16 h264_d0x20_sse2: dw 32,32,32,32,32,32,32,32 ALIGN 16 h264_d0x20_mmx: dw 32,32,32,32 ;============================================================================= ; Code ;============================================================================= SECTION .text ;******************************************************************************* ; void McChromaWidthEq4_mmx( const uint8_t *src, ; int32_t iSrcStride, ; uint8_t *pDst, ; int32_t iDstStride, ; const uint8_t *pABCD, ; int32_t iHeigh ); ;******************************************************************************* WELS_EXTERN McChromaWidthEq4_mmx %assign push_num 0 LOAD_6_PARA SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r5, r5d movd mm3, [r4]; [eax] WELS_Zero mm7 punpcklbw mm3, mm3 movq mm4, mm3 punpcklwd mm3, mm3 punpckhwd mm4, mm4 movq mm5, mm3 punpcklbw mm3, mm7 punpckhbw mm5, mm7 movq mm6, mm4 punpcklbw mm4, mm7 punpckhbw mm6, mm7 lea r4, [r0 + r1] ;lea ebx, [esi + eax] movd mm0, [r0] movd mm1, [r0+1] punpcklbw mm0, mm7 punpcklbw mm1, mm7 .xloop: pmullw mm0, mm3 pmullw mm1, mm5 paddw mm0, mm1 movd mm1, [r4] punpcklbw mm1, mm7 movq mm2, mm1 pmullw mm1, mm4 paddw mm0, mm1 movd mm1, [r4+1] punpcklbw mm1, mm7 movq mm7, mm1 pmullw mm1,mm6 paddw mm0, mm1 movq mm1,mm7 %ifdef X86_32_PICASM pcmpeqw mm7, mm7 psrlw mm7, 15 psllw mm7, 5 paddw mm0, mm7 %else paddw mm0, [h264_d0x20_mmx] %endif psrlw mm0, 6 WELS_Zero mm7 packuswb mm0, mm7 movd [r2], mm0 movq mm0, mm2 lea r2, [r2 + r3] lea r4, [r4 + r1] dec r5 jnz near .xloop WELSEMMS LOAD_6_PARA_POP ret ;******************************************************************************* ; void McChromaWidthEq8_sse2( const uint8_t *pSrc, ; int32_t iSrcStride, ; uint8_t *pDst, ; int32_t iDstStride, ; const uint8_t *pABCD, ; int32_t iheigh ); ;******************************************************************************* WELS_EXTERN McChromaWidthEq8_sse2 %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r5, r5d movd xmm3, [r4] WELS_Zero xmm7 punpcklbw xmm3, xmm3 punpcklwd xmm3, xmm3 movdqa xmm4, xmm3 punpckldq xmm3, xmm3 punpckhdq xmm4, xmm4 movdqa xmm5, xmm3 movdqa xmm6, xmm4 punpcklbw xmm3, xmm7 punpckhbw xmm5, xmm7 punpcklbw xmm4, xmm7 punpckhbw xmm6, xmm7 lea r4, [r0 + r1] ;lea ebx, [esi + eax] movq xmm0, [r0] movq xmm1, [r0+1] punpcklbw xmm0, xmm7 punpcklbw xmm1, xmm7 .xloop: pmullw xmm0, xmm3 pmullw xmm1, xmm5 paddw xmm0, xmm1 movq xmm1, [r4] punpcklbw xmm1, xmm7 movdqa xmm2, xmm1 pmullw xmm1, xmm4 paddw xmm0, xmm1 movq xmm1, [r4+1] punpcklbw xmm1, xmm7 movdqa xmm7, xmm1 pmullw xmm1, xmm6 paddw xmm0, xmm1 movdqa xmm1,xmm7 %ifdef X86_32_PICASM pcmpeqw xmm7, xmm7 psrlw xmm7, 15 psllw xmm7, 5 paddw xmm0, xmm7 %else paddw xmm0, [h264_d0x20_sse2] %endif psrlw xmm0, 6 WELS_Zero xmm7 packuswb xmm0, xmm7 movq [r2], xmm0 movdqa xmm0, xmm2 lea r2, [r2 + r3] lea r4, [r4 + r1] dec r5 jnz near .xloop POP_XMM LOAD_6_PARA_POP ret ;*********************************************************************** ; void McChromaWidthEq8_ssse3( const uint8_t *pSrc, ; int32_t iSrcStride, ; uint8_t *pDst, ; int32_t iDstStride, ; const uint8_t *pABCD, ; int32_t iHeigh); ;*********************************************************************** WELS_EXTERN McChromaWidthEq8_ssse3 %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r5, r5d pxor xmm7, xmm7 movd xmm5, [r4] punpcklwd xmm5, xmm5 punpckldq xmm5, xmm5 movdqa xmm6, xmm5 punpcklqdq xmm5, xmm5 punpckhqdq xmm6, xmm6 sub r2, r3 ;sub esi, edi sub r2, r3 %ifdef X86_32_PICASM pcmpeqw xmm7, xmm7 psrlw xmm7, 15 psllw xmm7, 5 %else movdqa xmm7, [h264_d0x20_sse2] %endif movdqu xmm0, [r0] movdqa xmm1, xmm0 psrldq xmm1, 1 punpcklbw xmm0, xmm1 .hloop_chroma: lea r2, [r2+2*r3] movdqu xmm2, [r0+r1] movdqa xmm3, xmm2 psrldq xmm3, 1 punpcklbw xmm2, xmm3 movdqa xmm4, xmm2 pmaddubsw xmm0, xmm5 pmaddubsw xmm2, xmm6 paddw xmm0, xmm2 paddw xmm0, xmm7 psrlw xmm0, 6 packuswb xmm0, xmm0 movq [r2],xmm0 lea r0, [r0+2*r1] movdqu xmm2, [r0] movdqa xmm3, xmm2 psrldq xmm3, 1 punpcklbw xmm2, xmm3 movdqa xmm0, xmm2 pmaddubsw xmm4, xmm5 pmaddubsw xmm2, xmm6 paddw xmm4, xmm2 paddw xmm4, xmm7 psrlw xmm4, 6 packuswb xmm4, xmm4 movq [r2+r3],xmm4 sub r5, 2 jnz .hloop_chroma POP_XMM LOAD_6_PARA_POP ret