shithub: openh264

ref: c5a09faf24190d66fbc9d97f6374dbb4dc203b4c
dir: /codec/common/x86/mc_chroma.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2004-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  mc_chroma.asm
;*
;*  Abstract
;*      mmx motion compensation for chroma
;*
;*  History
;*      10/13/2004 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"

;***********************************************************************
; Local Data (Read Only)
;***********************************************************************

SECTION .rodata align=16

;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************

ALIGN 16
h264_d0x20_sse2:
    dw 32,32,32,32,32,32,32,32
ALIGN 16
h264_d0x20_mmx:
    dw 32,32,32,32


;=============================================================================
; Code
;=============================================================================

SECTION .text

;*******************************************************************************
; void McChromaWidthEq4_mmx( const uint8_t *src,
;                           int32_t iSrcStride,
;                           uint8_t *pDst,
;                           int32_t iDstStride,
;                           const uint8_t *pABCD,
;                           int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
    %assign  push_num 0
    LOAD_6_PARA
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r5, r5d

    movd mm3, [r4]; [eax]
    WELS_Zero mm7
    punpcklbw mm3, mm3
    movq      mm4, mm3
    punpcklwd mm3, mm3
    punpckhwd mm4, mm4

    movq      mm5, mm3
    punpcklbw mm3, mm7
    punpckhbw mm5, mm7

    movq      mm6, mm4
    punpcklbw mm4, mm7
    punpckhbw mm6, mm7

    lea r4, [r0 + r1] ;lea ebx, [esi + eax]
    movd mm0, [r0]
    movd mm1, [r0+1]
    punpcklbw mm0, mm7
    punpcklbw mm1, mm7
.xloop:

    pmullw mm0, mm3
    pmullw mm1, mm5
    paddw  mm0, mm1

    movd  mm1, [r4]
    punpcklbw mm1, mm7
    movq mm2, mm1
    pmullw mm1, mm4
    paddw mm0, mm1

    movd mm1, [r4+1]
    punpcklbw mm1, mm7
    movq mm7, mm1
    pmullw mm1,mm6
    paddw mm0, mm1
    movq mm1,mm7

%ifdef X86_32_PICASM
    pcmpeqw mm7, mm7
    psrlw   mm7, 15
    psllw   mm7, 5
    paddw   mm0, mm7
%else
    paddw mm0, [h264_d0x20_mmx]
%endif
    psrlw mm0, 6

    WELS_Zero mm7
    packuswb mm0, mm7
    movd [r2], mm0

    movq mm0, mm2

    lea r2, [r2 + r3]
    lea r4, [r4 + r1]

    dec r5
    jnz near .xloop
    WELSEMMS
    LOAD_6_PARA_POP
    ret


;*******************************************************************************
; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
;                       int32_t iSrcStride,
;                       uint8_t *pDst,
;                       int32_t iDstStride,
;                       const uint8_t *pABCD,
;                       int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
    %assign  push_num 0
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r5, r5d

    movd xmm3, [r4]
    WELS_Zero xmm7
    punpcklbw  xmm3, xmm3
    punpcklwd  xmm3, xmm3

    movdqa     xmm4, xmm3
    punpckldq  xmm3, xmm3
    punpckhdq  xmm4, xmm4
    movdqa     xmm5, xmm3
    movdqa     xmm6, xmm4

    punpcklbw  xmm3, xmm7
    punpckhbw  xmm5, xmm7
    punpcklbw  xmm4, xmm7
    punpckhbw  xmm6, xmm7

    lea r4, [r0 + r1] ;lea ebx, [esi + eax]
    movq xmm0, [r0]
    movq xmm1, [r0+1]
    punpcklbw xmm0, xmm7
    punpcklbw xmm1, xmm7
.xloop:

    pmullw xmm0, xmm3
    pmullw xmm1, xmm5
    paddw  xmm0, xmm1

    movq  xmm1, [r4]
    punpcklbw xmm1, xmm7
    movdqa xmm2, xmm1
    pmullw xmm1, xmm4
    paddw xmm0, xmm1

    movq xmm1, [r4+1]
    punpcklbw xmm1, xmm7
    movdqa xmm7, xmm1
    pmullw xmm1, xmm6
    paddw xmm0, xmm1
    movdqa xmm1,xmm7

%ifdef X86_32_PICASM
    pcmpeqw xmm7, xmm7
    psrlw   xmm7, 15
    psllw   xmm7, 5
    paddw   xmm0, xmm7
%else
    paddw xmm0, [h264_d0x20_sse2]
%endif
    psrlw xmm0, 6

    WELS_Zero xmm7
    packuswb xmm0, xmm7
    movq [r2], xmm0

    movdqa xmm0, xmm2

    lea r2, [r2 + r3]
    lea r4, [r4 + r1]

    dec r5
    jnz near .xloop

    POP_XMM
    LOAD_6_PARA_POP

    ret




;***********************************************************************
; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
;                        int32_t iSrcStride,
;                        uint8_t *pDst,
;                        int32_t iDstStride,
;                        const uint8_t *pABCD,
;                        int32_t iHeigh);
;***********************************************************************
WELS_EXTERN McChromaWidthEq8_ssse3
    %assign  push_num 0
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r5, r5d

    pxor      xmm7, xmm7
    movd   xmm5, [r4]
    punpcklwd xmm5, xmm5
    punpckldq xmm5, xmm5
    movdqa    xmm6, xmm5
    punpcklqdq xmm5, xmm5
    punpckhqdq xmm6, xmm6

    sub r2, r3 ;sub esi, edi
    sub r2, r3
%ifdef X86_32_PICASM
    pcmpeqw xmm7, xmm7
    psrlw   xmm7, 15
    psllw   xmm7, 5
%else
    movdqa xmm7, [h264_d0x20_sse2]
%endif

    movdqu xmm0, [r0]
    movdqa xmm1, xmm0
    psrldq xmm1, 1
    punpcklbw xmm0, xmm1

.hloop_chroma:
    lea r2, [r2+2*r3]

    movdqu xmm2, [r0+r1]
    movdqa xmm3, xmm2
    psrldq xmm3, 1
    punpcklbw xmm2, xmm3
    movdqa      xmm4, xmm2

    pmaddubsw  xmm0, xmm5
    pmaddubsw  xmm2, xmm6
    paddw      xmm0, xmm2
    paddw      xmm0, xmm7
    psrlw      xmm0, 6
    packuswb   xmm0, xmm0
    movq       [r2],xmm0

    lea r0, [r0+2*r1]
    movdqu xmm2, [r0]
    movdqa xmm3, xmm2
    psrldq xmm3, 1
    punpcklbw xmm2, xmm3
    movdqa      xmm0, xmm2

    pmaddubsw  xmm4, xmm5
    pmaddubsw  xmm2, xmm6
    paddw      xmm4, xmm2
    paddw      xmm4, xmm7
    psrlw      xmm4, 6
    packuswb   xmm4, xmm4
    movq       [r2+r3],xmm4

    sub r5, 2
    jnz .hloop_chroma

    POP_XMM
    LOAD_6_PARA_POP

    ret