shithub: openh264

ref: fe23d53acc637eac7e54cd566a1a22141792c8da
dir: /codec/decoder/core/x86/intra_pred.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  intra_pred.asm
;*
;*  Abstract
;*      sse2 and mmx function for intra predict operations(decoder)
;*
;*  History
;*      18/09/2009 Created
;*		19/11/2010 Added
;*					WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
;*					WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
;*					and WelsDecoderIChromaPredDcNA_mmx
;*
;*
;*************************************************************************/

%include "asm_inc.asm"
;*******************************************************************************
; Local Data (Read Only)
;*******************************************************************************

SECTION .rodata align=16
%if 1
	%define WELSEMMS	emms
%else
	%define WELSEMMS
%endif

align 16
sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
align 16
sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
align 16
sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1

; for chroma plane mode
sse2_plane_inc_c dw 1, 2, 3, 4
sse2_plane_dec_c dw 4, 3, 2, 1
align 16
sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4

align 16
mmx_01bytes:		times 16	db 1

align 16
mmx_0x02: dw 0x02, 0x00, 0x00, 0x00

align 16
sse2_dc_0x80: times 16 db 0x80
align 16
sse2_wd_0x02: times 8 dw 0x02

;*******************************************************************************
; macros
;*******************************************************************************
;xmm0, xmm1, xmm2, eax, ecx
;lower 64 bits of xmm0 save the result
%macro SSE2_PRED_H_4X4_TWO_LINE 5
    movd		%1,	[%4-1]
	movdqa		%3,	%1
	punpcklbw	%1,	%3
	movdqa		%3,	%1
	punpcklbw	%1,	%3

	;add			%4,	%5
	movd		%2,	[%4+%5-1]
	movdqa		%3,	%2
	punpcklbw	%2,	%3
	movdqa		%3,	%2
	punpcklbw	%2,	%3
	punpckldq	%1,	%2
%endmacro


%macro	LOAD_COLUMN 6
		movd	%1,	[%5]
		movd	%2,	[%5+%6]
		punpcklbw %1,	%2
		lea		%5,	[%5+2*%6]
		movd	%3,	[%5]
		movd	%2,	[%5+%6]
		punpcklbw %3,	%2
		punpcklwd %1,	%3
		lea		%5,	[%5+2*%6]
		movd	%4,	[%5]
		movd	%2,	[%5+%6]
		punpcklbw %4,	%2
		lea		%5,	[%5+2*%6]
		movd	%3,	[%5]
		movd	%2,	[%5+%6]
		lea		%5,	[%5+2*%6]
		punpcklbw %3,	%2
		punpcklwd %4,	%3
		punpckhdq %1,	%4
%endmacro

%macro  SUMW_HORIZON 3
	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
%endmacro

%macro  COPY_16_TIMES 2
		movdqa		%2,	[%1-16]
		psrldq		%2,	15
		pmuludq		%2,	[mmx_01bytes]
		pshufd		%2,	%2, 0
%endmacro

%macro  COPY_16_TIMESS 3
		movdqa		%2,	[%1+%3-16]
		psrldq		%2,	15
		pmuludq		%2,	[mmx_01bytes]
		pshufd		%2,	%2, 0
%endmacro

%macro	LOAD_COLUMN_C 6
		movd	%1,	[%5]
		movd	%2,	[%5+%6]
		punpcklbw %1,%2
		lea		%5,	[%5+2*%6]
		movd	%3,	[%5]
		movd	%2,	[%5+%6]
		punpcklbw %3,	%2
		punpckhwd %1,	%3
		lea		%5,	[%5+2*%6]
%endmacro

%macro LOAD_2_LEFT_AND_ADD 0
        lea         r0, [r0+2*r1]
        movzx		r3, byte [r0-0x01]
        add			r2, r3
        movzx		r3, byte [r0+r1-0x01]
        add			r2, r3
%endmacro

;*******************************************************************************
; Code
;*******************************************************************************

SECTION .text


;*******************************************************************************
;   void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
;
;	pPred must align to 16
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d

	movzx		r2,	byte [r0-1]
	movd		xmm0,	r2d
	pmuludq		xmm0,	[mmx_01bytes]

	movzx		r2,	byte [r0+r1-1]
	movd		xmm1,	r2d
	pmuludq		xmm1,	[mmx_01bytes]

	lea			r0,	[r0+r1]
	movzx		r2,	byte [r0+r1-1]
	movd		xmm2,	r2d
	pmuludq		xmm2,	[mmx_01bytes]

	movzx		r2,	byte [r0+2*r1-1]
	movd		xmm3,	r2d
	pmuludq		xmm3,	[mmx_01bytes]

	sub         r0,    r1
	movd        [r0], xmm0
	movd        [r0+r1], xmm1
	lea         r0, [r0+2*r1]
	movd        [r0], xmm2
	movd        [r0+r1], xmm3

	ret

;*******************************************************************************
; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
		push r3
		push r4
		%assign push_num 2
		LOAD_2_PARA
		PUSH_XMM 8
		SIGN_EXTENSION r1, r1d
		mov r4, r0 ; save r0 in r4
		sub		r0,	1
		sub		r0,	r1

		;for H
		pxor	xmm7,	xmm7
		movq	xmm0,	[r0]
		movdqa	xmm5,	[sse2_plane_dec]
		punpcklbw xmm0,	xmm7
		pmullw	xmm0,	xmm5
		movq	xmm1,	[r0 + 9]
		movdqa	xmm6,	[sse2_plane_inc]
		punpcklbw xmm1,	xmm7
		pmullw	xmm1,	xmm6
		psubw	xmm1,	xmm0

		SUMW_HORIZON	xmm1,xmm0,xmm2
		movd    r2d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
		movsx	r2,	r2w
		imul	r2,	5
		add		r2,	32
		sar		r2,	6			; b = (5 * H + 32) >> 6;
		SSE2_Copy8Times	xmm1, r2d	; xmm1 = b,b,b,b,b,b,b,b

		movzx	r3,	BYTE [r0+16]
		sub	r0, 3
		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r0, r1

		add		r0,	3
		movzx	r2,	BYTE [r0+8*r1]
		add		r3,	r2
		shl		r3,	4			;	a = (left[15*kiStride] + top[15]) << 4;

		sub	r0, 3
		add		r0,	r1
		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r0, r1
		pxor	xmm4,	xmm4
		punpckhbw xmm0,	xmm4
		pmullw	xmm0,	xmm5
		punpckhbw xmm7,	xmm4
		pmullw	xmm7,	xmm6
		psubw	xmm7,	xmm0

		SUMW_HORIZON   xmm7,xmm0,xmm2
		movd    r2d,   xmm7			; V
		movsx	r2,	r2w

		imul	r2,	5
		add		r2,	32
		sar		r2,	6				; c = (5 * V + 32) >> 6;
		SSE2_Copy8Times	xmm4, r2d		; xmm4 = c,c,c,c,c,c,c,c

		mov r0, r4
		add		r3,	16
		imul	r2,	-7
		add		r3,	r2		; s = a + 16 + (-7)*c
		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s

		xor		r2,	r2
		movdqa	xmm5,	[sse2_plane_inc_minus]

get_i16x16_luma_pred_plane_sse2_1:
		movdqa	xmm2,	xmm1
		pmullw	xmm2,	xmm5
		paddw	xmm2,	xmm0
		psraw	xmm2,	5
		movdqa	xmm3,	xmm1
		pmullw	xmm3,	xmm6
		paddw	xmm3,	xmm0
		psraw	xmm3,	5
		packuswb xmm2,	xmm3
		movdqa	[r0],	xmm2
		paddw	xmm0,	xmm4
		add		r0,	r1
		inc		r2
		cmp		r2,	16
		jnz get_i16x16_luma_pred_plane_sse2_1

		POP_XMM
		pop r4
		pop r3
		ret



;*******************************************************************************
; void WelsDecoderI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************

%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
    lea     %1,	[%1+%2*2]

    COPY_16_TIMES %1,	xmm0
    movdqa  [%1],	xmm0
    COPY_16_TIMESS %1,	xmm0,	%2
    movdqa  [%1+%2],	xmm0
%endmacro

WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d

    COPY_16_TIMES r0,	xmm0
    movdqa  [r0],		xmm0
    COPY_16_TIMESS r0,	xmm0,	r1
    movdqa  [r0+r1],	xmm0

	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1

    ret

;*******************************************************************************
; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d

    sub     r0, r1
    movdqa  xmm0, [r0]

    movdqa  [r0+r1], xmm0
    lea     r0, [r0+2*r1]
    movdqa  [r0],     xmm0
    movdqa  [r0+r1], xmm0
    lea     r0, [r0+2*r1]
    movdqa  [r0],     xmm0
    movdqa  [r0+r1], xmm0
    lea     r0, [r0+2*r1]
    movdqa  [r0],     xmm0
    movdqa  [r0+r1], xmm0
    lea     r0, [r0+2*r1]
    movdqa  [r0],     xmm0
    movdqa  [r0+r1], xmm0
    lea     r0, [r0+2*r1]
    movdqa  [r0],     xmm0
    movdqa  [r0+r1], xmm0
    lea     r0, [r0+2*r1]
    movdqa  [r0],     xmm0
    movdqa  [r0+r1], xmm0
    lea     r0, [r0+2*r1]
    movdqa  [r0],     xmm0
    movdqa  [r0+r1], xmm0
    lea     r0, [r0+2*r1]
    movdqa  [r0],     xmm0

    ret

;*******************************************************************************
; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
		push r3
		push r4
		%assign push_num 2
		LOAD_2_PARA
		PUSH_XMM 8
		SIGN_EXTENSION r1, r1d
		mov r4, r0
		sub		r0,	1
		sub		r0,	r1

		pxor	mm7,	mm7
		movq	mm0,	[r0]
		movq	mm5,	[sse2_plane_dec_c]
		punpcklbw mm0,	mm7
		pmullw	mm0,	mm5
		movq	mm1,	[r0 + 5]
		movq	mm6,	[sse2_plane_inc_c]
		punpcklbw mm1,	mm7
		pmullw	mm1,	mm6
		psubw	mm1,	mm0

		movq2dq xmm1,   mm1
		pxor    xmm2,   xmm2
		SUMW_HORIZON	xmm1,xmm0,xmm2
		movd    r2d,	xmm1
		movsx	r2,	r2w
		imul	r2,	17
		add		r2,	16
		sar		r2,	5			; b = (17 * H + 16) >> 5;
		SSE2_Copy8Times	xmm1, r2d	; mm1 = b,b,b,b,b,b,b,b

		movzx	r3,	BYTE [r0+8]
		sub	r0, 3
		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r0, r1

		add		r0,	3
		movzx	r2,	BYTE [r0+4*r1]
		add		r3,	r2
		shl		r3,	4			; a = (left[7*kiStride] + top[7]) << 4;

		sub	r0, 3
		add		r0,	r1
		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r0, r1
		pxor	mm4,	mm4
		punpckhbw mm0,	mm4
		pmullw	mm0,	mm5
		punpckhbw mm7,	mm4
		pmullw	mm7,	mm6
		psubw	mm7,	mm0

		movq2dq xmm7,   mm7
		pxor    xmm2,   xmm2
		SUMW_HORIZON	xmm7,xmm0,xmm2
		movd    r2d,    xmm7			; V
		movsx	r2,	r2w

		imul	r2,	17
		add		r2,	16
		sar		r2,	5				; c = (17 * V + 16) >> 5;
		SSE2_Copy8Times	xmm4, r2d		; mm4 = c,c,c,c,c,c,c,c

		mov 	r0, r4
		add		r3,	16
		imul	r2,	-3
		add		r3,	r2				; s = a + 16 + (-3)*c
		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s

		xor		r2,	r2
		movdqa	xmm5,	[sse2_plane_mul_b_c]

get_i_chroma_pred_plane_sse2_1:
		movdqa	xmm2,	xmm1
		pmullw	xmm2,	xmm5
		paddw	xmm2,	xmm0
		psraw	xmm2,	5
		packuswb xmm2,	xmm2
		movq	[r0],	xmm2
		paddw	xmm0,	xmm4
		add		r0,	r1
		inc		r2
		cmp		r2,	8
		jnz get_i_chroma_pred_plane_sse2_1

		POP_XMM
		pop r4
		pop r3
		WELSEMMS
		ret

;*******************************************************************************
;	0 |1 |2 |3 |4 |
;	6 |7 |8 |9 |10|
;	11|12|13|14|15|
;	16|17|18|19|20|
;	21|22|23|24|25|
;	7 is the start pixel of current 4x4 block
;	pPred[7] = ([6]+[0]*2+[1]+2)/4
;
;   void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
;
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	mov r2, r0

	movq        mm1,[r2+r1-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
	movq        mm2,[r2-8]			;get value of 6 mm2[8] = 6
	sub		r2, r1			;mov eax to above line of current block(postion of 1)
	punpckhbw   mm2,[r2-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
	movd        mm3,[r2]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
	psllq       mm3,18h				;mm3[5]=[1]
	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
	lea 		r2,[r2+r1*2-8h]		;set eax point to 12
	movq        mm4,[r2+r1]		;get value of 16, mm4[8]=[16]
	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
	psrlq       mm4,38h				;mm4[1]=[16]
	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
	movq        mm4,[r2+r1*2]		;mm4[8]=[21]
	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
	psrlq       mm4,38h				;mm4[1]=[21]
	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
	pand        mm1,[mmx_01bytes]	;set the odd bit
	psubusb     mm3,mm1				;decrease 1 from odd bytes
	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2

	lea         r0,[r0+r1]
	movd        [r0+2*r1],mm2
	sub         r0,r1
	psrlq       mm2,8
	movd        [r0+2*r1],mm2
	psrlq       mm2,8
	movd        [r0+r1],mm2
	psrlq       mm2,8
	movd        [r0],mm2
	WELSEMMS
	ret


;*******************************************************************************
;	void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
;   copy 8 pixel of 8 line from left
;*******************************************************************************
%macro MMX_PRED_H_8X8_ONE_LINE 4
	movq		%1,		[%3-8]
	psrlq		%1,		38h

	pmullw		%1,		[mmx_01bytes]
	pshufw		%1,		%1,	0
	movq		[%4],	%1
%endmacro

%macro MMX_PRED_H_8X8_ONE_LINEE 4
	movq		%1,		[%3+r1-8]
	psrlq		%1,		38h

	pmullw		%1,		[mmx_01bytes]
	pshufw		%1,		%1,	0
	movq		[%4],	%1
%endmacro

WELS_EXTERN WelsDecoderIChromaPredH_mmx
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	mov r2, r0

	movq		mm0,	[r2-8]
	psrlq		mm0,	38h

	pmullw		mm0,		[mmx_01bytes]
	pshufw		mm0,	mm0,	0
	movq		[r0],	mm0

	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1

	lea			r2, [r2+r1*2]
	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1

	lea         r0, [r0+2*r1]
	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1

	lea			r2, [r2+r1*2]
	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1

	lea         r0, [r0+2*r1]
	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1

	lea			r2, [r2+r1*2]
	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1

    	lea         r0, [r0+2*r1]
	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1

	WELSEMMS
	ret


;*******************************************************************************
;	void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
;   copy 8 pixels from top 8 pixels
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredV_mmx
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d

	sub			r0,		r1
	movq		mm0,		[r0]

	movq		[r0+r1],		mm0
	movq		[r0+2*r1],	mm0
	lea         r0, [r0+2*r1]
	movq		[r0+r1],      mm0
	movq		[r0+2*r1],    mm0
	lea         r0, [r0+2*r1]
	movq		[r0+r1],      mm0
	movq		[r0+2*r1],    mm0
	lea         r0, [r0+2*r1]
	movq		[r0+r1],      mm0
	movq		[r0+2*r1],    mm0

	WELSEMMS
	ret


;*******************************************************************************
;	lt|t0|t1|t2|t3|
;	l0|
;	l1|
;	l2|
;	l3|
;	t3 will never been used
;   destination:
;	|a |b |c |d |
;	|e |f |a |b |
;	|g |h |e |f |
;	|i |j |g |h |

;   a = (1 + lt + l0)>>1
;   e = (1 + l0 + l1)>>1
;   g = (1 + l1 + l2)>>1
;   i = (1 + l2 + l3)>>1

;   d = (2 + t0 + (t1<<1) + t2)>>2
;   c = (2 + lt + (t0<<1) + t1)>>2
;   b = (2 + l0 + (lt<<1) + t0)>>2

;   f = (2 + l1 + (l0<<1) + lt)>>2
;   h = (2 + l2 + (l1<<1) + l0)>>2
;   j = (2 + l3 + (l2<<1) + l1)>>2
;   [b a f e h g j i] + [d c b a] --> mov to memory
;
;   void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	mov r2, r0
	sub         r2, r1
	movd        mm0, [r2-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]

	movd        mm1, [r2+2*r1-4]
	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
	lea         r2, [r2+2*r1]
	movd        mm2, [r2+2*r1-4]
	punpcklbw   mm2, [r2+r1-4]        ; mm2[7] = l2, mm2[6] = l3
	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
	psrlq       mm2, 20h
	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]

	movq        mm1, mm0
	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
	movq        mm2, mm0
	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
	movq        mm3, mm2
	movq        mm4, mm1
	pavgb       mm1, mm0

	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
	pand        mm4, [mmx_01bytes]	    ; set the odd bit
	psubusb     mm1, mm4				; decrease 1 from odd bytes

	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]

	movq        mm4, mm0
	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]

	psrlq       mm2, 20h
	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
	movq        mm4, mm3
	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]

	movd        [r0], mm2
	lea         r0, [r0+r1]
	movd        [r0+2*r1], mm3
	sub         r0, r1
	psrlq       mm3, 10h
	movd        [r0+2*r1], mm3
	psrlq       mm3, 10h
	movd        [r0+r1], mm3
	WELSEMMS
	ret



;*******************************************************************************
;	lt|t0|t1|t2|t3|
;	l0|
;	l1|
;	l2|
;	l3|
;	t3 will never been used
;   destination:
;	|a |b |c |d |
;	|c |d |e |f |
;	|e |f |g |g |
;	|g |g |g |g |

;   a = (1 + l0 + l1)>>1
;   c = (1 + l1 + l2)>>1
;   e = (1 + l2 + l3)>>1
;   g = l3

;   b = (2 + l0 + (l1<<1) + l2)>>2
;   d = (2 + l1 + (l2<<1) + l3)>>2
;   f = (2 + l2 + (l3<<1) + l3)>>2

;   [g g f e d c b a] + [g g g g] --> mov to memory
;
;   void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	mov r2, r0

	movd        mm0, [r2-4]            ; mm0[3] = l0
	punpcklbw   mm0, [r2+r1-4]        ; mm0[7] = l1, mm0[6] = l0
	lea         r2, [r2+2*r1]
	movd        mm2, [r2-4]            ; mm2[3] = l2
	movd        mm4, [r2+r1-4]        ; mm4[3] = l3
	punpcklbw   mm2, mm4
	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]

	psrlq       mm4, 18h
	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
	psrlq       mm0, 8h
	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]

	movq        mm1, mm0
	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]

	movq        mm2, mm0
	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
	movq        mm5, mm2
	pavgb       mm2, mm0

	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
	pand        mm5, [mmx_01bytes]	    ; set the odd bit
	psubusb     mm2, mm5				; decrease 1 from odd bytes

	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]

	psrlq       mm2, 8h
	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]

	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]

	psrlq       mm4, 20h
	lea         r0, [r0+r1]
	movd        [r0+2*r1], mm4

	sub         r0, r1
	movd        [r0], mm1
	psrlq       mm1, 10h
	movd        [r0+r1], mm1
	psrlq       mm1, 10h
	movd        [r0+2*r1], mm1
	WELSEMMS
	ret



;*******************************************************************************
;	lt|t0|t1|t2|t3|
;	l0|
;	l1|
;	l2|
;	l3|
;	l3 will never been used
;   destination:
;	|a |b |c |d |
;	|e |f |g |h |
;	|i |a |b |c |
;	|j |e |f |g |

;   a = (1 + lt + t0)>>1
;   b = (1 + t0 + t1)>>1
;   c = (1 + t1 + t2)>>1
;   d = (1 + t2 + t3)>>1

;   e = (2 + l0 + (lt<<1) + t0)>>2
;   f = (2 + lt + (t0<<1) + t1)>>2
;   g = (2 + t0 + (t1<<1) + t2)>>2

;   h = (2 + t1 + (t2<<1) + t3)>>2
;   i = (2 + lt + (l0<<1) + l1)>>2
;   j = (2 + l0 + (l1<<1) + l2)>>2
;
;   void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	mov r2, r0
	sub         r2, r1
	movq        mm0, [r2-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]

	movd        mm1, [r2+2*r1-4]
	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
	lea         r2, [r2+2*r1]
	movq        mm2, [r2+r1-8]        ; mm2[7] = l2
	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
	psrlq       mm2, 28h
	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]

	movq        mm1, mm0
	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]

	movq        mm2, mm0
	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
	movq        mm3, mm2
	pavgb       mm2, mm0

	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
	pand        mm3, [mmx_01bytes]	    ; set the odd bit
	psubusb     mm2, mm3				; decrease 1 from odd bytes

	movq        mm3, mm0
	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
	movq        mm2, mm3

	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
	movd        [r0], mm1

	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
	movd        [r0+r1], mm2

	movq        mm4, mm3
	psllq       mm4, 20h
	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]

	movq        mm5, mm3
	psllq       mm5, 28h
	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]

	psllq       mm1, 8h
	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
	movd        [r0+2*r1], mm4

	psllq       mm2, 8h
	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
	lea         r0, [r0+2*r1]
	movd        [r0+r1], mm5
	WELSEMMS
	ret

;*******************************************************************************
;	lt|t0|t1|t2|t3|t4|t5|t6|t7
;	l0|
;	l1|
;	l2|
;	l3|
;	lt,t0,t1,t2,t3 will never been used
;   destination:
;	|a |b |c |d |
;	|b |c |d |e |
;	|c |d |e |f |
;	|d |e |f |g |

;   a = (2 + t0 + t2 + (t1<<1))>>2
;   b = (2 + t1 + t3 + (t2<<1))>>2
;   c = (2 + t2 + t4 + (t3<<1))>>2
;   d = (2 + t3 + t5 + (t4<<1))>>2

;   e = (2 + t4 + t6 + (t5<<1))>>2
;   f = (2 + t5 + t7 + (t6<<1))>>2
;   g = (2 + t6 + t7 + (t7<<1))>>2

;   [g f e d c b a] --> mov to memory
;
;   void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	mov r2, r0
	sub         r2, r1
	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
	movq        mm1, mm0
	movq        mm2, mm0

	movq        mm3, mm0
	psrlq       mm3, 38h
	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]

	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
	psrlq       mm2, 8h
	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]

	movq        mm3, mm1
	pavgb       mm1, mm2
	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
	pand        mm3, [mmx_01bytes]	    ; set the odd bit
	psubusb     mm1, mm3				; decrease 1 from odd bytes

	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]

	psrlq       mm0, 8h
	movd        [r0], mm0
	psrlq       mm0, 8h
	movd        [r0+r1], mm0
	psrlq       mm0, 8h
	movd        [r0+2*r1], mm0
	psrlq       mm0, 8h
	lea         r0, [r0+2*r1]
	movd        [r0+r1], mm0
	WELSEMMS
	ret


;*******************************************************************************
;	lt|t0|t1|t2|t3|t4|t5|t6|t7
;	l0|
;	l1|
;	l2|
;	l3|
;	lt,t0,t1,t2,t3 will never been used
;   destination:
;	|a |b |c |d |
;	|e |f |g |h |
;	|b |c |d |i |
;	|f |g |h |j |

;   a = (1 + t0 + t1)>>1
;   b = (1 + t1 + t2)>>1
;   c = (1 + t2 + t3)>>1
;   d = (1 + t3 + t4)>>1
;   i = (1 + t4 + t5)>>1

;   e = (2 + t0 + (t1<<1) + t2)>>2
;   f = (2 + t1 + (t2<<1) + t3)>>2
;   g = (2 + t2 + (t3<<1) + t4)>>2
;   h = (2 + t3 + (t4<<1) + t5)>>2
;   j = (2 + t4 + (t5<<1) + t6)>>2

;   [i d c b a] + [j h g f e] --> mov to memory
;
;   void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	mov r2, r0

	sub         r2, r1
	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
	movq        mm1, mm0
	movq        mm2, mm0

	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]

	movq        mm3, mm1
	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]

	movq        mm4, mm2
	pavgb       mm2, mm0
	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
	pand        mm4, [mmx_01bytes]	    ; set the odd bit
	psubusb     mm2, mm4				; decrease 1 from odd bytes

	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]

	movd        [r0], mm3
	psrlq       mm3, 8h
	movd        [r0+2*r1], mm3

	movd        [r0+r1], mm2
	psrlq       mm2, 8h
	lea         r0, [r0+2*r1]
	movd        [r0+r1], mm2
	WELSEMMS
	ret

;*******************************************************************************
;
;   void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDc_sse2
	push 	r3
	push 	r4
	%assign push_num 2
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	mov r4, r0

	sub         r0, r1
	movq        mm0, [r0]

	movzx		r2, byte [r0+r1-0x01] ; l1
	lea         r0, [r0+2*r1]
	movzx		r3, byte [r0-0x01]     ; l2
	add			r2, r3
	movzx		r3, byte [r0+r1-0x01] ; l3
	add			r2, r3
	lea         r0, [r0+2*r1]
	movzx		r3, byte [r0-0x01]     ; l4
	add			r2, r3
	movd        mm1, r2d                 ; mm1 = l1+l2+l3+l4

	movzx		r2, byte [r0+r1-0x01] ; l5
	lea         r0, [r0+2*r1]
	movzx		r3, byte [r0-0x01]     ; l6
	add			r2, r3
	movzx		r3, byte [r0+r1-0x01] ; l7
	add			r2, r3
	lea         r0, [r0+2*r1]
	movzx		r3, byte [r0-0x01]     ; l8
	add			r2, r3
	movd        mm2, r2d                 ; mm2 = l5+l6+l7+l8

	movq        mm3, mm0
	psrlq       mm0, 0x20
	psllq       mm3, 0x20
	psrlq       mm3, 0x20
	pxor		mm4, mm4
	psadbw		mm0, mm4
	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2

	paddq       mm3, mm1
	movq        mm1, mm2
	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1

	movq        mm4, [mmx_0x02]

	paddq       mm0, mm4
	psrlq       mm0, 0x02

	paddq       mm2, mm4
	psrlq       mm2, 0x02

	paddq       mm3, mm4
	paddq       mm3, mm4
	psrlq       mm3, 0x03

	paddq       mm1, mm4
	paddq       mm1, mm4
	psrlq       mm1, 0x03

	pmuludq     mm0, [mmx_01bytes]
	pmuludq     mm3, [mmx_01bytes]
	psllq       mm0, 0x20
	pxor        mm0, mm3                 ; mm0 = m_up

	pmuludq     mm2, [mmx_01bytes]
	pmuludq     mm1, [mmx_01bytes]
	psllq       mm1, 0x20
	pxor        mm1, mm2                 ; mm2 = m_down

	movq        [r4],       mm0
	movq        [r4+r1],   mm0
	movq        [r4+2*r1], mm0
	lea         r4, [r4+2*r1]
	movq        [r4+r1],   mm0

	movq        [r4+2*r1], mm1
	lea         r4, [r4+2*r1]
	movq        [r4+r1],   mm1
	movq        [r4+2*r1], mm1
	lea         r4, [r4+2*r1]
	movq        [r4+r1],   mm1

	pop r4
	pop r3
	WELSEMMS
	ret



;*******************************************************************************
;
;   void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
	push 	r3
	push 	r4
	%assign push_num 2
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	mov r4, r0
	sub         r0, r1
	movdqa      xmm0, [r0]             ; read one row
	pxor		xmm1, xmm1
	psadbw		xmm0, xmm1
	movdqa      xmm1, xmm0
	psrldq      xmm1, 0x08
	pslldq      xmm0, 0x08
	psrldq      xmm0, 0x08
	paddw       xmm0, xmm1

	movzx		r2, byte [r0+r1-0x01]
	movzx		r3, byte [r0+2*r1-0x01]
	add		r2, r3
	lea    		r0, [r0+r1]
	LOAD_2_LEFT_AND_ADD
	LOAD_2_LEFT_AND_ADD
	LOAD_2_LEFT_AND_ADD
	LOAD_2_LEFT_AND_ADD
	LOAD_2_LEFT_AND_ADD
	LOAD_2_LEFT_AND_ADD
	LOAD_2_LEFT_AND_ADD
	add         r2, 0x10
	movd        xmm1, r2d
	paddw       xmm0, xmm1
	psrld       xmm0, 0x05
	pmuludq     xmm0, [mmx_01bytes]
	pshufd      xmm0, xmm0, 0

	movdqa      [r4],       xmm0
	movdqa      [r4+r1],   xmm0
	movdqa      [r4+2*r1], xmm0
	lea         r4,         [r4+2*r1]

	movdqa      [r4+r1],   xmm0
	movdqa      [r4+2*r1], xmm0
	lea         r4,         [r4+2*r1]

	movdqa      [r4+r1],   xmm0
	movdqa      [r4+2*r1], xmm0
	lea         r4,         [r4+2*r1]

	movdqa      [r4+r1],   xmm0
	movdqa      [r4+2*r1], xmm0
	lea         r4,         [r4+2*r1]

	movdqa      [r4+r1],   xmm0
	movdqa      [r4+2*r1], xmm0
	lea         r4,         [r4+2*r1]

	movdqa      [r4+r1],   xmm0
	movdqa      [r4+2*r1], xmm0
	lea         r4,         [r4+2*r1]

	movdqa      [r4+r1],   xmm0
	movdqa      [r4+2*r1], xmm0
	lea         r4,         [r4+2*r1]

	movdqa      [r4+r1],   xmm0

	pop r4
	pop r3

	ret

;*******************************************************************************
; for intra prediction as follows, 11/19/2010
;*******************************************************************************

;*******************************************************************************
;	void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
	%assign push_num 0
	LOAD_2_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	mov r2, r0
	sub r2, r1
	movdqa xmm0, [r2]		; pPred-kiStride, top line
	pxor xmm7, xmm7
	psadbw xmm0, xmm7
	movdqa xmm1, xmm0
	psrldq xmm1, 8
	paddw  xmm0, xmm1
	xor r2, r2
	movd r2d, xmm0
	;movdqa xmm1, xmm0
	;punpcklbw xmm0, xmm7
	;punpckhbw xmm1, xmm7

	;paddw xmm0, xmm1			; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
	;pshufd xmm1, xmm0, 04eh		; 01001110, w3w2w1w0,w7w6w5w4
	;paddw xmm0, xmm1			; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
	;pshufd xmm1, xmm0, 0b1h		; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
	;paddw xmm0, xmm1			; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
	;pshuflw xmm1, xmm0, 0b1h	; 10110001
	;paddw xmm0, xmm1			; sum in word unit (x8)
	;xor r3, r3
	;movd r3d, xmm0
	;and edx, 0ffffh

	add r2, 8
	sar r2, 4
	SSE2_Copy16Times xmm1, r2d
	;mov dh, dl
	;mov r2, edx
	;shl r2, 010h
	;or edx, r2
	;movd xmm1, edx
	;pshufd xmm0, xmm1, 00h
	;movdqa xmm1, xmm0
	movdqa xmm0, xmm1
	lea r2, [2*r1+r1]		; 3*kiStride

	movdqa [r0], xmm0
	movdqa [r0+r1], xmm1
	movdqa [r0+2*r1], xmm0
	movdqa [r0+r2], xmm1

	lea r0, [r0+4*r1]
	movdqa [r0], xmm0
	movdqa [r0+r1], xmm1
	movdqa [r0+2*r1], xmm0
	movdqa [r0+r2], xmm1

	lea r0, [r0+4*r1]
	movdqa [r0], xmm0
	movdqa [r0+r1], xmm1
	movdqa [r0+2*r1], xmm0
	movdqa [r0+r2], xmm1

	lea r0, [r0+4*r1]
	movdqa [r0], xmm0
	movdqa [r0+r1], xmm1
	movdqa [r0+2*r1], xmm0
	movdqa [r0+r2], xmm1

	POP_XMM
	ret

;*******************************************************************************
;	void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	lea r2, [2*r1+r1]		; 3*kiStride

	movdqa xmm0, [sse2_dc_0x80]
	movdqa xmm1, xmm0
	movdqa [r0], xmm0
	movdqa [r0+r1], xmm1
	movdqa [r0+2*r1], xmm0
	movdqa [r0+r2], xmm1
	lea r0, [r0+4*r1]
	movdqa [r0], xmm0
	movdqa [r0+r1], xmm1
	movdqa [r0+2*r1], xmm0
	movdqa [r0+r2], xmm1
	lea r0, [r0+4*r1]
	movdqa [r0], xmm0
	movdqa [r0+r1], xmm1
	movdqa [r0+2*r1], xmm0
	movdqa [r0+r2], xmm1
	lea r0, [r0+4*r1]
	movdqa [r0], xmm0
	movdqa [r0+r1], xmm1
	movdqa [r0+2*r1], xmm0
	movdqa [r0+r2], xmm1

	ret

;*******************************************************************************
;	void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
	push r3
	push r4
	%assign push_num 2
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	mov r4, r0
	; for left
	dec r0
	xor r2, r2
	xor r3, r3
	movzx r2, byte [r0]
	movzx r3, byte [r0+r1]
	add r2, r3
	lea r0, [r0+2*r1]
	movzx r3, byte [r0]
	add r2, r3
	movzx r3, byte [r0+r1]
	add r2, r3
	add r2, 02h
	sar r2, 02h
	;SSE2_Copy16Times mm0, r2d
	mov r3, r2
	sal r3, 8
	or r2, r3
	movd mm1, r2d
	pshufw mm0, mm1, 00h
	;mov bh, bl
	;movd mm1, ebx
	;pshufw mm0, mm1, 00h	; up64
	movq mm1, mm0
	xor r2, r2
	lea r0, [r0+2*r1]
	movzx r2, byte [r0]
	movzx r3, byte [r0+r1]
	add r2, r3
	lea r0, [r0+2*r1]
	movzx r3, byte [r0]
	add r2, r3
	movzx r3, byte [r0+r1]
	add r2, r3
	add r2, 02h
	sar r2, 02h
	mov r3, r2
	sal r3, 8
	or r2, r3
	movd mm3, r2d
	pshufw mm2, mm3, 00h
	;mov bh, bl
	;movd mm3, ebx
	;pshufw mm2, mm3, 00h	; down64
	;SSE2_Copy16Times mm2, r2d
	movq mm3, mm2
	lea r2, [2*r1+r1]
	movq [r4], mm0
	movq [r4+r1], mm1
	movq [r4+2*r1], mm0
	movq [r4+r2], mm1
	lea r4, [r4+4*r1]
	movq [r4], mm2
	movq [r4+r1], mm3
	movq [r4+2*r1], mm2
	movq [r4+r2], mm3
	pop r4
	pop r3
	emms
	ret

;*******************************************************************************
;	void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
	%assign push_num 0
	LOAD_2_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	mov r2, r0
	sub r2, r1
	movq xmm0, [r2]		; top: 8x1 pixels
	pxor xmm7, xmm7
	punpcklbw xmm0, xmm7		; ext 8x2 words
	pshufd xmm1, xmm0, 0B1h		; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
	paddw xmm0, xmm1			; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
	movdqa xmm1, xmm0
	pshuflw xmm2, xmm0, 0B1h	; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
	pshufhw xmm3, xmm1, 0B1h	; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
	paddw xmm0, xmm2			; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
	paddw xmm1, xmm3			; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
	punpckhqdq xmm1, xmm7
	punpcklqdq xmm0, xmm1		; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
	movdqa xmm6, [sse2_wd_0x02]
	paddw xmm0, xmm6
	psraw xmm0, 02h
	packuswb xmm0, xmm7
	lea r2, [2*r1+r1]
	movq [r0], xmm0
	movq [r0+r1], xmm0
	movq [r0+2*r1], xmm0
	movq [r0+r2], xmm0
	lea r0, [r0+4*r1]
	movq [r0], xmm0
	movq [r0+r1], xmm0
	movq [r0+2*r1], xmm0
	movq [r0+r2], xmm0
	POP_XMM
	ret

;*******************************************************************************
;	void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
	%assign push_num 0
	LOAD_2_PARA
	SIGN_EXTENSION r1, r1d
	lea r2, [2*r1+r1]
	movq mm0, [sse2_dc_0x80]
	movq mm1, mm0
	movq [r0], mm0
	movq [r0+r1], mm1
	movq [r0+2*r1], mm0
	movq [r0+r2], mm1
	lea r0, [r0+4*r1]
	movq [r0], mm0
	movq [r0+r1], mm1
	movq [r0+2*r1], mm0
	movq [r0+r2], mm1
	emms
	ret