ref: d7570bfa52f9ea383680ea9eb7b4d864ecd15e4f
parent: a8ab4afe5badef3fe336c9ad2abf181113bc92f9
parent: b6c4a5447c6ec67daf2a394b7927ea15bedbc5f7
author: zhilwang <[email protected]>
date: Fri Mar 18 04:50:33 EDT 2016
Merge pull request #2401 from saamas/decoder-use-encoder-x86-idct-routines [Decoder] Use encoder x86 IDCT routines
--- a/codec/build/win32/dec/WelsDecCore.vcproj
+++ b/codec/build/win32/dec/WelsDecCore.vcproj
@@ -356,6 +356,46 @@
</FileConfiguration>
</File>
<File
+ RelativePath="..\..\..\common\x86\dct.asm"
+ >
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName)_common.obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName)_common.obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName)_common.obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName)_common.obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName)_common.obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName)_common.obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName)_common.obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName)_common.obj"
+ />
+ </FileConfiguration>
+ </File>
+ <File
RelativePath="..\..\..\decoder\core\x86\dct.asm"
>
<FileConfiguration
--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -828,6 +828,46 @@
</FileConfiguration>
</File>
<File
+ RelativePath="..\..\..\common\x86\dct.asm"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName)_common.obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName)_common.obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName)_common.obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName)_common.obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName)_common.obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName)_common.obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName)_common.obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName)_common.obj"
+ />
+ </FileConfiguration>
+ </File>
+ <File
RelativePath="..\..\..\encoder\core\x86\dct.asm"
>
<FileConfiguration
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -21,6 +21,7 @@
COMMON_ASM_SRCS=\
$(COMMON_SRCDIR)/x86/cpuid.asm\
+ $(COMMON_SRCDIR)/x86/dct.asm\
$(COMMON_SRCDIR)/x86/deblock.asm\
$(COMMON_SRCDIR)/x86/expand_picture.asm\
$(COMMON_SRCDIR)/x86/intra_pred_com.asm\
--- /dev/null
+++ b/codec/common/x86/dct.asm
@@ -1,0 +1,1013 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* ?Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* ?Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* dct.asm
+;*
+;* History
+;* 8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+%macro LOAD_3_PARA_TO_5_PARA_IDCT 0
+%ifdef X86_32
+ push r3
+ push r4
+ %assign push_num push_num+2
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ mov r4, [esp + push_num*4 + 12]
+%else
+ mov r4, r2
+%endif
+ mov r2, r0
+ mov r3, r1
+%endmacro
+
+%ifdef PREFIX
+ %define prefixed(a) _ %+ a
+%else
+ %define prefixed(a) a
+%endif
+
+SECTION .rodata align=32
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+
+align 32
+wels_shufb0312_movzxw_128:
+ db 0, 80h, 3, 80h, 1, 80h, 2, 80h, 4, 80h, 7, 80h, 5, 80h, 6, 80h
+wels_shufb2301_128:
+ db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+wels_shufb0231_128:
+ db 0, 2, 3, 1, 4, 6, 7, 5, 8, 10, 11, 9, 12, 14, 15, 13
+wels_dw32_128:
+ times 8 dw 32
+wels_p1m1p1m1w_256:
+ times 8 dw 1, -1
+wels_p1p2m1m2w_256:
+ times 4 dw 1, 2, -1, -2
+wels_p1p1m1m1w_256:
+ times 4 dw 1, 1, -1, -1
+wels_8xp1w_8xm1w:
+ times 8 dw 1
+ times 8 dw -1
+wels_4xp1w_4xm1w_256:
+ times 4 dw 1
+ times 4 dw -1
+ times 4 dw 1
+ times 4 dw -1
+wels_4xp1w_4xp2w_4xm1w_4xm2w:
+ times 4 dw 1
+ times 4 dw 2
+ times 4 dw -1
+ times 4 dw -2
+
+align 16
+wels_p1m1p1m1w_128:
+ times 4 dw 1, -1
+wels_p1p2p1p2w_128:
+ times 4 dw 1, 2
+wels_p1m1m1p1w_128:
+ times 2 dw 1, -1, -1, 1
+wels_p0m8000p0m8000w_128:
+ times 4 dw 0, -8000h
+wels_p1p1m1m1w_128:
+ times 2 dw 1, 1, -1, -1
+wels_4xp1w_4xp2w:
+ times 4 dw 1
+ times 4 dw 2
+wels_4xp0w_4xm8000w:
+ times 4 dw 0
+ times 4 dw -8000h
+
+SECTION .text
+
+;***********************************************************************
+; MMX functions
+;***********************************************************************
+
+%macro MMX_LoadDiff4P 5
+ movd %1, [%3]
+ movd %2, [%4]
+ punpcklbw %1, %5
+ punpcklbw %2, %5
+ psubw %1, %2
+%endmacro
+
+%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
+ MMX_LoadDiff4P %1, %9, %5, %7, %10
+ MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
+ lea %5, [%5+2*%6]
+ lea %7, [%7+2*%8]
+ MMX_LoadDiff4P %3, %9, %5, %7, %10
+ MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
+%endmacro
+
+%macro MMX_SumSubMul2 3
+ movq %3, %1
+ psllw %1, $01
+ paddw %1, %2
+ psllw %2, $01
+ psubw %3, %2
+%endmacro
+
+%macro MMX_SumSubDiv2 3
+ movq %3, %2
+ psraw %3, $01
+ paddw %3, %1
+ psraw %1, $01
+ psubw %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+ movq %3, %2
+ psubw %2, %1
+ paddw %1, %3
+%endmacro
+
+%macro MMX_DCT 6
+ MMX_SumSub %4, %1, %6
+ MMX_SumSub %3, %2, %6
+ MMX_SumSub %3, %4, %6
+ MMX_SumSubMul2 %1, %2, %5
+%endmacro
+
+%macro MMX_IDCT 6
+ MMX_SumSub %4, %5, %6
+ MMX_SumSubDiv2 %3, %2, %1
+ MMX_SumSub %1, %4, %6
+ MMX_SumSub %3, %5, %6
+%endmacro
+
+%macro MMX_StoreDiff4P 6
+ movd %2, %6
+ punpcklbw %2, %4
+ paddw %1, %3
+ psraw %1, $06
+ paddsw %1, %2
+ packuswb %1, %2
+ movd %5, %1
+%endmacro
+
+;***********************************************************************
+; void WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
+;***********************************************************************
+WELS_EXTERN WelsDctT4_mmx
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r4, r4d
+ WELS_Zero mm7
+
+ MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
+
+ MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
+ MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
+
+ MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
+ MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
+
+ movq [r0+ 0], mm2
+ movq [r0+ 8], mm1
+ movq [r0+16], mm5
+ movq [r0+24], mm4
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+; void IdctResAddPred_mmx(uint8_t* pPred, int32_t iStride, int16_t* pDct);
+;***********************************************************************
+WELS_EXTERN IdctResAddPred_mmx
+ %assign push_num 0
+ LOAD_3_PARA_TO_5_PARA_IDCT
+ jmp prefixed(WelsIDctT4Rec_mmx.begin)
+
+;***********************************************************************
+; void WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
+;***********************************************************************
+WELS_EXTERN WelsIDctT4Rec_mmx
+ %assign push_num 0
+ LOAD_5_PARA
+.begin:
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movq mm0, [r4+ 0]
+ movq mm1, [r4+ 8]
+ movq mm2, [r4+16]
+ movq mm3, [r4+24]
+
+ MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
+ MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
+ MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
+ MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
+
+ WELS_Zero mm7
+ WELS_DW32 mm6
+
+ MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
+ MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
+ MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
+
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
+
+
+;***********************************************************************
+; SSE2 functions
+;***********************************************************************
+
+%macro SSE2_Store4x8p 6
+ movlps [%1+0x00], %2
+ movhps [%1+0x20], %2
+ movlps [%1+0x08], %3
+ movhps [%1+0x28], %3
+ movlps [%1+0x10], %4
+ movhps [%1+0x30], %4
+ movlps [%1+0x18], %5
+ movhps [%1+0x38], %5
+%endmacro
+
+%macro SSE2_Load4x8p 6
+ MOVDQ %2, [%1+0x00]
+ MOVDQ %4, [%1+0x10]
+ MOVDQ %6, [%1+0x20]
+ MOVDQ %3, [%1+0x30]
+ SSE2_XSawp qdq, %4, %3, %5
+ SSE2_XSawp qdq, %2, %6, %3
+%endmacro
+
+%macro SSE2_SumSubMul2 3
+ movdqa %3, %1
+ psllw %1, 1
+ paddw %1, %2
+ psllw %2, 1
+ psubw %3, %2
+%endmacro
+
+%macro SSE2_SumSubDiv2 4
+ movdqa %4, %1
+ movdqa %3, %2
+ psraw %2, $01
+ psraw %4, $01
+ paddw %1, %2
+ psubw %4, %3
+%endmacro
+
+%macro SSE2_StoreDiff16p 9
+ paddw %1, %4
+ psraw %1, $06
+ movq %3, %7
+ punpcklbw %3, %5
+ paddsw %1, %3
+ paddw %2, %4
+ psraw %2, $06
+ movq %3, %9
+ punpcklbw %3, %5
+ paddsw %2, %3
+ packuswb %1, %2
+ movlps %6, %1
+ movhps %8, %1
+%endmacro
+
+%macro SSE2_StoreDiff8p 5
+ movq %2, %5
+ punpcklbw %2, %3
+ paddsw %2, %1
+ packuswb %2, %2
+ movq %4, %2
+%endmacro
+
+%macro SSE2_Load2x4P 2
+ MOVDQ %1, [%2]
+%endmacro
+
+%macro SSE2_Store2x4P 2
+ MOVDQ [%1], %2
+%endmacro
+
+; out=%1 pPixel1Line1=%2 pPixel1Line2=%3 pPixel2Line1=%4 pPixel2Line2=%5 zero=%6 clobber=%7,%8
+%macro SSE2_LoadDiff2x4P 8
+ movd %1, [%2]
+ movd %7, [%3]
+ punpckldq %1, %7
+ punpcklbw %1, %6
+ movd %7, [%4]
+ movd %8, [%5]
+ punpckldq %7, %8
+ punpcklbw %7, %6
+ psubw %1, %7
+%endmacro
+
+; pRec1=%1 pRec2=%2 data=%3 pPred1=%4 pPred2=%5 dw32=%6 zero=%7 clobber=%8,%9
+%macro SSE2_StoreDiff2x4P 9
+ paddw %3, %6
+ psraw %3, 6
+ movd %8, [%4]
+ movd %9, [%5]
+ punpckldq %8, %9
+ punpcklbw %8, %7
+ paddsw %3, %8
+ packuswb %3, %3
+ movd [%1], %3
+ psrlq %3, 32
+ movd [%2], %3
+%endmacro
+
+%macro SSE2_Load8DC 6
+ movdqa %1, %6 ; %1 = dc0 dc1
+ paddw %1, %5
+ psraw %1, $06 ; (dc + 32) >> 6
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklwd %2, %2
+ punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+
+ movdqa %3, %1
+ psrldq %3, 8
+ punpcklwd %3, %3
+ punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
+
+ movdqa %4, %1
+ psrldq %4, 12
+ punpcklwd %4, %4
+ punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
+
+ punpcklwd %1, %1
+ punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+%endmacro
+
+%macro SSE2_DCT 6
+ SSE2_SumSub %6, %3, %5
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %2, %5
+ SSE2_SumSubMul2 %6, %1, %4
+%endmacro
+
+%macro SSE2_IDCT 7
+ SSE2_SumSub %7, %2, %6
+ SSE2_SumSubDiv2 %1, %3, %5, %4
+ SSE2_SumSub %2, %1, %5
+ SSE2_SumSub %7, %4, %5
+%endmacro
+
+; Do 2 horizontal 4-pt DCTs in parallel packed as 8 words in an xmm register.
+; out=%1 in=%1 clobber=%2
+%macro SSE2_DCT_HORIZONTAL 2
+ pshuflw %2, %1, 1bh ; [x[3],x[2],x[1],x[0]] low qw
+ pmullw %1, [wels_p1m1p1m1w_128] ; [x[0],-x[1],x[2],-x[3], ...]
+ pshufhw %2, %2, 1bh ; [x[3],x[2],x[1],x[0]] high qw
+ paddw %1, %2 ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
+ pshufd %2, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
+ pmullw %1, [wels_p1m1m1p1w_128] ; [s[0],-s[1],-s[2],s[3], ...]
+ pmullw %2, [wels_p1p2p1p2w_128] ; [s[2],2*s[3],s[0],2*s[1], ...]]
+ paddw %1, %2 ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
+%endmacro
+
+; Do 2 horizontal 4-pt IDCTs in parallel packed as 8 words in an xmm register.
+;
+; Use a multiply by reciprocal to get -x>>1, and x+=-x>>1 to get x>>1, which
+; avoids a cumbersome blend with SSE2 to get a vector with right-shifted odd
+; elements.
+;
+; out=%1 in=%1 wels_p1m1m1p1w_128=%2 clobber=%3,%4
+%macro SSE2_IDCT_HORIZONTAL 4
+ movdqa %3, [wels_p0m8000p0m8000w_128]
+ pmulhw %3, %1 ; x[0:7] * [0,-8000h,0,-8000h, ...] >> 16
+ pshufd %4, %1, 0b1h ; [x[2],x[3],x[0],x[1], ...]
+ pmullw %4, %2 ; [x[2],-x[3],-x[0],x[1], ...]
+ paddw %1, %3 ; [x[0]+0,x[1]+(-x[1]>>1),x[2]+0,x[3]+(-x[3]>>1), ...]
+ paddw %1, %4 ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
+ pshuflw %3, %1, 1bh ; [s[3],s[2],s[1],s[0]] low qw
+ pmullw %1, [wels_p1p1m1m1w_128] ; [s[0],s[1],-s[2],-s[3], ...]
+ pshufhw %3, %3, 1bh ; [s[3],s[2],s[1],s[0]] high qw
+ pmullw %3, %2 ; [s[3],-s[2],-s[1],s[0], ...]
+ paddw %1, %3 ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
+%endmacro
+
+; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in 2 xmm registers.
+; Uses scrambled input to save a negation.
+; [y0,y1]=%1 [y2,y3]=%2 [x1,x0]=%1 [x2,x3]=%2 clobber=%3
+%macro SSE2_DCT_4x4P 3
+ movdqa %3, %1
+ psubw %1, %2 ; [x1-x2,x0-x3]
+ paddw %2, %3 ; [x1+x2,x0+x3]
+ movdqa %3, %2
+ punpckhqdq %2, %1 ; s03 = [x0+x3,x0-x3]
+ punpcklqdq %3, %1 ; s12 = [x1+x2,x1-x2]
+ movdqa %1, %2
+ pmullw %1, [wels_4xp1w_4xp2w] ; [s03[0],2*s03[1]]
+ paddw %1, %3 ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
+ pmullw %3, [wels_4xp1w_4xp2w] ; [s12[0],2*s12[1]]
+ psubw %2, %3 ; [y2,y3] = [s03[0]-s12[0],s03[1]-2*s12[1]]
+%endmacro
+
+; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in 2 xmm registers.
+; Output is scrambled to save a negation.
+; [y1,y0]=%1 [y2,y3]=%2 [x0,x1]=%1 [x2,x3]=%2 clobber=%3,%4
+%macro SSE2_IDCT_4x4P 4
+ movdqa %4, [wels_4xp0w_4xm8000w]
+ movdqa %3, %1
+ pmulhw %3, %4 ; x[0:1] * [0,-8000h] >> 16
+ pmulhw %4, %2 ; x[2:3] * [0,-8000h] >> 16
+ paddw %3, %1 ; [x[0],x[1]>>1]
+ paddw %4, %2 ; [x[2],x[3]>>1]
+ psubw %3, %2 ; [x[0]-x[2],(x[1]>>1)-x[3]]
+ paddw %1, %4 ; [x[2]+x[0],(x[3]>>1)+x[1]]
+ movdqa %2, %3
+ punpckhqdq %3, %1 ; s13 = [(x[1]>>1)-x[3],(x[3]>>1)+x[1]]
+ punpcklqdq %2, %1 ; s02 = [x[0]-x[2], x[2]+x[0]]
+ movdqa %1, %2
+ paddw %1, %3 ; [y1,y0] = [s02[0]+s13[0],s02[1]+s13[1]]
+ psubw %2, %3 ; [y2,y3] = [s02[0]-s13[0],s02[1]-s13[1]]
+%endmacro
+
+;***********************************************************************
+; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
+;***********************************************************************
+WELS_EXTERN WelsDctFourT4_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r4, r4d
+ pxor xmm7, xmm7
+ ;Load 4x8
+ SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
+ SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
+ SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
+ SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
+
+ SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+ SSE2_DCT_HORIZONTAL xmm2, xmm5
+ SSE2_DCT_HORIZONTAL xmm0, xmm5
+ SSE2_DCT_HORIZONTAL xmm3, xmm5
+ SSE2_DCT_HORIZONTAL xmm4, xmm5
+
+ SSE2_Store4x8p r0, xmm2, xmm0, xmm3, xmm4, xmm1
+
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
+
+ ;Load 4x8
+ SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
+ SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
+ SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
+ SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
+
+ SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+ SSE2_DCT_HORIZONTAL xmm2, xmm5
+ SSE2_DCT_HORIZONTAL xmm0, xmm5
+ SSE2_DCT_HORIZONTAL xmm3, xmm5
+ SSE2_DCT_HORIZONTAL xmm4, xmm5
+
+ SSE2_Store4x8p r0+64, xmm2, xmm0, xmm3, xmm4, xmm1
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
+;***********************************************************************
+WELS_EXTERN WelsIDctFourT4Rec_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ ;Load 4x8
+ SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
+
+ movdqa xmm7, [wels_p1m1m1p1w_128]
+ SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
+ SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
+ SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
+ SSE2_IDCT_HORIZONTAL xmm2, xmm7, xmm5, xmm6
+ SSE2_IDCT xmm1, xmm4, xmm2, xmm3, xmm5, xmm6, xmm0
+
+ WELS_Zero xmm7
+ WELS_DW32 xmm6
+
+ SSE2_StoreDiff16p xmm1, xmm3, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_Load4x8p r4+64, xmm0, xmm1, xmm4, xmm2, xmm5
+
+ movdqa xmm7, [wels_p1m1m1p1w_128]
+ SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
+ SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
+ SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
+ SSE2_IDCT_HORIZONTAL xmm2, xmm7, xmm5, xmm6
+ SSE2_IDCT xmm1, xmm4, xmm2, xmm3, xmm5, xmm6, xmm0
+
+ WELS_Zero xmm7
+ WELS_DW32 xmm6
+
+ SSE2_StoreDiff16p xmm1, xmm3, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+; void WelsDctT4_sse2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
+;***********************************************************************
+WELS_EXTERN WelsDctT4_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 5
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r4, r4d
+
+ WELS_Zero xmm2
+ SSE2_LoadDiff2x4P xmm0, r1+r2, r1, r3+r4, r3, xmm2, xmm3, xmm4
+ add r1, r2
+ add r3, r4
+ SSE2_LoadDiff2x4P xmm1, r1+r2, r1+2*r2, r3+r4, r3+2*r4, xmm2, xmm3, xmm4
+ SSE2_DCT_HORIZONTAL xmm0, xmm3
+ SSE2_DCT_HORIZONTAL xmm1, xmm3
+ SSE2_DCT_4x4P xmm0, xmm1, xmm3
+ SSE2_Store2x4P r0, xmm0
+ SSE2_Store2x4P r0+16, xmm1
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+; void IdctResAddPred_sse2(uint8_t* pPred, int32_t iStride, int16_t* pDct);
+;***********************************************************************
+WELS_EXTERN IdctResAddPred_sse2
+ %assign push_num 0
+ LOAD_3_PARA_TO_5_PARA_IDCT
+ jmp prefixed(WelsIDctT4Rec_sse2.begin)
+
+;***********************************************************************
+; void WelsIDctT4Rec_sse2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
+;***********************************************************************
+WELS_EXTERN WelsIDctT4Rec_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+.begin:
+ PUSH_XMM 6
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+
+ SSE2_Load2x4P xmm0, r4
+ SSE2_Load2x4P xmm1, r4+16
+ movdqa xmm4, [wels_p1m1m1p1w_128]
+ SSE2_IDCT_HORIZONTAL xmm0, xmm4, xmm2, xmm3
+ SSE2_IDCT_HORIZONTAL xmm1, xmm4, xmm2, xmm3
+ SSE2_IDCT_4x4P xmm0, xmm1, xmm2, xmm3
+ WELS_Zero xmm4
+ WELS_DW32 xmm5
+ SSE2_StoreDiff2x4P r0+r1, r0, xmm0, r2+r3, r2, xmm5, xmm4, xmm2, xmm3
+ add r0, r1
+ add r2, r3
+ SSE2_StoreDiff2x4P r0+r1, r0+2*r1, xmm1, r2+r3, r2+2*r3, xmm5, xmm4, xmm2, xmm3
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+%macro SSE2_StoreDiff4x8p 8
+ SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
+ SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
+ SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
+ SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
+%endmacro
+
+ ;***********************************************************************
+; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
+;***********************************************************************
+WELS_EXTERN WelsIDctRecI16x16Dc_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm7, xmm7
+ WELS_DW32 xmm6
+
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+
+;***********************************************************************
+; AVX2 functions
+;***********************************************************************
+
+; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8
+%macro AVX2_LoadDiff16P 8
+ vmovq x%1, [%2 ]
+ vpbroadcastq y%7, [%2 + 4 * %3]
+ vpblendd y%1, y%1, y%7, 11110000b
+ vpshufb y%1, y%1, y%6
+ vmovq x%7, [%4 ]
+ vpbroadcastq y%8, [%4 + 4 * %5]
+ vpblendd y%7, y%7, y%8, 11110000b
+ vpshufb y%7, y%7, y%6
+ vpsubw y%1, y%1, y%7
+%endmacro
+
+; pRec=%1 iStride=%2 data=%3,%4 pPred=%5 iPredStride=%6 dw32=%7 wels_shufb0312_movzxw=%8 clobber=%9,%10
+%macro AVX2_StoreDiff32P 10
+ vpaddw y%3, y%3, y%7
+ vpsraw y%3, y%3, 6
+ vmovq x%9, [%5 ]
+ vpbroadcastq y%10, [%5 + 4 * %6]
+ add %5, %6
+ vpblendd y%9, y%9, y%10, 11110000b
+ vpshufb y%9, y%9, y%8
+ vpaddsw y%3, y%3, y%9
+ vpaddw y%4, y%4, y%7
+ vpsraw y%4, y%4, 6
+ vmovq x%9, [%5 ]
+ vpbroadcastq y%10, [%5 + 4 * %6]
+ vpblendd y%9, y%9, y%10, 11110000b
+ vpshufb y%9, y%9, y%8
+ vpaddsw y%4, y%4, y%9
+ vpackuswb y%3, y%3, y%4
+ vbroadcasti128 y%4, [wels_shufb0231_128]
+ vpshufb y%3, y%3, y%4
+ vextracti128 x%4, y%3, 1
+ vmovlps [%1 ], x%3
+ vmovlps [%1 + 4 * %2], x%4
+ add %1, %2
+ vmovhps [%1 ], x%3
+ vmovhps [%1 + 4 * %2], x%4
+%endmacro
+
+; out=%1,%2,%3,%4 pDct=%5 clobber=%6
+%macro AVX2_Load4x16P 6
+ vmovdqa x%2, [%5+0x00]
+ vinserti128 y%2, y%2, [%5+0x40], 1
+ vmovdqa x%6, [%5+0x20]
+ vinserti128 y%6, y%6, [%5+0x60], 1
+ vpunpcklqdq y%1, y%2, y%6
+ vpunpckhqdq y%2, y%2, y%6
+ vmovdqa x%4, [%5+0x10]
+ vinserti128 y%4, y%4, [%5+0x50], 1
+ vmovdqa x%6, [%5+0x30]
+ vinserti128 y%6, y%6, [%5+0x70], 1
+ vpunpcklqdq y%3, y%4, y%6
+ vpunpckhqdq y%4, y%4, y%6
+%endmacro
+
+; pDct=%1 data=%1,%2,%3,%4 clobber=%5
+%macro AVX2_Store4x16P 6
+ vpunpcklqdq y%6, y%2, y%3
+ vmovdqa [%1+0x00], x%6
+ vextracti128 [%1+0x40], y%6, 1
+ vpunpckhqdq y%6, y%2, y%3
+ vmovdqa [%1+0x20], x%6
+ vextracti128 [%1+0x60], y%6, 1
+ vpunpcklqdq y%6, y%4, y%5
+ vmovdqa [%1+0x10], x%6
+ vextracti128 [%1+0x50], y%6, 1
+ vpunpckhqdq y%6, y%4, y%5
+ vmovdqa [%1+0x30], x%6
+ vextracti128 [%1+0x70], y%6, 1
+%endmacro
+
+%macro AVX2_Load4x4P 2
+ vmovdqu y%1, [%2]
+%endmacro
+
+%macro AVX2_Store4x4P 2
+ vmovdqu [%1], y%2
+%endmacro
+
+; Load 4 lines of 4 pixels, shuffle and zero extend to 16-bit.
+; out=%1 pPixel=%2 iStride=%3 [wels_shufb0312_movzxw]=%4 clobber=%5,%6
+%macro AVX2_Loadzx4x4P 6
+ vmovd x%1, [%2 ]
+ add %2, %3
+ vpbroadcastd x%5, [%2 + 2 * %3]
+ vpblendd x%1, x%1, x%5, 1010b
+ vpbroadcastd y%5, [%2 ]
+ vpbroadcastd y%6, [%2 + %3]
+ vpblendd y%5, y%5, y%6, 10101010b
+ vpblendd y%1, y%1, y%5, 11110000b
+ vpshufb y%1, y%1, %4
+%endmacro
+
+; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8,%9
+%macro AVX2_LoadDiff4x4P 9
+ AVX2_Loadzx4x4P %1, %2, %3, y%6, %7, %8
+ AVX2_Loadzx4x4P %7, %4, %5, y%6, %8, %9
+ vpsubw y%1, y%1, y%7
+%endmacro
+
+; pRec=%1 iStride=%2 data=%3 pPred=%4 iPredStride=%5 dw32=%6 wels_shufb0312_movzxw=%7 clobber=%8,%9,%10
+%macro AVX2_StoreDiff4x4P 10
+ vpaddw y%3, y%3, y%6
+ vpsraw y%3, y%3, 6
+ AVX2_Loadzx4x4P %8, %4, %5, y%7, %9, %10
+ vpaddsw y%3, y%3, y%8
+ vpackuswb y%3, y%3, y%3
+ vbroadcasti128 y%8, [wels_shufb0231_128]
+ vpshufb y%3, y%3, y%8
+ vextracti128 x%8, y%3, 1
+ vmovd [%1 ], x%3
+ add %1, %2
+ vmovd [%1 ], x%8
+ vpsrlq x%8, x%8, 32
+ vmovd [%1 + %2], x%8
+ vpsrlq x%3, x%3, 32
+ vmovd [%1 + 2 * %2], x%3
+%endmacro
+
+; 4-pt DCT
+; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
+%macro AVX2_DCT 5
+ vpsubw %5, %1, %4 ; s3 = x0 - x3
+ vpaddw %1, %1, %4 ; s0 = x0 + x3
+ vpsubw %4, %2, %3 ; s2 = x1 - x2
+ vpaddw %2, %2, %3 ; s1 = x1 + x2
+ vpsubw %3, %1, %2 ; y2 = s0 - s1
+ vpaddw %1, %1, %2 ; y0 = s0 + s1
+ vpsllw %2, %5, 1
+ vpaddw %2, %2, %4 ; y1 = 2 * s3 + s2
+ vpsllw %4, %4, 1
+ vpsubw %4, %5, %4 ; y3 = s3 - 2 * s2
+%endmacro
+
+; 4-pt IDCT
+; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
+%macro AVX2_IDCT 5
+ vpsraw %5, %2, 1
+ vpsubw %5, %5, %4 ; t3 = (x1 >> 1) - x3
+ vpsraw %4, %4, 1
+ vpaddw %4, %2, %4 ; t2 = x1 + (x3 >> 1)
+ vpaddw %2, %1, %3 ; t0 = x0 + x2
+ vpsubw %3, %1, %3 ; t1 = x0 - x2
+ vpaddw %1, %2, %4 ; y0 = t0 + t2
+ vpsubw %4, %2, %4 ; y3 = t0 - t2
+ vpaddw %2, %3, %5 ; y1 = t1 + t3
+ vpsubw %3, %3, %5 ; y2 = t1 - t3
+%endmacro
+
+; Do 4 horizontal 4-pt DCTs in parallel packed as 16 words in a ymm register.
+; Uses scrambled input to save a negation.
+; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 wels_shufb2301=%2 clobber=%3
+%macro AVX2_DCT_HORIZONTAL 3
+ vpsignw %3, %1, [wels_p1m1p1m1w_256] ; [x0,-x3,x1,-x2]
+ vpshufb %1, %1, %2 ; [x3,x0,x2,x1]
+ vpaddw %1, %1, %3 ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
+ vpmullw %3, %1, [wels_p1p2m1m2w_256] ; [s[0],2*s[1],-s[2],-2*s[3], ...]
+ vpshufd %1, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
+ vpaddw %1, %1, %3 ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1], ...]
+%endmacro
+
+; Do 4 horizontal 4-pt IDCTs in parallel packed as 16 words in a ymm register.
+; Output is scrambled to save a negation.
+; [y0,y3,y1,y2]=%1 [x0,x1,x2,x3]=%1 wels_shufb2301=%2 clobber=%3
+%macro AVX2_IDCT_HORIZONTAL 3
+ vpsraw %3, %1, 1 ; [x0>>1,x1>>1,x2>>1,x3>>1]
+ vpblendw %3, %1, %3, 10101010b ; [x0,x1>>1,x2,x3>>1]
+ vpsignw %1, %1, [wels_p1p1m1m1w_256] ; [x0,x1,-x2,-x3]
+ vpshufd %3, %3, 0b1h ; [x2,x3>>1,x0,x1>>1]
+ vpaddw %1, %3, %1 ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
+ vpshufb %3, %1, %2 ; [s[1],s[0],s[3],s[2], ...]
+ vpsignw %1, %1, [wels_p1m1p1m1w_256] ; [s[0],-s[1],s[2],-s[3], ...]
+ vpaddw %1, %1, %3 ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2], ...]
+%endmacro
+
+; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in a ymm register.
+; Uses scrambled input to save a negation.
+; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 clobber=%2
+%macro AVX2_DCT_4x4P 2
+ vpsignw %2, %1, [wels_4xp1w_4xm1w_256] ; [x0,-x3,x1,-x2]
+ vpshufd %1, %1, 4eh ; [x3,x0,x2,x1]
+ vpaddw %1, %1, %2 ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
+ vpmullw %2, %1, [wels_4xp1w_4xp2w_4xm1w_4xm2w] ; [s[0],2*s[1],-s[2],-2*s[3]]
+ vpermq %1, %1, 4eh ; [s[2],s[3],s[0],s[1]]
+ vpaddw %1, %1, %2 ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1]]
+%endmacro
+
+; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in a ymm register.
+; Output is scrambled to save a negation.
+; [y0,y3,y1,y2]=%1 [x0,x1,x2,x3]=%1 clobber=%2
+%macro AVX2_IDCT_4x4P 2
+ vpsraw %2, %1, 1 ; [x0>>1,x1>>1,x2>>1,x3>>1]
+ vpblendw %2, %1, %2, 11110000b ; [x0,x1>>1,x2,x3>>1]
+ vpsignw %1, %1, [wels_8xp1w_8xm1w] ; [x0,x1,-x2,-x3]
+ vpermq %2, %2, 4eh ; [x2,x3>>1,x0,x1>>1]
+ vpaddw %1, %2, %1 ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
+ vpshufd %2, %1, 4eh ; [s[1],s[0],s[3],s[2]]
+ vpmullw %1, %1, [wels_4xp1w_4xm1w_256] ; [s[0],-s[1],s[2],-s[3], ...]
+ vpaddw %1, %1, %2 ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2]]
+%endmacro
+
+;***********************************************************************
+; void WelsDctFourT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
+;***********************************************************************
+WELS_EXTERN WelsDctFourT4_avx2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r4, r4d
+
+ vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
+
+ ;Load 4x16
+ AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5
+ add r1, r2
+ add r3, r4
+ AVX2_LoadDiff16P mm1, r1, r2, r3, r4, mm6, mm4, mm5
+ add r1, r2
+ add r3, r4
+ AVX2_LoadDiff16P mm2, r1, r2, r3, r4, mm6, mm4, mm5
+ add r1, r2
+ add r3, r4
+ AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5
+
+ AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5
+ vbroadcasti128 ymm6, [wels_shufb2301_128]
+ AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5
+ AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5
+ AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5
+ AVX2_DCT_HORIZONTAL ymm3, ymm6, ymm5
+
+ AVX2_Store4x16P r0, mm0, mm1, mm2, mm3, mm5
+ vzeroupper
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+; void IdctFourResAddPred_avx2(uint8_t* pPred, int32_t iStride, const int16_t* pDct, const int8_t* pNzc);
+;***********************************************************************
+WELS_EXTERN IdctFourResAddPred_avx2
+ %assign push_num 0
+ LOAD_3_PARA_TO_5_PARA_IDCT
+ jmp prefixed(WelsIDctFourT4Rec_avx2.begin)
+
+;***********************************************************************
+; void WelsIDctFourT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
+;***********************************************************************
+WELS_EXTERN WelsIDctFourT4Rec_avx2
+ %assign push_num 0
+ LOAD_5_PARA
+.begin:
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+
+ AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5
+ vbroadcasti128 ymm6, [wels_shufb2301_128]
+ AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5
+ AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5
+ AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5
+ AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5
+ AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5
+
+ vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
+ vbroadcasti128 ymm7, [wels_dw32_128]
+ AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4
+ add r2, r3
+ add r0, r1
+ AVX2_StoreDiff32P r0, r1, mm2, mm3, r2, r3, mm7, mm6, mm5, mm4
+ vzeroupper
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+; void WelsDctT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
+;***********************************************************************
+WELS_EXTERN WelsDctT4_avx2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 5
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r4, r4d
+
+ vbroadcasti128 ymm1, [wels_shufb0312_movzxw_128]
+ AVX2_LoadDiff4x4P mm0, r1, r2, r3, r4, mm1, mm2, mm3, mm4
+ AVX2_DCT_4x4P ymm0, ymm2
+ vbroadcasti128 ymm1, [wels_shufb2301_128]
+ AVX2_DCT_HORIZONTAL ymm0, ymm1, ymm2
+ AVX2_Store4x4P r0, mm0
+ vzeroupper
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+; void IdctResAddPred_avx2(uint8_t* pPred, int32_t iStride, int16_t* pDct);
+;***********************************************************************
+WELS_EXTERN IdctResAddPred_avx2
+ %assign push_num 0
+ LOAD_3_PARA_TO_5_PARA_IDCT
+ jmp prefixed(WelsIDctT4Rec_avx2.begin)
+
+;***********************************************************************
+; void WelsIDctT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
+;***********************************************************************
+WELS_EXTERN WelsIDctT4Rec_avx2
+ %assign push_num 0
+ LOAD_5_PARA
+.begin:
+ PUSH_XMM 6
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+
+ AVX2_Load4x4P mm0, r4
+ vbroadcasti128 ymm4, [wels_shufb2301_128]
+ AVX2_IDCT_HORIZONTAL ymm0, ymm4, ymm1
+ AVX2_IDCT_4x4P ymm0, ymm1
+ vbroadcasti128 ymm4, [wels_shufb0312_movzxw_128]
+ vbroadcasti128 ymm5, [wels_dw32_128]
+ AVX2_StoreDiff4x4P r0, r1, mm0, r2, r3, mm5, mm4, mm1, mm2, mm3
+ vzeroupper
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
--- a/codec/decoder/core/inc/decode_mb_aux.h
+++ b/codec/decoder/core/inc/decode_mb_aux.h
@@ -47,6 +47,9 @@
#if defined(X86_ASM)
void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
+void IdctResAddPred_sse2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
+void IdctResAddPred_avx2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
+void IdctFourResAddPred_avx2 (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc);
#endif//X86_ASM
#if defined(HAVE_NEON)
--- a/codec/decoder/core/inc/decoder_context.h
+++ b/codec/decoder/core/inc/decoder_context.h
@@ -136,6 +136,7 @@
/*typedef for get intra predictor func pointer*/
typedef void (*PGetIntraPredFunc) (uint8_t* pPred, const int32_t kiLumaStride);
typedef void (*PIdctResAddPredFunc) (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
+typedef void (*PIdctFourResAddPredFunc) (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc);
typedef void (*PExpandPictureFunc) (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicWidth,
const int32_t kiPicHeight);
@@ -389,6 +390,7 @@
PGetIntraPredFunc pGetI4x4LumaPredFunc[14]; // h264_predict_4x4_t
PGetIntraPredFunc pGetIChromaPredFunc[7]; // h264_predict_8x8_t
PIdctResAddPredFunc pIdctResAddPredFunc;
+ PIdctFourResAddPredFunc pIdctFourResAddPredFunc;
SMcFunc sMcFunc;
//Transform8x8
PGetIntraPred8x8Func pGetI8x8LumaPredFunc[14];
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -168,28 +168,21 @@
}
}
} else {
- for (i = 0; i < 16; i++) { //luma
- iIndex = g_kuiMbCountScan4Idx[i];
- if (pCurLayer->pNzc[iMbXy][iIndex]) {
- iOffset = ((iIndex >> 2) << 2) * iStrideL + ((iIndex % 4) << 2);
- pCtx->pIdctResAddPredFunc (pDstY + iOffset, iStrideL, pCurLayer->pScaledTCoeff[iMbXy] + (i << 4));
- }
- }
+ // luma.
+ const int8_t* pNzc = pCurLayer->pNzc[iMbXy];
+ int16_t* pScaledTCoeff = pCurLayer->pScaledTCoeff[iMbXy];
+ pCtx->pIdctFourResAddPredFunc (pDstY + 0 * iStrideL + 0, iStrideL, pScaledTCoeff + 0 * 64, pNzc + 0);
+ pCtx->pIdctFourResAddPredFunc (pDstY + 0 * iStrideL + 8, iStrideL, pScaledTCoeff + 1 * 64, pNzc + 2);
+ pCtx->pIdctFourResAddPredFunc (pDstY + 8 * iStrideL + 0, iStrideL, pScaledTCoeff + 2 * 64, pNzc + 8);
+ pCtx->pIdctFourResAddPredFunc (pDstY + 8 * iStrideL + 8, iStrideL, pScaledTCoeff + 3 * 64, pNzc + 10);
}
- for (i = 0; i < 4; i++) { //chroma
- iIndex = g_kuiMbCountScan4Idx[i + 16]; //Cb
- if (pCurLayer->pNzc[iMbXy][iIndex] || * (pCurLayer->pScaledTCoeff[iMbXy] + ((i + 16) << 4))) {
- iOffset = (((iIndex - 16) >> 2) << 2) * iStrideC + (((iIndex - 16) % 4) << 2);
- pCtx->pIdctResAddPredFunc (pDstU + iOffset, iStrideC, pCurLayer->pScaledTCoeff[iMbXy] + ((i + 16) << 4));
- }
-
- iIndex = g_kuiMbCountScan4Idx[i + 20]; //Cr
- if (pCurLayer->pNzc[iMbXy][iIndex] || * (pCurLayer->pScaledTCoeff[iMbXy] + ((i + 20) << 4))) {
- iOffset = (((iIndex - 18) >> 2) << 2) * iStrideC + (((iIndex - 18) % 4) << 2);
- pCtx->pIdctResAddPredFunc (pDstV + iOffset, iStrideC , pCurLayer->pScaledTCoeff[iMbXy] + ((i + 20) << 4));
- }
- }
+ const int8_t* pNzc = pCurLayer->pNzc[iMbXy];
+ int16_t* pScaledTCoeff = pCurLayer->pScaledTCoeff[iMbXy];
+ // Cb.
+ pCtx->pIdctFourResAddPredFunc (pDstU, iStrideC, pScaledTCoeff + 4 * 64, pNzc + 16);
+ // Cr.
+ pCtx->pIdctFourResAddPredFunc (pDstV, iStrideC, pScaledTCoeff + 5 * 64, pNzc + 18);
return ERR_NONE;
}
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -848,6 +848,22 @@
DeblockingInit (&pCtx->sDeblockingFunc, uiCpuFlag);
}
+namespace {
+
+template<void pfIdctResAddPred (uint8_t* pPred, int32_t iStride, int16_t* pRs)>
+void IdctFourResAddPred_ (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc) {
+ if (pNzc[0] || pRs[0 * 16])
+ pfIdctResAddPred (pPred + 0 * iStride + 0, iStride, pRs + 0 * 16);
+ if (pNzc[1] || pRs[1 * 16])
+ pfIdctResAddPred (pPred + 0 * iStride + 4, iStride, pRs + 1 * 16);
+ if (pNzc[4] || pRs[2 * 16])
+ pfIdctResAddPred (pPred + 4 * iStride + 0, iStride, pRs + 2 * 16);
+ if (pNzc[5] || pRs[3 * 16])
+ pfIdctResAddPred (pPred + 4 * iStride + 4, iStride, pRs + 3 * 16);
+}
+
+} // anon ns
+
void InitPredFunc (PWelsDecoderContext pCtx, uint32_t uiCpuFlag) {
pCtx->pGetI16x16LumaPredFunc[I16_PRED_V ] = WelsI16x16LumaPredV_c;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_H ] = WelsI16x16LumaPredH_c;
@@ -896,6 +912,7 @@
pCtx->pGetIChromaPredFunc[C_PRED_DC_128] = WelsIChromaPredDcNA_c;
pCtx->pIdctResAddPredFunc = IdctResAddPred_c;
+ pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_<IdctResAddPred_c>;
pCtx->pIdctResAddPredFunc8x8 = IdctResAddPred8x8_c;
@@ -902,6 +919,7 @@
#if defined(HAVE_NEON)
if (uiCpuFlag & WELS_CPU_NEON) {
pCtx->pIdctResAddPredFunc = IdctResAddPred_neon;
+ pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_<IdctResAddPred_neon>;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_neon;
@@ -927,6 +945,7 @@
#if defined(HAVE_NEON_AARCH64)
if (uiCpuFlag & WELS_CPU_NEON) {
pCtx->pIdctResAddPredFunc = IdctResAddPred_AArch64_neon;
+ pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_<IdctResAddPred_AArch64_neon>;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_AArch64_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_AArch64_neon;
@@ -957,6 +976,7 @@
#if defined(X86_ASM)
if (uiCpuFlag & WELS_CPU_MMXEXT) {
pCtx->pIdctResAddPredFunc = IdctResAddPred_mmx;
+ pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_<IdctResAddPred_mmx>;
///////mmx code opt---
pCtx->pGetIChromaPredFunc[C_PRED_H] = WelsDecoderIChromaPredH_mmx;
@@ -971,7 +991,9 @@
pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ] = WelsDecoderI4x4LumaPredVL_mmx;
}
if (uiCpuFlag & WELS_CPU_SSE2) {
- /////////sse2 code opt---
+ pCtx->pIdctResAddPredFunc = IdctResAddPred_sse2;
+ pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_<IdctResAddPred_sse2>;
+
pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_sse2;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_sse2;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_H] = WelsDecoderI16x16LumaPredH_sse2;
@@ -982,6 +1004,10 @@
pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDc_sse2;
pCtx->pGetIChromaPredFunc[C_PRED_DC_T] = WelsDecoderIChromaPredDcTop_sse2;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_H] = WelsDecoderI4x4LumaPredH_sse2;
+ }
+ if (uiCpuFlag & WELS_CPU_AVX2) {
+ pCtx->pIdctResAddPredFunc = IdctResAddPred_avx2;
+ pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_avx2;
}
#endif
}
--- a/codec/decoder/core/src/rec_mb.cpp
+++ b/codec/decoder/core/src/rec_mb.cpp
@@ -186,28 +186,22 @@
/*common use by decoder&encoder*/
int32_t iYStride = pDqLayer->iLumaStride;
- int32_t* pBlockOffset = pCtx->iDecBlockOffsetArray;
int16_t* pRS = pScoeffLevel;
uint8_t* pPred = pDqLayer->pPred[0];
- PIdctResAddPredFunc pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc;
+ PIdctFourResAddPredFunc pIdctFourResAddPredFunc = pCtx->pIdctFourResAddPredFunc;
- uint8_t i = 0;
-
/*decode i16x16 y*/
pGetI16x16LumaPredFunc[iI16x16PredMode] (pPred, iYStride);
/*1 mb is divided 16 4x4_block to idct*/
- for (i = 0; i < 16; i++) {
- int16_t* pRSI4x4 = pRS + (i << 4);
- uint8_t* pPredI4x4 = pPred + pBlockOffset[i];
+ const int8_t* pNzc = pDqLayer->pNzc[iMBXY];
+ pIdctFourResAddPredFunc (pPred + 0 * iYStride + 0, iYStride, pRS + 0 * 64, pNzc + 0);
+ pIdctFourResAddPredFunc (pPred + 0 * iYStride + 8, iYStride, pRS + 1 * 64, pNzc + 2);
+ pIdctFourResAddPredFunc (pPred + 8 * iYStride + 0, iYStride, pRS + 2 * 64, pNzc + 8);
+ pIdctFourResAddPredFunc (pPred + 8 * iYStride + 8, iYStride, pRS + 3 * 64, pNzc + 10);
- if (pDqLayer->pNzc[iMBXY][g_kuiMbCountScan4Idx[i]] || pRSI4x4[0]) {
- pIdctResAddPredFunc (pPredI4x4, iYStride, pRSI4x4);
- }
- }
-
/*decode intra mb cb&cr*/
pPred = pDqLayer->pPred[1];
pGetIChromaPredFunc[iChromaPredMode] (pPred, iUVStride);
@@ -541,9 +535,9 @@
int32_t RecChroma (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer) {
int32_t iChromaStride = pCtx->pCurDqLayer->pDec->iLinesize[1];
- PIdctResAddPredFunc pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc;
+ PIdctFourResAddPredFunc pIdctFourResAddPredFunc = pCtx->pIdctFourResAddPredFunc;
- uint8_t i = 0, j = 0;
+ uint8_t i = 0;
uint8_t uiCbpC = pDqLayer->pCbp[iMBXY] >> 4;
if (1 == uiCbpC || 2 == uiCbpC) {
@@ -552,17 +546,10 @@
for (i = 0; i < 2; i++) {
int16_t* pRS = pScoeffLevel + 256 + (i << 6);
uint8_t* pPred = pDqLayer->pPred[i + 1];
- int32_t* pBlockOffset = i == 0 ? &pCtx->iDecBlockOffsetArray[16] : &pCtx->iDecBlockOffsetArray[20];
+ const int8_t* pNzc = pDqLayer->pNzc[iMBXY] + 16 + 2 * i;
/*1 chroma is divided 4 4x4_block to idct*/
- for (j = 0; j < 4; j++) {
- int16_t* pRSI4x4 = &pRS[j << 4];
- uint8_t* pPredI4x4 = pPred + pBlockOffset[j];
-
- if (pDqLayer->pNzc[iMBXY][g_kuiMbCountScan4Idx[16 + (i << 2) + j]] || pRSI4x4[0]) {
- pIdctResAddPredFunc (pPredI4x4, iChromaStride, pRSI4x4);
- }
- }
+ pIdctFourResAddPredFunc (pPred, iChromaStride, pRS, pNzc);
}
}
--- a/codec/decoder/core/x86/dct.asm
+++ b/codec/decoder/core/x86/dct.asm
@@ -42,77 +42,7 @@
%include "asm_inc.asm"
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-%macro MMX_SumSubDiv2 3
- movq %3, %2
- psraw %3, $01
- paddw %3, %1
- psraw %1, $01
- psubw %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
- movq %3, %2
- psubw %2, %1
- paddw %1, %3
-%endmacro
-
-%macro MMX_IDCT 6
- MMX_SumSub %4, %5, %6
- MMX_SumSubDiv2 %3, %2, %1
- MMX_SumSub %1, %4, %6
- MMX_SumSub %3, %5, %6
-%endmacro
-
-
-%macro MMX_StoreDiff4P 5
- movd %2, %5
- punpcklbw %2, %4
- paddw %1, %3
- psraw %1, $06
- paddsw %1, %2
- packuswb %1, %2
- movd %5, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
SECTION .text
-
-;*******************************************************************************
-; void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
-;*******************************************************************************
-
-WELS_EXTERN IdctResAddPred_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r1, r1d
- movq mm0, [r2+ 0]
- movq mm1, [r2+ 8]
- movq mm2, [r2+16]
- movq mm3, [r2+24]
-
- MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
- MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
- MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
- MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
-
- WELS_Zero mm7
- WELS_DW32 mm6
-
- MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
- MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
- lea r0, [r0+2*r1]
- MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0]
- MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
-
-
- emms
- ret
;void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);
WELS_EXTERN WelsBlockZero16x16_sse2
--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -31,9 +31,6 @@
;*
;* dct.asm
;*
-;* Abstract
-;* WelsDctFourT4_sse2
-;*
;* History
;* 8/4/2009 Created
;*
@@ -42,618 +39,12 @@
%include "asm_inc.asm"
-SECTION .rodata align=32
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-
-align 32
-wels_shufb0312_movzxw_128:
- db 0, 80h, 3, 80h, 1, 80h, 2, 80h, 4, 80h, 7, 80h, 5, 80h, 6, 80h
-wels_shufb2301_128:
- db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
-wels_shufb0231_128:
- db 0, 2, 3, 1, 4, 6, 7, 5, 8, 10, 11, 9, 12, 14, 15, 13
-wels_dw32_128:
- times 8 dw 32
-wels_p1m1p1m1w_256:
- times 8 dw 1, -1
-wels_p1p2m1m2w_256:
- times 4 dw 1, 2, -1, -2
-wels_p1p1m1m1w_256:
- times 4 dw 1, 1, -1, -1
-wels_8xp1w_8xm1w:
- times 8 dw 1
- times 8 dw -1
-wels_4xp1w_4xm1w_256:
- times 4 dw 1
- times 4 dw -1
- times 4 dw 1
- times 4 dw -1
-wels_4xp1w_4xp2w_4xm1w_4xm2w:
- times 4 dw 1
- times 4 dw 2
- times 4 dw -1
- times 4 dw -2
-
-align 16
-wels_p1m1p1m1w_128:
- times 4 dw 1, -1
-wels_p1p2p1p2w_128:
- times 4 dw 1, 2
-wels_p1m1m1p1w_128:
- times 2 dw 1, -1, -1, 1
-wels_p0m8000p0m8000w_128:
- times 4 dw 0, -8000h
-wels_p1p1m1m1w_128:
- times 2 dw 1, 1, -1, -1
-wels_4xp1w_4xp2w:
- times 4 dw 1
- times 4 dw 2
-wels_4xp0w_4xm8000w:
- times 4 dw 0
- times 4 dw -8000h
-
-align 16
-SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
- dw 10, 13, 10, 13, 13, 16, 13, 16,
- dw 11, 14, 11, 14, 14, 18, 14, 18,
- dw 11, 14, 11, 14, 14, 18, 14, 18,
- dw 13, 16, 13, 16, 16, 20, 16, 20,
- dw 13, 16, 13, 16, 16, 20, 16, 20,
- dw 14, 18, 14, 18, 18, 23, 18, 23,
- dw 14, 18, 14, 18, 18, 23, 18, 23,
- dw 16, 20, 16, 20, 20, 25, 20, 25,
- dw 16, 20, 16, 20, 20, 25, 20, 25,
- dw 18, 23, 18, 23, 23, 29, 23, 29,
- dw 18, 23, 18, 23, 23, 29, 23, 29
-
-;***********************************************************************
-; MMX functions
-;***********************************************************************
-
-%macro MMX_LoadDiff4P 5
- movd %1, [%3]
- movd %2, [%4]
- punpcklbw %1, %5
- punpcklbw %2, %5
- psubw %1, %2
-%endmacro
-
-%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
- MMX_LoadDiff4P %1, %9, %5, %7, %10
- MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
- lea %5, [%5+2*%6]
- lea %7, [%7+2*%8]
- MMX_LoadDiff4P %3, %9, %5, %7, %10
- MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
-%endmacro
-
-%macro MMX_SumSubMul2 3
- movq %3, %1
- psllw %1, $01
- paddw %1, %2
- psllw %2, $01
- psubw %3, %2
-%endmacro
-
-%macro MMX_SumSubDiv2 3
- movq %3, %2
- psraw %3, $01
- paddw %3, %1
- psraw %1, $01
- psubw %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
- movq %3, %2
- psubw %2, %1
- paddw %1, %3
-%endmacro
-
-%macro MMX_DCT 6
- MMX_SumSub %4, %1, %6
- MMX_SumSub %3, %2, %6
- MMX_SumSub %3, %4, %6
- MMX_SumSubMul2 %1, %2, %5
-%endmacro
-
-%macro MMX_IDCT 6
- MMX_SumSub %4, %5, %6
- MMX_SumSubDiv2 %3, %2, %1
- MMX_SumSub %1, %4, %6
- MMX_SumSub %3, %5, %6
-%endmacro
-
-%macro MMX_StoreDiff4P 6
- movd %2, %6
- punpcklbw %2, %4
- paddw %1, %3
- psraw %1, $06
- paddsw %1, %2
- packuswb %1, %2
- movd %5, %1
-%endmacro
SECTION .text
-;***********************************************************************
-; void WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
-;***********************************************************************
-WELS_EXTERN WelsDctT4_mmx
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENSION r2, r2d
- SIGN_EXTENSION r4, r4d
- WELS_Zero mm7
- MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
-
- MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
- MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
-
- MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
- MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
-
- movq [r0+ 0], mm2
- movq [r0+ 8], mm1
- movq [r0+16], mm5
- movq [r0+24], mm4
- WELSEMMS
- LOAD_5_PARA_POP
- ret
-
-
;***********************************************************************
-; void WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
-;***********************************************************************
-WELS_EXTERN WelsIDctT4Rec_mmx
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movq mm0, [r4+ 0]
- movq mm1, [r4+ 8]
- movq mm2, [r4+16]
- movq mm3, [r4+24]
-
- MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
- MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
- MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
- MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
-
- WELS_Zero mm7
- WELS_DW32 mm6
-
- MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
- MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
- MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
-
- WELSEMMS
- LOAD_5_PARA_POP
- ret
-
-
-;***********************************************************************
; SSE2 functions
;***********************************************************************
-%macro SSE2_Store4x8p 6
- movlps [%1+0x00], %2
- movhps [%1+0x20], %2
- movlps [%1+0x08], %3
- movhps [%1+0x28], %3
- movlps [%1+0x10], %4
- movhps [%1+0x30], %4
- movlps [%1+0x18], %5
- movhps [%1+0x38], %5
-%endmacro
-%macro SSE2_Load4x8p 6
- MOVDQ %2, [%1+0x00]
- MOVDQ %4, [%1+0x10]
- MOVDQ %6, [%1+0x20]
- MOVDQ %3, [%1+0x30]
- SSE2_XSawp qdq, %4, %3, %5
- SSE2_XSawp qdq, %2, %6, %3
-%endmacro
-
-%macro SSE2_SumSubMul2 3
- movdqa %3, %1
- psllw %1, 1
- paddw %1, %2
- psllw %2, 1
- psubw %3, %2
-%endmacro
-
-%macro SSE2_SumSubDiv2 4
- movdqa %4, %1
- movdqa %3, %2
- psraw %2, $01
- psraw %4, $01
- paddw %1, %2
- psubw %4, %3
-%endmacro
-
-%macro SSE2_StoreDiff16p 9
- paddw %1, %4
- psraw %1, $06
- movq %3, %7
- punpcklbw %3, %5
- paddsw %1, %3
- paddw %2, %4
- psraw %2, $06
- movq %3, %9
- punpcklbw %3, %5
- paddsw %2, %3
- packuswb %1, %2
- movlps %6, %1
- movhps %8, %1
-%endmacro
-
-%macro SSE2_StoreDiff8p 5
- movq %2, %5
- punpcklbw %2, %3
- paddsw %2, %1
- packuswb %2, %2
- movq %4, %2
-%endmacro
-
-%macro SSE2_Load2x4P 2
- MOVDQ %1, [%2]
-%endmacro
-
-%macro SSE2_Store2x4P 2
- MOVDQ [%1], %2
-%endmacro
-
-; out=%1 pPixel1Line1=%2 pPixel1Line2=%3 pPixel2Line1=%4 pPixel2Line2=%5 zero=%6 clobber=%7,%8
-%macro SSE2_LoadDiff2x4P 8
- movd %1, [%2]
- movd %7, [%3]
- punpckldq %1, %7
- punpcklbw %1, %6
- movd %7, [%4]
- movd %8, [%5]
- punpckldq %7, %8
- punpcklbw %7, %6
- psubw %1, %7
-%endmacro
-
-; pRec1=%1 pRec2=%2 data=%3 pPred1=%4 pPred2=%5 dw32=%6 zero=%7 clobber=%8,%9
-%macro SSE2_StoreDiff2x4P 9
- paddw %3, %6
- psraw %3, 6
- movd %8, [%4]
- movd %9, [%5]
- punpckldq %8, %9
- punpcklbw %8, %7
- paddsw %3, %8
- packuswb %3, %3
- movd [%1], %3
- psrlq %3, 32
- movd [%2], %3
-%endmacro
-
-%macro SSE2_Load8DC 6
- movdqa %1, %6 ; %1 = dc0 dc1
- paddw %1, %5
- psraw %1, $06 ; (dc + 32) >> 6
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklwd %2, %2
- punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
-
- movdqa %3, %1
- psrldq %3, 8
- punpcklwd %3, %3
- punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
-
- movdqa %4, %1
- psrldq %4, 12
- punpcklwd %4, %4
- punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
-
- punpcklwd %1, %1
- punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
-%endmacro
-
-%macro SSE2_DCT 6
- SSE2_SumSub %6, %3, %5
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %2, %5
- SSE2_SumSubMul2 %6, %1, %4
-%endmacro
-
-%macro SSE2_IDCT 7
- SSE2_SumSub %7, %2, %6
- SSE2_SumSubDiv2 %1, %3, %5, %4
- SSE2_SumSub %2, %1, %5
- SSE2_SumSub %7, %4, %5
-%endmacro
-
-; Do 2 horizontal 4-pt DCTs in parallel packed as 8 words in an xmm register.
-; out=%1 in=%1 clobber=%2
-%macro SSE2_DCT_HORIZONTAL 2
- pshuflw %2, %1, 1bh ; [x[3],x[2],x[1],x[0]] low qw
- pmullw %1, [wels_p1m1p1m1w_128] ; [x[0],-x[1],x[2],-x[3], ...]
- pshufhw %2, %2, 1bh ; [x[3],x[2],x[1],x[0]] high qw
- paddw %1, %2 ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
- pshufd %2, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
- pmullw %1, [wels_p1m1m1p1w_128] ; [s[0],-s[1],-s[2],s[3], ...]
- pmullw %2, [wels_p1p2p1p2w_128] ; [s[2],2*s[3],s[0],2*s[1], ...]]
- paddw %1, %2 ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
-%endmacro
-
-; Do 2 horizontal 4-pt IDCTs in parallel packed as 8 words in an xmm register.
-;
-; Use a multiply by reciprocal to get -x>>1, and x+=-x>>1 to get x>>1, which
-; avoids a cumbersome blend with SSE2 to get a vector with right-shifted odd
-; elements.
-;
-; out=%1 in=%1 wels_p1m1m1p1w_128=%2 clobber=%3,%4
-%macro SSE2_IDCT_HORIZONTAL 4
- movdqa %3, [wels_p0m8000p0m8000w_128]
- pmulhw %3, %1 ; x[0:7] * [0,-8000h,0,-8000h, ...] >> 16
- pshufd %4, %1, 0b1h ; [x[2],x[3],x[0],x[1], ...]
- pmullw %4, %2 ; [x[2],-x[3],-x[0],x[1], ...]
- paddw %1, %3 ; [x[0]+0,x[1]+(-x[1]>>1),x[2]+0,x[3]+(-x[3]>>1), ...]
- paddw %1, %4 ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
- pshuflw %3, %1, 1bh ; [s[3],s[2],s[1],s[0]] low qw
- pmullw %1, [wels_p1p1m1m1w_128] ; [s[0],s[1],-s[2],-s[3], ...]
- pshufhw %3, %3, 1bh ; [s[3],s[2],s[1],s[0]] high qw
- pmullw %3, %2 ; [s[3],-s[2],-s[1],s[0], ...]
- paddw %1, %3 ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
-%endmacro
-
-; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in 2 xmm registers.
-; Uses scrambled input to save a negation.
-; [y0,y1]=%1 [y2,y3]=%2 [x1,x0]=%1 [x2,x3]=%2 clobber=%3
-%macro SSE2_DCT_4x4P 3
- movdqa %3, %1
- psubw %1, %2 ; [x1-x2,x0-x3]
- paddw %2, %3 ; [x1+x2,x0+x3]
- movdqa %3, %2
- punpckhqdq %2, %1 ; s03 = [x0+x3,x0-x3]
- punpcklqdq %3, %1 ; s12 = [x1+x2,x1-x2]
- movdqa %1, %2
- pmullw %1, [wels_4xp1w_4xp2w] ; [s03[0],2*s03[1]]
- paddw %1, %3 ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
- pmullw %3, [wels_4xp1w_4xp2w] ; [s12[0],2*s12[1]]
- psubw %2, %3 ; [y2,y3] = [s03[0]-s12[0],s03[1]-2*s12[1]]
-%endmacro
-
-; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in 2 xmm registers.
-; Output is scrambled to save a negation.
-; [y1,y0]=%1 [y2,y3]=%2 [x0,x1]=%1 [x2,x3]=%2 clobber=%3,%4
-%macro SSE2_IDCT_4x4P 4
- movdqa %4, [wels_4xp0w_4xm8000w]
- movdqa %3, %1
- pmulhw %3, %4 ; x[0:1] * [0,-8000h] >> 16
- pmulhw %4, %2 ; x[2:3] * [0,-8000h] >> 16
- paddw %3, %1 ; [x[0],x[1]>>1]
- paddw %4, %2 ; [x[2],x[3]>>1]
- psubw %3, %2 ; [x[0]-x[2],(x[1]>>1)-x[3]]
- paddw %1, %4 ; [x[2]+x[0],(x[3]>>1)+x[1]]
- movdqa %2, %3
- punpckhqdq %3, %1 ; s13 = [(x[1]>>1)-x[3],(x[3]>>1)+x[1]]
- punpcklqdq %2, %1 ; s02 = [x[0]-x[2], x[2]+x[0]]
- movdqa %1, %2
- paddw %1, %3 ; [y1,y0] = [s02[0]+s13[0],s02[1]+s13[1]]
- psubw %2, %3 ; [y2,y3] = [s02[0]-s13[0],s02[1]-s13[1]]
-%endmacro
-
-;***********************************************************************
-; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
-;***********************************************************************
-WELS_EXTERN WelsDctFourT4_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r2, r2d
- SIGN_EXTENSION r4, r4d
- pxor xmm7, xmm7
- ;Load 4x8
- SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
- SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
- SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
- SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
-
- SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
- SSE2_DCT_HORIZONTAL xmm2, xmm5
- SSE2_DCT_HORIZONTAL xmm0, xmm5
- SSE2_DCT_HORIZONTAL xmm3, xmm5
- SSE2_DCT_HORIZONTAL xmm4, xmm5
-
- SSE2_Store4x8p r0, xmm2, xmm0, xmm3, xmm4, xmm1
-
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
-
- ;Load 4x8
- SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
- SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
- SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
- SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
-
- SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
- SSE2_DCT_HORIZONTAL xmm2, xmm5
- SSE2_DCT_HORIZONTAL xmm0, xmm5
- SSE2_DCT_HORIZONTAL xmm3, xmm5
- SSE2_DCT_HORIZONTAL xmm4, xmm5
-
- SSE2_Store4x8p r0+64, xmm2, xmm0, xmm3, xmm4, xmm1
-
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-
-;***********************************************************************
-; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
-;***********************************************************************
-WELS_EXTERN WelsIDctFourT4Rec_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- ;Load 4x8
- SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
-
- movdqa xmm7, [wels_p1m1m1p1w_128]
- SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
- SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
- SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
- SSE2_IDCT_HORIZONTAL xmm2, xmm7, xmm5, xmm6
- SSE2_IDCT xmm1, xmm4, xmm2, xmm3, xmm5, xmm6, xmm0
-
- WELS_Zero xmm7
- WELS_DW32 xmm6
-
- SSE2_StoreDiff16p xmm1, xmm3, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_Load4x8p r4+64, xmm0, xmm1, xmm4, xmm2, xmm5
-
- movdqa xmm7, [wels_p1m1m1p1w_128]
- SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
- SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
- SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
- SSE2_IDCT_HORIZONTAL xmm2, xmm7, xmm5, xmm6
- SSE2_IDCT xmm1, xmm4, xmm2, xmm3, xmm5, xmm6, xmm0
-
- WELS_Zero xmm7
- WELS_DW32 xmm6
-
- SSE2_StoreDiff16p xmm1, xmm3, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3]
- POP_XMM
- LOAD_5_PARA_POP
- ; pop esi
- ; pop ebx
- ret
-
-;***********************************************************************
-; void WelsDctT4_sse2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
-;***********************************************************************
-WELS_EXTERN WelsDctT4_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 5
- SIGN_EXTENSION r2, r2d
- SIGN_EXTENSION r4, r4d
-
- WELS_Zero xmm2
- SSE2_LoadDiff2x4P xmm0, r1+r2, r1, r3+r4, r3, xmm2, xmm3, xmm4
- add r1, r2
- add r3, r4
- SSE2_LoadDiff2x4P xmm1, r1+r2, r1+2*r2, r3+r4, r3+2*r4, xmm2, xmm3, xmm4
- SSE2_DCT_HORIZONTAL xmm0, xmm3
- SSE2_DCT_HORIZONTAL xmm1, xmm3
- SSE2_DCT_4x4P xmm0, xmm1, xmm3
- SSE2_Store2x4P r0, xmm0
- SSE2_Store2x4P r0+16, xmm1
-
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-;***********************************************************************
-; void WelsIDctT4Rec_sse2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
-;***********************************************************************
-WELS_EXTERN WelsIDctT4Rec_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 6
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
-
- SSE2_Load2x4P xmm0, r4
- SSE2_Load2x4P xmm1, r4+16
- movdqa xmm4, [wels_p1m1m1p1w_128]
- SSE2_IDCT_HORIZONTAL xmm0, xmm4, xmm2, xmm3
- SSE2_IDCT_HORIZONTAL xmm1, xmm4, xmm2, xmm3
- SSE2_IDCT_4x4P xmm0, xmm1, xmm2, xmm3
- WELS_Zero xmm4
- WELS_DW32 xmm5
- SSE2_StoreDiff2x4P r0+r1, r0, xmm0, r2+r3, r2, xmm5, xmm4, xmm2, xmm3
- add r0, r1
- add r2, r3
- SSE2_StoreDiff2x4P r0+r1, r0+2*r1, xmm1, r2+r3, r2+2*r3, xmm5, xmm4, xmm2, xmm3
-
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-%macro SSE2_StoreDiff4x8p 8
- SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
- SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
- SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
- SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
-%endmacro
-
- ;***********************************************************************
-; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
-;***********************************************************************
-WELS_EXTERN WelsIDctRecI16x16Dc_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm7, xmm7
- WELS_DW32 xmm6
-
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-
-
%macro SSE2_SumSubD 3
movdqa %3, %2
paddd %2, %1
@@ -714,324 +105,4 @@
movdqa [r0+16], xmm2
POP_XMM
- ret
-
-;***********************************************************************
-; AVX2 functions
-;***********************************************************************
-
-; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8
-%macro AVX2_LoadDiff16P 8
- vmovq x%1, [%2 ]
- vpbroadcastq y%7, [%2 + 4 * %3]
- vpblendd y%1, y%1, y%7, 11110000b
- vpshufb y%1, y%1, y%6
- vmovq x%7, [%4 ]
- vpbroadcastq y%8, [%4 + 4 * %5]
- vpblendd y%7, y%7, y%8, 11110000b
- vpshufb y%7, y%7, y%6
- vpsubw y%1, y%1, y%7
-%endmacro
-
-; pRec=%1 iStride=%2 data=%3,%4 pPred=%5 iPredStride=%6 dw32=%7 wels_shufb0312_movzxw=%8 clobber=%9,%10
-%macro AVX2_StoreDiff32P 10
- vpaddw y%3, y%3, y%7
- vpsraw y%3, y%3, 6
- vmovq x%9, [%5 ]
- vpbroadcastq y%10, [%5 + 4 * %6]
- add %5, %6
- vpblendd y%9, y%9, y%10, 11110000b
- vpshufb y%9, y%9, y%8
- vpaddsw y%3, y%3, y%9
- vpaddw y%4, y%4, y%7
- vpsraw y%4, y%4, 6
- vmovq x%9, [%5 ]
- vpbroadcastq y%10, [%5 + 4 * %6]
- vpblendd y%9, y%9, y%10, 11110000b
- vpshufb y%9, y%9, y%8
- vpaddsw y%4, y%4, y%9
- vpackuswb y%3, y%3, y%4
- vbroadcasti128 y%4, [wels_shufb0231_128]
- vpshufb y%3, y%3, y%4
- vextracti128 x%4, y%3, 1
- vmovlps [%1 ], x%3
- vmovlps [%1 + 4 * %2], x%4
- add %1, %2
- vmovhps [%1 ], x%3
- vmovhps [%1 + 4 * %2], x%4
-%endmacro
-
-; out=%1,%2,%3,%4 pDct=%5 clobber=%6
-%macro AVX2_Load4x16P 6
- vmovdqa x%2, [%5+0x00]
- vinserti128 y%2, y%2, [%5+0x40], 1
- vmovdqa x%6, [%5+0x20]
- vinserti128 y%6, y%6, [%5+0x60], 1
- vpunpcklqdq y%1, y%2, y%6
- vpunpckhqdq y%2, y%2, y%6
- vmovdqa x%4, [%5+0x10]
- vinserti128 y%4, y%4, [%5+0x50], 1
- vmovdqa x%6, [%5+0x30]
- vinserti128 y%6, y%6, [%5+0x70], 1
- vpunpcklqdq y%3, y%4, y%6
- vpunpckhqdq y%4, y%4, y%6
-%endmacro
-
-; pDct=%1 data=%1,%2,%3,%4 clobber=%5
-%macro AVX2_Store4x16P 6
- vpunpcklqdq y%6, y%2, y%3
- vmovdqa [%1+0x00], x%6
- vextracti128 [%1+0x40], y%6, 1
- vpunpckhqdq y%6, y%2, y%3
- vmovdqa [%1+0x20], x%6
- vextracti128 [%1+0x60], y%6, 1
- vpunpcklqdq y%6, y%4, y%5
- vmovdqa [%1+0x10], x%6
- vextracti128 [%1+0x50], y%6, 1
- vpunpckhqdq y%6, y%4, y%5
- vmovdqa [%1+0x30], x%6
- vextracti128 [%1+0x70], y%6, 1
-%endmacro
-
-%macro AVX2_Load4x4P 2
- vmovdqu y%1, [%2]
-%endmacro
-
-%macro AVX2_Store4x4P 2
- vmovdqu [%1], y%2
-%endmacro
-
-; Load 4 lines of 4 pixels, shuffle and zero extend to 16-bit.
-; out=%1 pPixel=%2 iStride=%3 [wels_shufb0312_movzxw]=%4 clobber=%5,%6
-%macro AVX2_Loadzx4x4P 6
- vmovd x%1, [%2 ]
- add %2, %3
- vpbroadcastd x%5, [%2 + 2 * %3]
- vpblendd x%1, x%1, x%5, 1010b
- vpbroadcastd y%5, [%2 ]
- vpbroadcastd y%6, [%2 + %3]
- vpblendd y%5, y%5, y%6, 10101010b
- vpblendd y%1, y%1, y%5, 11110000b
- vpshufb y%1, y%1, %4
-%endmacro
-
-; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8,%9
-%macro AVX2_LoadDiff4x4P 9
- AVX2_Loadzx4x4P %1, %2, %3, y%6, %7, %8
- AVX2_Loadzx4x4P %7, %4, %5, y%6, %8, %9
- vpsubw y%1, y%1, y%7
-%endmacro
-
-; pRec=%1 iStride=%2 data=%3 pPred=%4 iPredStride=%5 dw32=%6 wels_shufb0312_movzxw=%7 clobber=%8,%9,%10
-%macro AVX2_StoreDiff4x4P 10
- vpaddw y%3, y%3, y%6
- vpsraw y%3, y%3, 6
- AVX2_Loadzx4x4P %8, %4, %5, y%7, %9, %10
- vpaddsw y%3, y%3, y%8
- vpackuswb y%3, y%3, y%3
- vbroadcasti128 y%8, [wels_shufb0231_128]
- vpshufb y%3, y%3, y%8
- vextracti128 x%8, y%3, 1
- vmovd [%1 ], x%3
- add %1, %2
- vmovd [%1 ], x%8
- vpsrlq x%8, x%8, 32
- vmovd [%1 + %2], x%8
- vpsrlq x%3, x%3, 32
- vmovd [%1 + 2 * %2], x%3
-%endmacro
-
-; 4-pt DCT
-; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
-%macro AVX2_DCT 5
- vpsubw %5, %1, %4 ; s3 = x0 - x3
- vpaddw %1, %1, %4 ; s0 = x0 + x3
- vpsubw %4, %2, %3 ; s2 = x1 - x2
- vpaddw %2, %2, %3 ; s1 = x1 + x2
- vpsubw %3, %1, %2 ; y2 = s0 - s1
- vpaddw %1, %1, %2 ; y0 = s0 + s1
- vpsllw %2, %5, 1
- vpaddw %2, %2, %4 ; y1 = 2 * s3 + s2
- vpsllw %4, %4, 1
- vpsubw %4, %5, %4 ; y3 = s3 - 2 * s2
-%endmacro
-
-; 4-pt IDCT
-; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
-%macro AVX2_IDCT 5
- vpsraw %5, %2, 1
- vpsubw %5, %5, %4 ; t3 = (x1 >> 1) - x3
- vpsraw %4, %4, 1
- vpaddw %4, %2, %4 ; t2 = x1 + (x3 >> 1)
- vpaddw %2, %1, %3 ; t0 = x0 + x2
- vpsubw %3, %1, %3 ; t1 = x0 - x2
- vpaddw %1, %2, %4 ; y0 = t0 + t2
- vpsubw %4, %2, %4 ; y3 = t0 - t2
- vpaddw %2, %3, %5 ; y1 = t1 + t3
- vpsubw %3, %3, %5 ; y2 = t1 - t3
-%endmacro
-
-; Do 4 horizontal 4-pt DCTs in parallel packed as 16 words in a ymm register.
-; Uses scrambled input to save a negation.
-; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 wels_shufb2301=%2 clobber=%3
-%macro AVX2_DCT_HORIZONTAL 3
- vpsignw %3, %1, [wels_p1m1p1m1w_256] ; [x0,-x3,x1,-x2]
- vpshufb %1, %1, %2 ; [x3,x0,x2,x1]
- vpaddw %1, %1, %3 ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
- vpmullw %3, %1, [wels_p1p2m1m2w_256] ; [s[0],2*s[1],-s[2],-2*s[3], ...]
- vpshufd %1, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
- vpaddw %1, %1, %3 ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1], ...]
-%endmacro
-
-; Do 4 horizontal 4-pt IDCTs in parallel packed as 16 words in a ymm register.
-; Output is scrambled to save a negation.
-; [y0,y3,y1,y2]=%1 [x0,x1,x2,x3]=%1 wels_shufb2301=%2 clobber=%3
-%macro AVX2_IDCT_HORIZONTAL 3
- vpsraw %3, %1, 1 ; [x0>>1,x1>>1,x2>>1,x3>>1]
- vpblendw %3, %1, %3, 10101010b ; [x0,x1>>1,x2,x3>>1]
- vpsignw %1, %1, [wels_p1p1m1m1w_256] ; [x0,x1,-x2,-x3]
- vpshufd %3, %3, 0b1h ; [x2,x3>>1,x0,x1>>1]
- vpaddw %1, %3, %1 ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
- vpshufb %3, %1, %2 ; [s[1],s[0],s[3],s[2], ...]
- vpsignw %1, %1, [wels_p1m1p1m1w_256] ; [s[0],-s[1],s[2],-s[3], ...]
- vpaddw %1, %1, %3 ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2], ...]
-%endmacro
-
-; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in a ymm register.
-; Uses scrambled input to save a negation.
-; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 clobber=%2
-%macro AVX2_DCT_4x4P 2
- vpsignw %2, %1, [wels_4xp1w_4xm1w_256] ; [x0,-x3,x1,-x2]
- vpshufd %1, %1, 4eh ; [x3,x0,x2,x1]
- vpaddw %1, %1, %2 ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
- vpmullw %2, %1, [wels_4xp1w_4xp2w_4xm1w_4xm2w] ; [s[0],2*s[1],-s[2],-2*s[3]]
- vpermq %1, %1, 4eh ; [s[2],s[3],s[0],s[1]]
- vpaddw %1, %1, %2 ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1]]
-%endmacro
-
-; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in a ymm register.
-; Output is scrambled to save a negation.
-; [y0,y3,y1,y2]=%1 [x0,x1,x2,x3]=%1 clobber=%2
-%macro AVX2_IDCT_4x4P 2
- vpsraw %2, %1, 1 ; [x0>>1,x1>>1,x2>>1,x3>>1]
- vpblendw %2, %1, %2, 11110000b ; [x0,x1>>1,x2,x3>>1]
- vpsignw %1, %1, [wels_8xp1w_8xm1w] ; [x0,x1,-x2,-x3]
- vpermq %2, %2, 4eh ; [x2,x3>>1,x0,x1>>1]
- vpaddw %1, %2, %1 ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
- vpshufd %2, %1, 4eh ; [s[1],s[0],s[3],s[2]]
- vpmullw %1, %1, [wels_4xp1w_4xm1w_256] ; [s[0],-s[1],s[2],-s[3], ...]
- vpaddw %1, %1, %2 ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2]]
-%endmacro
-
-;***********************************************************************
-; void WelsDctFourT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
-;***********************************************************************
-WELS_EXTERN WelsDctFourT4_avx2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 7
- SIGN_EXTENSION r2, r2d
- SIGN_EXTENSION r4, r4d
-
- vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
-
- ;Load 4x16
- AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5
- add r1, r2
- add r3, r4
- AVX2_LoadDiff16P mm1, r1, r2, r3, r4, mm6, mm4, mm5
- add r1, r2
- add r3, r4
- AVX2_LoadDiff16P mm2, r1, r2, r3, r4, mm6, mm4, mm5
- add r1, r2
- add r3, r4
- AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5
-
- AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5
- vbroadcasti128 ymm6, [wels_shufb2301_128]
- AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5
- AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5
- AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5
- AVX2_DCT_HORIZONTAL ymm3, ymm6, ymm5
-
- AVX2_Store4x16P r0, mm0, mm1, mm2, mm3, mm5
- vzeroupper
-
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-;***********************************************************************
-; void WelsIDctFourT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
-;***********************************************************************
-WELS_EXTERN WelsIDctFourT4Rec_avx2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
-
- AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5
- vbroadcasti128 ymm6, [wels_shufb2301_128]
- AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5
- AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5
- AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5
- AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5
- AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5
-
- vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
- vbroadcasti128 ymm7, [wels_dw32_128]
- AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4
- add r2, r3
- add r0, r1
- AVX2_StoreDiff32P r0, r1, mm2, mm3, r2, r3, mm7, mm6, mm5, mm4
- vzeroupper
-
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-;***********************************************************************
-; void WelsDctT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
-;***********************************************************************
-WELS_EXTERN WelsDctT4_avx2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 5
- SIGN_EXTENSION r2, r2d
- SIGN_EXTENSION r4, r4d
-
- vbroadcasti128 ymm1, [wels_shufb0312_movzxw_128]
- AVX2_LoadDiff4x4P mm0, r1, r2, r3, r4, mm1, mm2, mm3, mm4
- AVX2_DCT_4x4P ymm0, ymm2
- vbroadcasti128 ymm1, [wels_shufb2301_128]
- AVX2_DCT_HORIZONTAL ymm0, ymm1, ymm2
- AVX2_Store4x4P r0, mm0
- vzeroupper
-
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-;***********************************************************************
-; void WelsIDctT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
-;***********************************************************************
-WELS_EXTERN WelsIDctT4Rec_avx2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 6
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
-
- AVX2_Load4x4P mm0, r4
- vbroadcasti128 ymm4, [wels_shufb2301_128]
- AVX2_IDCT_HORIZONTAL ymm0, ymm4, ymm1
- AVX2_IDCT_4x4P ymm0, ymm1
- vbroadcasti128 ymm4, [wels_shufb0312_movzxw_128]
- vbroadcasti128 ymm5, [wels_dw32_128]
- AVX2_StoreDiff4x4P r0, r1, mm0, r2, r3, mm5, mm4, mm1, mm2, mm3
- vzeroupper
-
- POP_XMM
- LOAD_5_PARA_POP
ret
--- a/test/decoder/DecUT_IdctResAddPred.cpp
+++ b/test/decoder/DecUT_IdctResAddPred.cpp
@@ -4,6 +4,9 @@
#include "deblocking.h"
#include "cpu.h"
using namespace WelsDec;
+
+namespace {
+
void IdctResAddPred_ref (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) {
int16_t iSrc[16];
@@ -49,6 +52,17 @@
}
}
+#if defined(X86_ASM)
+void IdctFourResAddPred_ref (uint8_t* pPred, int32_t iStride, int16_t* pRs) {
+ IdctResAddPred_ref (pPred + 0 * iStride + 0, iStride, pRs + 0 * 16);
+ IdctResAddPred_ref (pPred + 0 * iStride + 4, iStride, pRs + 1 * 16);
+ IdctResAddPred_ref (pPred + 4 * iStride + 0, iStride, pRs + 2 * 16);
+ IdctResAddPred_ref (pPred + 4 * iStride + 4, iStride, pRs + 3 * 16);
+}
+#endif
+
+} // anon ns
+
#define GENERATE_IDCTRESADDPRED(pred, flag) \
TEST(DecoderDecodeMbAux, pred) {\
const int32_t kiStride = 32;\
@@ -55,8 +69,8 @@
const int iBits = 12;\
const int iMask = (1 << iBits) - 1;\
const int iOffset = 1 << (iBits - 1);\
- int16_t iRS[16];\
- uint8_t uiPred[16*kiStride];\
+ ENFORCE_STACK_ALIGN_1D (int16_t, iRS, 16, 16);\
+ ENFORCE_STACK_ALIGN_1D (uint8_t, uiPred, 16 * kiStride, 16);\
int16_t iRefRS[16];\
uint8_t uiRefPred[16*kiStride];\
int32_t iRunTimes = 1000;\
@@ -84,9 +98,48 @@
}\
}
+#define GENERATE_IDCTFOURRESADDPRED(pred, flag) \
+TEST(DecoderDecodeMbAux, pred) {\
+ const int32_t kiStride = 32;\
+ const int iBits = 12;\
+ const int iMask = (1 << iBits) - 1;\
+ const int iOffset = 1 << (iBits - 1);\
+ ENFORCE_STACK_ALIGN_1D (int16_t, iRS, 4 * 16, 16);\
+ ENFORCE_STACK_ALIGN_1D (uint8_t, uiPred, 4 * 16 * kiStride, 16);\
+ int16_t iRefRS[4 * 16];\
+ uint8_t uiRefPred[4 * 16 * kiStride];\
+ int8_t iNzc[6] = { 0 };\
+ int32_t iRunTimes = 1000;\
+ uint32_t uiCPUFlags = WelsCPUFeatureDetect(0); \
+ if ((uiCPUFlags & flag) == 0 && flag != 0) \
+ return; \
+ while (iRunTimes--) {\
+ for (int i = 0; i < 4; i++)\
+ for (int j = 0; j < 16; j++)\
+ iNzc[i / 2 * 4 + i % 2] += !!(iRefRS[16 * i + j] = iRS[16 * i + j] = (rand() & iMask) - iOffset);\
+ for (int i = 0; i < 8; i++)\
+ for (int j = 0; j < 8; j++)\
+ uiRefPred[i * kiStride + j] = uiPred[i * kiStride + j] = rand() & 255;\
+ pred (uiPred, kiStride, iRS, iNzc);\
+ IdctFourResAddPred_ref (uiRefPred, kiStride, iRefRS);\
+ bool ok = true;\
+ for (int i = 0; i < 8; i++)\
+ for (int j = 0; j < 8; j++)\
+ if (uiRefPred[i * kiStride + j] != uiPred[i * kiStride + j]) {\
+ ok = false;\
+ goto next;\
+ }\
+ next:\
+ EXPECT_EQ(ok, true);\
+ }\
+}
+
GENERATE_IDCTRESADDPRED (IdctResAddPred_c, 0)
#if defined(X86_ASM)
GENERATE_IDCTRESADDPRED (IdctResAddPred_mmx, WELS_CPU_MMXEXT)
+GENERATE_IDCTRESADDPRED (IdctResAddPred_sse2, WELS_CPU_SSE2)
+GENERATE_IDCTRESADDPRED (IdctResAddPred_avx2, WELS_CPU_AVX2)
+GENERATE_IDCTFOURRESADDPRED (IdctFourResAddPred_avx2, WELS_CPU_AVX2)
#endif
#if defined(HAVE_NEON)