shithub: openh264

Download patch

ref: 3088d96978667117f07891f9e5749c218de7f2d2
parent: b267163f103132497b17ae3fee5249e362c75b2e
author: Sindre Aamås <[email protected]>
date: Mon Jan 18 15:46:20 EST 2016

[Encoder] Add an AVX2 4x4 IDCT implementation

~2.03x faster on Haswell as compared to the SSE2 version.

--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -605,8 +605,8 @@
     packuswb %1,%1
 %endmacro
 
-
-
-
-
-
+%macro WELS_DW32_VEX 1
+    vpcmpeqw %1, %1, %1
+    vpsrlw   %1, %1, 15
+    vpsllw   %1, %1,  5
+%endmacro
--- a/codec/encoder/core/inc/decode_mb_aux.h
+++ b/codec/encoder/core/inc/decode_mb_aux.h
@@ -68,6 +68,7 @@
 void WelsIDctFourT4Rec_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
 void WelsIDctRecI16x16Dc_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,
                                int16_t* pDctDc);
+void WelsIDctFourT4Rec_avx2 (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
 #endif//X86_ASM
 
 #ifdef HAVE_NEON
--- a/codec/encoder/core/src/decode_mb_aux.cpp
+++ b/codec/encoder/core/src/decode_mb_aux.cpp
@@ -269,6 +269,9 @@
     pFuncList->pfIDctFourT4     = WelsIDctFourT4Rec_sse2;
     pFuncList->pfIDctI16x16Dc   = WelsIDctRecI16x16Dc_sse2;
   }
+  if (uiCpuFlag & WELS_CPU_AVX2) {
+    pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_avx2;
+  }
 #endif//X86_ASM
 
 #if defined(HAVE_NEON)
--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -57,6 +57,8 @@
     times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
 wels_p1m1m1p1w_256:
     times 4 dw 1, -1, -1, 1
+wels_p1p1m1m1w_256:
+    times 4 dw 1, 1, -1, -1
 
 align 16
 SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
@@ -529,6 +531,48 @@
     vpsubw        y%1, y%1, y%7
 %endmacro
 
+; pRec=%1 iStride=%2 data=%3,%4 pPred=%5 iPredStride=%6 dw32=%7 zero=%8 clobber=%9,%10
+%macro AVX2_StoreDiff32P 10
+    vpaddw        y%3, y%3, y%7
+    vpsraw        y%3, y%3, 6
+    vmovq         x%9,  [%5         ]
+    vpbroadcastq  y%10, [%5 + 4 * %6]
+    add           %5, %6
+    vpblendd      y%9, y%9, y%10, 11110000b
+    vpunpcklbw    y%9, y%9, y%8
+    vpaddsw       y%3, y%3, y%9
+    vpaddw        y%4, y%4, y%7
+    vpsraw        y%4, y%4, 6
+    vmovq         x%9,  [%5         ]
+    vpbroadcastq  y%10, [%5 + 4 * %6]
+    vpblendd      y%9, y%9, y%10, 11110000b
+    vpunpcklbw    y%9, y%9, y%8
+    vpaddsw       y%4, y%4, y%9
+    vpackuswb     y%3, y%3, y%4
+    vextracti128  x%4, y%3, 1
+    vmovlps       [%1         ], x%3
+    vmovlps       [%1 + 4 * %2], x%4
+    add           %1, %2
+    vmovhps       [%1         ], x%3
+    vmovhps       [%1 + 4 * %2], x%4
+%endmacro
+
+; out=%1,%2,%3,%4 pDct=%5 clobber=%6
+%macro AVX2_Load4x16P 6
+    vmovdqa       x%2, [%5+0x00]
+    vinserti128   y%2, [%5+0x40], 1
+    vmovdqa       x%6, [%5+0x20]
+    vinserti128   y%6, [%5+0x60], 1
+    vpunpcklqdq   y%1, y%2, y%6
+    vpunpckhqdq   y%2, y%2, y%6
+    vmovdqa       x%4, [%5+0x10]
+    vinserti128   y%4, [%5+0x50], 1
+    vmovdqa       x%6, [%5+0x30]
+    vinserti128   y%6, [%5+0x70], 1
+    vpunpcklqdq   y%3, y%4, y%6
+    vpunpckhqdq   y%4, y%4, y%6
+%endmacro
+
 ; pDct=%1 data=%1,%2,%3,%4 clobber=%5
 %macro AVX2_Store4x16P 6
     vpunpcklqdq   y%6, y%2,  y%3
@@ -560,6 +604,21 @@
     vpsubw        %4, %5, %4  ; y3 = s3 - 2 * s2
 %endmacro
 
+; 4-pt IDCT
+; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5
+%macro AVX2_IDCT 5
+    vpsraw        %5, %2, 1
+    vpsubw        %5, %5, %4  ; t3 = (x1 >> 1) - x3
+    vpsraw        %4, %4, 1
+    vpaddw        %4, %2, %4  ; t2 = x1 + (x3 >> 1)
+    vpaddw        %2, %1, %3  ; t0 = x0 + x2
+    vpsubw        %3, %1, %3  ; t1 = x0 - x2
+    vpaddw        %1, %2, %4  ; y0 = t0 + t2
+    vpsubw        %4, %2, %4  ; y3 = t0 - t2
+    vpaddw        %2, %3, %5  ; y1 = t1 + t3
+    vpsubw        %3, %3, %5  ; y2 = t1 - t3
+%endmacro
+
 ; Do 4 horizontal 4-pt DCTs in parallel packed as 16 words in a ymm register.
 ; out=%1 in=%1 wels_rev64w_256=%2 clobber=%3
 %macro AVX2_DCT_HORIZONTAL 3
@@ -572,6 +631,20 @@
     vpaddw        %1, %1, %3                    ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
 %endmacro
 
+; Do 4 horizontal 4-pt IDCTs in parallel packed as 16 words in a ymm register.
+; out=%1 in=%1 wels_rev64w_256=%2 clobber=%3
+%macro AVX2_IDCT_HORIZONTAL 3
+    vpsraw        %3, %1, 1                     ; [x[0]>>1,x[1]>>1,x[2]>>1,x[3]>>1, ...]
+    vpblendw      %3, %1, %3, 10101010b         ; [x[0],x[1]>>1,x[2],x[3]>>1, ...]
+    vpshufd       %1, %1, 0b1h                  ; [x[2],x[3],x[0],x[1], ...]
+    vpsignw       %1, %1, [wels_p1m1m1p1w_256]  ; [x[2],-x[3],-x[0],x[1], ...]
+    vpaddw        %1, %3, %1                    ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
+    vpshufb       %3, %1, %2                    ; [s[3],s[2],s[1],s[0], ...]
+    vpmullw       %1, %1, [wels_p1p1m1m1w_256]  ; [s[0],s[1],-s[2],-s[3], ...]
+    vpmullw       %3, %3, [wels_p1m1m1p1w_256]  ; [s[3],-s[2],-s[1],s[0], ...]
+    vpaddw        %1, %1, %3                    ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
+%endmacro
+
 ;***********************************************************************
 ; void WelsDctFourT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
 ;***********************************************************************
@@ -604,6 +677,36 @@
     AVX2_DCT_HORIZONTAL ymm3, ymm6, ymm5
 
     AVX2_Store4x16P r0, mm0, mm1, mm2, mm3, mm5
+    vzeroupper
+
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
+
+;***********************************************************************
+; void WelsIDctFourT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct);
+;***********************************************************************
+WELS_EXTERN WelsIDctFourT4Rec_avx2
+    %assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+
+    AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5
+    vmovdqa ymm6, [wels_rev64w_256]
+    AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5
+    AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5
+    AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5
+    AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5
+    AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5
+
+    vpxor ymm6, ymm6, ymm6
+    WELS_DW32_VEX ymm7
+    AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4
+    add r2, r3
+    add r0, r1
+    AVX2_StoreDiff32P r0, r1, mm2, mm3, r2, r3, mm7, mm6, mm5, mm4
     vzeroupper
 
     POP_XMM
--- a/test/encoder/EncUT_DecodeMbAux.cpp
+++ b/test/encoder/EncUT_DecodeMbAux.cpp
@@ -354,6 +354,9 @@
 TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_sse2) {
   TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_sse2);
 }
+TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_avx2) {
+  TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_avx2);
+}
 TEST (DecodeMbAuxTest, WelsIDctRecI16x16Dc_sse2) {
   int32_t iCpuCores = 0;
   uint32_t uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);