shithub: openh264

Download patch

ref: 991e344d8c62fc6523d150488b53dea5cdb6456c
parent: 3088d96978667117f07891f9e5749c218de7f2d2
author: Sindre Aamås <[email protected]>
date: Mon Jan 18 15:49:17 EST 2016

[Encoder] SSE2 4x4 DCT optimizations

Use a combination of instruction types that distributes more
evenly across execution ports on common architectures.

Do the horizontal DCT without transposing back and forth.

Minor tweaks.

~1.54x faster on Haswell. Should be faster on other architectures
as well.

--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -61,6 +61,14 @@
     times 4 dw 1, 1, -1, -1
 
 align 16
+wels_p1m1p1m1w_128:
+    times 4 dw 1, -1
+wels_p1p2p1p2w_128:
+    times 4 dw 1, 2
+wels_p1m1m1p1w_128:
+    times 2 dw 1, -1, -1, 1
+
+align 16
 SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
             dw  10, 13, 10, 13, 13, 16, 13, 16,
             dw  11, 14, 11, 14, 14, 18, 14, 18,
@@ -205,12 +213,14 @@
 ; SSE2 functions
 ;***********************************************************************
 %macro SSE2_Store4x8p 6
-    SSE2_XSawp qdq, %2, %3, %6
-    SSE2_XSawp qdq, %4, %5, %3
-    MOVDQ    [%1+0x00], %2
-    MOVDQ    [%1+0x10], %4
-    MOVDQ    [%1+0x20], %6
-    MOVDQ    [%1+0x30], %3
+    movlps   [%1+0x00], %2
+    movhps   [%1+0x20], %2
+    movlps   [%1+0x08], %3
+    movhps   [%1+0x28], %3
+    movlps   [%1+0x10], %4
+    movhps   [%1+0x30], %4
+    movlps   [%1+0x18], %5
+    movhps   [%1+0x38], %5
 %endmacro
 
 %macro SSE2_Load4x8p 6
@@ -224,10 +234,10 @@
 
 %macro SSE2_SumSubMul2 3
     movdqa  %3, %1
-    paddw   %1, %1
+    psllw   %1, 1
     paddw   %1, %2
+    psllw   %2, 1
     psubw   %3, %2
-    psubw   %3, %2
 %endmacro
 
 %macro SSE2_SumSubDiv2 4
@@ -295,6 +305,19 @@
     SSE2_SumSub      %7, %4, %5
 %endmacro
 
+; Do 2 horizontal 4-pt DCTs in parallel packed as 8 words in an xmm register.
+; out=%1 in=%1 clobber=%2
+%macro SSE2_DCT_HORIZONTAL 2
+    pshuflw       %2, %1, 1bh               ; [x[3],x[2],x[1],x[0]] low qw
+    pmullw        %1, [wels_p1m1p1m1w_128]  ; [x[0],-x[1],x[2],-x[3], ...]
+    pshufhw       %2, %2, 1bh               ; [x[3],x[2],x[1],x[0]] high qw
+    paddw         %1, %2                    ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
+    pshufd        %2, %1, 0b1h              ; [s[2],s[3],s[0],s[1], ...]
+    pmullw        %1, [wels_p1m1m1p1w_128]  ; [s[0],-s[1],-s[2],s[3], ...]
+    pmullw        %2, [wels_p1p2p1p2w_128]  ; [s[2],2*s[3],s[0],2*s[1], ...]]
+    paddw         %1, %2                    ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
+%endmacro
+
 ;***********************************************************************
 ; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
 ;***********************************************************************
@@ -314,11 +337,12 @@
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
 
     SSE2_DCT            xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-    SSE2_TransTwo4x4W   xmm2, xmm0, xmm3, xmm4, xmm1
-    SSE2_DCT            xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
-    SSE2_TransTwo4x4W   xmm4, xmm2, xmm1, xmm3, xmm0
+    SSE2_DCT_HORIZONTAL xmm2, xmm5
+    SSE2_DCT_HORIZONTAL xmm0, xmm5
+    SSE2_DCT_HORIZONTAL xmm3, xmm5
+    SSE2_DCT_HORIZONTAL xmm4, xmm5
 
-    SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+    SSE2_Store4x8p r0, xmm2, xmm0, xmm3, xmm4, xmm1
 
     lea     r1, [r1 + 2 * r2]
     lea     r3, [r3 + 2 * r4]
@@ -332,12 +356,12 @@
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
 
     SSE2_DCT            xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-    SSE2_TransTwo4x4W   xmm2, xmm0, xmm3, xmm4, xmm1
-    SSE2_DCT            xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
-    SSE2_TransTwo4x4W   xmm4, xmm2, xmm1, xmm3, xmm0
+    SSE2_DCT_HORIZONTAL xmm2, xmm5
+    SSE2_DCT_HORIZONTAL xmm0, xmm5
+    SSE2_DCT_HORIZONTAL xmm3, xmm5
+    SSE2_DCT_HORIZONTAL xmm4, xmm5
 
-    lea     r0, [r0+64]
-    SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+    SSE2_Store4x8p r0+64, xmm2, xmm0, xmm3, xmm4, xmm1
 
     POP_XMM
     LOAD_5_PARA_POP