shithub: openh264

Download patch

ref: 80721234bef52905fe23d7e4acc87bae29f0639c
parent: a5b53a690fc9d10a3a454ef6f5ea6b9826874ec6
author: Sindre Aamås <[email protected]>
date: Wed Jul 20 05:28:30 EDT 2016

[Common/x86] Tweak McCopyWidthEq8_mmx

~2x speedup on Haswell.

--- a/codec/common/inc/mc.h
+++ b/codec/common/inc/mc.h
@@ -252,8 +252,6 @@
                              int32_t iHeight);
 void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                            const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                         int32_t iHeight);
 void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
                            const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
 void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
@@ -264,6 +262,8 @@
 //***************************************************************************//
 void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                             const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                          int32_t iHeight);
 void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                            int32_t iHeight);
 void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
--- a/codec/common/src/mc.cpp
+++ b/codec/common/src/mc.cpp
@@ -440,7 +440,7 @@
   if (iWidth == 16)
     McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 8)
-    McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McCopyWidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 4)
     McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else
--- a/codec/common/x86/mb_copy.asm
+++ b/codec/common/x86/mb_copy.asm
@@ -44,6 +44,10 @@
 ;*********************************************************************************************/
 %include "asm_inc.asm"
 
+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
 ;***********************************************************************
 ; Macros and other preprocessor constants
 ;***********************************************************************
@@ -502,12 +506,37 @@
     LOAD_7_PARA_POP
     ret
 
+; load_instr=%1 store_instr=%2 p_dst=%3 i_dststride=%4 p_src=%5 i_srcstride=%6 cnt=%7 r_tmp=%8,%9 mm_tmp=%10,%11
+%macro CopyStrided4N 11
+    lea             %8, [3 * %6]
+    lea             %9, [3 * %4]
+ALIGN 32
+%%loop:
+    %1              %10, [%5]
+    %1              %11, [%5 + %6]
+    %2              [%3], %10
+    %2              [%3 + %4], %11
+    %1              %10, [%5 + 2 * %6]
+    %1              %11, [%5 + %8]
+    %2              [%3 + 2 * %4], %10
+    %2              [%3 + %9], %11
+    lea             %5, [%5 + 4 * %6]
+    lea             %3, [%3 + 4 * %4]
+    sub             %7, 4
+    jg              %%loop
+%endmacro
+
 ;*******************************************************************************
-;   void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
-;                           uint8_t *pDst, int iDstStride, int iHeight )
+;   void McCopyWidthEq8_sse2( uint8_t *pSrc, int iSrcStride,
+;                             uint8_t *pDst, int iDstStride, int iHeight )
 ;*******************************************************************************
-WELS_EXTERN McCopyWidthEq8_mmx
+WELS_EXTERN McCopyWidthEq8_sse2
     %assign  push_num 0
+%ifdef X86_32
+    push            r5
+    push            r6
+    %assign  push_num 2
+%endif
     LOAD_5_PARA
 
     SIGN_EXTENSION  r1, r1d
@@ -514,17 +543,13 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
 
-ALIGN 4
-.height_loop:
-    movq mm0, [r0]
-    movq [r2], mm0
-    add r0, r1
-    add r2, r3
-    dec r4
-    jnz .height_loop
+    CopyStrided4N   movsd, movsd, r2, r3, r0, r1, r4, r5, r6, xmm0, xmm1
 
-    WELSEMMS
     LOAD_5_PARA_POP
+%ifdef X86_32
+    pop             r6
+    pop             r5
+%endif
     ret