shithub: openh264

--- a/codec/common/inc/mc.h

+++ b/codec/common/inc/mc.h

@@ -305,11 +305,32 @@

         int32_t iWidth, int32_t iHeight);

 //***************************************************************************//

-//                       SSSE3 definition                                    //

+//                       SSE3 definition                                     //

 //***************************************************************************//

+void McCopyWidthEq16_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                           int32_t iHeight);

+//***************************************************************************//

+//                       SSSE3 definition                                    //

+//***************************************************************************//

 void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

                              const uint8_t* kpABCD, int32_t iHeight);

+void McHorVer02_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight);

+void McHorVer02Width4S16ToU8_ssse3 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);

+void McHorVer02Width5S16ToU8_ssse3 (const int16_t* pSrc, int32_t iSrcStride,

+                                    uint8_t* pDst, int32_t iDstStride, int32_t iHeight);

+void McHorVer02WidthGe8S16ToU8_ssse3 (const int16_t* pSrc, int32_t iSrcStride,

+                                      uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);

+void McHorVer20_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight);

+void McHorVer20Width4U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);

+void McHorVer20Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,

+                                    uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);

+void McHorVer20Width8U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,

+                                    int16_t* pDst, int32_t iDstStride, int32_t iHeight);

+void McHorVer20Width9Or17U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,

+                                        int16_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);

 #endif //X86_ASM

--- a/codec/common/src/mc.cpp

+++ b/codec/common/src/mc.cpp

@@ -44,6 +44,8 @@

 #include "ls_defines.h"

 #include "macros.h"

+namespace {

 typedef void (*PMcChromaWidthExtFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

                                        const uint8_t* kpABCD, int32_t iHeight);

 typedef void (*PWelsSampleWidthAveragingFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*,

@@ -51,8 +53,6 @@

 typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

                                         int32_t iWidth, int32_t iHeight);

-namespace WelsCommon {

 /*------------------weight for chroma fraction pixel interpolation------------------*/

 //iA = (8 - dx) * (8 - dy);

 //iB = dx * (8 - dy);

@@ -710,6 +710,183 @@

     McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);

+//***************************************************************************//

+//                          SSSE3 implementation                             //

+//***************************************************************************//

+void PixelAvgWidth4Or8Or16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,

+                                 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {

+  if (iWidth < 8) {

+    PixelAvgWidthEq4_mmx   (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);

+  } else if (iWidth == 8) {

+    PixelAvgWidthEq8_mmx   (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);

+  } else {

+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);

+  }

+}

+void McCopy_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                  int32_t iWidth, int32_t iHeight) {

+  switch (iWidth) {

+  case 16: return McCopyWidthEq16_sse3 (pSrc, iSrcStride, pDst, iDstStride, iHeight);

+  case 8:  return McCopyWidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);

+  case 4:  return McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);

+  }

+  return McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);

+}

+void McHorVer22_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 8, 16);

+  if (iWidth < 8) {

+    McHorVer20Width4U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);

+    McHorVer02Width4S16ToU8_ssse3 (&pTmp[0][0], pDst, iDstStride, iHeight);

+  } else if (iWidth == 8) {

+    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);

+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);

+  } else {

+    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);

+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, 8, iHeight);

+    McHorVer20Width8U8ToS16_ssse3 (pSrc + 8, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);

+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst + 8, iDstStride, 8, iHeight);

+  }

+}

+void McHorVer01_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,

+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer03_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,

+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer10_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,

+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer11_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer12_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer13_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_ssse3 (pSrc,              iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer21_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer23_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer22_ssse3 (pSrc,              iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer30_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer31_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc,     iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer32_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  McHorVer22_ssse3 (pSrc,     iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer33_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_ssse3 (pSrc + 1,          iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer22Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                                    int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 16 / sizeof (int16_t)), 16)

+  if (iWidth > 5) {

+    McHorVer20Width9Or17U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight + 5);

+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);

+  } else {

+    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);

+    McHorVer02Width5S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iHeight);

+  }

+}

+void McLuma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {

+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {

+    {McCopy_sse3,      McHorVer01_ssse3, McHorVer02_ssse3, McHorVer03_ssse3},

+    {McHorVer10_ssse3, McHorVer11_ssse3, McHorVer12_ssse3, McHorVer13_ssse3},

+    {McHorVer20_ssse3, McHorVer21_ssse3, McHorVer22_ssse3, McHorVer23_ssse3},

+    {McHorVer30_ssse3, McHorVer31_ssse3, McHorVer32_ssse3, McHorVer33_ssse3},

+  };

+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);

+}

 void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

                      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {

   static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {

@@ -1319,7 +1496,9 @@

 #endif

-void InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {

+} // anon ns.

+void WelsCommon::InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {

   pMcFuncs->pfLumaHalfpelHor  = McHorVer20_c;

   pMcFuncs->pfLumaHalfpelVer  = McHorVer02_c;

   pMcFuncs->pfLumaHalfpelCen  = McHorVer22_c;

@@ -1338,7 +1517,11 @@

   if (uiCpuFlag & WELS_CPU_SSSE3) {

+    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_ssse3;

+    pMcFuncs->pfLumaHalfpelVer  = McHorVer02_ssse3;

+    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17_ssse3;

     pMcFuncs->pMcChromaFunc = McChroma_ssse3;

+    pMcFuncs->pMcLumaFunc   = McLuma_ssse3;

 #endif //(X86_ASM)

@@ -1363,4 +1546,3 @@

 #endif

-} // namespace WelsCommon

--- a/codec/common/x86/mb_copy.asm

+++ b/codec/common/x86/mb_copy.asm

@@ -587,3 +587,28 @@

     LOAD_5_PARA_POP

ret

+;*******************************************************************************

+;   void McCopyWidthEq16_sse3( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )

+;*******************************************************************************

+WELS_EXTERN McCopyWidthEq16_sse3

+    %assign push_num 0

+%ifdef X86_32

+    push            r5

+    push            r6

+    %assign push_num 2

+%endif

+    LOAD_5_PARA

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    CopyStrided4N   lddqu, MOVDQ, r2, r3, r0, r1, r4, r5, r6, xmm0, xmm1

+    LOAD_5_PARA_POP

+%ifdef X86_32

+    pop             r6

+    pop             r5

+%endif

+    ret

--- a/codec/common/x86/mc_luma.asm

+++ b/codec/common/x86/mc_luma.asm

@@ -51,9 +51,28 @@

 ;*******************************************************************************

 ALIGN 16

-h264_w0x10:

-    dw 16, 16, 16, 16

-ALIGN 16

+shufb_32435465768798A9:

+    db 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9

+shufb_011267784556ABBC:

+    db 0, 1, 1, 2, 6, 7, 7, 8, 4, 5, 5, 6, 0Ah, 0Bh, 0Bh, 0Ch

+maddubsw_p1m5_p1m5_m5p1_m5p1_128:

+    times 2 db 1, -5, 1, -5, -5, 1, -5, 1

+maddubsw_m2p10_m40m40_p10m2_p0p0_128:

+    times 2 db -2, 10, -40, -40, 10, -2, 0, 0

+dwm1024_128:

+    times 8 dw -1024

+dd32768_128:

+    times 4 dd 32768

+maddubsw_p1m5_128:

+    times 8 db 1, -5

+maddubsw_m5p1_128:

+    times 8 db -5, 1

+db20_128:

+    times 16 db 20

+maddubsw_m5p20_128:

+    times 8 db -5, 20

+maddubsw_p20m5_128:

+    times 8 db 20, -5

 h264_w0x10_1:

     dw 16, 16, 16, 16, 16, 16, 16, 16

 ALIGN 16

@@ -85,7 +104,7 @@

     sub r0, 2

     WELS_Zero mm7

-    movq mm6, [h264_w0x10]

+    movq mm6, [h264_w0x10_1]

 .height_loop:

     movd mm0, [r0]

     punpcklbw mm0, mm7

@@ -1746,3 +1765,1112 @@

 LOAD_6_PARA_POP

ret

+; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7

+%macro SSSE3_FilterVertical_8px 7

+    pmaddubsw       %1, %4

+    movdqa          %7, %2

+    pmaddubsw       %7, %5

+    paddw           %1, %7

+    movdqa          %7, %3

+    pmaddubsw       %7, %6

+    paddw           %1, %7

+    paddw           %1, [h264_w0x10_1]

+    psraw           %1, 5

+%endmacro

+; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8

+%macro SSSE3_FilterVertical2_8px 8

+    movdqa          %8, %2

+    pxor            %7, %7

+    punpcklbw       %1, %7

+    punpcklbw       %8, %7

+    paddw           %1, %8

+    movdqa          %7, %3

+    pmaddubsw       %7, %5

+    paddw           %1, %7

+    movdqa          %7, %4

+    pmaddubsw       %7, %6

+    paddw           %1, %7

+    paddw           %1, [h264_w0x10_1]

+    psraw           %1, 5

+%endmacro

+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6

+%macro SSSE3_FilterHorizontalbw_8px 6

+    movdqa          %5, %1

+    pshufb          %1, %2

+    pshufb          %5, %3

+    pshufd          %6, %1, 10110001b

+    pmaddubsw       %1, [db20_128]

+    pmaddubsw       %5, %4

+    pmaddubsw       %6, %4

+    paddw           %1, %5

+    paddw           %1, %6

+%endmacro

+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6

+%macro SSSE3_FilterHorizontal_8px 6

+    SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6

+    paddw           %1, [h264_w0x10_1]

+    psraw           %1, 5

+%endmacro

+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7

+%macro SSSE3_FilterHorizontalbw_2x4px 7

+    movdqa          %6, %1

+    movdqa          %7, %2

+    pshufb          %1, %3

+    pshufb          %2, %3

+    punpcklqdq      %1, %2

+    pshufb          %6, %4

+    pshufb          %7, %4

+    punpcklqdq      %6, %7

+    pshufd          %7, %1, 10110001b

+    pmaddubsw       %1, [db20_128]

+    pmaddubsw       %6, %5

+    pmaddubsw       %7, %5

+    paddw           %1, %6

+    paddw           %1, %7

+%endmacro

+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7

+%macro SSSE3_FilterHorizontal_2x4px 7

+    SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7

+    paddw           %1, [h264_w0x10_1]

+    psraw           %1, 5

+%endmacro

+; pixels=%1 -32768>>scale=%2 tmp=%3

+%macro SSSE3_FilterHorizontalbw_2px 3

+    pmaddubsw       %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_128]

+    pmaddwd         %1, %2

+    pshufd          %3, %1, 10110001b

+    paddd           %1, %3

+%endmacro

+; pixels=%1 tmp=%2

+%macro SSSE3_FilterHorizontal_2px 2

+    SSSE3_FilterHorizontalbw_2px %1, [dwm1024_128], %2

+    paddd           %1, [dd32768_128]

+%endmacro

+; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7

+%macro SSE2_FilterVerticalw_8px 7

+    paddw           %1, %6

+    movdqa          %7, %2

+    paddw           %7, %5

+    psubw           %1, %7

+    psraw           %1, 2

+    psubw           %1, %7

+    movdqa          %7, %3

+    paddw           %7, %4

+    paddw           %1, %7

+    psraw           %1, 2

+    paddw           %7, [h264_mc_hc_32]

+    paddw           %1, %7

+    psraw           %1, 6

+%endmacro

+;***********************************************************************

+; void McHorVer02_ssse3(const uint8_t *pSrc,

+;                       int32_t iSrcStride,

+;                       uint8_t *pDst,

+;                       int32_t iDstStride,

+;                       int32_t iWidth,

+;                       int32_t iHeight)

+;***********************************************************************

+WELS_EXTERN McHorVer02_ssse3

+%define p_src         r0

+%define i_srcstride   r1

+%define p_dst         r2

+%define i_dststride   r3

+%define i_width       r4

+%define i_height      r5

+%define i_srcstride3  r6

+    %assign push_num 0

+%ifdef X86_32

+    push            r6

+    %assign push_num 1

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    lea             i_srcstride3, [3 * i_srcstride]

+    cmp             i_width, 4

+    jg              .width8or16

+    movd            xmm0, [p_src]

+    movd            xmm4, [p_src + i_srcstride]

+    punpcklbw       xmm0, xmm4

+    movd            xmm1, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm4, xmm1

+    punpcklqdq      xmm0, xmm4

+    movd            xmm4, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklbw       xmm1, xmm4

+    movd            xmm2, [p_src]

+    punpcklbw       xmm4, xmm2

+    punpcklqdq      xmm1, xmm4

+    movd            xmm4, [p_src + i_srcstride]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm2, xmm4

+    movd            xmm3, [p_src]

+    punpcklbw       xmm4, xmm3

+    punpcklqdq      xmm2, xmm4

+    movdqa          xmm5, [db20_128]

+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm0, xmm0

+    movd            [p_dst], xmm0

+    psrlq           xmm0, 32

+    movd            [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movd            xmm4, [p_src + i_srcstride]

+    punpcklbw       xmm3, xmm4

+    movd            xmm0, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm4, xmm0

+    punpcklqdq      xmm3, xmm4

+    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm1, xmm1

+    movd            [p_dst], xmm1

+    psrlq           xmm1, 32

+    movd            [p_dst + i_dststride], xmm1

+    cmp             i_height, 5

+    jl              .width4_height_le5_done

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movd            xmm4, [p_src + i_srcstride3]

+    punpcklbw       xmm0, xmm4

+    jg              .width4_height_ge8

+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm2, xmm2

+    movd            [p_dst], xmm2

+.width4_height_le5_done:

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+.width4_height_ge8:

+    lea             p_src, [p_src + 4 * i_srcstride]

+    movd            xmm1, [p_src]

+    punpcklbw       xmm4, xmm1

+    punpcklqdq      xmm0, xmm4

+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm2, xmm2

+    movd            [p_dst], xmm2

+    psrlq           xmm2, 32

+    movd            [p_dst + i_dststride], xmm2

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movd            xmm4, [p_src + i_srcstride]

+    punpcklbw       xmm1, xmm4

+    movd            xmm2, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm4, xmm2

+    punpcklqdq      xmm1, xmm4

+    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm3, xmm3

+    movd            [p_dst], xmm3

+    psrlq           xmm3, 32

+    movd            [p_dst + i_dststride], xmm3

+    cmp             i_height, 9

+    jl              .width4_height_ge8_done

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movd            xmm4, [p_src + i_srcstride3]

+    punpcklbw       xmm2, xmm4

+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm0, xmm0

+    movd            [p_dst], xmm0

+.width4_height_ge8_done:

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+.width8or16:

+    sub             i_height, 1

+    push            i_height

+%xdefine i_ycnt i_height

+%define i_height [r7]

+.xloop:

+    push            p_src

+    push            p_dst

+    test            i_ycnt, 1

+    jnz             .yloop_begin_even

+    movq            xmm0, [p_src]

+    movq            xmm1, [p_src + i_srcstride]

+    punpcklbw       xmm0, xmm1

+    movq            xmm2, [p_src + 2 * i_srcstride]

+    movq            xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklbw       xmm2, xmm3

+    movq            xmm4, [p_src]

+    movq            xmm5, [p_src + i_srcstride]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm4, xmm5

+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm7

+    packuswb        xmm0, xmm0

+    movlps          [p_dst], xmm0

+    add             p_dst, i_dststride

+    jmp             .yloop

+.yloop_begin_even:

+    movq            xmm1, [p_src]

+    movq            xmm2, [p_src + i_srcstride]

+    movq            xmm3, [p_src + 2 * i_srcstride]

+    add             p_src, i_srcstride3

+    punpcklbw       xmm2, xmm3

+    movq            xmm4, [p_src]

+    movq            xmm5, [p_src + i_srcstride]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm4, xmm5

+.yloop:

+    movq            xmm6, [p_src]

+    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm0, xmm7

+    movq            xmm7, [p_src + i_srcstride]

+    punpcklbw       xmm6, xmm7

+    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm0

+    packuswb        xmm1, xmm2

+    movlps          [p_dst], xmm1

+    movhps          [p_dst + i_dststride], xmm1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movq            xmm0, [p_src + 2 * i_srcstride]

+    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm2, xmm1

+    movq            xmm1, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklbw       xmm0, xmm1

+    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm2

+    packuswb        xmm3, xmm4

+    movlps          [p_dst], xmm3

+    movhps          [p_dst + i_dststride], xmm3

+    cmp             i_ycnt, 4

+    jle             .yloop_exit

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movq            xmm2, [p_src]

+    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm4, xmm3

+    movq            xmm3, [p_src + i_srcstride]

+    punpcklbw       xmm2, xmm3

+    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm4

+    packuswb        xmm5, xmm6

+    movlps          [p_dst], xmm5

+    movhps          [p_dst + i_dststride], xmm5

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movq            xmm4, [p_src + 2 * i_srcstride]

+    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm6, xmm5

+    movq            xmm5, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklbw       xmm4, xmm5

+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm6

+    packuswb        xmm7, xmm0

+    movlps          [p_dst], xmm7

+    movhps          [p_dst + i_dststride], xmm7

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_ycnt, 8

+    jg              .yloop

+.yloop_exit:

+    pop             p_dst

+    pop             p_src

+    sub             i_width, 8

+    jle             .width8or16_done

+    add             p_src, 8

+    add             p_dst, 8

+    mov             i_ycnt, i_height

+    jmp             .xloop

+.width8or16_done:

+    pop             i_ycnt

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+%undef p_src

+%undef i_srcstride

+%undef i_srcstride3

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+%undef i_ycnt

+;*******************************************************************************

+; void McHorVer20_ssse3(const uint8_t *pSrc,

+;                       int iSrcStride,

+;                       uint8_t *pDst,

+;                       int iDstStride,

+;                       int iWidth,

+;                       int iHeight);

+;*******************************************************************************

+WELS_EXTERN McHorVer20_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_width      r4

+%define i_height     r5

+    %assign  push_num 0

+    LOAD_6_PARA

+    PUSH_XMM 7

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    movdqa          xmm4, [shufb_32435465768798A9]

+    movdqa          xmm5, [shufb_011267784556ABBC]

+    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    cmp             i_width, 8

+    je              .width8_yloop

+    jg              .width16_yloop

+.width4_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm1, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    SSSE3_FilterHorizontal_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3

+    packuswb        xmm0, xmm0

+    movd            [p_dst], xmm0

+    psrlq           xmm0, 32

+    movd            [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 2

+    jg              .width4_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width8_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm1, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3

+    SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3

+    packuswb        xmm0, xmm1

+    movlps          [p_dst], xmm0

+    movhps          [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 2

+    jg              .width8_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width16_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm1, [p_src + 6]

+    add             p_src, i_srcstride

+    SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3

+    SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3

+    packuswb        xmm0, xmm1

+    MOVDQ           [p_dst], xmm0

+    add             p_dst, i_dststride

+    sub             i_height, 1

+    jg              .width16_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+;***********************************************************************

+; void McHorVer20Width5Or9Or17_ssse3(const uint8_t *pSrc,

+;                                    int32_t iSrcStride,

+;                                    uint8_t *pDst,

+;                                    int32_t iDstStride,

+;                                    int32_t iWidth,

+;                                    int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer20Width5Or9Or17_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_width      r4

+%define i_height     r5

+    %assign  push_num 0

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    movdqa          xmm5, [shufb_32435465768798A9]

+    movdqa          xmm6, [shufb_011267784556ABBC]

+    movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    cmp             i_width, 9

+    je              .width9_yloop

+    jg              .width17_yloop

+.width5_yloop:

+    movdqu          xmm0, [p_src - 2]

+    add             p_src, i_srcstride

+    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2

+    packuswb        xmm0, xmm0

+    movdqa          xmm1, xmm0

+    psrlq           xmm1, 8

+    movd            [p_dst], xmm0

+    movd            [p_dst + 1], xmm1

+    add             p_dst, i_dststride

+    sub             i_height, 1

+    jg              .width5_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width9_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm4, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    movdqa          xmm3, xmm0

+    punpckhqdq      xmm3, xmm4

+    SSSE3_FilterHorizontal_2px xmm3, xmm2

+    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2

+    packuswb        xmm3, xmm0

+    movd            [p_dst + 5], xmm3

+    movhps          [p_dst], xmm3

+    add             p_dst, i_dststride

+    SSSE3_FilterHorizontal_8px xmm4, xmm5, xmm6, xmm7, xmm1, xmm2

+    packuswb        xmm4, xmm4

+    psrldq          xmm3, 4

+    movd            [p_dst + 5], xmm3

+    movlps          [p_dst], xmm4

+    add             p_dst, i_dststride

+    sub             i_height, 2

+    jg              .width9_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width17_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm3, [p_src + 6]

+    add             p_src, i_srcstride

+    movdqa          xmm4, xmm3

+    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2

+    SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2

+    packuswb        xmm0, xmm3

+    movdqu          xmm1, [p_src - 2]

+    movdqu          xmm3, [p_src + 6]

+    add             p_src, i_srcstride

+    punpckhqdq      xmm4, xmm3

+    SSSE3_FilterHorizontal_2px xmm4, xmm2

+    packuswb        xmm4, xmm4

+    movd            [p_dst + 13], xmm4

+    MOVDQ           [p_dst], xmm0

+    add             p_dst, i_dststride

+    psrldq          xmm4, 4

+    movd            [p_dst + 13], xmm4

+    SSSE3_FilterHorizontal_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm2

+    SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm0, xmm2

+    packuswb        xmm1, xmm3

+    MOVDQ           [p_dst], xmm1

+    add             p_dst, i_dststride

+    sub             i_height, 2

+    jg              .width17_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+;*******************************************************************************

+; void McHorVer20Width4U8ToS16_ssse3(const uint8_t *pSrc,

+;                                    int iSrcStride,

+;                                    int16_t *pDst,

+;                                    int iHeight);

+;*******************************************************************************

+WELS_EXTERN McHorVer20Width4U8ToS16_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_height     r3

+    %assign  push_num 0

+    LOAD_4_PARA

+    PUSH_XMM 7

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    movdqa          xmm4, [shufb_32435465768798A9]

+    movdqa          xmm5, [shufb_011267784556ABBC]

+    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    sub             i_height, 1

+.yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm1, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    SSSE3_FilterHorizontalbw_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3

+    movdqa          [p_dst], xmm0

+    add             p_dst, 16

+    sub             i_height, 2

+    jg              .yloop

+    ; Height % 2 remainder.

+    movdqu          xmm0, [p_src - 2]

+    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3

+    movlps          [p_dst], xmm0

+    POP_XMM

+    LOAD_4_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_height

+;***********************************************************************

+; void McHorVer02Width4S16ToU8_ssse3(const int16_t *pSrc,

+;                                    uint8_t *pDst,

+;                                    int32_t iDstStride,

+;                                    int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer02Width4S16ToU8_ssse3

+%define p_src        r0

+%define p_dst        r1

+%define i_dststride  r2

+%define i_height     r3

+%define i_srcstride  8

+    %assign  push_num 0

+    LOAD_4_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r2, r2d

+    SIGN_EXTENSION  r3, r3d

+    movdqa          xmm0, [p_src +  0 * i_srcstride]

+    movdqu          xmm1, [p_src +  1 * i_srcstride]

+    movdqa          xmm2, [p_src +  2 * i_srcstride]

+    movdqu          xmm3, [p_src +  3 * i_srcstride]

+    movdqa          xmm4, [p_src +  4 * i_srcstride]

+    movdqu          xmm5, [p_src +  5 * i_srcstride]

+    movdqa          xmm6, [p_src +  6 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7

+    packuswb        xmm0, xmm0

+    movd            [p_dst], xmm0

+    psrlq           xmm0, 32

+    movd            [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movdqu          xmm7, [p_src +  7 * i_srcstride]

+    movdqa          xmm0, [p_src +  8 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm1

+    packuswb        xmm2, xmm2

+    movd            [p_dst], xmm2

+    psrlq           xmm2, 32

+    movd            [p_dst + i_dststride], xmm2

+    cmp             i_height, 4

+    jle             .done

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movdqu          xmm1, [p_src +  9 * i_srcstride]

+    movdqa          xmm2, [p_src + 10 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm3

+    packuswb        xmm4, xmm4

+    movd            [p_dst], xmm4

+    psrlq           xmm4, 32

+    movd            [p_dst + i_dststride], xmm4

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movdqu          xmm3, [p_src + 11 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm5

+    packuswb        xmm6, xmm6

+    movd            [p_dst], xmm6

+    psrlq           xmm6, 32

+    movd            [p_dst + i_dststride], xmm6

+.done:

+    POP_XMM

+    LOAD_4_PARA_POP

+    ret

+%undef p_src

+%undef p_dst

+%undef i_dststride

+%undef i_height

+%undef i_srcstride

+;***********************************************************************

+; void McHorVer20Width8U8ToS16_ssse3(const uint8_t *pSrc,

+;                                    int16_t iSrcStride,

+;                                    int16_t *pDst,

+;                                    int32_t iDstStride,

+;                                    int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer20Width8U8ToS16_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_height     r4

+    %assign  push_num 0

+    LOAD_5_PARA

+    PUSH_XMM 7

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    movdqa          xmm4, [shufb_32435465768798A9]

+    movdqa          xmm5, [shufb_011267784556ABBC]

+    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    sub             i_height, 1

+.yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm1, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3

+    MOVDQ           [p_dst], xmm0

+    add             p_dst, i_dststride

+    SSSE3_FilterHorizontalbw_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3

+    MOVDQ           [p_dst], xmm1

+    add             p_dst, i_dststride

+    sub             i_height, 2

+    jg              .yloop

+    jl              .done

+    movdqu          xmm0, [p_src - 2]

+    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3

+    MOVDQ           [p_dst], xmm0

+.done:

+    POP_XMM

+    LOAD_5_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_height

+;***********************************************************************

+; void McHorVer02Width5S16ToU8_ssse3(const int16_t *pSrc,

+;                                    int32_t iTapStride,

+;                                    uint8_t *pDst,

+;                                    int32_t iDstStride,

+;                                    int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer02Width5S16ToU8_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_height     r4

+%define i_srcstride3 r5

+    %assign  push_num 0

+%ifdef X86_32

+    push            r5

+    %assign  push_num 1

+%endif

+    LOAD_5_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    lea             i_srcstride3, [3 * i_srcstride]

+    movdqa          xmm0, [p_src]

+    movdqa          xmm1, [p_src + i_srcstride]

+    movdqa          xmm2, [p_src + 2 * i_srcstride]

+    movdqa          xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    movdqa          xmm4, [p_src]

+    movdqa          xmm5, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6

+    movdqa          xmm6, [p_src + 2 * i_srcstride]

+    packuswb        xmm0, xmm0

+    movdqa          xmm7, xmm0

+    psrlq           xmm7, 8

+    movd            [p_dst + 1], xmm7

+    movd            [p_dst], xmm0

+    add             p_dst, i_dststride

+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

+    movdqa          xmm7, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    packuswb        xmm1, xmm1

+    movdqa          xmm0, xmm1

+    psrlq           xmm0, 8

+    movd            [p_dst + 1], xmm0

+    movd            [p_dst], xmm1

+    add             p_dst, i_dststride

+    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0

+    movdqa          xmm0, [p_src]

+    packuswb        xmm2, xmm2

+    movdqa          xmm1, xmm2

+    psrlq           xmm1, 8

+    movd            [p_dst + 1], xmm1

+    movd            [p_dst], xmm2

+    add             p_dst, i_dststride

+    SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1

+    packuswb        xmm3, xmm3

+    movdqa          xmm2, xmm3

+    psrlq           xmm2, 8

+    movd            [p_dst + 1], xmm2

+    movd            [p_dst], xmm3

+    add             p_dst, i_dststride

+    movdqa          xmm1, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2

+    packuswb        xmm4, xmm4

+    movdqa          xmm3, xmm4

+    psrlq           xmm3, 8

+    movd            [p_dst + 1], xmm3

+    movd            [p_dst], xmm4

+    cmp             i_height, 5

+    jle             .done

+    add             p_dst, i_dststride

+    movdqa          xmm2, [p_src + 2 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3

+    movdqa          xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    packuswb        xmm5, xmm5

+    movdqa          xmm4, xmm5

+    psrlq           xmm4, 8

+    movd            [p_dst + 1], xmm4

+    movd            [p_dst], xmm5

+    add             p_dst, i_dststride

+    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4

+    movdqa          xmm4, [p_src]

+    packuswb        xmm6, xmm6

+    movdqa          xmm5, xmm6

+    psrlq           xmm5, 8

+    movd            [p_dst + 1], xmm5

+    movd            [p_dst], xmm6

+    add             p_dst, i_dststride

+    SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5

+    packuswb        xmm7, xmm7

+    movdqa          xmm6, xmm7

+    psrlq           xmm6, 8

+    movd            [p_dst + 1], xmm6

+    movd            [p_dst], xmm7

+    add             p_dst, i_dststride

+    movdqa          xmm5, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6

+    packuswb        xmm0, xmm0

+    movdqa          xmm7, xmm0

+    psrlq           xmm7, 8

+    movd            [p_dst + 1], xmm7

+    movd            [p_dst], xmm0

+.done:

+    POP_XMM

+    LOAD_5_PARA_POP

+%ifdef X86_32

+    pop             r5

+%endif

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_height

+%undef i_srcstride3

+;***********************************************************************

+; void McHorVer20Width9Or17U8ToS16_ssse3(const uint8_t *pSrc,

+;                                        int32_t iSrcStride,

+;                                        int16_t *pDst,

+;                                        int32_t iDstStride,

+;                                        int32_t iWidth,

+;                                        int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer20Width9Or17U8ToS16_ssse3

+%define p_src       r0

+%define i_srcstride r1

+%define p_dst       r2

+%define i_dststride r3

+%define i_width     r4

+%define i_height    r5

+    %assign  push_num 0

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    pcmpeqw         xmm4, xmm4

+    psllw           xmm4, 15                                ; dw -32768

+    movdqa          xmm5, [shufb_32435465768798A9]

+    movdqa          xmm6, [shufb_011267784556ABBC]

+    movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    cmp             i_width, 9

+    jne             .width17_yloop

+.width9_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqa          xmm3, xmm0

+    SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2

+    movdqu          xmm2, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    punpckhqdq      xmm3, xmm2

+    SSSE3_FilterHorizontalbw_2px xmm3, xmm4, xmm1

+    movlps          [p_dst + 10], xmm3

+    MOVDQ           [p_dst], xmm0

+    add             p_dst, i_dststride

+    movhps          [p_dst + 10], xmm3

+    SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm1, xmm0

+    MOVDQ           [p_dst], xmm2

+    add             p_dst, i_dststride

+    sub             i_height, 2

+    jg              .width9_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width17_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm3, [p_src + 6]

+    add             p_src, i_srcstride

+    SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2

+    MOVDQ           [p_dst], xmm0

+    movdqa          xmm0, xmm3

+    SSSE3_FilterHorizontalbw_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2

+    movdqu          xmm2, [p_src + 6]

+    punpckhqdq      xmm0, xmm2

+    SSSE3_FilterHorizontalbw_2px xmm0, xmm4, xmm1

+    movdqu          xmm1, [p_src - 2]

+    add             p_src, i_srcstride

+    movlps          [p_dst + 26], xmm0

+    MOVDQ           [p_dst + 16], xmm3

+    add             p_dst, i_dststride

+    movhps          [p_dst + 26], xmm0

+    SSSE3_FilterHorizontalbw_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm3

+    MOVDQ           [p_dst], xmm1

+    SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm0, xmm3

+    MOVDQ           [p_dst + 16], xmm2

+    add             p_dst, i_dststride

+    sub             i_height, 2

+    jg              .width17_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+;***********************************************************************

+; void McHorVer02WidthGe8S16ToU8_ssse3(const int16_t *pSrc,

+;                                      int32_t iSrcStride,

+;                                      uint8_t *pDst,

+;                                      int32_t iDstStride,

+;                                      int32_t iWidth,

+;                                      int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer02WidthGe8S16ToU8_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_width      r4

+%define i_height     r5

+%define i_srcstride3 r6

+    %assign  push_num 0

+%ifdef X86_32

+    push            r6

+    %assign  push_num 1

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    sub             i_height, 1

+    push            i_height

+    lea             i_srcstride3, [3 * i_srcstride]

+    test            i_width, 1

+    jz              .width_loop

+    push            p_src

+    push            p_dst

+    lea             p_src, [p_src + 2 * i_width - 2]

+    add             p_dst, i_width

+    movd            xmm0, [p_src]

+    punpcklwd       xmm0, [p_src + i_srcstride]

+    movd            xmm1, [p_src + 2 * i_srcstride]

+    add             p_src, i_srcstride3

+    punpcklwd       xmm1, [p_src]

+    punpckldq       xmm0, xmm1

+    movd            xmm1, [p_src + i_srcstride]

+    cmp             i_height, 4

+    je              .filter5_unalign

+    punpcklwd       xmm1, [p_src + 2 * i_srcstride]

+    movd            xmm2, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklwd       xmm2, [p_src]

+    punpckldq       xmm1, xmm2

+    punpcklqdq      xmm0, xmm1

+.height_loop_unalign:

+    movd            xmm1, [p_src + i_srcstride]

+    palignr         xmm1, xmm0, 2

+    movd            xmm2, [p_src + 2 * i_srcstride]

+    palignr         xmm2, xmm1, 2

+    movd            xmm3, [p_src + i_srcstride3]

+    palignr         xmm3, xmm2, 2

+    lea             p_src, [p_src + 4 * i_srcstride]

+    movd            xmm4, [p_src]

+    palignr         xmm4, xmm3, 2

+    movd            xmm5, [p_src + i_srcstride]

+    palignr         xmm5, xmm4, 2

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7

+    packuswb        xmm0, xmm0

+    movdqa          xmm6, xmm0

+    pslld           xmm6, 24

+    movd            [p_dst - 4], xmm6

+    movlps          [p_dst + 4 * i_dststride - 8], xmm6

+    add             p_dst, i_dststride

+    movdqa          xmm6, xmm0

+    pslld           xmm6, 16

+    movd            [p_dst - 4], xmm6

+    movlps          [p_dst + 4 * i_dststride - 8], xmm6

+    add             p_dst, i_dststride

+    movdqa          xmm6, xmm0

+    pslld           xmm6, 8

+    movd            [p_dst - 4], xmm6

+    movd            [p_dst + i_dststride - 4], xmm0

+    lea             p_dst, [p_dst + 4 * i_dststride]

+    movlps          [p_dst - 8], xmm6

+    movlps          [p_dst + i_dststride - 8], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 8

+    jle             .height_loop_unalign_exit

+    movd            xmm1, [p_src + 2 * i_srcstride]

+    palignr         xmm1, xmm5, 2

+    movd            xmm0, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklwd       xmm0, [p_src]

+    palignr         xmm0, xmm1, 4

+    jmp             .height_loop_unalign

+.height_loop_unalign_exit:

+    movddup         xmm6, [p_src + 2 * i_srcstride - 6]

+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

+    packuswb        xmm1, xmm1

+    movlps          [p_dst - 8], xmm1

+    jmp             .unalign_done

+.filter5_unalign:

+    pslldq          xmm0, 8

+    palignr         xmm1, xmm0, 2

+    movd            xmm2, [p_src + 2 * i_srcstride]

+    palignr         xmm2, xmm1, 2

+    movd            xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    palignr         xmm3, xmm2, 2

+    movd            xmm4, [p_src]

+    palignr         xmm4, xmm3, 2

+    movd            xmm5, [p_src + i_srcstride]

+    palignr         xmm5, xmm4, 2

+    movd            xmm6, [p_src + 2 * i_srcstride]

+    palignr         xmm6, xmm5, 2

+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

+    packuswb        xmm1, xmm1

+    movdqa          xmm0, xmm1

+    psrlq           xmm1,  8

+    movdqa          xmm2, xmm0

+    psrlq           xmm2, 16

+    movdqa          xmm3, xmm0

+    psrlq           xmm3, 24

+    movd            [p_dst - 4], xmm0

+    movd            [p_dst + i_dststride - 4], xmm1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movd            [p_dst - 4], xmm2

+    movd            [p_dst + i_dststride - 4], xmm3

+    movlps          [p_dst + 2 * i_dststride - 8], xmm0

+.unalign_done:

+    pop             p_dst

+    pop             p_src

+    mov             i_height, [r7]

+    sub             i_width, 1

+.width_loop:

+    push            p_src

+    push            p_dst

+    movdqa          xmm0, [p_src]

+    movdqa          xmm1, [p_src + i_srcstride]

+    movdqa          xmm2, [p_src + 2 * i_srcstride]

+    movdqa          xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    movdqa          xmm4, [p_src]

+.height_loop:

+    movdqa          xmm5, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6

+    movdqa          xmm6, [p_src + 2 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

+    movdqa          xmm7, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    packuswb        xmm0, xmm1

+    movlps          [p_dst], xmm0

+    movhps          [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0

+    movdqa          xmm0, [p_src]

+    SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1

+    packuswb        xmm2, xmm3

+    movlps          [p_dst], xmm2

+    movhps          [p_dst + i_dststride], xmm2

+    cmp             i_height, 4

+    jl              .x_loop_dec

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movdqa          xmm1, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2

+    je              .store_xmm4_exit

+    movdqa          xmm2, [p_src + 2 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3

+    movdqa          xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    packuswb        xmm4, xmm5

+    movlps          [p_dst], xmm4

+    movhps          [p_dst + i_dststride], xmm4

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4

+    movdqa          xmm4, [p_src]

+    SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5

+    packuswb        xmm6, xmm7

+    movlps          [p_dst], xmm6

+    movhps          [p_dst + i_dststride], xmm6

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 8

+    jg              .height_loop

+    jl              .x_loop_dec

+    movdqa          xmm5, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6

+    packuswb        xmm0, xmm0

+    movlps          [p_dst], xmm0

+.x_loop_dec:

+    pop             p_dst

+    pop             p_src

+    sub             i_width, 8

+    jle             .done

+    mov             i_height, [r7]

+    add             p_src, 16

+    add             p_dst, 8

+    jmp             .width_loop

+.store_xmm4_exit:

+    packuswb        xmm4, xmm4

+    movlps          [p_dst], xmm4

+    pop             p_dst

+    pop             p_src

+.done:

+    pop             i_height

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+%undef i_srcstride3

--- a/test/encoder/EncUT_MotionCompensation.cpp

+++ b/test/encoder/EncUT_MotionCompensation.cpp

@@ -168,8 +168,8 @@

 DEF_MCCOPYTEST (8, 16)

 DEF_MCCOPYTEST (16, 16)

-#define DEF_LUMA_MCTEST(iW,iH) \

-TEST(McHorVer,iW##x##iH)  \

+#define DEF_LUMA_MCTEST(iW, iH, cpu_flags, name_suffix) \

+TEST(McHorVer, iW##x##iH##_##name_suffix) \

 {                       \

     for (int32_t a = 0; a < 4; a++) { \

     for (int32_t b = 0; b < 4; b++) { \

@@ -191,43 +191,38 @@

         uSrcAnchor[0][j][i] = uSrcTest[j][i] = rand()%256;  \

}\

}\

-    int32_t iCpuCores = 1; \

-    uint32_t uiCpuFlag;\

-    for(int32_t k =0; k<2; k++)\

-    {\

-      if(k==0)\

-      {\

-        uiCpuFlag = 0;\

-      }else \

-      {\

-        uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); \

-      }\

-      InitMcFunc(&sMcFunc,uiCpuFlag);\

-      memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \

-      memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \

-      MCHalfPelFilterAnchor(uSrcInputAnchor[1],uSrcInputAnchor[2],uSrcInputAnchor[3],uSrcInputAnchor[0],MC_BUFF_SRC_STRIDE,iW+1,iH+1,pBuf+4); \

-      MCLumaAnchor(uDstAnchor[0],MC_BUFF_DST_STRIDE,uSrcInputAnchor,MC_BUFF_SRC_STRIDE,a,b,iW,iH); \

-      sMcFunc.pMcLumaFunc(&uSrcTest[4][4],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\

-      for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \

-      {                                                                             \

-          for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \

-          {                                                                           \

-              ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]);                              \

-          }                                                                             \

-      }                                                                                \

-    }\

+    InitMcFunc(&sMcFunc, WelsCPUFeatureDetect (0) & (cpu_flags)); \

+    memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \

+    memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \

+    MCHalfPelFilterAnchor(uSrcInputAnchor[1],uSrcInputAnchor[2],uSrcInputAnchor[3],uSrcInputAnchor[0],MC_BUFF_SRC_STRIDE,iW+1,iH+1,pBuf+4); \

+    MCLumaAnchor(uDstAnchor[0],MC_BUFF_DST_STRIDE,uSrcInputAnchor,MC_BUFF_SRC_STRIDE,a,b,iW,iH); \

+    sMcFunc.pMcLumaFunc(&uSrcTest[4][4],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\

+    for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \

+    {                                                                             \

+        for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \

+        {                                                                           \

+            ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]);                              \

+        }                                                                             \

+    }                                                                                \

}\

}\

+#define DEF_LUMA_MCTESTS(cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST ( 4,  4, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST ( 4,  8, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST ( 8,  4, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST ( 8,  8, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST (16,  8, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST ( 8, 16, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST (16, 16, cpu_flags, name_suffix)

-DEF_LUMA_MCTEST (4, 4)

-DEF_LUMA_MCTEST (4, 8)

-DEF_LUMA_MCTEST (8, 4)

-DEF_LUMA_MCTEST (8, 8)

-DEF_LUMA_MCTEST (16, 8)

-DEF_LUMA_MCTEST (8, 16)

-DEF_LUMA_MCTEST (16, 16)

+DEF_LUMA_MCTESTS(0, c)

+DEF_LUMA_MCTESTS(~0, native)

+#ifdef X86_ASM

+DEF_LUMA_MCTESTS(WELS_CPU_SSE2, sse2)

+DEF_LUMA_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3, ssse3)

+#endif

 #define DEF_CHROMA_MCTEST(iW,iH) \

 TEST(McChroma,iW##x##iH)  \

@@ -315,81 +310,86 @@

-#define DEF_HALFPEL_MCTEST(iW,iH) \

-TEST (EncMcHalfpel, iW##x##iH) { \

+#define DEF_HALFPEL_MCTEST(iW, iH, cpu_flags, name_suffix) \

+TEST (EncMcHalfpel, iW##x##iH##_##name_suffix) { \

     SMcFunc sMcFunc; \

-    for (int32_t k = 0; k < 2; k++) { \

-        for (int32_t w = 0; w < 2; w++) { \

-            int32_t width = iW ; \

-            int32_t height = iH; \

-            uint8_t uAnchor[4][MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \

-            uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \

-            uint8_t uRand[MC_BUFF_HEIGHT][MC_BUFF_DST_STRIDE]; \

-            ENFORCE_STACK_ALIGN_2D (uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \

-            uint8_t* uAnchors[4]; \

-            int16_t pBuf[MC_BUFF_DST_STRIDE]; \

-            uAnchors[0] = &uAnchor[0][4][4]; \

-            uAnchors[1] = &uAnchor[1][4][4]; \

-            uAnchors[2] = &uAnchor[2][4][4]; \

-            uAnchors[3] = &uAnchor[3][4][4]; \

-             \

-            memset (uAnchor, 0, 4 * sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_SRC_STRIDE); \

-            memset (uDstTest, 0, sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_DST_STRIDE); \

-            for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

-                for (int32_t i = 0; i < MC_BUFF_SRC_STRIDE; i++) { \

-                    uAnchor[0][j][i] = uSrcTest[j][i] = rand() % 256; \

-                    uRand[j][i] = rand() % 256; \

-                } \

+    for (int32_t w = 0; w < 2; w++) { \

+        int32_t width = iW ; \

+        int32_t height = iH; \

+        uint8_t uAnchor[4][MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \

+        uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \

+        uint8_t uRand[MC_BUFF_HEIGHT][MC_BUFF_DST_STRIDE]; \

+        ENFORCE_STACK_ALIGN_2D (uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \

+        uint8_t* uAnchors[4]; \

+        int16_t pBuf[MC_BUFF_DST_STRIDE]; \

+        uAnchors[0] = &uAnchor[0][4][4]; \

+        uAnchors[1] = &uAnchor[1][4][4]; \

+        uAnchors[2] = &uAnchor[2][4][4]; \

+        uAnchors[3] = &uAnchor[3][4][4]; \

+         \

+        memset (uAnchor, 0, 4 * sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_SRC_STRIDE); \

+        memset (uDstTest, 0, sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_DST_STRIDE); \

+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

+            for (int32_t i = 0; i < MC_BUFF_SRC_STRIDE; i++) { \

+                uAnchor[0][j][i] = uSrcTest[j][i] = rand() % 256; \

+                uRand[j][i] = rand() % 256; \

} \

-             \

-            uint32_t uiCpuFlag = k == 0 ? 0 : WelsCPUFeatureDetect (NULL); \

-            InitMcFunc (&sMcFunc, uiCpuFlag); \

-             \

-            MCHalfPelFilterAnchor (uAnchors[1], uAnchors[2], uAnchors[3], uAnchors[0], MC_BUFF_SRC_STRIDE, width + 1, height + 1, pBuf + 4); \

-            memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \

-            sMcFunc.pfLumaHalfpelHor (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height); \

-            for (int32_t j = 0; j < height; j++) { \

-                for (int32_t i = 0; i < width + 1; i++) { \

-                    ASSERT_EQ (uAnchor[1][4 + j][4 + i], uDstTest[j][i]); \

-                } \

+        } \

+         \

+        InitMcFunc (&sMcFunc, WelsCPUFeatureDetect (0) & (cpu_flags)); \

+         \

+        MCHalfPelFilterAnchor (uAnchors[1], uAnchors[2], uAnchors[3], uAnchors[0], MC_BUFF_SRC_STRIDE, width + 1, height + 1, pBuf + 4); \

+        memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \

+        sMcFunc.pfLumaHalfpelHor (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height); \

+        for (int32_t j = 0; j < height; j++) { \

+            for (int32_t i = 0; i < width + 1; i++) { \

+                ASSERT_EQ (uAnchor[1][4 + j][4 + i], uDstTest[j][i]); \

} \

-            for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

-                for (int32_t i = j < height ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \

-                    ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \

-                } \

+        } \

+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

+            for (int32_t i = j < height ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \

+                ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \

} \

-            memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \

-            sMcFunc.pfLumaHalfpelVer (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width, height + 1); \

-            for (int32_t j = 0; j < height + 1; j++) { \

-                for (int32_t i = 0; i < width; i++) { \

-                    ASSERT_EQ (uAnchor[2][4 + j][4 + i], uDstTest[j][i]); \

-                } \

+        } \

+        memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \

+        sMcFunc.pfLumaHalfpelVer (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width, height + 1); \

+        for (int32_t j = 0; j < height + 1; j++) { \

+            for (int32_t i = 0; i < width; i++) { \

+                ASSERT_EQ (uAnchor[2][4 + j][4 + i], uDstTest[j][i]); \

} \

-            for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

-                for (int32_t i = j < height + 1 ? width : 0; i < MC_BUFF_DST_STRIDE; i++) { \

-                    ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \

-                } \

+        } \

+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

+            for (int32_t i = j < height + 1 ? width : 0; i < MC_BUFF_DST_STRIDE; i++) { \

+                ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \

} \

-            memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \

-            sMcFunc.pfLumaHalfpelCen (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height + 1); \

-            for (int32_t j = 0; j < height + 1; j++) { \

-                for (int32_t i = 0; i < width + 1; i++) { \

-                    ASSERT_EQ (uAnchor[3][4 + j][4 + i], uDstTest[j][i]); \

-                } \

+        } \

+        memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \

+        sMcFunc.pfLumaHalfpelCen (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height + 1); \

+        for (int32_t j = 0; j < height + 1; j++) { \

+            for (int32_t i = 0; i < width + 1; i++) { \

+                ASSERT_EQ (uAnchor[3][4 + j][4 + i], uDstTest[j][i]); \

} \

-            for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

-                for (int32_t i = j < height + 1 ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \

-                    ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \

-                } \

+        } \

+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

+            for (int32_t i = j < height + 1 ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \

+                ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \

} \

} \

} \

-DEF_HALFPEL_MCTEST(4,4)

-DEF_HALFPEL_MCTEST(4,8)

-DEF_HALFPEL_MCTEST(8,4)

-DEF_HALFPEL_MCTEST(8,8)

-DEF_HALFPEL_MCTEST(8,16)

-DEF_HALFPEL_MCTEST(16,8)

-DEF_HALFPEL_MCTEST(16,16)

+#define DEF_HALFPEL_MCTESTS(cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST( 4 , 4, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST( 4,  8, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST( 8,  4, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST( 8,  8, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST( 8, 16, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST(16,  8, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST(16, 16, cpu_flags, name_suffix)

+DEF_HALFPEL_MCTESTS(0, c)

+DEF_HALFPEL_MCTESTS(~0, native)

+#ifdef X86_ASM

+DEF_HALFPEL_MCTESTS(WELS_CPU_SSE2, sse2)

+DEF_HALFPEL_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3, ssse3)

+#endif