shithub: openh264

--- a/codec/common/inc/mc.h

+++ b/codec/common/inc/mc.h

@@ -252,8 +252,6 @@

                              int32_t iHeight);

 void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

                            const uint8_t* kpABCD, int32_t iHeight);

-void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

-                         int32_t iHeight);

 void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

                          int32_t iHeight);

 void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,

@@ -307,11 +305,54 @@

         int32_t iWidth, int32_t iHeight);

 //***************************************************************************//

-//                       SSSE3 definition                                    //

+//                       SSE3 definition                                     //

 //***************************************************************************//

+void McCopyWidthEq16_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                           int32_t iHeight);

+//***************************************************************************//

+//                       SSSE3 definition                                    //

+//***************************************************************************//

 void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

                              const uint8_t* kpABCD, int32_t iHeight);

+void McHorVer02_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight);

+void McHorVer02Width4S16ToU8_ssse3 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);

+void McHorVer02Width5S16ToU8_ssse3 (const int16_t* pSrc, int32_t iSrcStride,

+                                    uint8_t* pDst, int32_t iDstStride, int32_t iHeight);

+void McHorVer02WidthGe8S16ToU8_ssse3 (const int16_t* pSrc, int32_t iSrcStride,

+                                      uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);

+void McHorVer20_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight);

+void McHorVer20Width4U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);

+void McHorVer20Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,

+                                    uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);

+void McHorVer20Width8U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,

+                                    int16_t* pDst, int32_t iDstStride, int32_t iHeight);

+void McHorVer20Width9Or17U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,

+                                        int16_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);

+//***************************************************************************//

+//                       AVX2 definition                                     //

+//***************************************************************************//

+#ifdef HAVE_AVX2

+void McHorVer02_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight);

+void McHorVer02Width4S16ToU8_avx2 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);

+void McHorVer02Width5S16ToU8_avx2 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);

+void McHorVer02Width8S16ToU8_avx2 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);

+void McHorVer02Width9S16ToU8_avx2 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);

+void McHorVer02Width16Or17S16ToU8_avx2 (const int16_t* pSrc, int32_t iSrcStride,

+                                        uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);

+void McHorVer20_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight);

+void McHorVer20Width5Or9Or17_avx2 (const uint8_t* pSrc, int32_t iSrcStride,

+                                   uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);

+void McHorVer20Width4U8ToS16_avx2 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);

+void McHorVer20Width8U8ToS16_avx2 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);

+void McHorVer20Width16U8ToS16_avx2 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);

+void McHorVer20Width17U8ToS16_avx2 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);

+#endif //HAVE_AVX2

 #endif //X86_ASM

--- a/codec/common/src/mc.cpp

+++ b/codec/common/src/mc.cpp

@@ -44,6 +44,8 @@

 #include "ls_defines.h"

 #include "macros.h"

+namespace {

 typedef void (*PMcChromaWidthExtFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

                                        const uint8_t* kpABCD, int32_t iHeight);

 typedef void (*PWelsSampleWidthAveragingFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*,

@@ -51,8 +53,6 @@

 typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

                                         int32_t iWidth, int32_t iHeight);

-namespace WelsCommon {

 /*------------------weight for chroma fraction pixel interpolation------------------*/

 //iA = (8 - dx) * (8 - dy);

 //iB = dx * (8 - dy);

@@ -442,7 +442,7 @@

   else if (iWidth == 8)

     McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);

   else if (iWidth == 4)

-    McCopyWidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);

+    McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);

   else

     McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);

@@ -710,6 +710,183 @@

     McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);

+//***************************************************************************//

+//                          SSSE3 implementation                             //

+//***************************************************************************//

+void PixelAvgWidth4Or8Or16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,

+                                 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {

+  if (iWidth < 8) {

+    PixelAvgWidthEq4_mmx   (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);

+  } else if (iWidth == 8) {

+    PixelAvgWidthEq8_mmx   (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);

+  } else {

+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);

+  }

+}

+void McCopy_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                  int32_t iWidth, int32_t iHeight) {

+  switch (iWidth) {

+  case 16: return McCopyWidthEq16_sse3 (pSrc, iSrcStride, pDst, iDstStride, iHeight);

+  case 8:  return McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);

+  case 4:  return McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);

+  }

+  return McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);

+}

+void McHorVer22_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 8, 16);

+  if (iWidth < 8) {

+    McHorVer20Width4U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);

+    McHorVer02Width4S16ToU8_ssse3 (&pTmp[0][0], pDst, iDstStride, iHeight);

+  } else if (iWidth == 8) {

+    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);

+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);

+  } else {

+    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);

+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, 8, iHeight);

+    McHorVer20Width8U8ToS16_ssse3 (pSrc + 8, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);

+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst + 8, iDstStride, 8, iHeight);

+  }

+}

+void McHorVer01_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,

+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer03_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,

+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer10_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,

+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer11_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer12_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer13_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_ssse3 (pSrc,              iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer21_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer23_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer22_ssse3 (pSrc,              iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer30_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer31_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc,     iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer32_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  McHorVer22_ssse3 (pSrc,     iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer33_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                       int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_ssse3 (pSrc + 1,          iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer22Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                                    int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 16 / sizeof (int16_t)), 16)

+  if (iWidth > 5) {

+    McHorVer20Width9Or17U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight + 5);

+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);

+  } else {

+    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);

+    McHorVer02Width5S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iHeight);

+  }

+}

+void McLuma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {

+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {

+    {McCopy_sse3,      McHorVer01_ssse3, McHorVer02_ssse3, McHorVer03_ssse3},

+    {McHorVer10_ssse3, McHorVer11_ssse3, McHorVer12_ssse3, McHorVer13_ssse3},

+    {McHorVer20_ssse3, McHorVer21_ssse3, McHorVer22_ssse3, McHorVer23_ssse3},

+    {McHorVer30_ssse3, McHorVer31_ssse3, McHorVer32_ssse3, McHorVer33_ssse3},

+  };

+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);

+}

 void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

                      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {

   static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {

@@ -728,6 +905,169 @@

     McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);

+//***************************************************************************//

+//                          AVX2 implementation                              //

+//***************************************************************************//

+#ifdef HAVE_AVX2

+void McHorVer22_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 16, 32);

+  if (iWidth < 8) {

+    McHorVer20Width4U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);

+    McHorVer02Width4S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);

+  } else if (iWidth == 8) {

+    McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);

+    McHorVer02Width8S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);

+  } else {

+    McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);

+    McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);

+  }

+}

+void McHorVer01_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,

+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer03_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,

+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer10_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,

+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer11_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer12_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer13_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_avx2 (pSrc,              iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer21_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer23_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer22_avx2 (pSrc,              iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer30_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);

+  McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);

+}

+void McHorVer31_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_avx2 (pSrc,     iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer32_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);

+  McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  McHorVer22_avx2 (pSrc,     iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,

+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);

+}

+void McHorVer33_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                      int32_t iWidth, int32_t iHeight) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);

+  McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);

+  McHorVer02_avx2 (pSrc + 1,          iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,

+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);

+}

+void McHorVer22Width5Or9Or17_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                                   int32_t iWidth, int32_t iHeight) {

+  if (iWidth < 9) {

+    ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 9 + 5, WELS_ALIGN(5, 16 / sizeof (int16_t)), 16)

+    McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);

+    McHorVer02Width5S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);

+  } else if (iWidth == 9) {

+    ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, 16, 32)

+    McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);

+    McHorVer02Width9S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);

+  } else {

+    ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 32 / sizeof (int16_t)), 32)

+    McHorVer20Width17U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);

+    McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);

+  }

+}

+void McLuma_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,

+                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {

+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {

+    {McCopy_sse3,     McHorVer01_avx2, McHorVer02_avx2, McHorVer03_avx2},

+    {McHorVer10_avx2, McHorVer11_avx2, McHorVer12_avx2, McHorVer13_avx2},

+    {McHorVer20_avx2, McHorVer21_avx2, McHorVer22_avx2, McHorVer23_avx2},

+    {McHorVer30_avx2, McHorVer31_avx2, McHorVer32_avx2, McHorVer33_avx2},

+  };

+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);

+}

+#endif //HAVE_AVX2

 void PixelAvg_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,

                     const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {

   static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {

@@ -1319,7 +1659,9 @@

 #endif

-void InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {

+} // anon ns.

+void WelsCommon::InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {

   pMcFuncs->pfLumaHalfpelHor  = McHorVer20_c;

   pMcFuncs->pfLumaHalfpelVer  = McHorVer02_c;

   pMcFuncs->pfLumaHalfpelCen  = McHorVer22_c;

@@ -1338,8 +1680,20 @@

   if (uiCpuFlag & WELS_CPU_SSSE3) {

+    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_ssse3;

+    pMcFuncs->pfLumaHalfpelVer  = McHorVer02_ssse3;

+    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17_ssse3;

     pMcFuncs->pMcChromaFunc = McChroma_ssse3;

+    pMcFuncs->pMcLumaFunc   = McLuma_ssse3;

+#ifdef HAVE_AVX2

+  if (uiCpuFlag & WELS_CPU_AVX2) {

+    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_avx2;

+    pMcFuncs->pfLumaHalfpelVer  = McHorVer02_avx2;

+    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17_avx2;

+    pMcFuncs->pMcLumaFunc       = McLuma_avx2;

+  }

+#endif

 #endif //(X86_ASM)

 #if defined(HAVE_NEON)

@@ -1363,4 +1717,3 @@

 #endif

-} // namespace WelsCommon

--- a/codec/common/x86/mb_copy.asm

+++ b/codec/common/x86/mb_copy.asm

@@ -44,6 +44,10 @@

 ;*********************************************************************************************/

 %include "asm_inc.asm"

+%ifdef __NASM_VER__

+    %use smartalign

+%endif

 ;***********************************************************************

 ; Macros and other preprocessor constants

 ;***********************************************************************

@@ -502,39 +506,37 @@

     LOAD_7_PARA_POP

ret

-;*******************************************************************************

-;  void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,

-;                          uint8_t *pDst, int iDstStride, int iHeight )

-;*******************************************************************************

-WELS_EXTERN McCopyWidthEq4_mmx

-    push    r5

-    %assign  push_num 1

-    LOAD_5_PARA

+; load_instr=%1 store_instr=%2 p_dst=%3 i_dststride=%4 p_src=%5 i_srcstride=%6 cnt=%7 r_tmp=%8,%9 mm_tmp=%10,%11

+%macro CopyStrided4N 11

+    lea             %8, [3 * %6]

+    lea             %9, [3 * %4]

+ALIGN 32

+%%loop:

+    %1              %10, [%5]

+    %1              %11, [%5 + %6]

+    %2              [%3], %10

+    %2              [%3 + %4], %11

+    %1              %10, [%5 + 2 * %6]

+    %1              %11, [%5 + %8]

+    %2              [%3 + 2 * %4], %10

+    %2              [%3 + %9], %11

+    lea             %5, [%5 + 4 * %6]

+    lea             %3, [%3 + 4 * %4]

+    sub             %7, 4

+    jg              %%loop

+%endmacro

-    SIGN_EXTENSION  r1, r1d

-    SIGN_EXTENSION  r3, r3d

-    SIGN_EXTENSION  r4, r4d

-ALIGN 4

-.height_loop:

-    mov r5d, [r0]

-    mov [r2], r5d

-    add r0, r1

-    add r2, r3

-    dec r4

-    jnz .height_loop

-    WELSEMMS

-    LOAD_5_PARA_POP

-    pop    r5

-    ret

 ;*******************************************************************************

 ;   void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,

-;                           uint8_t *pDst, int iDstStride, int iHeight )

+;                            uint8_t *pDst, int iDstStride, int iHeight )

 ;*******************************************************************************

 WELS_EXTERN McCopyWidthEq8_mmx

     %assign  push_num 0

+%ifdef X86_32

+    push            r5

+    push            r6

+    %assign  push_num 2

+%endif

     LOAD_5_PARA

     SIGN_EXTENSION  r1, r1d

@@ -541,17 +543,14 @@

     SIGN_EXTENSION  r3, r3d

     SIGN_EXTENSION  r4, r4d

-ALIGN 4

-.height_loop:

-    movq mm0, [r0]

-    movq [r2], mm0

-    add r0, r1

-    add r2, r3

-    dec r4

-    jnz .height_loop

+    CopyStrided4N   movq, movq, r2, r3, r0, r1, r4, r5, r6, mm0, mm1

     WELSEMMS

     LOAD_5_PARA_POP

+%ifdef X86_32

+    pop             r6

+    pop             r5

+%endif

ret

@@ -588,4 +587,29 @@

     jnz     .height_loop

     LOAD_5_PARA_POP

+    ret

+;*******************************************************************************

+;   void McCopyWidthEq16_sse3( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )

+;*******************************************************************************

+WELS_EXTERN McCopyWidthEq16_sse3

+    %assign push_num 0

+%ifdef X86_32

+    push            r5

+    push            r6

+    %assign push_num 2

+%endif

+    LOAD_5_PARA

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    CopyStrided4N   lddqu, MOVDQ, r2, r3, r0, r1, r4, r5, r6, xmm0, xmm1

+    LOAD_5_PARA_POP

+%ifdef X86_32

+    pop             r6

+    pop             r5

+%endif

ret

--- a/codec/common/x86/mc_luma.asm

+++ b/codec/common/x86/mc_luma.asm

@@ -44,16 +44,61 @@

 ;*******************************************************************************

 ; Local Data (Read Only)

 ;*******************************************************************************

-SECTION .rodata align=16

+SECTION .rodata align=32

 ;*******************************************************************************

 ; Various memory constants (trigonometric values or rounding values)

 ;*******************************************************************************

+%ifdef HAVE_AVX2

+ALIGN 32

+dwm32768_256:

+    times 16 dw -32768

+maddubsw_m2p10_m40m40_p10m2_p0p0_256:

+    times 4 db -2, 10, -40, -40, 10, -2, 0, 0

+dwm1024_256:

+    times 16 dw -1024

+dd32768_256:

+    times 8 dd 32768

+maddubsw_p1m5_256:

+    times 16 db 1, -5

+maddubsw_m5p1_256:

+    times 16 db -5, 1

+db20_256:

+    times 32 db 20

+maddubsw_m5p20_256:

+    times 16 db -5, 20

+maddubsw_p20m5_256:

+    times 16 db 20, -5

+h264_w0x10_256:

+    times 16 dw 16

+dw32_256:

+    times 16 dw 32

+%endif ; HAVE_AVX2

 ALIGN 16

-h264_w0x10:

-    dw 16, 16, 16, 16

-ALIGN 16

+shufb_32435465768798A9:

+    db 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9

+shufb_011267784556ABBC:

+    db 0, 1, 1, 2, 6, 7, 7, 8, 4, 5, 5, 6, 0Ah, 0Bh, 0Bh, 0Ch

+maddubsw_p1m5_p1m5_m5p1_m5p1_128:

+    times 2 db 1, -5, 1, -5, -5, 1, -5, 1

+maddubsw_m2p10_m40m40_p10m2_p0p0_128:

+    times 2 db -2, 10, -40, -40, 10, -2, 0, 0

+dwm1024_128:

+    times 8 dw -1024

+dd32768_128:

+    times 4 dd 32768

+maddubsw_p1m5_128:

+    times 8 db 1, -5

+maddubsw_m5p1_128:

+    times 8 db -5, 1

+db20_128:

+    times 16 db 20

+maddubsw_m5p20_128:

+    times 8 db -5, 20

+maddubsw_p20m5_128:

+    times 8 db 20, -5

 h264_w0x10_1:

     dw 16, 16, 16, 16, 16, 16, 16, 16

 ALIGN 16

@@ -85,7 +130,7 @@

     sub r0, 2

     WELS_Zero mm7

-    movq mm6, [h264_w0x10]

+    movq mm6, [h264_w0x10_1]

 .height_loop:

     movd mm0, [r0]

     punpcklbw mm0, mm7

@@ -1746,3 +1791,2559 @@

 LOAD_6_PARA_POP

ret

+; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7

+%macro SSSE3_FilterVertical_8px 7

+    pmaddubsw       %1, %4

+    movdqa          %7, %2

+    pmaddubsw       %7, %5

+    paddw           %1, %7

+    movdqa          %7, %3

+    pmaddubsw       %7, %6

+    paddw           %1, %7

+    paddw           %1, [h264_w0x10_1]

+    psraw           %1, 5

+%endmacro

+; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8

+%macro SSSE3_FilterVertical2_8px 8

+    movdqa          %8, %2

+    pxor            %7, %7

+    punpcklbw       %1, %7

+    punpcklbw       %8, %7

+    paddw           %1, %8

+    movdqa          %7, %3

+    pmaddubsw       %7, %5

+    paddw           %1, %7

+    movdqa          %7, %4

+    pmaddubsw       %7, %6

+    paddw           %1, %7

+    paddw           %1, [h264_w0x10_1]

+    psraw           %1, 5

+%endmacro

+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6

+%macro SSSE3_FilterHorizontalbw_8px 6

+    movdqa          %5, %1

+    pshufb          %1, %2

+    pshufb          %5, %3

+    pshufd          %6, %1, 10110001b

+    pmaddubsw       %1, [db20_128]

+    pmaddubsw       %5, %4

+    pmaddubsw       %6, %4

+    paddw           %1, %5

+    paddw           %1, %6

+%endmacro

+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6

+%macro SSSE3_FilterHorizontal_8px 6

+    SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6

+    paddw           %1, [h264_w0x10_1]

+    psraw           %1, 5

+%endmacro

+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7

+%macro SSSE3_FilterHorizontalbw_2x4px 7

+    movdqa          %6, %1

+    movdqa          %7, %2

+    pshufb          %1, %3

+    pshufb          %2, %3

+    punpcklqdq      %1, %2

+    pshufb          %6, %4

+    pshufb          %7, %4

+    punpcklqdq      %6, %7

+    pshufd          %7, %1, 10110001b

+    pmaddubsw       %1, [db20_128]

+    pmaddubsw       %6, %5

+    pmaddubsw       %7, %5

+    paddw           %1, %6

+    paddw           %1, %7

+%endmacro

+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7

+%macro SSSE3_FilterHorizontal_2x4px 7

+    SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7

+    paddw           %1, [h264_w0x10_1]

+    psraw           %1, 5

+%endmacro

+; pixels=%1 -32768>>scale=%2 tmp=%3

+%macro SSSE3_FilterHorizontalbw_2px 3

+    pmaddubsw       %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_128]

+    pmaddwd         %1, %2

+    pshufd          %3, %1, 10110001b

+    paddd           %1, %3

+%endmacro

+; pixels=%1 tmp=%2

+%macro SSSE3_FilterHorizontal_2px 2

+    SSSE3_FilterHorizontalbw_2px %1, [dwm1024_128], %2

+    paddd           %1, [dd32768_128]

+%endmacro

+; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7

+%macro SSE2_FilterVerticalw_8px 7

+    paddw           %1, %6

+    movdqa          %7, %2

+    paddw           %7, %5

+    psubw           %1, %7

+    psraw           %1, 2

+    psubw           %1, %7

+    movdqa          %7, %3

+    paddw           %7, %4

+    paddw           %1, %7

+    psraw           %1, 2

+    paddw           %7, [h264_mc_hc_32]

+    paddw           %1, %7

+    psraw           %1, 6

+%endmacro

+;***********************************************************************

+; void McHorVer02_ssse3(const uint8_t *pSrc,

+;                       int32_t iSrcStride,

+;                       uint8_t *pDst,

+;                       int32_t iDstStride,

+;                       int32_t iWidth,

+;                       int32_t iHeight)

+;***********************************************************************

+WELS_EXTERN McHorVer02_ssse3

+%define p_src         r0

+%define i_srcstride   r1

+%define p_dst         r2

+%define i_dststride   r3

+%define i_width       r4

+%define i_height      r5

+%define i_srcstride3  r6

+    %assign push_num 0

+%ifdef X86_32

+    push            r6

+    %assign push_num 1

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    lea             i_srcstride3, [3 * i_srcstride]

+    cmp             i_width, 4

+    jg              .width8or16

+    movd            xmm0, [p_src]

+    movd            xmm4, [p_src + i_srcstride]

+    punpcklbw       xmm0, xmm4

+    movd            xmm1, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm4, xmm1

+    punpcklqdq      xmm0, xmm4

+    movd            xmm4, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklbw       xmm1, xmm4

+    movd            xmm2, [p_src]

+    punpcklbw       xmm4, xmm2

+    punpcklqdq      xmm1, xmm4

+    movd            xmm4, [p_src + i_srcstride]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm2, xmm4

+    movd            xmm3, [p_src]

+    punpcklbw       xmm4, xmm3

+    punpcklqdq      xmm2, xmm4

+    movdqa          xmm5, [db20_128]

+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm0, xmm0

+    movd            [p_dst], xmm0

+    psrlq           xmm0, 32

+    movd            [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movd            xmm4, [p_src + i_srcstride]

+    punpcklbw       xmm3, xmm4

+    movd            xmm0, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm4, xmm0

+    punpcklqdq      xmm3, xmm4

+    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm1, xmm1

+    movd            [p_dst], xmm1

+    psrlq           xmm1, 32

+    movd            [p_dst + i_dststride], xmm1

+    cmp             i_height, 5

+    jl              .width4_height_le5_done

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movd            xmm4, [p_src + i_srcstride3]

+    punpcklbw       xmm0, xmm4

+    jg              .width4_height_ge8

+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm2, xmm2

+    movd            [p_dst], xmm2

+.width4_height_le5_done:

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+.width4_height_ge8:

+    lea             p_src, [p_src + 4 * i_srcstride]

+    movd            xmm1, [p_src]

+    punpcklbw       xmm4, xmm1

+    punpcklqdq      xmm0, xmm4

+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm2, xmm2

+    movd            [p_dst], xmm2

+    psrlq           xmm2, 32

+    movd            [p_dst + i_dststride], xmm2

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movd            xmm4, [p_src + i_srcstride]

+    punpcklbw       xmm1, xmm4

+    movd            xmm2, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm4, xmm2

+    punpcklqdq      xmm1, xmm4

+    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm3, xmm3

+    movd            [p_dst], xmm3

+    psrlq           xmm3, 32

+    movd            [p_dst + i_dststride], xmm3

+    cmp             i_height, 9

+    jl              .width4_height_ge8_done

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movd            xmm4, [p_src + i_srcstride3]

+    punpcklbw       xmm2, xmm4

+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4

+    packuswb        xmm0, xmm0

+    movd            [p_dst], xmm0

+.width4_height_ge8_done:

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+.width8or16:

+    sub             i_height, 1

+    push            i_height

+%xdefine i_ycnt i_height

+%define i_height [r7]

+.xloop:

+    push            p_src

+    push            p_dst

+    test            i_ycnt, 1

+    jnz             .yloop_begin_even

+    movq            xmm0, [p_src]

+    movq            xmm1, [p_src + i_srcstride]

+    punpcklbw       xmm0, xmm1

+    movq            xmm2, [p_src + 2 * i_srcstride]

+    movq            xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklbw       xmm2, xmm3

+    movq            xmm4, [p_src]

+    movq            xmm5, [p_src + i_srcstride]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm4, xmm5

+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm7

+    packuswb        xmm0, xmm0

+    movlps          [p_dst], xmm0

+    add             p_dst, i_dststride

+    jmp             .yloop

+.yloop_begin_even:

+    movq            xmm1, [p_src]

+    movq            xmm2, [p_src + i_srcstride]

+    movq            xmm3, [p_src + 2 * i_srcstride]

+    add             p_src, i_srcstride3

+    punpcklbw       xmm2, xmm3

+    movq            xmm4, [p_src]

+    movq            xmm5, [p_src + i_srcstride]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    punpcklbw       xmm4, xmm5

+.yloop:

+    movq            xmm6, [p_src]

+    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm0, xmm7

+    movq            xmm7, [p_src + i_srcstride]

+    punpcklbw       xmm6, xmm7

+    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm0

+    packuswb        xmm1, xmm2

+    movlps          [p_dst], xmm1

+    movhps          [p_dst + i_dststride], xmm1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movq            xmm0, [p_src + 2 * i_srcstride]

+    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm2, xmm1

+    movq            xmm1, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklbw       xmm0, xmm1

+    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm2

+    packuswb        xmm3, xmm4

+    movlps          [p_dst], xmm3

+    movhps          [p_dst + i_dststride], xmm3

+    cmp             i_ycnt, 4

+    jle             .yloop_exit

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movq            xmm2, [p_src]

+    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm4, xmm3

+    movq            xmm3, [p_src + i_srcstride]

+    punpcklbw       xmm2, xmm3

+    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm4

+    packuswb        xmm5, xmm6

+    movlps          [p_dst], xmm5

+    movhps          [p_dst + i_dststride], xmm5

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movq            xmm4, [p_src + 2 * i_srcstride]

+    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm6, xmm5

+    movq            xmm5, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklbw       xmm4, xmm5

+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm6

+    packuswb        xmm7, xmm0

+    movlps          [p_dst], xmm7

+    movhps          [p_dst + i_dststride], xmm7

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_ycnt, 8

+    jg              .yloop

+.yloop_exit:

+    pop             p_dst

+    pop             p_src

+    sub             i_width, 8

+    jle             .width8or16_done

+    add             p_src, 8

+    add             p_dst, 8

+    mov             i_ycnt, i_height

+    jmp             .xloop

+.width8or16_done:

+    pop             i_ycnt

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+%undef p_src

+%undef i_srcstride

+%undef i_srcstride3

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+%undef i_ycnt

+;*******************************************************************************

+; void McHorVer20_ssse3(const uint8_t *pSrc,

+;                       int iSrcStride,

+;                       uint8_t *pDst,

+;                       int iDstStride,

+;                       int iWidth,

+;                       int iHeight);

+;*******************************************************************************

+WELS_EXTERN McHorVer20_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_width      r4

+%define i_height     r5

+    %assign  push_num 0

+    LOAD_6_PARA

+    PUSH_XMM 7

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    movdqa          xmm4, [shufb_32435465768798A9]

+    movdqa          xmm5, [shufb_011267784556ABBC]

+    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    cmp             i_width, 8

+    je              .width8_yloop

+    jg              .width16_yloop

+.width4_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm1, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    SSSE3_FilterHorizontal_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3

+    packuswb        xmm0, xmm0

+    movd            [p_dst], xmm0

+    psrlq           xmm0, 32

+    movd            [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 2

+    jg              .width4_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width8_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm1, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3

+    SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3

+    packuswb        xmm0, xmm1

+    movlps          [p_dst], xmm0

+    movhps          [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 2

+    jg              .width8_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width16_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm1, [p_src + 6]

+    add             p_src, i_srcstride

+    SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3

+    SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3

+    packuswb        xmm0, xmm1

+    MOVDQ           [p_dst], xmm0

+    add             p_dst, i_dststride

+    sub             i_height, 1

+    jg              .width16_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+;***********************************************************************

+; void McHorVer20Width5Or9Or17_ssse3(const uint8_t *pSrc,

+;                                    int32_t iSrcStride,

+;                                    uint8_t *pDst,

+;                                    int32_t iDstStride,

+;                                    int32_t iWidth,

+;                                    int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer20Width5Or9Or17_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_width      r4

+%define i_height     r5

+    %assign  push_num 0

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    movdqa          xmm5, [shufb_32435465768798A9]

+    movdqa          xmm6, [shufb_011267784556ABBC]

+    movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    cmp             i_width, 9

+    je              .width9_yloop

+    jg              .width17_yloop

+.width5_yloop:

+    movdqu          xmm0, [p_src - 2]

+    add             p_src, i_srcstride

+    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2

+    packuswb        xmm0, xmm0

+    movdqa          xmm1, xmm0

+    psrlq           xmm1, 8

+    movd            [p_dst], xmm0

+    movd            [p_dst + 1], xmm1

+    add             p_dst, i_dststride

+    sub             i_height, 1

+    jg              .width5_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width9_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm4, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    movdqa          xmm3, xmm0

+    punpckhqdq      xmm3, xmm4

+    SSSE3_FilterHorizontal_2px xmm3, xmm2

+    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2

+    packuswb        xmm3, xmm0

+    movd            [p_dst + 5], xmm3

+    movhps          [p_dst], xmm3

+    add             p_dst, i_dststride

+    SSSE3_FilterHorizontal_8px xmm4, xmm5, xmm6, xmm7, xmm1, xmm2

+    packuswb        xmm4, xmm4

+    psrldq          xmm3, 4

+    movd            [p_dst + 5], xmm3

+    movlps          [p_dst], xmm4

+    add             p_dst, i_dststride

+    sub             i_height, 2

+    jg              .width9_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width17_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm3, [p_src + 6]

+    add             p_src, i_srcstride

+    movdqa          xmm4, xmm3

+    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2

+    SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2

+    packuswb        xmm0, xmm3

+    movdqu          xmm1, [p_src - 2]

+    movdqu          xmm3, [p_src + 6]

+    add             p_src, i_srcstride

+    punpckhqdq      xmm4, xmm3

+    SSSE3_FilterHorizontal_2px xmm4, xmm2

+    packuswb        xmm4, xmm4

+    movd            [p_dst + 13], xmm4

+    MOVDQ           [p_dst], xmm0

+    add             p_dst, i_dststride

+    psrldq          xmm4, 4

+    movd            [p_dst + 13], xmm4

+    SSSE3_FilterHorizontal_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm2

+    SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm0, xmm2

+    packuswb        xmm1, xmm3

+    MOVDQ           [p_dst], xmm1

+    add             p_dst, i_dststride

+    sub             i_height, 2

+    jg              .width17_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+;*******************************************************************************

+; void McHorVer20Width4U8ToS16_ssse3(const uint8_t *pSrc,

+;                                    int iSrcStride,

+;                                    int16_t *pDst,

+;                                    int iHeight);

+;*******************************************************************************

+WELS_EXTERN McHorVer20Width4U8ToS16_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_height     r3

+    %assign  push_num 0

+    LOAD_4_PARA

+    PUSH_XMM 7

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    movdqa          xmm4, [shufb_32435465768798A9]

+    movdqa          xmm5, [shufb_011267784556ABBC]

+    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    sub             i_height, 1

+.yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm1, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    SSSE3_FilterHorizontalbw_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3

+    movdqa          [p_dst], xmm0

+    add             p_dst, 16

+    sub             i_height, 2

+    jg              .yloop

+    ; Height % 2 remainder.

+    movdqu          xmm0, [p_src - 2]

+    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3

+    movlps          [p_dst], xmm0

+    POP_XMM

+    LOAD_4_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_height

+;***********************************************************************

+; void McHorVer02Width4S16ToU8_ssse3(const int16_t *pSrc,

+;                                    uint8_t *pDst,

+;                                    int32_t iDstStride,

+;                                    int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer02Width4S16ToU8_ssse3

+%define p_src        r0

+%define p_dst        r1

+%define i_dststride  r2

+%define i_height     r3

+%define i_srcstride  8

+    %assign  push_num 0

+    LOAD_4_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r2, r2d

+    SIGN_EXTENSION  r3, r3d

+    movdqa          xmm0, [p_src +  0 * i_srcstride]

+    movdqu          xmm1, [p_src +  1 * i_srcstride]

+    movdqa          xmm2, [p_src +  2 * i_srcstride]

+    movdqu          xmm3, [p_src +  3 * i_srcstride]

+    movdqa          xmm4, [p_src +  4 * i_srcstride]

+    movdqu          xmm5, [p_src +  5 * i_srcstride]

+    movdqa          xmm6, [p_src +  6 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7

+    packuswb        xmm0, xmm0

+    movd            [p_dst], xmm0

+    psrlq           xmm0, 32

+    movd            [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movdqu          xmm7, [p_src +  7 * i_srcstride]

+    movdqa          xmm0, [p_src +  8 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm1

+    packuswb        xmm2, xmm2

+    movd            [p_dst], xmm2

+    psrlq           xmm2, 32

+    movd            [p_dst + i_dststride], xmm2

+    cmp             i_height, 4

+    jle             .done

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movdqu          xmm1, [p_src +  9 * i_srcstride]

+    movdqa          xmm2, [p_src + 10 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm3

+    packuswb        xmm4, xmm4

+    movd            [p_dst], xmm4

+    psrlq           xmm4, 32

+    movd            [p_dst + i_dststride], xmm4

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movdqu          xmm3, [p_src + 11 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm5

+    packuswb        xmm6, xmm6

+    movd            [p_dst], xmm6

+    psrlq           xmm6, 32

+    movd            [p_dst + i_dststride], xmm6

+.done:

+    POP_XMM

+    LOAD_4_PARA_POP

+    ret

+%undef p_src

+%undef p_dst

+%undef i_dststride

+%undef i_height

+%undef i_srcstride

+;***********************************************************************

+; void McHorVer20Width8U8ToS16_ssse3(const uint8_t *pSrc,

+;                                    int16_t iSrcStride,

+;                                    int16_t *pDst,

+;                                    int32_t iDstStride,

+;                                    int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer20Width8U8ToS16_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_height     r4

+    %assign  push_num 0

+    LOAD_5_PARA

+    PUSH_XMM 7

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    movdqa          xmm4, [shufb_32435465768798A9]

+    movdqa          xmm5, [shufb_011267784556ABBC]

+    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    sub             i_height, 1

+.yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm1, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3

+    MOVDQ           [p_dst], xmm0

+    add             p_dst, i_dststride

+    SSSE3_FilterHorizontalbw_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3

+    MOVDQ           [p_dst], xmm1

+    add             p_dst, i_dststride

+    sub             i_height, 2

+    jg              .yloop

+    jl              .done

+    movdqu          xmm0, [p_src - 2]

+    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3

+    MOVDQ           [p_dst], xmm0

+.done:

+    POP_XMM

+    LOAD_5_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_height

+;***********************************************************************

+; void McHorVer02Width5S16ToU8_ssse3(const int16_t *pSrc,

+;                                    int32_t iTapStride,

+;                                    uint8_t *pDst,

+;                                    int32_t iDstStride,

+;                                    int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer02Width5S16ToU8_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_height     r4

+%define i_srcstride3 r5

+    %assign  push_num 0

+%ifdef X86_32

+    push            r5

+    %assign  push_num 1

+%endif

+    LOAD_5_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    lea             i_srcstride3, [3 * i_srcstride]

+    movdqa          xmm0, [p_src]

+    movdqa          xmm1, [p_src + i_srcstride]

+    movdqa          xmm2, [p_src + 2 * i_srcstride]

+    movdqa          xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    movdqa          xmm4, [p_src]

+    movdqa          xmm5, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6

+    movdqa          xmm6, [p_src + 2 * i_srcstride]

+    packuswb        xmm0, xmm0

+    movdqa          xmm7, xmm0

+    psrlq           xmm7, 8

+    movd            [p_dst + 1], xmm7

+    movd            [p_dst], xmm0

+    add             p_dst, i_dststride

+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

+    movdqa          xmm7, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    packuswb        xmm1, xmm1

+    movdqa          xmm0, xmm1

+    psrlq           xmm0, 8

+    movd            [p_dst + 1], xmm0

+    movd            [p_dst], xmm1

+    add             p_dst, i_dststride

+    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0

+    movdqa          xmm0, [p_src]

+    packuswb        xmm2, xmm2

+    movdqa          xmm1, xmm2

+    psrlq           xmm1, 8

+    movd            [p_dst + 1], xmm1

+    movd            [p_dst], xmm2

+    add             p_dst, i_dststride

+    SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1

+    packuswb        xmm3, xmm3

+    movdqa          xmm2, xmm3

+    psrlq           xmm2, 8

+    movd            [p_dst + 1], xmm2

+    movd            [p_dst], xmm3

+    add             p_dst, i_dststride

+    movdqa          xmm1, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2

+    packuswb        xmm4, xmm4

+    movdqa          xmm3, xmm4

+    psrlq           xmm3, 8

+    movd            [p_dst + 1], xmm3

+    movd            [p_dst], xmm4

+    cmp             i_height, 5

+    jle             .done

+    add             p_dst, i_dststride

+    movdqa          xmm2, [p_src + 2 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3

+    movdqa          xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    packuswb        xmm5, xmm5

+    movdqa          xmm4, xmm5

+    psrlq           xmm4, 8

+    movd            [p_dst + 1], xmm4

+    movd            [p_dst], xmm5

+    add             p_dst, i_dststride

+    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4

+    movdqa          xmm4, [p_src]

+    packuswb        xmm6, xmm6

+    movdqa          xmm5, xmm6

+    psrlq           xmm5, 8

+    movd            [p_dst + 1], xmm5

+    movd            [p_dst], xmm6

+    add             p_dst, i_dststride

+    SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5

+    packuswb        xmm7, xmm7

+    movdqa          xmm6, xmm7

+    psrlq           xmm6, 8

+    movd            [p_dst + 1], xmm6

+    movd            [p_dst], xmm7

+    add             p_dst, i_dststride

+    movdqa          xmm5, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6

+    packuswb        xmm0, xmm0

+    movdqa          xmm7, xmm0

+    psrlq           xmm7, 8

+    movd            [p_dst + 1], xmm7

+    movd            [p_dst], xmm0

+.done:

+    POP_XMM

+    LOAD_5_PARA_POP

+%ifdef X86_32

+    pop             r5

+%endif

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_height

+%undef i_srcstride3

+;***********************************************************************

+; void McHorVer20Width9Or17U8ToS16_ssse3(const uint8_t *pSrc,

+;                                        int32_t iSrcStride,

+;                                        int16_t *pDst,

+;                                        int32_t iDstStride,

+;                                        int32_t iWidth,

+;                                        int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer20Width9Or17U8ToS16_ssse3

+%define p_src       r0

+%define i_srcstride r1

+%define p_dst       r2

+%define i_dststride r3

+%define i_width     r4

+%define i_height    r5

+    %assign  push_num 0

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    pcmpeqw         xmm4, xmm4

+    psllw           xmm4, 15                                ; dw -32768

+    movdqa          xmm5, [shufb_32435465768798A9]

+    movdqa          xmm6, [shufb_011267784556ABBC]

+    movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    cmp             i_width, 9

+    jne             .width17_yloop

+.width9_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqa          xmm3, xmm0

+    SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2

+    movdqu          xmm2, [p_src + i_srcstride - 2]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    punpckhqdq      xmm3, xmm2

+    SSSE3_FilterHorizontalbw_2px xmm3, xmm4, xmm1

+    movlps          [p_dst + 10], xmm3

+    MOVDQ           [p_dst], xmm0

+    add             p_dst, i_dststride

+    movhps          [p_dst + 10], xmm3

+    SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm1, xmm0

+    MOVDQ           [p_dst], xmm2

+    add             p_dst, i_dststride

+    sub             i_height, 2

+    jg              .width9_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width17_yloop:

+    movdqu          xmm0, [p_src - 2]

+    movdqu          xmm3, [p_src + 6]

+    add             p_src, i_srcstride

+    SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2

+    MOVDQ           [p_dst], xmm0

+    movdqa          xmm0, xmm3

+    SSSE3_FilterHorizontalbw_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2

+    movdqu          xmm2, [p_src + 6]

+    punpckhqdq      xmm0, xmm2

+    SSSE3_FilterHorizontalbw_2px xmm0, xmm4, xmm1

+    movdqu          xmm1, [p_src - 2]

+    add             p_src, i_srcstride

+    movlps          [p_dst + 26], xmm0

+    MOVDQ           [p_dst + 16], xmm3

+    add             p_dst, i_dststride

+    movhps          [p_dst + 26], xmm0

+    SSSE3_FilterHorizontalbw_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm3

+    MOVDQ           [p_dst], xmm1

+    SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm0, xmm3

+    MOVDQ           [p_dst + 16], xmm2

+    add             p_dst, i_dststride

+    sub             i_height, 2

+    jg              .width17_yloop

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+;***********************************************************************

+; void McHorVer02WidthGe8S16ToU8_ssse3(const int16_t *pSrc,

+;                                      int32_t iSrcStride,

+;                                      uint8_t *pDst,

+;                                      int32_t iDstStride,

+;                                      int32_t iWidth,

+;                                      int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer02WidthGe8S16ToU8_ssse3

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_width      r4

+%define i_height     r5

+%define i_srcstride3 r6

+    %assign  push_num 0

+%ifdef X86_32

+    push            r6

+    %assign  push_num 1

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    sub             i_height, 1

+    push            i_height

+    lea             i_srcstride3, [3 * i_srcstride]

+    test            i_width, 1

+    jz              .width_loop

+    push            p_src

+    push            p_dst

+    lea             p_src, [p_src + 2 * i_width - 2]

+    add             p_dst, i_width

+    movd            xmm0, [p_src]

+    punpcklwd       xmm0, [p_src + i_srcstride]

+    movd            xmm1, [p_src + 2 * i_srcstride]

+    add             p_src, i_srcstride3

+    punpcklwd       xmm1, [p_src]

+    punpckldq       xmm0, xmm1

+    movd            xmm1, [p_src + i_srcstride]

+    cmp             i_height, 4

+    je              .filter5_unalign

+    punpcklwd       xmm1, [p_src + 2 * i_srcstride]

+    movd            xmm2, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklwd       xmm2, [p_src]

+    punpckldq       xmm1, xmm2

+    punpcklqdq      xmm0, xmm1

+.height_loop_unalign:

+    movd            xmm1, [p_src + i_srcstride]

+    palignr         xmm1, xmm0, 2

+    movd            xmm2, [p_src + 2 * i_srcstride]

+    palignr         xmm2, xmm1, 2

+    movd            xmm3, [p_src + i_srcstride3]

+    palignr         xmm3, xmm2, 2

+    lea             p_src, [p_src + 4 * i_srcstride]

+    movd            xmm4, [p_src]

+    palignr         xmm4, xmm3, 2

+    movd            xmm5, [p_src + i_srcstride]

+    palignr         xmm5, xmm4, 2

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7

+    packuswb        xmm0, xmm0

+    movdqa          xmm6, xmm0

+    pslld           xmm6, 24

+    movd            [p_dst - 4], xmm6

+    movlps          [p_dst + 4 * i_dststride - 8], xmm6

+    add             p_dst, i_dststride

+    movdqa          xmm6, xmm0

+    pslld           xmm6, 16

+    movd            [p_dst - 4], xmm6

+    movlps          [p_dst + 4 * i_dststride - 8], xmm6

+    add             p_dst, i_dststride

+    movdqa          xmm6, xmm0

+    pslld           xmm6, 8

+    movd            [p_dst - 4], xmm6

+    movd            [p_dst + i_dststride - 4], xmm0

+    lea             p_dst, [p_dst + 4 * i_dststride]

+    movlps          [p_dst - 8], xmm6

+    movlps          [p_dst + i_dststride - 8], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 8

+    jle             .height_loop_unalign_exit

+    movd            xmm1, [p_src + 2 * i_srcstride]

+    palignr         xmm1, xmm5, 2

+    movd            xmm0, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    punpcklwd       xmm0, [p_src]

+    palignr         xmm0, xmm1, 4

+    jmp             .height_loop_unalign

+.height_loop_unalign_exit:

+    movddup         xmm6, [p_src + 2 * i_srcstride - 6]

+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

+    packuswb        xmm1, xmm1

+    movlps          [p_dst - 8], xmm1

+    jmp             .unalign_done

+.filter5_unalign:

+    pslldq          xmm0, 8

+    palignr         xmm1, xmm0, 2

+    movd            xmm2, [p_src + 2 * i_srcstride]

+    palignr         xmm2, xmm1, 2

+    movd            xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    palignr         xmm3, xmm2, 2

+    movd            xmm4, [p_src]

+    palignr         xmm4, xmm3, 2

+    movd            xmm5, [p_src + i_srcstride]

+    palignr         xmm5, xmm4, 2

+    movd            xmm6, [p_src + 2 * i_srcstride]

+    palignr         xmm6, xmm5, 2

+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

+    packuswb        xmm1, xmm1

+    movdqa          xmm0, xmm1

+    psrlq           xmm1,  8

+    movdqa          xmm2, xmm0

+    psrlq           xmm2, 16

+    movdqa          xmm3, xmm0

+    psrlq           xmm3, 24

+    movd            [p_dst - 4], xmm0

+    movd            [p_dst + i_dststride - 4], xmm1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movd            [p_dst - 4], xmm2

+    movd            [p_dst + i_dststride - 4], xmm3

+    movlps          [p_dst + 2 * i_dststride - 8], xmm0

+.unalign_done:

+    pop             p_dst

+    pop             p_src

+    mov             i_height, [r7]

+    sub             i_width, 1

+.width_loop:

+    push            p_src

+    push            p_dst

+    movdqa          xmm0, [p_src]

+    movdqa          xmm1, [p_src + i_srcstride]

+    movdqa          xmm2, [p_src + 2 * i_srcstride]

+    movdqa          xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    movdqa          xmm4, [p_src]

+.height_loop:

+    movdqa          xmm5, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6

+    movdqa          xmm6, [p_src + 2 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

+    movdqa          xmm7, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    packuswb        xmm0, xmm1

+    movlps          [p_dst], xmm0

+    movhps          [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0

+    movdqa          xmm0, [p_src]

+    SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1

+    packuswb        xmm2, xmm3

+    movlps          [p_dst], xmm2

+    movhps          [p_dst + i_dststride], xmm2

+    cmp             i_height, 4

+    jl              .x_loop_dec

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    movdqa          xmm1, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2

+    je              .store_xmm4_exit

+    movdqa          xmm2, [p_src + 2 * i_srcstride]

+    SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3

+    movdqa          xmm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    packuswb        xmm4, xmm5

+    movlps          [p_dst], xmm4

+    movhps          [p_dst + i_dststride], xmm4

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4

+    movdqa          xmm4, [p_src]

+    SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5

+    packuswb        xmm6, xmm7

+    movlps          [p_dst], xmm6

+    movhps          [p_dst + i_dststride], xmm6

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 8

+    jg              .height_loop

+    jl              .x_loop_dec

+    movdqa          xmm5, [p_src + i_srcstride]

+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6

+    packuswb        xmm0, xmm0

+    movlps          [p_dst], xmm0

+.x_loop_dec:

+    pop             p_dst

+    pop             p_src

+    sub             i_width, 8

+    jle             .done

+    mov             i_height, [r7]

+    add             p_src, 16

+    add             p_dst, 8

+    jmp             .width_loop

+.store_xmm4_exit:

+    packuswb        xmm4, xmm4

+    movlps          [p_dst], xmm4

+    pop             p_dst

+    pop             p_src

+.done:

+    pop             i_height

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+%undef i_srcstride3

+%ifdef HAVE_AVX2

+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6

+%macro AVX2_FilterHorizontalbw_16px 6

+    vpshufb         %5, %1, %3

+    vpshufb         %1, %1, %2

+    vpshufd         %6, %1, 10110001b

+    vpmaddubsw      %1, %1, [db20_256]

+    vpmaddubsw      %5, %5, %4

+    vpmaddubsw      %6, %6, %4

+    vpaddw          %1, %1, %5

+    vpaddw          %1, %1, %6

+%endmacro

+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6

+%macro AVX2_FilterHorizontal_16px 6

+    AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6

+    vpaddw          %1, %1, [h264_w0x10_256]

+    vpsraw          %1, %1, 5

+%endmacro

+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7

+%macro AVX2_FilterHorizontalbw_4x4px 7

+    vpshufb         %6, %1, %4

+    vpshufb         %7, %2, %4

+    vpshufb         %1, %1, %3

+    vpshufb         %2, %2, %3

+    vpunpcklqdq     %1, %1, %2

+    vpunpcklqdq     %6, %6, %7

+    vpshufd         %7, %1, 10110001b

+    vpmaddubsw      %1, %1, [db20_256]

+    vpmaddubsw      %6, %6, %5

+    vpmaddubsw      %7, %7, %5

+    vpaddw          %1, %1, %6

+    vpaddw          %1, %1, %7

+%endmacro

+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7

+%macro AVX2_FilterHorizontal_4x4px 7

+    AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7

+    vpaddw          %1, %1, [h264_w0x10_256]

+    vpsraw          %1, %1, 5

+%endmacro

+; pixels=%1 -32768>>scale=%2 tmp=%3

+%macro AVX2_FilterHorizontalbw_4px 3

+    vpmaddubsw      %1, %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_256]

+    vpmaddwd        %1, %1, %2

+    vpshufd         %3, %1, 10110001b

+    vpaddd          %1, %1, %3

+%endmacro

+; pixels=%1 tmp=%2

+%macro AVX2_FilterHorizontal_4px 2

+    AVX2_FilterHorizontalbw_4px %1, [dwm1024_256], %2

+    vpaddd          %1, %1, [dd32768_256]

+%endmacro

+; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7

+%macro AVX2_FilterVertical_16px 7

+    vpmaddubsw      %1, %1, %4

+    vpmaddubsw      %7, %2, %5

+    vpaddw          %1, %1, %7

+    vpmaddubsw      %7, %3, %6

+    vpaddw          %1, %1, %7

+    vpaddw          %1, %1, [h264_w0x10_256]

+    vpsraw          %1, %1, 5

+%endmacro

+; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8

+%macro AVX2_FilterVertical2_16px 8

+    vpxor           %7, %7, %7

+    vpunpcklbw      %1, %1, %7

+    vpunpcklbw      %8, %2, %7

+    vpaddw          %1, %1, %8

+    vpmaddubsw      %7, %3, %5

+    vpaddw          %1, %1, %7

+    vpmaddubsw      %7, %4, %6

+    vpaddw          %1, %1, %7

+    vpaddw          %1, %1, [h264_w0x10_256]

+    vpsraw          %1, %1, 5

+%endmacro

+; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7

+%macro AVX2_FilterVerticalw_16px 7

+    vpaddw          %1, %1, %6

+    vpaddw          %7, %2, %5

+    vpsubw          %1, %1, %7

+    vpsraw          %1, %1, 2

+    vpsubw          %1, %1, %7

+    vpaddw          %7, %3, %4

+    vpaddw          %1, %1, %7

+    vpsraw          %1, %1, 2

+    vpaddw          %7, %7, [dw32_256]

+    vpaddw          %1, %1, %7

+    vpsraw          %1, %1, 6

+%endmacro

+;***********************************************************************

+; void McHorVer02_avx2(const uint8_t *pSrc,

+;                      int32_t iSrcStride,

+;                      uint8_t *pDst,

+;                      int32_t iDstStride,

+;                      int32_t iWidth,

+;                      int32_t iHeight)

+;***********************************************************************

+WELS_EXTERN McHorVer02_avx2

+%define p_src         r0

+%define i_srcstride   r1

+%define p_dst         r2

+%define i_dststride   r3

+%define i_width       r4

+%define i_height      r5

+%define i_srcstride3  r6

+    %assign push_num 0

+%ifdef X86_32

+    push            r6

+    %assign push_num 1

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    lea             i_srcstride3, [3 * i_srcstride]

+    cmp             i_width, 8

+    je              .width8

+    jg              .width16

+; .width4:

+    vmovd           xmm0, [p_src]

+    vpbroadcastd    xmm5, [p_src + i_srcstride]

+    vpunpcklbw      xmm0, xmm0, xmm5

+    vpbroadcastd    ymm1, [p_src + 2 * i_srcstride]

+    vpunpcklbw      xmm5, xmm5, xmm1

+    vpblendd        xmm0, xmm0, xmm5, 1100b

+    vpbroadcastd    ymm5, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpunpcklbw      ymm1, ymm1, ymm5

+    vpbroadcastd    ymm2, [p_src]

+    vpunpcklbw      ymm5, ymm5, ymm2

+    vpblendd        ymm1, ymm1, ymm5, 11001100b

+    vpblendd        ymm0, ymm0, ymm1, 11110000b

+    vpbroadcastd    ymm5, [p_src + i_srcstride]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    vpunpcklbw      ymm2, ymm2, ymm5

+    vpbroadcastd    ymm3, [p_src]

+    vpunpcklbw      ymm5, ymm5, ymm3

+    vpblendd        ymm2, ymm2, ymm5, 11001100b

+    vpblendd        ymm1, ymm1, ymm2, 11110000b

+    vpbroadcastd    ymm5, [p_src + i_srcstride]

+    vpunpcklbw      ymm3, ymm3, ymm5

+    vpbroadcastd    ymm4, [p_src + 2 * i_srcstride]

+    vpunpcklbw      ymm5, ymm5, ymm4

+    vpblendd        ymm3, ymm3, ymm5, 11001100b

+    vpblendd        ymm2, ymm2, ymm3, 11110000b

+    vbroadcasti128  ymm6, [db20_128]

+    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5

+    vpackuswb       ymm0, ymm0, ymm0

+    vmovd           [p_dst], xmm0

+    vpsrlq          xmm5, xmm0, 32

+    vmovd           [p_dst + i_dststride], xmm5

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vextracti128    xmm0, ymm0, 1

+    vmovd           [p_dst], xmm0

+    vpsrlq          xmm5, xmm0, 32

+    vmovd           [p_dst + i_dststride], xmm5

+    cmp             i_height, 5

+    jl              .width4_done

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vpbroadcastd    ymm5, [p_src + i_srcstride3]

+    vpunpcklbw      ymm4, ymm4, ymm5

+    jg              .width4_height_ge8

+    AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5

+    vpackuswb       xmm2, xmm2, xmm2

+    vmovd           [p_dst], xmm2

+    jmp             .width4_done

+.width4_height_ge8:

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpbroadcastd    ymm1, [p_src]

+    vpunpcklbw      ymm5, ymm5, ymm1

+    vpblendd        ymm4, ymm4, ymm5, 11001100b

+    vpblendd        ymm3, ymm3, ymm4, 11110000b

+    vpbroadcastd    ymm5, [p_src + i_srcstride]

+    vpunpcklbw      ymm1, ymm5

+    vpbroadcastd    ymm0, [p_src + 2 * i_srcstride]

+    vpunpcklbw      ymm5, ymm5, ymm0

+    vpblendd        ymm1, ymm1, ymm5, 11001100b

+    vpblendd        ymm4, ymm4, ymm1, 11110000b

+    AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5

+    vpackuswb       ymm2, ymm2, ymm2

+    vmovd           [p_dst], xmm2

+    vpsrlq          xmm5, xmm2, 32

+    vmovd           [p_dst + i_dststride], xmm5

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vextracti128    xmm2, ymm2, 1

+    vmovd           [p_dst], xmm2

+    vpsrlq          xmm5, xmm2, 32

+    vmovd           [p_dst + i_dststride], xmm5

+    cmp             i_height, 9

+    jl              .width4_done

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vmovd           xmm5, [p_src + i_srcstride3]

+    vpunpcklbw      xmm0, xmm0, xmm5

+    AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5

+    vpackuswb       xmm4, xmm4, xmm4

+    vmovd           [p_dst], xmm4

+.width4_done:

+    vzeroupper

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+.width8:

+    sub             i_height, 1

+    vmovq           xmm0, [p_src]

+    vmovq           xmm4, [p_src + i_srcstride]

+    vpunpcklbw      xmm0, xmm0, xmm4

+    vmovq           xmm1, [p_src + 2 * i_srcstride]

+    vpunpcklbw      xmm4, xmm4, xmm1

+    vinserti128     ymm0, ymm0, xmm4, 1

+    vmovq           xmm4, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpunpcklbw      xmm1, xmm1, xmm4

+    vmovq           xmm6, [p_src]

+    vpunpcklbw      xmm4, xmm4, xmm6

+    vinserti128     ymm1, ymm1, xmm4, 1

+.width8_yloop:

+    vmovq           xmm4, [p_src + i_srcstride]

+    vpunpcklbw      xmm2, xmm6, xmm4

+    vmovq           xmm3, [p_src + 2 * i_srcstride]

+    vpunpcklbw      xmm4, xmm4, xmm3

+    vinserti128     ymm2, ymm2, xmm4, 1

+    vbroadcasti128  ymm5, [db20_128]

+    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4

+    vmovq           xmm4, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpunpcklbw      xmm3, xmm3, xmm4

+    vmovq           xmm6, [p_src]

+    vpunpcklbw      xmm4, xmm4, xmm6

+    vinserti128     ymm3, ymm3, xmm4, 1

+    AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4

+    vpackuswb       ymm0, ymm0, ymm1

+    vmovlps         [p_dst], xmm0

+    vextracti128    xmm1, ymm0, 1

+    vmovlps         [p_dst + i_dststride], xmm1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vmovhps         [p_dst], xmm0

+    vmovhps         [p_dst + i_dststride], xmm1

+    cmp             i_height, 4

+    jl              .width8_done

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vmovq           xmm4, [p_src + i_srcstride]

+    vpunpcklbw      xmm0, xmm6, xmm4

+    jg              .width8_height_ge8

+    AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4

+    vpackuswb       xmm2, xmm2, xmm2

+    vmovlps         [p_dst], xmm2

+    jmp             .width8_done

+.width8_height_ge8:

+    vmovq           xmm1, [p_src + 2 * i_srcstride]

+    vpunpcklbw      xmm4, xmm4, xmm1

+    vinserti128     ymm0, ymm0, xmm4, 1

+    AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4

+    vmovq           xmm4, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpunpcklbw      xmm1, xmm1, xmm4

+    vmovq           xmm6, [p_src]

+    vpunpcklbw      xmm4, xmm4, xmm6

+    vinserti128     ymm1, ymm1, xmm4, 1

+    AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4

+    vpackuswb       ymm2, ymm2, ymm3

+    vmovlps         [p_dst], xmm2

+    vextracti128    xmm3, ymm2, 1

+    vmovlps         [p_dst + i_dststride], xmm3

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vmovhps         [p_dst], xmm2

+    vmovhps         [p_dst + i_dststride], xmm3

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 8

+    jg              .width8_yloop

+    jl              .width8_done

+    vmovq           xmm4, [p_src + i_srcstride]

+    vpunpcklbw      xmm2, xmm6, xmm4

+    AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4

+    vpackuswb       xmm0, xmm0, xmm0

+    vmovlps         [p_dst], xmm0

+.width8_done:

+    vzeroupper

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+.width16:

+    sub             i_height, 1

+    test            i_height, 1

+    jnz             .width16_yloop_begin_even

+    vmovq           xmm0, [p_src]

+    vpbroadcastq    ymm1, [p_src + 8]

+    vpblendd        ymm0, ymm0, ymm1, 11110000b

+    vmovq           xmm1, [p_src + i_srcstride]

+    vpbroadcastq    ymm2, [p_src + i_srcstride + 8]

+    vpblendd        ymm1, ymm1, ymm2, 11110000b

+    vpunpcklbw      ymm0, ymm0, ymm1

+    vmovq           xmm2, [p_src + 2 * i_srcstride]

+    vpbroadcastq    ymm3, [p_src + 2 * i_srcstride + 8]

+    vpblendd        ymm2, ymm2, ymm3, 11110000b

+    vmovq           xmm3, [p_src + i_srcstride3]

+    vpbroadcastq    ymm4, [p_src + i_srcstride3 + 8]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpblendd        ymm3, ymm3, ymm4, 11110000b

+    vpunpcklbw      ymm2, ymm2, ymm3

+    vmovq           xmm4, [p_src]

+    vpbroadcastq    ymm5, [p_src + 8]

+    vpblendd        ymm4, ymm4, ymm5, 11110000b

+    vmovq           xmm5, [p_src + i_srcstride]

+    vpbroadcastq    ymm6, [p_src + i_srcstride + 8]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    vpblendd        ymm5, ymm5, ymm6, 11110000b

+    vpunpcklbw      ymm4, ymm4, ymm5

+    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm7

+    vpackuswb       ymm0, ymm0, ymm0

+    vpermq          ymm0, ymm0, 1000b

+    vmovdqa         [p_dst], xmm0

+    add             p_dst, i_dststride

+    jmp             .width16_yloop

+.width16_yloop_begin_even:

+    vmovq           xmm1, [p_src]

+    vpbroadcastq    ymm2, [p_src + 8]

+    vpblendd        ymm1, ymm1, ymm2, 11110000b

+    vmovq           xmm2, [p_src + i_srcstride]

+    vpbroadcastq    ymm3, [p_src + i_srcstride + 8]

+    vpblendd        ymm2, ymm2, ymm3, 11110000b

+    vmovq           xmm3, [p_src + 2 * i_srcstride]

+    vpbroadcastq    ymm4, [p_src + 2 * i_srcstride + 8]

+    add             p_src, i_srcstride3

+    vpblendd        ymm3, ymm3, ymm4, 11110000b

+    vpunpcklbw      ymm2, ymm2, ymm3

+    vmovq           xmm4, [p_src]

+    vpbroadcastq    ymm5, [p_src + 8]

+    vpblendd        ymm4, ymm4, ymm5, 11110000b

+    vmovq           xmm5, [p_src + i_srcstride]

+    vpbroadcastq    ymm6, [p_src + i_srcstride + 8]

+    lea             p_src, [p_src + 2 * i_srcstride]

+    vpblendd        ymm5, ymm5, ymm6, 11110000b

+    vpunpcklbw      ymm4, ymm4, ymm5

+.width16_yloop:

+    vmovq           xmm6, [p_src]

+    vpbroadcastq    ymm7, [p_src + 8]

+    vpblendd        ymm6, ymm6, ymm7, 11110000b

+    AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm0, ymm7

+    vmovq           xmm7, [p_src + i_srcstride]

+    vpbroadcastq    ymm0, [p_src + i_srcstride + 8]

+    vpblendd        ymm7, ymm7, ymm0, 11110000b

+    vpunpcklbw      ymm6, ymm6, ymm7

+    AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm0

+    vpackuswb       ymm1, ymm1, ymm2

+    vpermq          ymm1, ymm1, 11011000b

+    vmovdqa         [p_dst], xmm1

+    vextracti128    [p_dst + i_dststride], ymm1, 1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vmovq           xmm0, [p_src + 2 * i_srcstride]

+    vpbroadcastq    ymm1, [p_src + 2 * i_srcstride + 8]

+    vpblendd        ymm0, ymm0, ymm1, 11110000b

+    AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm2, ymm1

+    vmovq           xmm1, [p_src + i_srcstride3]

+    vpbroadcastq    ymm2, [p_src + i_srcstride3 + 8]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpblendd        ymm1, ymm1, ymm2, 11110000b

+    vpunpcklbw      ymm0, ymm0, ymm1

+    AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm2

+    vpackuswb       ymm3, ymm3, ymm4

+    vpermq          ymm3, ymm3, 11011000b

+    vmovdqa         [p_dst], xmm3

+    vextracti128    [p_dst + i_dststride], ymm3, 1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vmovq           xmm2, [p_src]

+    vpbroadcastq    ymm3, [p_src + 8]

+    vpblendd        ymm2, ymm2, ymm3, 11110000b

+    AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm4, ymm3

+    vmovq           xmm3, [p_src + i_srcstride]

+    vpbroadcastq    ymm4, [p_src + i_srcstride + 8]

+    vpblendd        ymm3, ymm3, ymm4, 11110000b

+    vpunpcklbw      ymm2, ymm2, ymm3

+    AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm4

+    vpackuswb       ymm5, ymm5, ymm6

+    vpermq          ymm5, ymm5, 11011000b

+    vmovdqa         [p_dst], xmm5

+    vextracti128    [p_dst + i_dststride], ymm5, 1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vmovq           xmm4, [p_src + 2 * i_srcstride]

+    vpbroadcastq    ymm5, [p_src + 2 * i_srcstride + 8]

+    vpblendd        ymm4, ymm4, ymm5, 11110000b

+    AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm6, ymm5

+    vmovq           xmm5, [p_src + i_srcstride3]

+    vpbroadcastq    ymm6, [p_src + i_srcstride3 + 8]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpblendd        ymm5, ymm5, ymm6, 11110000b

+    vpunpcklbw      ymm4, ymm4, ymm5

+    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm6

+    vpackuswb       ymm7, ymm7, ymm0

+    vpermq          ymm7, ymm7, 11011000b

+    vmovdqa         [p_dst], xmm7

+    vextracti128    [p_dst + i_dststride], ymm7, 1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 8

+    jg              .width16_yloop

+    vzeroupper

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+%undef p_src

+%undef i_srcstride

+%undef i_srcstride3

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+%undef i_ycnt

+;*******************************************************************************

+; void McHorVer20_avx2(const uint8_t *pSrc,

+;                      int iSrcStride,

+;                      uint8_t *pDst,

+;                      int iDstStride,

+;                      int iWidth,

+;                      int iHeight);

+;*******************************************************************************

+WELS_EXTERN McHorVer20_avx2

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_width      r4

+%define i_height     r5

+    %assign  push_num 0

+    LOAD_6_PARA

+    PUSH_XMM 7

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    vbroadcasti128  ymm4, [shufb_32435465768798A9]

+    vbroadcasti128  ymm5, [shufb_011267784556ABBC]

+    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    cmp             i_width, 8

+    je              .width8

+    jg              .width16_yloop

+%xdefine i_srcstride3 i_width

+%undef i_width

+    lea             i_srcstride3, [3 * i_srcstride]

+.width4_yloop:

+    vmovdqu         xmm0, [p_src - 2]

+    vmovdqu         xmm1, [p_src + i_srcstride - 2]

+    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1

+    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1

+    lea             p_src, [p_src + 4 * i_srcstride]

+    AVX2_FilterHorizontal_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3

+    vpackuswb       ymm0, ymm0, ymm0

+    vmovd           [p_dst], xmm0

+    vpsrlq          xmm1, xmm0, 32

+    vmovd           [p_dst + i_dststride], xmm1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vextracti128    xmm0, ymm0, 1

+    vmovd           [p_dst], xmm0

+    vpsrlq          xmm1, xmm0, 32

+    vmovd           [p_dst + i_dststride], xmm1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 4

+    jg              .width4_yloop

+    vzeroupper

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width8:

+    lea             i_srcstride3, [3 * i_srcstride]

+.width8_yloop:

+    vmovdqu         xmm0, [p_src - 2]

+    vmovdqu         xmm1, [p_src + i_srcstride - 2]

+    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1

+    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1

+    lea             p_src, [p_src + 4 * i_srcstride]

+    AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3

+    AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3

+    vpackuswb       ymm0, ymm0, ymm1

+    vmovlps         [p_dst], xmm0

+    vmovhps         [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vextracti128    xmm0, ymm0, 1

+    vmovlps         [p_dst], xmm0

+    vmovhps         [p_dst + i_dststride], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 4

+    jg              .width8_yloop

+    vzeroupper

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+%undef i_srcstride3

+.width16_yloop:

+    vmovdqu         xmm0, [p_src - 2]

+    vmovdqu         xmm1, [p_src + 6]

+    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1

+    vinserti128     ymm1, ymm1, [p_src + i_srcstride + 6], 1

+    lea             p_src, [p_src + 2 * i_srcstride]

+    AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3

+    AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3

+    vpackuswb       ymm0, ymm0, ymm1

+    vmovdqa         [p_dst], xmm0

+    vextracti128    [p_dst + i_dststride], ymm0, 1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 2

+    jg              .width16_yloop

+    vzeroupper

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+;***********************************************************************

+; void McHorVer20Width5Or9Or17_avx2(const uint8_t *pSrc,

+;                                   int32_t iSrcStride,

+;                                   uint8_t *pDst,

+;                                   int32_t iDstStride,

+;                                   int32_t iWidth,

+;                                   int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer20Width5Or9Or17_avx2

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_width      r4

+%define i_height     r5

+    %assign  push_num 0

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    vbroadcasti128  ymm5, [shufb_32435465768798A9]

+    vbroadcasti128  ymm6, [shufb_011267784556ABBC]

+    vbroadcasti128  ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    cmp             i_width, 9

+    je              .width9

+    jg              .width17

+.width5_yloop:

+    vmovdqu         xmm0, [p_src - 2]

+    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1

+    lea             p_src, [p_src + 2 * i_srcstride]

+    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2

+    vpackuswb       ymm0, ymm0, ymm0

+    vpsrlq          xmm1, xmm0, 8

+    vmovd           [p_dst + 1], xmm1

+    vmovd           [p_dst], xmm0

+    add             p_dst, i_dststride

+    vextracti128    xmm0, ymm0, 1

+    vpsrlq          xmm1, xmm0, 8

+    vmovd           [p_dst + 1], xmm1

+    vmovd           [p_dst], xmm0

+    add             p_dst, i_dststride

+    sub             i_height, 2

+    jg              .width5_yloop

+    vzeroupper

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width9:

+%xdefine i_srcstride3 i_width

+%undef i_width

+    lea             i_srcstride3, [3 * i_srcstride]

+.width9_yloop:

+    vmovdqu         xmm0, [p_src - 2]

+    vmovdqu         xmm4, [p_src + i_srcstride - 2]

+    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1

+    vinserti128     ymm4, ymm4, [p_src + i_srcstride3 - 2], 1

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpunpckhqdq     ymm3, ymm0, ymm4

+    AVX2_FilterHorizontal_4px ymm3, ymm2

+    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2

+    vpackuswb       ymm3, ymm3, ymm0

+    vmovd           [p_dst + 5], xmm3

+    vmovhps         [p_dst], xmm3

+    add             p_dst, i_dststride

+    AVX2_FilterHorizontal_16px ymm4, ymm5, ymm6, ymm7, ymm1, ymm2

+    vpackuswb       ymm4, ymm4, ymm4

+    vpsrlq          xmm2, xmm3, 32

+    vmovd           [p_dst + 5], xmm2

+    vmovlps         [p_dst], xmm4

+    add             p_dst, i_dststride

+    vextracti128    xmm3, ymm3, 1

+    vextracti128    xmm4, ymm4, 1

+    vmovd           [p_dst + 5], xmm3

+    vmovhps         [p_dst], xmm3

+    add             p_dst, i_dststride

+    vpsrlq          xmm2, xmm3, 32

+    vmovd           [p_dst + 5], xmm2

+    vmovlps         [p_dst], xmm4

+    add             p_dst, i_dststride

+    sub             i_height, 4

+    jg              .width9_yloop

+    vzeroupper

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+.width17:

+    lea             i_srcstride3, [3 * i_srcstride]

+.width17_yloop:

+    vmovdqu         xmm0, [p_src - 2]

+    vmovdqu         xmm3, [p_src + 6]

+    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1

+    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1

+    vmovdqa         ymm4, ymm3

+    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2

+    AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2

+    vpackuswb       ymm0, ymm0, ymm3

+    vmovdqu         xmm1, [p_src + 2 * i_srcstride - 2]

+    vmovdqu         xmm3, [p_src + 2 * i_srcstride + 6]

+    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1

+    vinserti128     ymm3, ymm3, [p_src + i_srcstride3 + 6], 1

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpunpckhqdq     ymm4, ymm4, ymm3

+    AVX2_FilterHorizontal_4px ymm4, ymm2

+    vpackuswb       ymm4, ymm4, ymm4

+    vmovd           [p_dst + 13], xmm4

+    vmovdqa         [p_dst], xmm0

+    add             p_dst, i_dststride

+    vextracti128    xmm2, ymm4, 1

+    vmovd           [p_dst + 13], xmm2

+    vextracti128    [p_dst], ymm0, 1

+    add             p_dst, i_dststride

+    vpsrlq          xmm4, xmm4, 32

+    vmovd           [p_dst + 13], xmm4

+    AVX2_FilterHorizontal_16px ymm1, ymm5, ymm6, ymm7, ymm0, ymm4

+    AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm0, ymm4

+    vpackuswb       ymm1, ymm1, ymm3

+    vmovdqa         [p_dst], xmm1

+    add             p_dst, i_dststride

+    vpsrlq          xmm2, xmm2, 32

+    vmovd           [p_dst + 13], xmm2

+    vextracti128    [p_dst], ymm1, 1

+    add             p_dst, i_dststride

+    sub             i_height, 4

+    jg              .width17_yloop

+    vzeroupper

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+%undef i_srcstride3

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+;*******************************************************************************

+; void McHorVer20Width4U8ToS16_avx2(const uint8_t *pSrc,

+;                                   int iSrcStride,

+;                                   int16_t *pDst,

+;                                   int iHeight);

+;*******************************************************************************

+WELS_EXTERN McHorVer20Width4U8ToS16_avx2

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_height     r3

+%define i_srcstride3 r4

+%define i_dststride   8

+    %assign  push_num 0

+%ifdef X86_32

+    push            r4

+    %assign  push_num 1

+%endif

+    LOAD_4_PARA

+    PUSH_XMM 7

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    lea             i_srcstride3, [3 * i_srcstride]

+    vbroadcasti128  ymm4, [shufb_32435465768798A9]

+    vbroadcasti128  ymm5, [shufb_011267784556ABBC]

+    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    sub             i_height, 3

+.yloop:

+    vmovdqu         xmm0, [p_src - 2]

+    vmovdqu         xmm1, [p_src + i_srcstride - 2]

+    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1

+    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1

+    lea             p_src, [p_src + 4 * i_srcstride]

+    AVX2_FilterHorizontalbw_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3

+    vmovdqa         [p_dst], ymm0

+    add             p_dst, 4 * i_dststride

+    sub             i_height, 4

+    jg              .yloop

+    ; Height % 4 remaining single.

+    vmovdqu         xmm0, [p_src - 2]

+    AVX2_FilterHorizontalbw_16px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3

+    vmovlps         [p_dst], xmm0

+    vzeroupper

+    POP_XMM

+    LOAD_4_PARA_POP

+%ifdef X86_32

+    pop             r4

+%endif

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_height

+%undef i_srcstride3

+%undef i_dststride

+;***********************************************************************

+; void McHorVer02Width4S16ToU8_avx2(const int16_t *pSrc,

+;                                   uint8_t *pDst,

+;                                   int32_t iDstStride,

+;                                   int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer02Width4S16ToU8_avx2

+%define p_src        r0

+%define p_dst        r1

+%define i_dststride  r2

+%define i_height     r3

+%define i_dststride3 r4

+%define i_srcstride  8

+    %assign  push_num 0

+%ifdef X86_32

+    push            r4

+    %assign  push_num 1

+%endif

+    LOAD_4_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r2, r2d

+    SIGN_EXTENSION  r3, r3d

+    lea             i_dststride3, [3 * i_dststride]

+    vmovdqu         ymm0, [p_src +  0 * i_srcstride]

+    vmovdqu         ymm1, [p_src +  1 * i_srcstride]

+    vmovdqu         ymm2, [p_src +  2 * i_srcstride]

+    vmovdqu         ymm3, [p_src +  3 * i_srcstride]

+    vmovdqu         ymm4, [p_src +  4 * i_srcstride]

+    vmovdqu         ymm5, [p_src +  5 * i_srcstride]

+    vmovdqu         ymm6, [p_src +  6 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7

+    vpackuswb       ymm0, ymm0, ymm0

+    vmovd           [p_dst], xmm0

+    vpsrlq          xmm7, xmm0, 32

+    vmovd           [p_dst + i_dststride], xmm7

+    vextracti128    xmm0, ymm0, 1

+    vmovd           [p_dst + 2 * i_dststride], xmm0

+    vpsrlq          xmm7, xmm0, 32

+    vmovd           [p_dst + i_dststride3], xmm7

+    cmp             i_height, 4

+    jle             .done

+    lea             p_dst, [p_dst + 4 * i_dststride]

+    vmovdqu         ymm7, [p_src +  7 * i_srcstride]

+    vmovdqu         ymm0, [p_src +  8 * i_srcstride]

+    vmovdqu         ymm1, [p_src +  9 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3

+    vpackuswb       ymm4, ymm4, ymm4

+    vmovd           [p_dst], xmm4

+    vpsrlq          xmm3, xmm4, 32

+    vmovd           [p_dst + i_dststride], xmm3

+    vextracti128    xmm4, ymm4, 1

+    vmovd           [p_dst + 2 * i_dststride], xmm4

+    vpsrlq          xmm3, xmm4, 32

+    vmovd           [p_dst + i_dststride3], xmm3

+.done:

+    vzeroupper

+    POP_XMM

+    LOAD_4_PARA_POP

+%ifdef X86_32

+    pop             r4

+%endif

+    ret

+%undef p_src

+%undef p_dst

+%undef i_dststride

+%undef i_height

+%undef i_srcstride

+%undef i_dststride3

+;*******************************************************************************

+; void McHorVer20Width8U8ToS16_avx2(const uint8_t *pSrc,

+;                                   int iSrcStride,

+;                                   int16_t *pDst,

+;                                   int iHeight);

+;*******************************************************************************

+WELS_EXTERN McHorVer20Width8U8ToS16_avx2

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_height     r3

+%define i_dststride  16

+    %assign  push_num 0

+    LOAD_4_PARA

+    PUSH_XMM 6

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    vbroadcasti128  ymm3, [shufb_32435465768798A9]

+    vbroadcasti128  ymm4, [shufb_011267784556ABBC]

+    vbroadcasti128  ymm5, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    sub             i_height, 1

+.yloop:

+    vmovdqu         xmm0, [p_src - 2]

+    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1

+    lea             p_src, [p_src + 2 * i_srcstride]

+    AVX2_FilterHorizontalbw_16px ymm0, ymm3, ymm4, ymm5, ymm1, ymm2

+    vmovdqu         [p_dst], ymm0

+    add             p_dst, 2 * i_dststride

+    sub             i_height, 2

+    jg              .yloop

+    jl              .done

+    vmovdqu         xmm0, [p_src - 2]

+    AVX2_FilterHorizontalbw_16px xmm0, xmm3, xmm4, xmm5, xmm1, xmm2

+    vmovdqa         [p_dst], xmm0

+.done:

+    vzeroupper

+    POP_XMM

+    LOAD_4_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_height

+%undef i_dststride

+;***********************************************************************

+; void McHorVer02Width5S16ToU8_avx2(const int16_t *pSrc,

+;                                   uint8_t *pDst,

+;                                   int32_t iDstStride,

+;                                   int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer02Width5S16ToU8_avx2

+%define p_src        r0

+%define p_dst        r1

+%define i_dststride  r2

+%define i_height     r3

+%define i_srcstride  16

+    %assign  push_num 0

+    LOAD_4_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r2, r2d

+    SIGN_EXTENSION  r3, r3d

+    vmovdqu         ymm0, [p_src +  0 * i_srcstride]

+    vmovdqu         ymm2, [p_src +  2 * i_srcstride]

+    vmovdqu         ymm4, [p_src +  4 * i_srcstride]

+    vmovdqu         ymm6, [p_src +  6 * i_srcstride]

+    vperm2i128      ymm1, ymm0, ymm2, 00100001b

+    vperm2i128      ymm3, ymm2, ymm4, 00100001b

+    vperm2i128      ymm5, ymm4, ymm6, 00100001b

+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7

+    vpackuswb       ymm0, ymm0, ymm0

+    vpsrlq          xmm7, xmm0, 8

+    vmovd           [p_dst + 1], xmm7

+    vmovd           [p_dst], xmm0

+    add             p_dst, i_dststride

+    vextracti128    xmm0, ymm0, 1

+    vpsrlq          xmm7, xmm0, 8

+    vmovd           [p_dst + 1], xmm7

+    vmovd           [p_dst], xmm0

+    add             p_dst, i_dststride

+    vmovdqu         ymm7, [p_src +  7 * i_srcstride]

+    vmovdqu         ymm0, [p_src +  8 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1

+    vpackuswb       ymm2, ymm2, ymm2

+    vpsrlq          xmm1, xmm2, 8

+    vmovd           [p_dst + 1], xmm1

+    vmovd           [p_dst], xmm2

+    add             p_dst, i_dststride

+    vextracti128    xmm2, ymm2, 1

+    vpsrlq          xmm1, xmm2, 8

+    vmovd           [p_dst + 1], xmm1

+    vmovd           [p_dst], xmm2

+    add             p_dst, i_dststride

+    vmovdqu         ymm1, [p_src +  9 * i_srcstride]

+    vmovdqu         ymm2, [p_src + 10 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3

+    vpackuswb       ymm4, ymm4, ymm4

+    vpsrlq          xmm3, xmm4, 8

+    vmovd           [p_dst + 1], xmm3

+    vmovd           [p_dst], xmm4

+    cmp             i_height, 5

+    jle             .done

+    add             p_dst, i_dststride

+    vextracti128    xmm4, ymm4, 1

+    vpsrlq          xmm3, xmm4, 8

+    vmovd           [p_dst + 1], xmm3

+    vmovd           [p_dst], xmm4

+    add             p_dst, i_dststride

+    vmovdqu         ymm3, [p_src + 11 * i_srcstride]

+    vmovdqu         xmm4, [p_src + 12 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5

+    vpackuswb       ymm6, ymm6, ymm6

+    vpsrlq          xmm5, xmm6, 8

+    vmovd           [p_dst + 1], xmm5

+    vmovd           [p_dst], xmm6

+    add             p_dst, i_dststride

+    vextracti128    xmm6, ymm6, 1

+    vpsrlq          xmm5, xmm6, 8

+    vmovd           [p_dst + 1], xmm5

+    vmovd           [p_dst], xmm6

+    add             p_dst, i_dststride

+    vmovdqu         xmm5, [p_src + 13 * i_srcstride]

+    AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7

+    vpackuswb       xmm0, xmm0, xmm0

+    vpsrlq          xmm7, xmm0, 8

+    vmovd           [p_dst + 1], xmm7

+    vmovd           [p_dst], xmm0

+.done:

+    vzeroupper

+    POP_XMM

+    LOAD_4_PARA_POP

+    ret

+%undef p_src

+%undef p_dst

+%undef i_dststride

+%undef i_height

+%undef i_srcstride

+;***********************************************************************

+; void McHorVer02Width8S16ToU8_avx2(const int16_t *pSrc,

+;                                   uint8_t *pDst,

+;                                   int32_t iDstStride,

+;                                   int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer02Width8S16ToU8_avx2

+%define p_src        r0

+%define p_dst        r1

+%define i_dststride  r2

+%define i_height     r3

+%define i_dststride3 r4

+%define i_srcstride  16

+    %assign  push_num 0

+%ifdef X86_32

+    push            r4

+    %assign  push_num 1

+%endif

+    LOAD_4_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r2, r2d

+    SIGN_EXTENSION  r3, r3d

+    lea             i_dststride3, [3 * i_dststride]

+    vmovdqa         ymm0, [p_src +  0 * i_srcstride]

+    vmovdqa         ymm2, [p_src +  2 * i_srcstride]

+    vmovdqa         ymm4, [p_src +  4 * i_srcstride]

+    vperm2i128      ymm1, ymm0, ymm2, 00100001b

+    vperm2i128      ymm3, ymm2, ymm4, 00100001b

+.yloop:

+    vmovdqa         ymm6, [p_src +  6 * i_srcstride]

+    vperm2i128      ymm5, ymm4, ymm6, 00100001b

+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7

+    vmovdqu         ymm7, [p_src +  7 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1

+    vpackuswb       ymm1, ymm0, ymm2

+    vmovdqa         ymm0, [p_src +  8 * i_srcstride]

+    vextracti128    xmm2, ymm1, 1

+    vmovlps         [p_dst], xmm1

+    vmovlps         [p_dst + i_dststride], xmm2

+    vmovhps         [p_dst + 2 * i_dststride], xmm1

+    vmovhps         [p_dst + i_dststride3], xmm2

+    cmp             i_height, 4

+    jle             .done

+    lea             p_dst, [p_dst + 4 * i_dststride]

+    vmovdqu         ymm1, [p_src +  9 * i_srcstride]

+    vmovdqa         ymm2, [p_src + 10 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3

+    vmovdqu         ymm3, [p_src + 11 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5

+    vpackuswb       ymm5, ymm4, ymm6

+    vmovdqa         ymm4, [p_src + 12 * i_srcstride]

+    add             p_src, 8 * i_srcstride

+    vextracti128    xmm6, ymm5, 1

+    vmovlps         [p_dst], xmm5

+    vmovlps         [p_dst + i_dststride], xmm6

+    vmovhps         [p_dst + 2 * i_dststride], xmm5

+    vmovhps         [p_dst + i_dststride3], xmm6

+    lea             p_dst, [p_dst + 4 * i_dststride]

+    sub             i_height, 8

+    jg              .yloop

+.done:

+    vzeroupper

+    POP_XMM

+    LOAD_4_PARA_POP

+%ifdef X86_32

+    pop             r4

+%endif

+    ret

+%undef p_src

+%undef p_dst

+%undef i_dststride

+%undef i_height

+%undef i_dststride3

+%undef i_srcstride

+;*******************************************************************************

+; void McHorVer20Width16U8ToS16_avx2(const uint8_t *pSrc,

+;                                    int32_t iSrcStride,

+;                                    int16_t *pDst,

+;                                    int32_t iHeight);

+;*******************************************************************************

+WELS_EXTERN McHorVer20Width16U8ToS16_avx2

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_height     r3

+%define i_dststride  32

+    %assign  push_num 0

+    LOAD_4_PARA

+    PUSH_XMM 7

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    vbroadcasti128  ymm4, [shufb_32435465768798A9]

+    vbroadcasti128  ymm5, [shufb_011267784556ABBC]

+    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    sub             i_height, 1

+.yloop:

+    vmovdqu         xmm0, [p_src - 2]

+    vinserti128     ymm0, ymm0, [p_src + 6], 1

+    vmovdqu         xmm1, [p_src + i_srcstride - 2]

+    vinserti128     ymm1, ymm1, [p_src + i_srcstride + 6], 1

+    lea             p_src, [p_src + 2 * i_srcstride]

+    AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3

+    vmovdqa         [p_dst], ymm0

+    AVX2_FilterHorizontalbw_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3

+    vmovdqa         [p_dst + i_dststride], ymm1

+    add             p_dst, 2 * i_dststride

+    sub             i_height, 2

+    jg              .yloop

+    jl              .done

+    vmovdqu         xmm0, [p_src - 2]

+    vinserti128     ymm0, ymm0, [p_src + 6], 1

+    AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm1, ymm2

+    vmovdqa         [p_dst], ymm0

+.done:

+    vzeroupper

+    POP_XMM

+    LOAD_4_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_height

+%undef i_dststride

+;***********************************************************************

+; void McHorVer02Width9S16ToU8_avx2(const int16_t *pSrc,

+;                                   uint8_t *pDst,

+;                                   int32_t iDstStride,

+;                                   int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer02Width9S16ToU8_avx2

+%define p_src        r0

+%define p_dst        r1

+%define i_dststride  r2

+%define i_height     r3

+%define i_srcstride  32

+    %assign  push_num 0

+    LOAD_4_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r2, r2d

+    SIGN_EXTENSION  r3, r3d

+    vmovdqa         ymm0, [p_src + 0 * i_srcstride]

+    vmovdqa         ymm1, [p_src + 1 * i_srcstride]

+    vmovdqa         ymm2, [p_src + 2 * i_srcstride]

+    vmovdqa         ymm3, [p_src + 3 * i_srcstride]

+    vmovdqa         ymm4, [p_src + 4 * i_srcstride]

+    sub             i_height, 1

+.height_loop:

+    vmovdqa         ymm5, [p_src + 5 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6

+    vmovdqa         ymm6, [p_src + 6 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7

+    vmovdqa         ymm7, [p_src + 7 * i_srcstride]

+    vpackuswb       ymm0, ymm0, ymm1

+    vextracti128    xmm1, ymm0, 1

+    vpsllq          xmm1, xmm1, 56

+    vmovlps         [p_dst + 1], xmm1

+    vmovlps         [p_dst], xmm0

+    add             p_dst, i_dststride

+    vmovhps         [p_dst + 1], xmm1

+    vmovhps         [p_dst], xmm0

+    add             p_dst, i_dststride

+    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0

+    vmovdqa         ymm0, [p_src + 8 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1

+    vpackuswb       ymm2, ymm2, ymm3

+    vextracti128    xmm3, ymm2, 1

+    vpsllq          xmm3, xmm3, 56

+    vmovlps         [p_dst + 1], xmm3

+    vmovlps         [p_dst], xmm2

+    add             p_dst, i_dststride

+    vmovhps         [p_dst + 1], xmm3

+    vmovhps         [p_dst], xmm2

+    add             p_dst, i_dststride

+    vmovdqa         ymm1, [p_src + 9 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2

+    vmovdqa         ymm2, [p_src + 10 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3

+    vmovdqa         ymm3, [p_src + 11 * i_srcstride]

+    vpackuswb       ymm4, ymm4, ymm5

+    vextracti128    xmm5, ymm4, 1

+    vpsllq          xmm5, xmm5, 56

+    vmovlps         [p_dst + 1], xmm5

+    vmovlps         [p_dst], xmm4

+    cmp             i_height, 4

+    jle             .done

+    add             p_dst, i_dststride

+    vmovhps         [p_dst + 1], xmm5

+    vmovhps         [p_dst], xmm4

+    add             p_dst, i_dststride

+    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4

+    vmovdqa         ymm4, [p_src + 12 * i_srcstride]

+    add             p_src, 8 * i_srcstride

+    AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5

+    vpackuswb       ymm6, ymm6, ymm7

+    vextracti128    xmm7, ymm6, 1

+    vpsllq          xmm7, xmm7, 56

+    vmovlps         [p_dst + 1], xmm7

+    vmovlps         [p_dst], xmm6

+    add             p_dst, i_dststride

+    vmovhps         [p_dst + 1], xmm7

+    vmovhps         [p_dst], xmm6

+    add             p_dst, i_dststride

+    sub             i_height, 8

+    jg              .height_loop

+    vmovdqa         ymm5, [p_src + 5 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6

+    vpackuswb       ymm0, ymm0, ymm0

+    vextracti128    xmm1, ymm0, 1

+    vpsllq          xmm1, xmm1, 56

+    vmovlps         [p_dst + 1], xmm1

+    vmovlps         [p_dst], xmm0

+.done:

+    vzeroupper

+    POP_XMM

+    LOAD_4_PARA_POP

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_height

+;*******************************************************************************

+; void McHorVer20Width17U8ToS16_avx2(const uint8_t *pSrc,

+;                                    int32_t iSrcStride,

+;                                    int16_t *pDst,

+;                                    int32_t iHeight);

+;*******************************************************************************

+WELS_EXTERN McHorVer20Width17U8ToS16_avx2

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_height     r3

+%define i_srcstride3 r4

+%define i_dststride  64

+    %assign  push_num 0

+%ifdef X86_32

+    push            r4

+    %assign  push_num 1

+%endif

+    LOAD_4_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    sub             p_src, i_srcstride

+    sub             p_src, i_srcstride

+    lea             i_srcstride3, [3 * i_srcstride]

+    vbroadcasti128  ymm5, [shufb_32435465768798A9]

+    vbroadcasti128  ymm6, [shufb_011267784556ABBC]

+    vbroadcasti128  ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]

+    sub             i_height, 3

+.yloop:

+    vmovdqu         xmm0, [p_src - 2]

+    vmovdqu         xmm3, [p_src + 6]

+    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1

+    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1

+    vmovdqa         ymm4, ymm3

+    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2

+    vmovdqa         [p_dst], xmm0

+    vextracti128    [p_dst + i_dststride], ymm0, 1

+    AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2

+    vmovdqu         xmm1, [p_src + 2 * i_srcstride - 2]

+    vmovdqu         xmm0, [p_src + 2 * i_srcstride + 6]

+    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1

+    vinserti128     ymm0, ymm0, [p_src + i_srcstride3 + 6], 1

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpunpckhqdq     ymm4, ymm4, ymm0

+    AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2

+    vmovlps         [p_dst + 26], xmm4

+    vmovdqa         [p_dst + 16], xmm3

+    vextracti128    xmm2, ymm4, 1

+    vmovlps         [p_dst + i_dststride + 26], xmm2

+    vextracti128    [p_dst + i_dststride + 16], ymm3, 1

+    vmovhps         [p_dst + 2 * i_dststride + 26], xmm4

+    AVX2_FilterHorizontalbw_16px ymm1, ymm5, ymm6, ymm7, ymm3, ymm4

+    vmovdqa         [p_dst + 2 * i_dststride], xmm1

+    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm3, ymm4

+    vmovdqa         [p_dst + 2 * i_dststride + 16], xmm0

+    vextracti128    [p_dst + 3 * i_dststride], ymm1, 1

+    vmovhps         [p_dst + 3 * i_dststride + 26], xmm2

+    vextracti128    [p_dst + 3 * i_dststride + 16], ymm0, 1

+    add             p_dst, 4 * i_dststride

+    sub             i_height, 4

+    jg              .yloop

+    ; Handle remaining 2 lines after 4x unrolled loop.

+    vmovdqu         xmm0, [p_src - 2]

+    vinserti128     ymm0, ymm0, [p_src + 6], 1

+    vmovdqu         xmm3, [p_src + i_srcstride - 2]

+    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1

+    vpunpckhqdq     ymm4, ymm0, ymm3

+    AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2

+    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2

+    AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2

+    vextracti128    xmm4, ymm4, 1

+    vmovlps         [p_dst + 26], xmm4

+    vmovdqa         [p_dst], ymm0

+    vmovhps         [p_dst + i_dststride + 26], xmm4

+    vmovdqa         [p_dst + i_dststride], ymm3

+    vzeroupper

+    POP_XMM

+    LOAD_4_PARA_POP

+%ifdef X86_32

+    pop             r4

+%endif

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_height

+%undef i_srcstride3

+;***********************************************************************

+; void McHorVer02Width16Or17S16ToU8_avx2(const int16_t *pSrc,

+;                                        int32_t iSrcStride,

+;                                        uint8_t *pDst,

+;                                        int32_t iDstStride,

+;                                        int32_t iWidth,

+;                                        int32_t iHeight);

+;***********************************************************************

+WELS_EXTERN McHorVer02Width16Or17S16ToU8_avx2

+%define p_src        r0

+%define i_srcstride  r1

+%define p_dst        r2

+%define i_dststride  r3

+%define i_width      r4

+%define i_height     r5

+%define i_srcstride3 r6

+    %assign  push_num 0

+%ifdef X86_32

+    push            r6

+    %assign  push_num 1

+%endif

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION  r1, r1d

+    SIGN_EXTENSION  r3, r3d

+    SIGN_EXTENSION  r4, r4d

+    SIGN_EXTENSION  r5, r5d

+    sub             i_height, 1

+    lea             i_srcstride3, [3 * i_srcstride]

+    test            i_width, 1

+    jz              .align_begin

+    push            i_height

+    push            p_src

+    push            p_dst

+    lea             p_src, [p_src + 2 * i_width - 2]

+    add             p_dst, i_width

+    vmovd           xmm0, [p_src]

+    vpunpcklwd      xmm0, xmm0, [p_src + i_srcstride]

+    vmovd           xmm1, [p_src + 2 * i_srcstride]

+    add             p_src, i_srcstride3

+    vpunpcklwd      xmm1, xmm1, [p_src]

+    vpunpckldq      xmm0, xmm0, xmm1

+    vmovd           xmm1, [p_src + i_srcstride]

+    vpunpcklwd      xmm1, xmm1, [p_src + 2 * i_srcstride]

+    vmovd           xmm2, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpunpcklwd      xmm2, xmm2, [p_src]

+    vpunpckldq      xmm1, xmm1, xmm2

+    vpunpcklqdq     xmm0, xmm0, xmm1

+.height_loop_unalign:

+    vmovd           xmm1, [p_src + i_srcstride]

+    vpalignr        xmm1, xmm1, xmm0, 2

+    vmovd           xmm2, [p_src + 2 * i_srcstride]

+    vpalignr        xmm2, xmm2, xmm1, 2

+    vmovd           xmm3, [p_src + i_srcstride3]

+    vpalignr        xmm3, xmm3, xmm2, 2

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vmovd           xmm4, [p_src]

+    vpalignr        xmm4, xmm4, xmm3, 2

+    vmovd           xmm5, [p_src + i_srcstride]

+    vpalignr        xmm5, xmm5, xmm4, 2

+    AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7

+    vpackuswb       xmm0, xmm0, xmm0

+    vpslld          xmm6, xmm0, 24

+    vmovd           [p_dst - 4], xmm6

+    vmovlps         [p_dst + 4 * i_dststride - 8], xmm6

+    add             p_dst, i_dststride

+    vpslld          xmm6, xmm0, 16

+    vmovd           [p_dst - 4], xmm6

+    vmovlps         [p_dst + 4 * i_dststride - 8], xmm6

+    add             p_dst, i_dststride

+    vpslld          xmm6, xmm0, 8

+    vmovd           [p_dst - 4], xmm6

+    vmovd           [p_dst + i_dststride - 4], xmm0

+    lea             p_dst, [p_dst + 4 * i_dststride]

+    vmovlps         [p_dst - 8], xmm6

+    vmovlps         [p_dst + i_dststride - 8], xmm0

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 8

+    jle             .height_loop_unalign_exit

+    vmovd           xmm1, [p_src + 2 * i_srcstride]

+    vpalignr        xmm1, xmm1, xmm5, 2

+    vmovd           xmm0, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpunpcklwd      xmm0, xmm0, [p_src]

+    vpalignr        xmm0, xmm0, xmm1, 4

+    jmp             .height_loop_unalign

+.height_loop_unalign_exit:

+    vpbroadcastq    xmm6, [p_src + 2 * i_srcstride - 6]

+    AVX2_FilterVerticalw_16px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

+    vpackuswb       xmm1, xmm1, xmm1

+    vmovlps         [p_dst - 8], xmm1

+    pop             p_dst

+    pop             p_src

+    pop             i_height

+.align_begin:

+    vmovdqa         ymm0, [p_src]

+    vmovdqa         ymm1, [p_src + i_srcstride]

+    vmovdqa         ymm2, [p_src + 2 * i_srcstride]

+    vmovdqa         ymm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vmovdqa         ymm4, [p_src]

+.height_loop:

+    vmovdqa         ymm5, [p_src + i_srcstride]

+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6

+    vmovdqa         ymm6, [p_src + 2 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7

+    vmovdqa         ymm7, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpackuswb       ymm0, ymm0, ymm1

+    vpermq          ymm0, ymm0, 11011000b

+    vmovdqa         [p_dst], xmm0

+    vextracti128    [p_dst + i_dststride], ymm0, 1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0

+    vmovdqa         ymm0, [p_src]

+    AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1

+    vpackuswb       ymm2, ymm2, ymm3

+    vpermq          ymm2, ymm2, 11011000b

+    vmovdqa         [p_dst], xmm2

+    vextracti128    [p_dst + i_dststride], ymm2, 1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    vmovdqa         ymm1, [p_src + i_srcstride]

+    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2

+    vmovdqa         ymm2, [p_src + 2 * i_srcstride]

+    AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3

+    vmovdqa         ymm3, [p_src + i_srcstride3]

+    lea             p_src, [p_src + 4 * i_srcstride]

+    vpackuswb       ymm4, ymm4, ymm5

+    vpermq          ymm4, ymm4, 11011000b

+    vmovdqa        [p_dst], xmm4

+    vextracti128   [p_dst + i_dststride], ymm4, 1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4

+    vmovdqa         ymm4, [p_src]

+    AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5

+    vpackuswb       ymm6, ymm6, ymm7

+    vpermq          ymm6, ymm6, 11011000b

+    vmovdqa         [p_dst], xmm6

+    vextracti128    [p_dst + i_dststride], ymm6, 1

+    lea             p_dst, [p_dst + 2 * i_dststride]

+    sub             i_height, 8

+    jg              .height_loop

+    jl              .done

+    vmovdqa         ymm5, [p_src + i_srcstride]

+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6

+    vpackuswb       ymm0, ymm0, ymm0

+    vpermq          ymm0, ymm0, 11011000b

+    vmovdqa         [p_dst], xmm0

+.done:

+    vzeroupper

+    POP_XMM

+    LOAD_6_PARA_POP

+%ifdef X86_32

+    pop             r6

+%endif

+    ret

+%undef p_src

+%undef i_srcstride

+%undef p_dst

+%undef i_dststride

+%undef i_width

+%undef i_height

+%undef i_srcstride3

+%endif ; HAVE_AVX2

--- a/test/encoder/EncUT_MotionCompensation.cpp

+++ b/test/encoder/EncUT_MotionCompensation.cpp

@@ -168,8 +168,8 @@

 DEF_MCCOPYTEST (8, 16)

 DEF_MCCOPYTEST (16, 16)

-#define DEF_LUMA_MCTEST(iW,iH) \

-TEST(McHorVer,iW##x##iH)  \

+#define DEF_LUMA_MCTEST(iW, iH, cpu_flags, name_suffix) \

+TEST(McHorVer, iW##x##iH##_##name_suffix) \

 {                       \

     for (int32_t a = 0; a < 4; a++) { \

     for (int32_t b = 0; b < 4; b++) { \

@@ -191,43 +191,41 @@

         uSrcAnchor[0][j][i] = uSrcTest[j][i] = rand()%256;  \

}\

}\

-    int32_t iCpuCores = 1; \

-    uint32_t uiCpuFlag;\

-    for(int32_t k =0; k<2; k++)\

-    {\

-      if(k==0)\

-      {\

-        uiCpuFlag = 0;\

-      }else \

-      {\

-        uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); \

-      }\

-      InitMcFunc(&sMcFunc,uiCpuFlag);\

-      memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \

-      memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \

-      MCHalfPelFilterAnchor(uSrcInputAnchor[1],uSrcInputAnchor[2],uSrcInputAnchor[3],uSrcInputAnchor[0],MC_BUFF_SRC_STRIDE,iW+1,iH+1,pBuf+4); \

-      MCLumaAnchor(uDstAnchor[0],MC_BUFF_DST_STRIDE,uSrcInputAnchor,MC_BUFF_SRC_STRIDE,a,b,iW,iH); \

-      sMcFunc.pMcLumaFunc(&uSrcTest[4][4],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\

-      for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \

-      {                                                                             \

-          for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \

-          {                                                                           \

-              ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]);                              \

-          }                                                                             \

-      }                                                                                \

-    }\

+    InitMcFunc(&sMcFunc, WelsCPUFeatureDetect (0) & (cpu_flags)); \

+    memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \

+    memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \

+    MCHalfPelFilterAnchor(uSrcInputAnchor[1],uSrcInputAnchor[2],uSrcInputAnchor[3],uSrcInputAnchor[0],MC_BUFF_SRC_STRIDE,iW+1,iH+1,pBuf+4); \

+    MCLumaAnchor(uDstAnchor[0],MC_BUFF_DST_STRIDE,uSrcInputAnchor,MC_BUFF_SRC_STRIDE,a,b,iW,iH); \

+    sMcFunc.pMcLumaFunc(&uSrcTest[4][4],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\

+    for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \

+    {                                                                             \

+        for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \

+        {                                                                           \

+            ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]);                              \

+        }                                                                             \

+    }                                                                                \

}\

}\

+#define DEF_LUMA_MCTESTS(cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST ( 4,  4, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST ( 4,  8, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST ( 8,  4, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST ( 8,  8, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST (16,  8, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST ( 8, 16, cpu_flags, name_suffix) \

+    DEF_LUMA_MCTEST (16, 16, cpu_flags, name_suffix)

-DEF_LUMA_MCTEST (4, 4)

-DEF_LUMA_MCTEST (4, 8)

-DEF_LUMA_MCTEST (8, 4)

-DEF_LUMA_MCTEST (8, 8)

-DEF_LUMA_MCTEST (16, 8)

-DEF_LUMA_MCTEST (8, 16)

-DEF_LUMA_MCTEST (16, 16)

+DEF_LUMA_MCTESTS(0, c)

+DEF_LUMA_MCTESTS(~0, native)

+#ifdef X86_ASM

+DEF_LUMA_MCTESTS(WELS_CPU_SSE2, sse2)

+DEF_LUMA_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3, ssse3)

+#ifdef HAVE_AVX2

+DEF_LUMA_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3 | WELS_CPU_AVX2, avx2)

+#endif

+#endif

 #define DEF_CHROMA_MCTEST(iW,iH) \

 TEST(McChroma,iW##x##iH)  \

@@ -315,61 +313,89 @@

-#define DEF_HALFPEL_MCTEST(iW,iH) \

-TEST (EncMcHalfpel, iW##x##iH) { \

+#define DEF_HALFPEL_MCTEST(iW, iH, cpu_flags, name_suffix) \

+TEST (EncMcHalfpel, iW##x##iH##_##name_suffix) { \

     SMcFunc sMcFunc; \

-    for (int32_t k = 0; k < 2; k++) { \

-        for (int32_t w = 0; w < 2; w++) { \

-            int32_t width = iW ; \

-            int32_t height = iH; \

-            uint8_t uAnchor[4][MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \

-            uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \

-            ENFORCE_STACK_ALIGN_2D (uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \

-            uint8_t* uAnchors[4]; \

-            int16_t pBuf[MC_BUFF_DST_STRIDE]; \

-            uAnchors[0] = &uAnchor[0][4][4]; \

-            uAnchors[1] = &uAnchor[1][4][4]; \

-            uAnchors[2] = &uAnchor[2][4][4]; \

-            uAnchors[3] = &uAnchor[3][4][4]; \

-             \

-            memset (uAnchor, 0, 4 * sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_SRC_STRIDE); \

-            memset (uDstTest, 0, sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_DST_STRIDE); \

-            for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

-                for (int32_t i = 0; i < MC_BUFF_SRC_STRIDE; i++) { \

-                    uAnchor[0][j][i] = uSrcTest[j][i] = rand() % 256; \

-                } \

+    for (int32_t w = 0; w < 2; w++) { \

+        int32_t width = iW ; \

+        int32_t height = iH; \

+        uint8_t uAnchor[4][MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \

+        uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \

+        uint8_t uRand[MC_BUFF_HEIGHT][MC_BUFF_DST_STRIDE]; \

+        ENFORCE_STACK_ALIGN_2D (uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \

+        uint8_t* uAnchors[4]; \

+        int16_t pBuf[MC_BUFF_DST_STRIDE]; \

+        uAnchors[0] = &uAnchor[0][4][4]; \

+        uAnchors[1] = &uAnchor[1][4][4]; \

+        uAnchors[2] = &uAnchor[2][4][4]; \

+        uAnchors[3] = &uAnchor[3][4][4]; \

+         \

+        memset (uAnchor, 0, 4 * sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_SRC_STRIDE); \

+        memset (uDstTest, 0, sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_DST_STRIDE); \

+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

+            for (int32_t i = 0; i < MC_BUFF_SRC_STRIDE; i++) { \

+                uAnchor[0][j][i] = uSrcTest[j][i] = rand() % 256; \

+                uRand[j][i] = rand() % 256; \

} \

-             \

-            uint32_t uiCpuFlag = k == 0 ? 0 : WelsCPUFeatureDetect (NULL); \

-            InitMcFunc (&sMcFunc, uiCpuFlag); \

-             \

-            MCHalfPelFilterAnchor (uAnchors[1], uAnchors[2], uAnchors[3], uAnchors[0], MC_BUFF_SRC_STRIDE, width + 1, height + 1, pBuf + 4); \

-            sMcFunc.pfLumaHalfpelHor (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height); \

-            for (int32_t j = 0; j < height; j++) { \

-                for (int32_t i = 0; i < width + 1; i++) { \

-                    ASSERT_EQ (uAnchor[1][4 + j][4 + i], uDstTest[j][i]); \

-                } \

+        } \

+         \

+        InitMcFunc (&sMcFunc, WelsCPUFeatureDetect (0) & (cpu_flags)); \

+         \

+        MCHalfPelFilterAnchor (uAnchors[1], uAnchors[2], uAnchors[3], uAnchors[0], MC_BUFF_SRC_STRIDE, width + 1, height + 1, pBuf + 4); \

+        memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \

+        sMcFunc.pfLumaHalfpelHor (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height); \

+        for (int32_t j = 0; j < height; j++) { \

+            for (int32_t i = 0; i < width + 1; i++) { \

+                ASSERT_EQ (uAnchor[1][4 + j][4 + i], uDstTest[j][i]); \

} \

-            sMcFunc.pfLumaHalfpelVer (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width, height + 1); \

-            for (int32_t j = 0; j < height + 1; j++) { \

-                for (int32_t i = 0; i < width; i++) { \

-                    ASSERT_EQ (uAnchor[2][4 + j][4 + i], uDstTest[j][i]); \

-                } \

+        } \

+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

+            for (int32_t i = j < height ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \

+                ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \

} \

-            sMcFunc.pfLumaHalfpelCen (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height + 1); \

-            for (int32_t j = 0; j < height + 1; j++) { \

-                for (int32_t i = 0; i < width + 1; i++) { \

-                    ASSERT_EQ (uAnchor[3][4 + j][4 + i], uDstTest[j][i]); \

-                } \

+        } \

+        memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \

+        sMcFunc.pfLumaHalfpelVer (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width, height + 1); \

+        for (int32_t j = 0; j < height + 1; j++) { \

+            for (int32_t i = 0; i < width; i++) { \

+                ASSERT_EQ (uAnchor[2][4 + j][4 + i], uDstTest[j][i]); \

} \

} \

+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

+            for (int32_t i = j < height + 1 ? width : 0; i < MC_BUFF_DST_STRIDE; i++) { \

+                ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \

+            } \

+        } \

+        memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \

+        sMcFunc.pfLumaHalfpelCen (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height + 1); \

+        for (int32_t j = 0; j < height + 1; j++) { \

+            for (int32_t i = 0; i < width + 1; i++) { \

+                ASSERT_EQ (uAnchor[3][4 + j][4 + i], uDstTest[j][i]); \

+            } \

+        } \

+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \

+            for (int32_t i = j < height + 1 ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \

+                ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \

+            } \

+        } \

} \

-DEF_HALFPEL_MCTEST(4,4)

-DEF_HALFPEL_MCTEST(4,8)

-DEF_HALFPEL_MCTEST(8,4)

-DEF_HALFPEL_MCTEST(8,8)

-DEF_HALFPEL_MCTEST(8,16)

-DEF_HALFPEL_MCTEST(16,8)

-DEF_HALFPEL_MCTEST(16,16)

+#define DEF_HALFPEL_MCTESTS(cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST( 4 , 4, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST( 4,  8, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST( 8,  4, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST( 8,  8, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST( 8, 16, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST(16,  8, cpu_flags, name_suffix) \

+    DEF_HALFPEL_MCTEST(16, 16, cpu_flags, name_suffix)

+DEF_HALFPEL_MCTESTS(0, c)

+DEF_HALFPEL_MCTESTS(~0, native)

+#ifdef X86_ASM

+DEF_HALFPEL_MCTESTS(WELS_CPU_SSE2, sse2)

+DEF_HALFPEL_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3, ssse3)

+#ifdef HAVE_AVX2

+DEF_HALFPEL_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3 | WELS_CPU_AVX2, avx2)

+#endif

+#endif