shithub: openh264

Download patch

ref: e5609bc0fe7a580b8805d57b0878721b48be1b13
parent: 130c682c29d6a5767770322f2d958357eb19380e
author: Martin Storsjö <[email protected]>
date: Sun Jun 29 13:43:28 EDT 2014

Remove global function pointers in the encoder MC code

This is similar to how this is done in the decoder part of MC.

--- a/codec/encoder/core/src/mc.cpp
+++ b/codec/encoder/core/src/mc.cpp
@@ -81,18 +81,6 @@
     {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
   }
 };
-typedef void (*WelsMcFunc0) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             int32_t iHeight);
-typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, const uint8_t* psrcA, int32_t iSrcAStride,
-                             const uint8_t* pSrcB,
-                             int32_t iSrcBStride, int32_t iHeight);
-WelsMcFunc0 McCopyWidthEq16 = NULL;
-WelsMcFunc0 McCopyWidthEq8 = NULL;
-WelsMcFunc0 McCopyWidthEq4 = NULL;
-WelsMcFunc0 pfMcHorVer02WidthEq16 = NULL;
-WelsMcFunc1 pfPixelAvgWidthEq16  = NULL;
-WelsMcFunc0 pfMcHorVer20WidthEq16 = NULL;
-WelsMcFunc0 pfMcHorVer22WidthEq16 = NULL;
 
 //***************************************************************************//
 //                          C code implementation                            //
@@ -224,97 +212,97 @@
 
 /////////////////////luma MC//////////////////////////
 
-static inline void McHorVer01WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer01WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
 
-  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
 }
-static inline void McHorVer03WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer03WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
 
-  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
 }
-static inline void McHorVer10WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer10WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
 
-  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer20WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
 }
-static inline void McHorVer11WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer11WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
 
-  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+  McHorVer20WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
 }
-static inline void McHorVer12WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer12WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
 
-  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+  McHorVer02WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
 }
-static inline void McHorVer13WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer13WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
 
-  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+  McHorVer20WidthEq16_c (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
 }
-static inline void McHorVer21WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer21WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
 
-  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+  McHorVer20WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
 }
-static inline void McHorVer23WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer23WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
 
-  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+  McHorVer20WidthEq16_c (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
 }
-static inline void McHorVer30WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer30WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
 
-  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + 1, iSrcStride, pTmp, 16, iHeight);
+  McHorVer20WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pSrc + 1, iSrcStride, pTmp, 16, iHeight);
 }
-static inline void McHorVer31WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer31WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
 
-  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+  McHorVer20WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_c (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
 }
-static inline void McHorVer32WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer32WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
 
-  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+  McHorVer02WidthEq16_c (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
 }
-static inline void McHorVer33WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+static inline void McHorVer33WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
 
-  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+  McHorVer20WidthEq16_c (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_c (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
 }
 
 static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
@@ -360,15 +348,15 @@
     pDst += iDstStride;
   }
 }
-static inline void McCopy (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                           int32_t iHeight) {
+static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                             int32_t iHeight) {
   int32_t i;
-  if (iWidth == 16 && McCopyWidthEq16 != NULL)
-    McCopyWidthEq16 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8 && McCopyWidthEq8 != NULL)
-    McCopyWidthEq8 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4 && McCopyWidthEq4 != NULL)
-    McCopyWidthEq4 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  if (iWidth == 16)
+    McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else {
     for (i = 0; i < iHeight; i++) {
       memcpy (pDst, pSrc, iWidth);	// confirmed_safe_unsafe_usage
@@ -386,7 +374,7 @@
   const int32_t kiDy = mv.iMvY & 0x07;
 
   if (0 == kiDx && 0 == kiDy) {
-    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+    McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
   } else {
     const int32_t kiDA = g_kuiABCD[kiDy][kiDx][0];
     const int32_t kiDB = g_kuiABCD[kiDy][kiDx][1];
@@ -441,6 +429,117 @@
   McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8, iDstStride, 8, iHeight);
 }
 
+static inline void McHorVer01WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer03WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer10WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer11WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer12WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer13WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer21WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer23WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer30WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer31WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer32WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer33WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                             int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+
+static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                int32_t iWidth, int32_t iHeight) {
+  int32_t i;
+  if (iWidth == 16)
+    McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McCopyWidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else {
+    for (i = 0; i < iHeight; i++) {
+      memcpy (pDst, pSrc, iWidth);	// confirmed_safe_unsafe_usage
+      pDst += iDstStride;
+      pSrc += iSrcStride;
+    }
+  }
+}
+
 typedef void (*McChromaWidthEqx) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                                   const uint8_t* pABCD, int32_t iHeigh);
 void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
@@ -453,7 +552,7 @@
   };
 
   if (0 == kiD8x && 0 == kiD8y) {
-    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
   } else {
     kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
   }
@@ -469,7 +568,7 @@
     McChromaWidthEq8_ssse3
   };
   if (0 == kiD8x && 0 == kiD8y) {
-    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
   } else {
     kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
   }
@@ -671,17 +770,17 @@
   static const PixelAvgFunc pfPixAvgFunc[2] = {PixelAvgWidthEq8_c, PixelAvgWidthEq16_c};
 
   static const PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16[16] = { //[y*4+x]
-    McCopyWidthEq16_c,  McHorVer10WidthEq16, McHorVer20WidthEq16_c,     McHorVer30WidthEq16,
-    McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16,
-    McHorVer02WidthEq16_c,     McHorVer12WidthEq16, McHorVer22WidthEq16_c,    McHorVer32WidthEq16,
-    McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
+    McCopyWidthEq16_c,     McHorVer10WidthEq16_c, McHorVer20WidthEq16_c, McHorVer30WidthEq16_c,
+    McHorVer01WidthEq16_c, McHorVer11WidthEq16_c, McHorVer21WidthEq16_c, McHorVer31WidthEq16_c,
+    McHorVer02WidthEq16_c, McHorVer12WidthEq16_c, McHorVer22WidthEq16_c, McHorVer32WidthEq16_c,
+    McHorVer03WidthEq16_c, McHorVer13WidthEq16_c, McHorVer23WidthEq16_c, McHorVer33WidthEq16_c
   };
 #if defined (X86_ASM)
   static const PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_sse2[16] = {
-    McCopyWidthEq16_sse2,  McHorVer10WidthEq16, McHorVer20WidthEq16_sse2,     McHorVer30WidthEq16,
-    McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16,
-    McHorVer02WidthEq16_sse2,     McHorVer12WidthEq16, McHorVer22WidthEq16_sse2,    McHorVer32WidthEq16,
-    McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
+    McCopyWidthEq16_sse2,     McHorVer10WidthEq16_sse2, McHorVer20WidthEq16_sse2, McHorVer30WidthEq16_sse2,
+    McHorVer01WidthEq16_sse2, McHorVer11WidthEq16_sse2, McHorVer21WidthEq16_sse2, McHorVer31WidthEq16_sse2,
+    McHorVer02WidthEq16_sse2, McHorVer12WidthEq16_sse2, McHorVer22WidthEq16_sse2, McHorVer32WidthEq16_sse2,
+    McHorVer03WidthEq16_sse2, McHorVer13WidthEq16_sse2, McHorVer23WidthEq16_sse2, McHorVer33WidthEq16_sse2
   };
 #endif
 #if defined(HAVE_NEON)
@@ -705,13 +804,6 @@
   pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_c;
   memcpy (pFuncList->sMcFuncs.pfSampleAveraging, pfPixAvgFunc, sizeof (pfPixAvgFunc));
   pFuncList->sMcFuncs.pfChromaMc	= McChroma_c;
-  McCopyWidthEq4 = McCopyWidthEq4_c;
-  McCopyWidthEq8 = McCopyWidthEq8_c;
-  McCopyWidthEq16 = McCopyWidthEq16_c;
-  pfPixelAvgWidthEq16 = PixelAvgWidthEq16_c;
-  pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_c;
-  pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_c;
-  pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_c;
   memcpy (pFuncList->sMcFuncs.pfLumaQuarpelMc, pWelsMcFuncWidthEq16, sizeof (pWelsMcFuncWidthEq16));
 #if defined (X86_ASM)
   if (uiCpuFlag & WELS_CPU_SSE2) {
@@ -721,13 +813,6 @@
     pFuncList->sMcFuncs.pfSampleAveraging[0] = PixelAvgWidthEq8_mmx;
     pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_sse2;
     pFuncList->sMcFuncs.pfChromaMc = McChroma_sse2;
-    McCopyWidthEq4 = McCopyWidthEq4_mmx;
-    McCopyWidthEq8 = McCopyWidthEq8_mmx;
-    McCopyWidthEq16 = McCopyWidthEq16_sse2;
-    pfPixelAvgWidthEq16 = PixelAvgWidthEq16_sse2;
-    pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_sse2;
-    pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_sse2;
-    pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_sse2;
     memcpy (pFuncList->sMcFuncs.pfLumaQuarpelMc, pWelsMcFuncWidthEq16_sse2, sizeof (pWelsMcFuncWidthEq16_sse2));
   }