shithub: openh264

Download patch

ref: 021fff491b161c59be8bb8ab793d7a14a3a406c3
parent: b6a765ad71c5ea861694e16adc01f33b1c77b869
parent: a842f14a3c30a81f9454aff675d7f1c9734c47c6
author: Licai Guo <[email protected]>
date: Wed Apr 23 11:19:15 EDT 2014

Merge pull request #735 from mstorsjo/cleanup-mess

Clean up the mess left by merging the motion compensation arm64 neon code

--- a/codec/common/inc/mc_common.h.orig
+++ /dev/null
@@ -1,204 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef MC_COMMON_H
-#define MC_COMMON_H
-
-#include "typedefs.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif//__cplusplus
-
-#if defined(HAVE_NEON)
-void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
-
-void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
-
-void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-
-void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-    //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
-void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-    //vertical filter to gain half sample, that is (0, 2) location in quarter sample
-void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-    //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-
-void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
-void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
-
-void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
-void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
-
-void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
-void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
-#endif
-
-#if defined(HAVE_NEON_AARCH64)
-void McCopyWidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McCopyWidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McCopyWidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McChromaWidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
-void McChromaWidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
-void PixelAvgWidthEq16_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-void PixelAvgWidthEq8_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-void PixelAvgWidthEq4_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-void McHorVer01WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer01WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer01WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer10WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer10WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer10WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-    //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
-void McHorVer20WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer20WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer20WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-    //vertical filter to gain half sample, that is (0, 2) location in quarter sample
-void McHorVer02WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer02WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer02WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-    //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-void McHorVer22WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer22WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer22WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void PixStrideAvgWidthEq16_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-void PixStrideAvgWidthEq8_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-void McHorVer20Width17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
-void McHorVer20Width9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
-void McHorVer02Height17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
-void McHorVer02Height9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
-void McHorVer22Width17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
-void McHorVer22Width9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
-#endif
-    
-#if defined(X86_ASM)
-//***************************************************************************//
-//                       MMXEXT definition                                   //
-//***************************************************************************//
-void McHorVer20WidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             int32_t iHeight);
-void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                           const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                         int32_t iHeight);
-void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                         int32_t iHeight);
-void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-
-//***************************************************************************//
-//                       SSE2 definition                                     //
-//***************************************************************************//
-void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                            const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                           int32_t iHeight);
-void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer20WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int32_t iHeight);
-void McHorVer02WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iHeight);
-void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iWidth, int32_t iHeight);
-void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
-                                         int32_t iWidth, int32_t iHeight);
-
-void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                             const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-
-void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                int32_t iHeight);
-
-void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                 int32_t iHeight);
-
-void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
-                              int32_t iHeight);
-
-//***************************************************************************//
-//                       SSSE3 definition                                    //
-//***************************************************************************//
-
-void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             const uint8_t* kpABCD, int32_t iHeight);
-
-#endif //X86_ASM
-
-#if defined(__cplusplus)
-}
-#endif//__cplusplus
-
-#endif//MC_COMMON_H
--- a/codec/decoder/core/src/mc.cpp.orig
+++ /dev/null
@@ -1,1305 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	mc.c
- *
- * \brief	Interfaces implementation for motion compensation
- *
- * \date	03/17/2009 Created
- *
- *************************************************************************************
- */
-
-#include "mc.h"
-
-#include "cpu_core.h"
-
-namespace WelsDec {
-
-/*------------------weight for chroma fraction pixel interpolation------------------*/
-//iA = (8 - dx) * (8 - dy);
-//iB = dx * (8 - dy);
-//iC = (8 - dx) * dy;
-//iD = dx * dy
-static const uint8_t g_kuiABCD[8][8][4] = {	//g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx]
-  {
-    {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
-    {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
-  },
-  {
-    {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
-    {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
-  },
-  {
-    {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
-    {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
-  },
-  {
-    {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
-    {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
-  },
-  {
-    {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
-    {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
-  },
-  {
-    {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
-    {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
-  },
-  {
-    {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
-    {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
-  },
-  {
-    {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
-    {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
-  }
-};
-
-typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iWidth, int32_t iHeight);
-
-//***************************************************************************//
-//                          C code implementation                            //
-//***************************************************************************//
-static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma
-    ST16A2 (pDst, LD16 (pSrc));
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    ST32A4 (pDst, LD32 (pSrc));
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    ST64A8 (pDst, LD64 (pSrc));
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    ST64A8 (pDst  , LD64 (pSrc));
-    ST64A8 (pDst + 8, LD64 (pSrc + 8));
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-//--------------------Luma sample MC------------------//
-
-static inline int32_t HorFilterInput16bit_c (int16_t* pSrc) {
-  int32_t iPix05 = pSrc[-2] + pSrc[3];
-  int32_t iPix14 = pSrc[-1] + pSrc[2];
-  int32_t iPix23 = pSrc[ 0] + pSrc[1];
-
-  return (iPix05 - (iPix14 * 5)+ (iPix23 * 20));
-}
-// h: iOffset=1 / v: iOffset=iSrcStride
-static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const int32_t kiOffset) {
-  const int32_t kiOffset1 = kiOffset;
-  const int32_t kiOffset2 = (kiOffset << 1);
-  const int32_t kiOffset3 = kiOffset + kiOffset2;
-  const uint32_t kuiPix05   = * (pSrc - kiOffset2) + * (pSrc + kiOffset3);
-  const uint32_t kuiPix14   = * (pSrc - kiOffset1) + * (pSrc + kiOffset2);
-  const uint32_t kuiPix23   = * (pSrc) + * (pSrc + kiOffset1);
-
-  return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
-}
-
-static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                                 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
-    }
-    pDst  += iDstStride;
-    pSrcA += iSrcAStride;
-    pSrcB += iSrcBStride;
-  }
-}
-static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                               int32_t iHeight) {
-  if (iWidth == 16)
-    McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else //here iWidth == 2
-    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-
-static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, 1) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, iSrcStride) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  int16_t iTmp[16 + 5]; //16
-  int32_t i, j, k;
-
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth + 5; j++) {
-      iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
-    }
-    for (k = 0; k < iWidth; k++) {
-      pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[2 + k]) + 512) >> 10);
-    }
-    pSrc += iSrcStride;
-    pDst += iDstStride;
-  }
-}
-
-/////////////////////luma MC//////////////////////////
-static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiTmp[256];
-  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiTmp[256];
-  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiTmp[256];
-  McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiVerTmp[256];
-  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiVerTmp[256];
-  uint8_t uiCtrTmp[256];
-  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiVerTmp[256];
-  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiCtrTmp[256];
-  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiCtrTmp[256];
-  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiVerTmp[256];
-  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiVerTmp[256];
-  uint8_t uiCtrTmp[256];
-  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiVerTmp[256];
-  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
-}
-
-void McLuma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-//pSrc has been added the offset of mv
-{
-  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
-    {McCopy_c,      McHorVer01_c, McHorVer02_c, McHorVer03_c},
-    {McHorVer10_c,  McHorVer11_c, McHorVer12_c, McHorVer13_c},
-    {McHorVer20_c,  McHorVer21_c, McHorVer22_c, McHorVer23_c},
-    {McHorVer30_c,  McHorVer31_c, McHorVer32_c, McHorVer33_c},
-  };
-
-  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-}
-
-static inline void McChromaWithFragMv_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  int32_t i, j;
-  int32_t iA, iB, iC, iD;
-  const uint8_t* pSrcNext = pSrc + iSrcStride;
-  const uint8_t *pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07];
-  iA = pABCD[0];
-  iB = pABCD[1];
-  iC = pABCD[2];
-  iD = pABCD[3];
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = (iA * pSrc[j] + iB * pSrc[j + 1] + iC * pSrcNext[j] + iD * pSrcNext[j + 1] + 32) >> 6;
-    }
-    pDst     += iDstStride;
-    pSrc      = pSrcNext;
-    pSrcNext += iSrcStride;
-  }
-}
-
-void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-//pSrc has been added the offset of mv
-{
-  const int32_t kiD8x = iMvX & 0x07;
-  const int32_t kiD8y = iMvY & 0x07;
-  if (0 == kiD8x && 0 == kiD8y)
-    McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-  else
-    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
-}
-
-#if defined(X86_ASM)
-//***************************************************************************//
-//                       SSE2 implement                          //
-//***************************************************************************//
-static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
-  McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
-  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
-}
-
-static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  McHorVer02WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
-  McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
-}
-
-static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
-  McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
-}
-
-static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                  int32_t iHeight) {
-  if (iWidth == 16)
-    McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McCopyWidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else
-    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-
-static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-
-static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else
-    McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
-}
-
-static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else
-    McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
-}
-
-static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  } else {
-    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  } else {
-    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else {
-    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
-    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02_c (pSrc,            iSrcStride, pVerTmp, 16, 4 , iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq16_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq8_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22_c (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc,   iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq16_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq8_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else {
-    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
-    McHorVer22_c (pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02_c (pSrc + 1,          iSrcStride, pVerTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-
-void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-//pSrc has been added the offset of mv
-{
-  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
-    {McCopy_sse2,     McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2},
-    {McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2},
-    {McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2},
-    {McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2},
-  };
-
-  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-}
-
-void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
-				McChromaWidthEq4_mmx,
-    McChromaWidthEq8_sse2
-  };
-  const int32_t kiD8x = iMvX & 0x07;
-  const int32_t kiD8y = iMvY & 0x07;
-  if (kiD8x == 0 && kiD8y == 0) {
-    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-    return;
-  }
-  if (iWidth != 2) {
-    kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
-  } else
-    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
-}
-
-#endif //X86_ASM
-//***************************************************************************//
-//                       NEON implementation                      //
-//***************************************************************************//
-#if defined(HAVE_NEON)
-void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																						int32_t iWidth, int32_t iHeight)
-{
-  if (16 == iWidth)
-				McCopyWidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if(8 == iWidth)
-				McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else
-				McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		}
-void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																				int32_t iWidth, int32_t iHeight)
-{
-  if (iWidth == 16)
-	   McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																				int32_t iWidth, int32_t iHeight)
-{
-		if (iWidth == 16)
-				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																				int32_t iWidth, int32_t iHeight)
-{
-		if (iWidth == 16)
-    McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-
-void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  if (iWidth == 16)
-				McHorVer01WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-				McHorVer01WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-				McHorVer01WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  if (iWidth == 16)
-				McHorVer03WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer03WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer03WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  if (iWidth == 16)
-				McHorVer10WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer10WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer10WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-  if (iWidth == 16)
-		{
-				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 8)
-		{
-				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  }
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-}
-void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-  if (iWidth == 16)
-		{
-				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-  }
-		else if (iWidth == 8)
-		{
-				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-  }
-		else if (iWidth == 4)
-		{
-				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-  }
-}
-void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-  if (iWidth == 16)
-  {
-				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 8)
-		{
-				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-}
-void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-  if (iWidth == 16)
-		{
-				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
-		else if (iWidth == 8)
-		{
-				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
-}
-void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-  if (iWidth == 16)
-		{
-				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
-  else if (iWidth == 8)
-		{
-				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
-}
-void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  if (iWidth == 16)
-				McHorVer30WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer30WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer30WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-  if (iWidth == 16) {
-				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 8){
-				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-}
-void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-  if (iWidth == 16)
-		{
-				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-		}
-		else if (iWidth == 8)
-		{
-				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-		}
-}
-void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-  if (iWidth == 16)
-		{
-				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 8)
-		{
-				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-}
-
-void McLuma_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-											int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-{
-  static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] =  //[x][y]
-  {
-				{McCopy_neon,  McHorVer01_neon, McHorVer02_neon,    McHorVer03_neon},
-				{McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
-				{McHorVer20_neon,    McHorVer21_neon, McHorVer22_neon,    McHorVer23_neon},
-				{McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
-		};
-  //	pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
-  pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-}
-void McChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
-												int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-{
-  if (0 == iMvX && 0 == iMvY)
-		{
-				if(8 == iWidth)
-				  McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-				else if(iWidth == 4)
-				  McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-				else //here iWidth == 2
-				  McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-		}
-		else
-		{
-				const int32_t kiD8x = iMvX & 0x07;
-				const int32_t kiD8y = iMvY & 0x07;
-				if(8 == iWidth)
-				  McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
-				else if(4 == iWidth)
-				  McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
-				else //here iWidth == 2
-				  McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
-		}
-}
-#endif
-#if defined(HAVE_NEON_AARCH64)
-void McCopy_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                 int32_t iWidth, int32_t iHeight)
-{
-    if (16 == iWidth)
-        McCopyWidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if(8 == iWidth)
-        McCopyWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else
-        McCopyWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer20_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    if (iWidth == 16)
-        McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 8)
-        McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 4)
-        McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer02_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    if (iWidth == 16)
-        McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 8)
-        McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 4)
-        McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer22_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    if (iWidth == 16)
-        McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 8)
-        McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 4)
-        McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-
-void McHorVer01_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    if (iWidth == 16)
-        McHorVer01WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 8)
-        McHorVer01WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 4)
-        McHorVer01WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer03_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    if (iWidth == 16)
-        McHorVer03WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 8)
-        McHorVer03WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 4)
-        McHorVer03WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer10_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    if (iWidth == 16)
-        McHorVer10WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 8)
-        McHorVer10WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 4)
-        McHorVer10WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer11_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-    if (iWidth == 16)
-    {
-        McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-    else if (iWidth == 8)
-    {
-        McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-    else if (iWidth == 4)
-    {
-        McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-}
-void McHorVer12_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-    if (iWidth == 16)
-    {
-        McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-        McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-    }
-    else if (iWidth == 8)
-    {
-        McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-        McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-    }
-    else if (iWidth == 4)
-    {
-        McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-        McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-    }
-}
-void McHorVer13_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-    if (iWidth == 16)
-    {
-        McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-    else if (iWidth == 8)
-    {
-        McHorVer20WidthEq8_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-    else if (iWidth == 4)
-    {
-        McHorVer20WidthEq4_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-}
-void McHorVer21_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-    if (iWidth == 16)
-    {
-        McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-    }
-    else if (iWidth == 8)
-    {
-        McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-    }
-    else if (iWidth == 4)
-    {
-        McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-    }
-}
-void McHorVer23_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-    if (iWidth == 16)
-    {
-        McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-    }
-    else if (iWidth == 8)
-    {
-        McHorVer20WidthEq8_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-    }
-    else if (iWidth == 4)
-    {
-        McHorVer20WidthEq4_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-    }
-}
-void McHorVer30_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    if (iWidth == 16)
-        McHorVer30WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 8)
-        McHorVer30WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 4)
-        McHorVer30WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer31_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-    if (iWidth == 16) {
-        McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-    else if (iWidth == 8){
-        McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq8_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-    else if (iWidth == 4)
-    {
-        McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq4_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-}
-void McHorVer32_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
-    if (iWidth == 16)
-    {
-        McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-        McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-    }
-    else if (iWidth == 8)
-    {
-        McHorVer02WidthEq8_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-        McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-    }
-    else if (iWidth == 4)
-    {
-        McHorVer02WidthEq4_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-        McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-    }
-}
-void McHorVer33_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int32_t iWidth, int32_t iHeight)
-{
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-    if (iWidth == 16)
-    {
-        McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-    else if (iWidth == 8)
-    {
-        McHorVer20WidthEq8_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq8_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-    else if (iWidth == 4)
-    {
-        McHorVer20WidthEq4_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-        McHorVer02WidthEq4_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-    }
-}
-
-void McLuma_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-{
-    static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] =  //[x][y]
-    {
-        {McCopy_AArch64_neon,  McHorVer01_AArch64_neon, McHorVer02_AArch64_neon,    McHorVer03_AArch64_neon},
-        {McHorVer10_AArch64_neon, McHorVer11_AArch64_neon, McHorVer12_AArch64_neon, McHorVer13_AArch64_neon},
-        {McHorVer20_AArch64_neon,    McHorVer21_AArch64_neon, McHorVer22_AArch64_neon,    McHorVer23_AArch64_neon},
-        {McHorVer30_AArch64_neon, McHorVer31_AArch64_neon, McHorVer32_AArch64_neon, McHorVer33_AArch64_neon},
-    };
-    //	pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
-    pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-}
-void McChroma_AArch64_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
-                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-{
-    if (0 == iMvX && 0 == iMvY)
-    {
-        if(8 == iWidth)
-            McCopyWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-        else if(iWidth == 4)
-            McCopyWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-        else //here iWidth == 2
-            McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-    }
-    else
-    {
-        const int32_t kiD8x = iMvX & 0x07;
-        const int32_t kiD8y = iMvY & 0x07;
-        if(8 == iWidth)
-            McChromaWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
-        else if(4 == iWidth)
-            McChromaWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
-        else //here iWidth == 2
-            McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
-    }
-}
-#endif
-
-void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) {
-  pMcFunc->pMcLumaFunc   = McLuma_c;
-  pMcFunc->pMcChromaFunc = McChroma_c;
-
-#ifdef	HAVE_NEON
-  if ( iCpu & WELS_CPU_NEON ) {
-	   pMcFunc->pMcLumaFunc	  = McLuma_neon;
-	   pMcFunc->pMcChromaFunc  = McChroma_neon;
-		}
-#endif
-#ifdef	HAVE_NEON_AARCH64
-    if ( iCpu & WELS_CPU_NEON ) {
-        pMcFunc->pMcLumaFunc	  = McLuma_AArch64_neon;
-        pMcFunc->pMcChromaFunc  = McChroma_AArch64_neon;
-    }
-#endif
-#if defined (X86_ASM)
-  if (iCpu & WELS_CPU_SSE2) {
-  pMcFunc->pMcLumaFunc   = McLuma_sse2;
-  pMcFunc->pMcChromaFunc = McChroma_sse2;
-  }
-#endif //(X86_ASM)
-}
-
-} // namespace WelsDec
--- a/codec/encoder/core/src/mc.cpp.orig
+++ /dev/null
@@ -1,762 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	mc.c
- *
- * \brief	Interfaces implementation for motion compensation
- *
- * \date	03/17/2009 Created
- *
- *************************************************************************************
- */
-
-#include "mc.h"
-#include "cpu_core.h"
-
-namespace WelsSVCEnc {
-/*------------------weight for chroma fraction pixel interpolation------------------*/
-//kuiA = (8 - dx) * (8 - dy);
-//kuiB = dx * (8 - dy);
-//kuiC = (8 - dx) * dy;
-//kuiD = dx * dy
-static const uint8_t g_kuiABCD[8][8][4] = { ////g_kuiA[dy][dx], g_kuiB[dy][dx], g_kuiC[dy][dx], g_kuiD[dy][dx]
-  {
-    {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
-    {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
-  },
-  {
-    {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
-    {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
-  },
-  {
-    {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
-    {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
-  },
-  {
-    {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
-    {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
-  },
-  {
-    {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
-    {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
-  },
-  {
-    {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
-    {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
-  },
-  {
-    {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
-    {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
-  },
-  {
-    {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
-    {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
-  }
-};
-typedef int32_t (*VerFilterFunc) (const uint8_t* pSrc, const int32_t kiSrcStride);
-typedef int32_t (*HorFilterFunc) (const uint8_t* pSrc);
-typedef int32_t (*HorFilterFuncInput16Bits) (int16_t* pSrc);
-
-VerFilterFunc fpVerFilter			= NULL;
-HorFilterFunc fpHorFilter			= NULL;
-HorFilterFuncInput16Bits fpHorFilterInput16Bits = NULL;
-
-typedef void (*WelsMcFunc0) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, const uint8_t* psrcA, int32_t iSrcAStride,  const uint8_t* pSrcB,
-                             int32_t iSrcBStride, int32_t iHeight);
-WelsMcFunc0 McCopyWidthEq16 = NULL;
-WelsMcFunc0 McCopyWidthEq8 = NULL;
-WelsMcFunc0 McCopyWidthEq4 = NULL;
-WelsMcFunc0 pfMcHorVer02WidthEq16 = NULL;
-WelsMcFunc1 pfPixelAvgWidthEq16  = NULL;
-WelsMcFunc0 pfMcHorVer20WidthEq16 = NULL;
-WelsMcFunc0 pfMcHorVer22WidthEq16 = NULL;
-
-//***************************************************************************//
-//                          C code implementation                            //
-//***************************************************************************//
-static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                     int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    memcpy (pDst, pSrc, 4);	// confirmed_safe_unsafe_usage
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                     int32_t iHeight)
-
-{
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    memcpy (pDst, pSrc, 8);	// confirmed_safe_unsafe_usage
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    memcpy (pDst, pSrc, 16);	// confirmed_safe_unsafe_usage
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-//--------------------Luma sample MC------------------//
-static inline int32_t HorFilter_c (const uint8_t* pSrc) {
-  int32_t iPix05 = pSrc[-2] + pSrc[3];
-  int32_t iPix14 = pSrc[-1] + pSrc[2];
-  int32_t iPix23 = pSrc[ 0] + pSrc[1];
-
-  return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2));
-}
-
-static inline int32_t HorFilterInput16bit1_c (int16_t* pSrc) {
-  int32_t iPix05 = pSrc[-2] + pSrc[3];
-  int32_t iPix14 = pSrc[-1] + pSrc[2];
-  int32_t iPix23 = pSrc[ 0] + pSrc[1];
-
-  return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2));
-}
-static inline int32_t VerFilter_c (const uint8_t* pSrc, const int32_t kiSrcStride) {
-  const int32_t kiLine1	= kiSrcStride;
-  const int32_t kiLine2	= (kiSrcStride << 1);
-  const int32_t kiLine3 = kiLine1 + kiLine2;
-  const uint32_t kuiPix05 = * (pSrc - kiLine2) + * (pSrc + kiLine3);
-  const uint32_t kuiPix14 = * (pSrc - kiLine1) + * (pSrc + kiLine2);
-  const uint32_t kuiPix23 = * (pSrc) + * (pSrc + kiLine1);
-
-  return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
-}
-
-static inline void PixelAvgWidthEq8_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                                       const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < 8; j++) {
-      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
-    }
-    pDst  += iDstStride;
-    pSrcA += iSrcAStride;
-    pSrcB += iSrcBStride;
-  }
-}
-static inline void PixelAvgWidthEq16_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                                        const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < 16; j++) {
-      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
-    }
-    pDst  += iDstStride;
-    pSrcA += iSrcAStride;
-    pSrcB += iSrcBStride;
-  }
-}
-
-//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
-static inline void McHorVer20WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < 16; j++) {
-      pDst[j] = WelsClip1 ((fpHorFilter (pSrc + j) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-//vertical filter to gain half sample, that is (0, 2) location in quarter sample
-static inline void McHorVer02WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < 16; j++) {
-      pDst[j] = WelsClip1 ((fpVerFilter (pSrc + j, iSrcStride) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-static inline void McHorVer22WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  int16_t pTmp[16 + 5] = {0}; //16
-  int32_t i, j, k;
-
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < 16 + 5; j++) {
-      pTmp[j] = fpVerFilter (pSrc - 2 + j, iSrcStride);
-    }
-    for (k = 0; k < 16; k++) {
-      pDst[k] = WelsClip1 ((fpHorFilterInput16Bits (&pTmp[2 + k]) + 512) >> 10);
-    }
-    pSrc += iSrcStride;
-    pDst += iDstStride;
-  }
-}
-
-/////////////////////luma MC//////////////////////////
-
-static inline void McHorVer01WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer03WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer10WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer11WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer12WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer13WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer21WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer23WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer30WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + 1, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer31WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer32WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer33WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-
-static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                 int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = WelsClip1 ((fpHorFilter (pSrc + j) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-//vertical filter to gain half sample, that is (0, 2) location in quarter sample
-static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                 int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = WelsClip1 ((fpVerFilter (pSrc + j, iSrcStride) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                 int32_t iHeight) {
-  int16_t pTmp[17 + 5] = {0}; //w+1
-  int32_t i, j, k;
-
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth + 5; j++) {
-      pTmp[j] = fpVerFilter (pSrc - 2 + j, iSrcStride);
-    }
-    for (k = 0; k < iWidth; k++) {
-      pDst[k] = WelsClip1 ((fpHorFilterInput16Bits (&pTmp[2 + k]) + 512) >> 10);
-    }
-    pSrc += iSrcStride;
-    pDst += iDstStride;
-  }
-}
-static inline void McCopy (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                           int32_t iHeight) {
-  int32_t i;
-  if (iWidth == 16 && McCopyWidthEq16 != NULL)
-    McCopyWidthEq16 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8 && McCopyWidthEq8 != NULL)
-    McCopyWidthEq8 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4 && McCopyWidthEq4 != NULL)
-    McCopyWidthEq4 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else {
-    for (i = 0; i < iHeight; i++) {
-      memcpy (pDst, pSrc, iWidth);	// confirmed_safe_unsafe_usage
-      pDst += iDstStride;
-      pSrc += iSrcStride;
-    }
-  }
-}
-
-void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                 SMVUnitXY mv, int32_t iWidth, int32_t iHeight)
-//pSrc has been added the offset of mv
-{
-  const int32_t kiDx = mv.iMvX & 0x07;
-  const int32_t kiDy = mv.iMvY & 0x07;
-
-  if (0 == kiDx && 0 == kiDy) {
-    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-  } else {
-    const int32_t kiDA = g_kuiABCD[kiDy][kiDx][0];
-    const int32_t kiDB = g_kuiABCD[kiDy][kiDx][1];
-    const int32_t kiDC = g_kuiABCD[kiDy][kiDx][2];
-    const int32_t kiDD = g_kuiABCD[kiDy][kiDx][3];
-
-    int32_t i, j;
-
-    const uint8_t* pSrcNext = pSrc + iSrcStride;
-
-    for (i = 0; i < iHeight; i++) {
-      for (j = 0; j < iWidth; j++) {
-        pDst[j] = (kiDA * pSrc[j] + kiDB * pSrc[j + 1] + kiDC * pSrcNext[j] + kiDD * pSrcNext[j + 1] + 32) >> 6;
-      }
-      pDst += iDstStride;
-      pSrc = pSrcNext;
-      pSrcNext += iSrcStride;
-    }
-  }
-}
-//***************************************************************************//
-//                       MMXEXT and SSE2 implementation                      //
-//***************************************************************************//
-#if defined(X86_ASM)
-
-static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 21, 8, 16)
-  McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 16, iHeight + 5);
-  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 16, pDst, iDstStride, 8, iHeight);
-}
-
-//2010.2.5
-
-static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* PDst, int32_t iDstStride,
-    int32_t iHeight) {
-  McHorVer02WidthEq8_sse2 (pSrc,     iSrcStride, PDst,     iDstStride, iHeight);
-  McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &PDst[8], iDstStride, iHeight);
-}
-static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
-  McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
-}
-void McHorVer22Width9Or17Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iWidth,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
-  int32_t tmp1 = 2 * (iWidth - 8);
-  McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
-  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
-  McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8, iDstStride, 8, iHeight);
-}
-
-typedef void (*McChromaWidthEqx) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                  const uint8_t* pABCD, int32_t iHeigh);
-void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
-  const int32_t kiD8x = sMv.iMvX & 0x07;
-  const int32_t kiD8y = sMv.iMvY & 0x07;
-  static const McChromaWidthEqx kpfFuncs[2] = {
-    McChromaWidthEq4_mmx,
-    McChromaWidthEq8_sse2
-  };
-
-  if (0 == kiD8x && 0 == kiD8y) {
-    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-  } else {
-    kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
-  }
-}
-
-void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
-  const int32_t kiD8x = sMv.iMvX & 0x07;
-  const int32_t kiD8y = sMv.iMvY & 0x07;
-
-  static const McChromaWidthEqx kpfFuncs[2] = {
-    McChromaWidthEq4_mmx,
-    McChromaWidthEq8_ssse3
-  };
-  if (0 == kiD8x && 0 == kiD8y) {
-    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-  } else {
-    kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
-  }
-
-}
-
-#endif //X86_ASM
-
-    //***************************************************************************//
-    //                       NEON implementation                      //
-    //***************************************************************************//
-#if defined(HAVE_NEON)
-void McHorVer20Width9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                   int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 17)
-    McHorVer20Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else //if (iWidth == 9)
-    McHorVer20Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer02Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight){
-  if (iWidth == 16)
-    McHorVer02Height17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else //if (iWidth == 8)
-    McHorVer02Height9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer22Width9Or17Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                              int32_t iWidth, int32_t iHeight){
-  if (iWidth == 17)
-    McHorVer22Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else //if (iWidth == 9)
-    McHorVer22Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void EncMcHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer02WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
-                          SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
-  const int32_t kiD8x = sMv.iMvX&0x07;
-  const int32_t kiD8y = sMv.iMvY&0x07;
-  if (0 == kiD8x && 0 == kiD8y) {
-    if(8 == iWidth)
-      McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else // iWidth == 4
-      McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  }
-  else {
-    if(8 == iWidth)
-      McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
-    else //if(4 == iWidth)
-      McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
-  }
-}
-#endif
-
-#if defined(HAVE_NEON_AARCH64)
-void McHorVer20Width9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int32_t iWidth, int32_t iHeight) {
-    if (iWidth == 17)
-        McHorVer20Width17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else //if (iWidth == 9)
-        McHorVer20Width9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer02Height9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                int32_t iWidth, int32_t iHeight){
-    if (iWidth == 16)
-        McHorVer02Height17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else //if (iWidth == 8)
-        McHorVer02Height9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer22Width9Or17Height9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                          int32_t iWidth, int32_t iHeight){
-    if (iWidth == 17)
-        McHorVer22Width17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else //if (iWidth == 9)
-        McHorVer22Width9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void EncMcHorVer11_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-    McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-    McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
-}
-void EncMcHorVer12_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-    McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-    McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer13_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-    McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-    McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer21_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-    McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-    McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer23_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-    McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-    McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer31_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-    McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-    McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer32_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-    McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight);
-    McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer33_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-    McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-    McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcChroma_AArch64_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
-                      SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
-    const int32_t kiD8x = sMv.iMvX&0x07;
-    const int32_t kiD8y = sMv.iMvY&0x07;
-    if (0 == kiD8x && 0 == kiD8y) {
-        if(8 == iWidth)
-            McCopyWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-        else // iWidth == 4
-            McCopyWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    }
-    else {
-        if(8 == iWidth)
-            McChromaWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
-        else //if(4 == iWidth)
-            McChromaWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
-    }
-}
-#endif
-
-typedef void (*PixelAvgFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*, int32_t, int32_t);
-void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
-  static PixelAvgFunc pfPixAvgFunc[2] = {PixelAvgWidthEq8_c, PixelAvgWidthEq16_c};
-
-  static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16[16] = { //[y*4+x]
-    McCopyWidthEq16_c,  McHorVer10WidthEq16, McHorVer20WidthEq16_c,     McHorVer30WidthEq16,
-    McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16,
-    McHorVer02WidthEq16_c,     McHorVer12WidthEq16, McHorVer22WidthEq16_c,    McHorVer32WidthEq16,
-    McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
-  };
-#if defined (X86_ASM)
-  static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_sse2[16] = {
-    McCopyWidthEq16_sse2,  McHorVer10WidthEq16, McHorVer20WidthEq16_sse2,     McHorVer30WidthEq16,
-    McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16,
-    McHorVer02WidthEq16_sse2,     McHorVer12WidthEq16, McHorVer22WidthEq16_sse2,    McHorVer32WidthEq16,
-    McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
-  };
-#endif
-#if defined(HAVE_NEON)
-  static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_neon[16] = { //[x][y]
-    McCopyWidthEq16_neon,        McHorVer10WidthEq16_neon,   McHorVer20WidthEq16_neon,    McHorVer30WidthEq16_neon,
-    McHorVer01WidthEq16_neon,    EncMcHorVer11_neon,         EncMcHorVer21_neon,          EncMcHorVer31_neon,
-    McHorVer02WidthEq16_neon,    EncMcHorVer12_neon,         McHorVer22WidthEq16_neon,    EncMcHorVer32_neon,
-    McHorVer03WidthEq16_neon,    EncMcHorVer13_neon,         EncMcHorVer23_neon,          EncMcHorVer33_neon
-  };
-#endif
-#if defined(HAVE_NEON_AARCH64)
-    static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_AArch64_neon[16] = { //[x][y]
-        McCopyWidthEq16_AArch64_neon,        McHorVer10WidthEq16_AArch64_neon,   McHorVer20WidthEq16_AArch64_neon,    McHorVer30WidthEq16_AArch64_neon,
-        McHorVer01WidthEq16_AArch64_neon,    EncMcHorVer11_AArch64_neon,         EncMcHorVer21_AArch64_neon,          EncMcHorVer31_AArch64_neon,
-        McHorVer02WidthEq16_AArch64_neon,    EncMcHorVer12_AArch64_neon,         McHorVer22WidthEq16_AArch64_neon,    EncMcHorVer32_AArch64_neon,
-        McHorVer03WidthEq16_AArch64_neon,    EncMcHorVer13_AArch64_neon,         EncMcHorVer23_AArch64_neon,          EncMcHorVer33_AArch64_neon
-    };
-#endif
-  pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c;
-  pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c;
-  pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_c;
-  pFuncList->sMcFuncs.pfSampleAveraging = pfPixAvgFunc;
-  pFuncList->sMcFuncs.pfChromaMc	= McChroma_c;
-  fpVerFilter				= VerFilter_c;
-  fpHorFilter				= HorFilter_c;
-  fpHorFilterInput16Bits			= HorFilterInput16bit1_c;
-  McCopyWidthEq4 = McCopyWidthEq4_c;
-  McCopyWidthEq8 = McCopyWidthEq8_c;
-  McCopyWidthEq16 = McCopyWidthEq16_c;
-  pfPixelAvgWidthEq16 = PixelAvgWidthEq16_c;
-  pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_c;
-  pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_c;
-  pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_c;
-  pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16;
-#if defined (X86_ASM)
-  if (uiCpuFlag & WELS_CPU_SSE2) {
-    pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_sse2;
-    pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_sse2;
-    pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_sse2;
-    pFuncList->sMcFuncs.pfSampleAveraging[0] = PixelAvgWidthEq8_mmx;
-    pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_sse2;
-    pFuncList->sMcFuncs.pfChromaMc = McChroma_sse2;
-    McCopyWidthEq4 = McCopyWidthEq4_mmx;
-    McCopyWidthEq8 = McCopyWidthEq8_mmx;
-    McCopyWidthEq16 = McCopyWidthEq16_sse2;
-    pfPixelAvgWidthEq16 = PixelAvgWidthEq16_sse2;
-    pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_sse2;
-    pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_sse2;
-    pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_sse2;
-    pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_sse2;
-  }
-
-  if (uiCpuFlag & WELS_CPU_SSSE3) {
-    pFuncList->sMcFuncs.pfChromaMc = McChroma_ssse3;
-  }
-
-#endif //(X86_ASM)
-
-#if defined(HAVE_NEON)
-  if (uiCpuFlag & WELS_CPU_NEON) {
-    pFuncList->sMcFuncs.pfLumaQuarpelMc	= pWelsMcFuncWidthEq16_neon;
-    pFuncList->sMcFuncs.pfChromaMc	= EncMcChroma_neon;
-    pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_neon;
-    pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_neon;
-    pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_neon;//iWidth+1:8/16
-    pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_neon;//heigh+1:8/16
-    pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1
-  }
-#endif
-#if defined(HAVE_NEON_AARCH64)
-    if (uiCpuFlag & WELS_CPU_NEON) {
-        pFuncList->sMcFuncs.pfLumaQuarpelMc	= pWelsMcFuncWidthEq16_AArch64_neon;
-        pFuncList->sMcFuncs.pfChromaMc	= EncMcChroma_AArch64_neon;
-        pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_AArch64_neon;
-        pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_AArch64_neon;
-        pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_AArch64_neon;//iWidth+1:8/16
-        pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_AArch64_neon;//heigh+1:8/16
-        pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_AArch64_neon;//iWidth+1/heigh+1
-    }
-#endif
-}
-}