shithub: openh264

Download patch

ref: 228ea1a668e99c7a0ce11518dea9b1d67c4fd125
parent: ce740ee19e5b49afeae5c041d230563d5a869338
parent: 249b8a0aa6b9e8fd7d0a1327d9b6dcd35af0900c
author: zhilwang <[email protected]>
date: Wed Jul 9 05:31:31 EDT 2014

Merge pull request #1101 from dongzha/AddArm64MBCopy

add arm64 MB COPY code and UT

--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@@ -24,6 +24,7 @@
 		F556A8251906673900E156A8 /* expand_picture_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */; };
 		F5AC94FF193EB7D800F58154 /* deblocking_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */; };
 		F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5B8D82C190757290037849A /* mc_aarch64_neon.S */; };
+		F5BB0BB8196BB5960072D50D /* copy_mb_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */; };
 		FAABAA1818E9354A00D4186F /* sad_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FAABAA1718E9354A00D4186F /* sad_common.cpp */; };
 /* End PBXBuildFile section */
 
@@ -72,6 +73,7 @@
 		F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = expand_picture_aarch64_neon.S; path = arm64/expand_picture_aarch64_neon.S; sourceTree = "<group>"; };
 		F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = deblocking_aarch64_neon.S; path = arm64/deblocking_aarch64_neon.S; sourceTree = "<group>"; };
 		F5B8D82C190757290037849A /* mc_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = mc_aarch64_neon.S; path = arm64/mc_aarch64_neon.S; sourceTree = "<group>"; };
+		F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = copy_mb_aarch64_neon.S; path = arm64/copy_mb_aarch64_neon.S; sourceTree = "<group>"; };
 		FAABAA1618E9353F00D4186F /* sad_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sad_common.h; sourceTree = "<group>"; };
 		FAABAA1718E9354A00D4186F /* sad_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sad_common.cpp; sourceTree = "<group>"; };
 /* End PBXFileReference section */
@@ -177,6 +179,7 @@
 		F556A81D1906669F00E156A8 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
+				F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */,
 				F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */,
 				F5B8D82C190757290037849A /* mc_aarch64_neon.S */,
 				F556A8221906673900E156A8 /* arm_arch64_common_macro.S */,
@@ -245,6 +248,7 @@
 				5BA8F2C019603F5F00011CE4 /* common_tables.cpp in Sources */,
 				4C3406D118D96EA600DFA14A /* WelsThreadLib.cpp in Sources */,
 				4C3406CC18D96EA600DFA14A /* mc_neon.S in Sources */,
+				F5BB0BB8196BB5960072D50D /* copy_mb_aarch64_neon.S in Sources */,
 				4C3406CB18D96EA600DFA14A /* expand_picture_neon.S in Sources */,
 				4CC61F0918FF6B4B00E56EAB /* copy_mb_neon.S in Sources */,
 				53C1C9BC193F0FB000404D8F /* expand_pic.cpp in Sources */,
--- /dev/null
+++ b/codec/common/arm64/copy_mb_aarch64_neon.S
@@ -1,0 +1,274 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef  HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+#ifdef __APPLE__
+.macro LOAD_ALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, src*, src_stride
+    ld1 {$0.d}[0], [$4], $5
+    ld1 {$1.d}[0], [$4], $5
+    ld1 {$2.d}[0], [$4], $5
+    ld1 {$3.d}[0], [$4], $5
+//  }
+.endm
+
+.macro STORE_ALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, dst*, dst_stride
+    st1 {$0.d}[0], [$4], $5
+    st1 {$1.d}[0], [$4], $5
+    st1 {$2.d}[0], [$4], $5
+    st1 {$3.d}[0], [$4], $5
+//  }
+.endm
+
+.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, src*, src_stride
+    ld1 {$0.8b}, [$4], $5
+    ld1 {$1.8b}, [$4], $5
+    ld1 {$2.8b}, [$4], $5
+    ld1 {$3.8b}, [$4], $5
+//  }
+.endm
+
+.macro STORE_UNALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, dst*, dst_stride
+    st1 {$0.8b}, [$4], $5
+    st1 {$1.8b}, [$4], $5
+    st1 {$2.8b}, [$4], $5
+    st1 {$3.8b}, [$4], $5
+//  }
+.endm
+
+.macro LOAD16_ALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, src*, src_stride
+    ld1 {$0.2d}, [$4], $5
+    ld1 {$1.2d}, [$4], $5
+    ld1 {$2.2d}, [$4], $5
+    ld1 {$3.2d}, [$4], $5
+//  }
+.endm
+
+.macro STORE16_ALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, dst*, dst_stride
+    st1 {$0.2d}, [$4], $5
+    st1 {$1.2d}, [$4], $5
+    st1 {$2.2d}, [$4], $5
+    st1 {$3.2d}, [$4], $5
+//  }
+.endm
+
+.macro LOAD16_UNALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, src*, src_stride
+    ld1 {$0.16b}, [$4], $5
+    ld1 {$1.16b}, [$4], $5
+    ld1 {$2.16b}, [$4], $5
+    ld1 {$3.16b}, [$4], $5
+//  }
+.endm
+
+.macro STORE16_UNALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, dst*, dst_stride
+    st1 {$0.16b}, [$4], $5
+    st1 {$1.16b}, [$4], $5
+    st1 {$2.16b}, [$4], $5
+    st1 {$3.16b}, [$4], $5
+//  }
+.endm
+
+#else
+.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: $0~$3, src*, src_stride
+    ld1 {\arg0\().d}[0], [\arg4], \arg5
+    ld1 {\arg1\().d}[0], [\arg4], \arg5
+    ld1 {\arg2\().d}[0], [\arg4], \arg5
+    ld1 {\arg3\().d}[0], [\arg4], \arg5
+//  }
+.endm
+
+.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: $0~$3, dst*, dst_stride
+    st1 {\arg0\().d}[0], [\arg4], \arg5
+    st1 {\arg1\().d}[0], [\arg4], \arg5
+    st1 {\arg2\().d}[0], [\arg4], \arg5
+    st1 {\arg3\().d}[0], [\arg4], \arg5
+//  }
+.endm
+
+.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: $0~$3, src*, src_stride
+    ld1 {\arg0\().8b}, [\arg4], \arg5
+    ld1 {\arg1\().8b}, [\arg4], \arg5
+    ld1 {\arg2\().8b}, [\arg4], \arg5
+    ld1 {\arg3\().8b}, [\arg4], \arg5
+//  }
+.endm
+
+.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: $0~$3, dst*, dst_stride
+    st1 {\arg0\().8b}, [\arg4], \arg5
+    st1 {\arg1\().8b}, [\arg4], \arg5
+    st1 {\arg2\().8b}, [\arg4], \arg5
+    st1 {\arg3\().8b}, [\arg4], \arg5
+//  }
+.endm
+
+.macro LOAD16_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: $0~$3, src*, src_stride
+    ld1 {\arg0\().2d}, [\arg4], \arg5
+    ld1 {\arg1\().2d}, [\arg4], \arg5
+    ld1 {\arg2\().2d}, [\arg4], \arg5
+    ld1 {\arg3\().2d}, [\arg4], \arg5
+//  }
+.endm
+
+.macro STORE16_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: $0~$3, dst*, dst_stride
+    st1 {\arg0\().2d}, [\arg4], \arg5
+    st1 {\arg1\().2d}, [\arg4], \arg5
+    st1 {\arg2\().2d}, [\arg4], \arg5
+    st1 {\arg3\().2d}, [\arg4], \arg5
+//  }
+.endm
+
+.macro LOAD16_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: $0~$3, src*, src_stride
+    ld1 {\arg0\().16b}, [\arg4], \arg5
+    ld1 {\arg1\().16b}, [\arg4], \arg5
+    ld1 {\arg2\().16b}, [\arg4], \arg5
+    ld1 {\arg3\().16b}, [\arg4], \arg5
+//  }
+.endm
+
+.macro STORE16_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: $0~$3, dst*, dst_stride
+    st1 {\arg0\().16b}, [\arg4], \arg5
+    st1 {\arg1\().16b}, [\arg4], \arg5
+    st1 {\arg2\().16b}, [\arg4], \arg5
+    st1 {\arg3\().16b}, [\arg4], \arg5
+//  }
+.endm
+
+#endif
+
+
+WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy8x8_AArch64_neon
+
+    LOAD_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3
+
+    STORE_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1
+
+    LOAD_UNALIGNED_DATA_WITH_STRIDE v4, v5, v6, v7, x2, x3
+
+    STORE_UNALIGNED_DATA_WITH_STRIDE    v4, v5, v6, v7, x0, x1
+
+WELS_ASM_AARCH64_FUNC_END
+
+
+WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy16x16_AArch64_neon
+
+    LOAD16_ALIGNED_DATA_WITH_STRIDE   v0, v1, v2, v3, x2, x3
+
+    STORE16_ALIGNED_DATA_WITH_STRIDE  v0, v1, v2, v3, x0, x1
+
+    LOAD16_ALIGNED_DATA_WITH_STRIDE   v16, v17, v18, v19, x2, x3
+
+    STORE16_ALIGNED_DATA_WITH_STRIDE  v16, v17, v18, v19, x0, x1
+
+    LOAD16_ALIGNED_DATA_WITH_STRIDE   v0, v1, v2, v3, x2, x3
+
+    STORE16_ALIGNED_DATA_WITH_STRIDE  v0, v1, v2, v3, x0, x1
+
+    LOAD16_ALIGNED_DATA_WITH_STRIDE   v16, v17, v18, v19, x2, x3
+
+    STORE16_ALIGNED_DATA_WITH_STRIDE  v16, v17, v18, v19, x0, x1
+
+WELS_ASM_AARCH64_FUNC_END
+
+
+WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy16x16NotAligned_AArch64_neon
+
+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3
+
+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1
+
+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v16, v17, v18, v19, x2, x3
+
+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v16, v17, v18, v19, x0, x1
+
+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3
+
+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1
+
+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v16, v17, v18, v19, x2, x3
+
+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v16, v17, v18, v19, x0, x1
+
+WELS_ASM_AARCH64_FUNC_END
+
+
+WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy16x8NotAligned_AArch64_neon
+
+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3
+
+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1
+
+    LOAD16_UNALIGNED_DATA_WITH_STRIDE v16, v17, v18, v19, x2, x3
+
+    STORE16_UNALIGNED_DATA_WITH_STRIDE    v16, v17, v18, v19, x0, x1
+
+WELS_ASM_AARCH64_FUNC_END
+
+
+WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy8x16_AArch64_neon
+
+    LOAD_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3
+
+    STORE_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1
+
+    LOAD_UNALIGNED_DATA_WITH_STRIDE v4, v5, v6, v7, x2, x3
+
+    STORE_UNALIGNED_DATA_WITH_STRIDE    v4, v5, v6, v7, x0, x1
+
+    LOAD_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3
+
+    STORE_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1
+
+    LOAD_UNALIGNED_DATA_WITH_STRIDE v4, v5, v6, v7, x2, x3
+
+    STORE_UNALIGNED_DATA_WITH_STRIDE    v4, v5, v6, v7, x0, x1
+
+WELS_ASM_AARCH64_FUNC_END
+
+#endif
--- a/codec/common/inc/copy_mb.h
+++ b/codec/common/inc/copy_mb.h
@@ -65,6 +65,14 @@
 void WelsCopy8x16_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
 #endif
 
+#if defined (HAVE_NEON_AARCH64)
+void WelsCopy8x8_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy16x16_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy16x16NotAligned_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy16x8NotAligned_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy8x16_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+#endif
+
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -39,6 +39,7 @@
 
 ifeq ($(ASM_ARCH), arm64)
 COMMON_ASM_ARM64_SRCS=\
+	$(COMMON_SRCDIR)/arm64/copy_mb_aarch64_neon.S\
 	$(COMMON_SRCDIR)/arm64/deblocking_aarch64_neon.S\
 	$(COMMON_SRCDIR)/arm64/expand_picture_aarch64_neon.S\
 	$(COMMON_SRCDIR)/arm64/mc_aarch64_neon.S\
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -553,8 +553,8 @@
     pFuncList->pfQuantizationHadamard2x2		= WelsHadamardQuant2x2_AArch64_neon;
     pFuncList->pfQuantizationHadamard2x2Skip	= WelsHadamardQuant2x2Skip_AArch64_neon;
     pFuncList->pfDctT4					= WelsDctT4_AArch64_neon;
-    //pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_AArch64_neon; // will enable in next update
-    //pFuncList->pfCopy8x16Aligned		= WelsCopy8x16_AArch64_neon; // will enable in next update
+    pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_AArch64_neon;
+    pFuncList->pfCopy8x16Aligned		= WelsCopy8x16_AArch64_neon;
 
     pFuncList->pfGetNoneZeroCount		= WelsGetNoneZeroCount_AArch64_neon;
     pFuncList->pfTransformHadamard4x4Dc	= WelsHadamardT4Dc_AArch64_neon;
@@ -564,9 +564,9 @@
     pFuncList->pfQuantizationFour4x4	= WelsQuantFour4x4_AArch64_neon;
     pFuncList->pfQuantizationFour4x4Max	= WelsQuantFour4x4Max_AArch64_neon;
 
-    //pFuncList->pfCopy16x16Aligned		= WelsCopy16x16_AArch64_neon; // will enable in next update
-    //pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16NotAligned_AArch64_neon; // will enable in next update
-    //pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8NotAligned_AArch64_neon; // will enable in next update
+    pFuncList->pfCopy16x16Aligned		= WelsCopy16x16_AArch64_neon;
+    pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16NotAligned_AArch64_neon;
+    pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8NotAligned_AArch64_neon;
     pFuncList->pfDctFourT4				= WelsDctFourT4_AArch64_neon;
   }
 #endif
--- /dev/null
+++ b/test/encoder/EncUT_MBCopy.cpp
@@ -1,0 +1,140 @@
+#include <gtest/gtest.h>
+#include <math.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "cpu_core.h"
+#include "cpu.h"
+#include "macros.h"
+#include "encode_mb_aux.h"
+#include "wels_func_ptr_def.h"
+#include "copy_mb.h"
+
+using namespace WelsSVCEnc;
+#define MBCOPYTEST_NUM 1000
+static void FillWithRandomData (uint8_t* p, int32_t Len) {
+  for (int32_t i = 0; i < Len; i++) {
+    p[i] = rand() % 256;
+  }
+}
+
+
+TEST (MBCopyFunTest, pfCopy8x8Aligned) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcAlign, 16 * 64 + 1, 16)
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pDstAlign, 2, 16 * 32 + 16, 16)
+
+  int32_t iCpuCores = 0;
+  SWelsFuncPtrList sFuncPtrList;
+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
+
+  for (int32_t k = 0; k < MBCOPYTEST_NUM; k++) {
+    memset (pDstAlign[0], 0, 16 * 32 + 1);
+    memset (pDstAlign[1], 0, 16 * 32 + 1);
+    FillWithRandomData ((uint8_t*)pSrcAlign, 16 * 64 + 1);
+    WelsCopy8x8_c (pDstAlign[0], 32, pSrcAlign, 64);
+    sFuncPtrList.pfCopy8x8Aligned (pDstAlign[1], 32, pSrcAlign, 64);
+
+    for (int32_t i = 0; i < 16 * 32 + 1; i++) {
+      ASSERT_EQ (pDstAlign[0][i], pDstAlign[1][i]);
+    }
+
+  }
+
+}
+
+TEST (MBCopyFunTest, pfCopy8x16Aligned) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcAlign, 16 * 64 + 1, 16)
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pDstAlign, 2, 16 * 32 + 16, 16)
+
+  int32_t iCpuCores = 0;
+  SWelsFuncPtrList sFuncPtrList;
+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
+
+  for (int32_t k = 0; k < MBCOPYTEST_NUM; k++) {
+    memset (pDstAlign[0], 0, 16 * 32 + 1);
+    memset (pDstAlign[1], 0, 16 * 32 + 1);
+    FillWithRandomData ((uint8_t*)pSrcAlign, 16 * 64 + 1);
+    WelsCopy8x16_c (pDstAlign[0], 32, pSrcAlign, 64);
+    sFuncPtrList.pfCopy8x16Aligned (pDstAlign[1], 32, pSrcAlign, 64);
+
+    for (int32_t i = 0; i < 16 * 32 + 1; i++) {
+      ASSERT_EQ (pDstAlign[0][i], pDstAlign[1][i]);
+    }
+
+  }
+
+}
+
+TEST (MBCopyFunTest, pfCopy16x16Aligned) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcAlign, 16 * 64 + 1, 16)
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pDstAlign, 2, 16 * 32 + 16, 16)
+
+  int32_t iCpuCores = 0;
+  SWelsFuncPtrList sFuncPtrList;
+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
+
+  for (int32_t k = 0; k < MBCOPYTEST_NUM; k++) {
+    memset (pDstAlign[0], 0, 16 * 32 + 1);
+    memset (pDstAlign[1], 0, 16 * 32 + 1);
+    FillWithRandomData ((uint8_t*)pSrcAlign, 16 * 64 + 1);
+    WelsCopy16x16_c (pDstAlign[0], 32, pSrcAlign, 64);
+    sFuncPtrList.pfCopy16x16Aligned (pDstAlign[1], 32, pSrcAlign, 64);
+
+    for (int32_t i = 0; i < 16 * 32 + 1; i++) {
+      ASSERT_EQ (pDstAlign[0][i], pDstAlign[1][i]);
+    }
+
+  }
+
+}
+
+TEST (MBCopyFunTest, pfCopy16x8NotAligned) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcAlign, 16 * 64 + 1, 16)
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pDstAlign, 2, 16 * 32 + 16, 16)
+
+  int32_t iCpuCores = 0;
+  SWelsFuncPtrList sFuncPtrList;
+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
+
+  for (int32_t k = 0; k < MBCOPYTEST_NUM; k++) {
+    memset (pDstAlign[0], 0, 16 * 32 + 1);
+    memset (pDstAlign[1], 0, 16 * 32 + 1);
+    FillWithRandomData ((uint8_t*)pSrcAlign, 16 * 64 + 1);
+    WelsCopy16x8_c (pDstAlign[0], 32, pSrcAlign + 1, 64);
+    sFuncPtrList.pfCopy16x8NotAligned (pDstAlign[1], 32, pSrcAlign + 1, 64);
+
+    for (int32_t i = 0; i < 16 * 32 + 1; i++) {
+      ASSERT_EQ (pDstAlign[0][i], pDstAlign[1][i]);
+    }
+
+  }
+
+}
+
+TEST (MBCopyFunTest, pfCopy16x16NotAligned) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcAlign, 16 * 64 + 1, 16)
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pDstAlign, 2, 16 * 32 + 16, 16)
+
+  int32_t iCpuCores = 0;
+  SWelsFuncPtrList sFuncPtrList;
+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
+
+  for (int32_t k = 0; k < MBCOPYTEST_NUM; k++) {
+    memset (pDstAlign[0], 0, 16 * 32 + 1);
+    memset (pDstAlign[1], 0, 16 * 32 + 1);
+    FillWithRandomData ((uint8_t*)pSrcAlign, 16 * 64 + 1);
+    WelsCopy16x16_c (pDstAlign[0], 32, pSrcAlign + 1, 64);
+    sFuncPtrList.pfCopy16x16NotAligned (pDstAlign[1], 32, pSrcAlign + 1, 64);
+
+    for (int32_t i = 0; i < 16 * 32 + 1; i++) {
+      ASSERT_EQ (pDstAlign[0][i], pDstAlign[1][i]);
+    }
+
+  }
+
+}
--- a/test/encoder/targets.mk
+++ b/test/encoder/targets.mk
@@ -6,6 +6,7 @@
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMbAux.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_ExpGolomb.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_GetIntraPredictor.cpp\
+	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MBCopy.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MemoryAlloc.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MemoryZero.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MotionEstimate.cpp\