shithub: openh264

Download patch

ref: 80fdf09b260a900c9ce6bba59315db832c12de17
parent: 7de6eb2bad6b76ba7d6795a4dc74b8735948651f
parent: 53c8af4566c435960263f66d53550411ae1a7827
author: dongzha <[email protected]>
date: Fri May 30 05:26:04 EDT 2014

Merge pull request #903 from zhilwang/arm64-sad

Add Arm64 sad code

--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@@ -12,6 +12,7 @@
 		4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
 		4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066A18C57D0400DFA14A /* pixel_neon.S */; };
 		4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */; };
+		4CB8F2B419235FC5005D6386 /* pixel_neon_aarch64.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */; };
 		4CE4431518B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
 		4CE4432318B6FFA00017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4432218B6FFA00017DF25 /* XCTest.framework */; };
 		4CE4432418B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
@@ -81,6 +82,7 @@
 		4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
 		4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
 		4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
+		4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = pixel_neon_aarch64.S; path = arm64/pixel_neon_aarch64.S; sourceTree = "<group>"; };
 		4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_transpose_matrix.h; sourceTree = "<group>"; };
 		4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
 		4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
@@ -210,6 +212,14 @@
 			path = arm;
 			sourceTree = "<group>";
 		};
+		4CB8F2B219235FAC005D6386 /* arm64 */ = {
+			isa = PBXGroup;
+			children = (
+				4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */,
+			);
+			name = arm64;
+			sourceTree = "<group>";
+		};
 		4CE4430818B6FFA00017DF25 = {
 			isa = PBXGroup;
 			children = (
@@ -270,6 +280,7 @@
 		4CE446A118BC605B0017DF25 /* core */ = {
 			isa = PBXGroup;
 			children = (
+				4CB8F2B219235FAC005D6386 /* arm64 */,
 				4C34066418C57D0400DFA14A /* arm */,
 				4CE446A918BC605C0017DF25 /* inc */,
 				4CE446DC18BC605C0017DF25 /* src */,
@@ -507,6 +518,7 @@
 				4CE4471918BC605C0017DF25 /* memory_align.cpp in Sources */,
 				4CE4472418BC605C0017DF25 /* svc_enc_slice_segment.cpp in Sources */,
 				4CE4472318BC605C0017DF25 /* svc_base_layer_md.cpp in Sources */,
+				4CB8F2B419235FC5005D6386 /* pixel_neon_aarch64.S in Sources */,
 				4CE4471E18BC605C0017DF25 /* ratectl.cpp in Sources */,
 				4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */,
 				4CE4471C18BC605C0017DF25 /* picture_handle.cpp in Sources */,
@@ -648,6 +660,7 @@
 					"$(SRCROOT)/../../../../processing/interface",
 					"$(SRCROOT)/../../../../api/svc",
 					"$(SRCROOT)/../../../../common/arm",
+					"$(SRCROOT)/../../../../common/arm64",
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 6.1;
 				ONLY_ACTIVE_ARCH = NO;
@@ -686,6 +699,7 @@
 					"$(SRCROOT)/../../../../processing/interface",
 					"$(SRCROOT)/../../../../api/svc",
 					"$(SRCROOT)/../../../../common/arm",
+					"$(SRCROOT)/../../../../common/arm64",
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 6.1;
 				OTHER_LDFLAGS = "-ObjC";
--- a/codec/common/inc/sad_common.h
+++ b/codec/common/inc/sad_common.h
@@ -89,6 +89,19 @@
 
 #endif
 
+#if defined (HAVE_NEON_AARCH64)
+int32_t WelsSampleSad4x4_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+
+void WelsSampleSadFour16x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour16x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour4x4_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+#endif
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- /dev/null
+++ b/codec/encoder/core/arm64/pixel_neon_aarch64.S
@@ -1,0 +1,477 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+.macro CALC_AND_STORE_SAD
+    saddlv  s2, v2.8h
+    fmov    w0, s2
+.endm
+
+.macro CALC_AND_STORE_SAD_FOUR
+    saddlv  s28, v28.8h
+    saddlv  s29, v29.8h
+    saddlv  s30, v30.8h
+    saddlv  s31, v31.8h
+    st4     {v28.s, v29.s, v30.s, v31.s}[0], [x4]
+.endm
+
+.macro LOAD_8X8_1
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x0], x1
+    ld1     {v2.8b}, [x0], x1
+    ld1     {v3.8b}, [x0], x1
+    ld1     {v4.8b}, [x0], x1
+    ld1     {v5.8b}, [x0], x1
+    ld1     {v6.8b}, [x0], x1
+    ld1     {v7.8b}, [x0], x1
+.endm
+
+.macro LOAD_16X8_1
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x0], x1
+    ld1     {v2.16b}, [x0], x1
+    ld1     {v3.16b}, [x0], x1
+    ld1     {v4.16b}, [x0], x1
+    ld1     {v5.16b}, [x0], x1
+    ld1     {v6.16b}, [x0], x1
+    ld1     {v7.16b}, [x0], x1
+.endm
+
+#ifdef __APPLE__
+.macro LOAD_8X8_2
+    ld1     {v16.8b}, [$0], x3
+    ld1     {v17.8b}, [$0], x3
+    ld1     {v18.8b}, [$0], x3
+    ld1     {v19.8b}, [$0], x3
+    ld1     {v20.8b}, [$0], x3
+    ld1     {v21.8b}, [$0], x3
+    ld1     {v22.8b}, [$0], x3
+    ld1     {v23.8b}, [$0], x3
+.endm
+
+.macro CALC_ABS_8X8_1
+    uab$1l  $0, v0.8b, v16.8b
+    uabal   $0, v1.8b, v17.8b
+    uabal   $0, v2.8b, v18.8b
+    uabal   $0, v3.8b, v19.8b
+    uabal   $0, v4.8b, v20.8b
+    uabal   $0, v5.8b, v21.8b
+    uabal   $0, v6.8b, v22.8b
+    uabal   $0, v7.8b, v23.8b
+.endm
+
+.macro CALC_ABS_8X8_2
+    uab$0l  v29.8h, v0.8b, v18.8b
+    uabal   v29.8h, v1.8b, v19.8b
+    uabal   v29.8h, v2.8b, v20.8b
+    uabal   v29.8h, v3.8b, v21.8b
+    uabal   v29.8h, v4.8b, v22.8b
+    uabal   v29.8h, v5.8b, v23.8b
+    uabal   v29.8h, v6.8b, v24.8b
+    uabal   v29.8h, v7.8b, v25.8b
+.endm
+
+.macro LOAD_16X8_2
+    ld1     {v16.16b}, [$0], x3
+    ld1     {v17.16b}, [$0], x3
+    ld1     {v18.16b}, [$0], x3
+    ld1     {v19.16b}, [$0], x3
+    ld1     {v20.16b}, [$0], x3
+    ld1     {v21.16b}, [$0], x3
+    ld1     {v22.16b}, [$0], x3
+    ld1     {v23.16b}, [$0], x3
+.endm
+
+.macro CALC_ABS_16X8_1
+    uab$1l  $0, v0.8b, v16.8b
+    uabal2  $0, v0.16b,v16.16b
+    uabal   $0, v1.8b, v17.8b
+    uabal2  $0, v1.16b,v17.16b
+    uabal   $0, v2.8b, v18.8b
+    uabal2  $0, v2.16b,v18.16b
+    uabal   $0, v3.8b, v19.8b
+    uabal2  $0, v3.16b,v19.16b
+    uabal   $0, v4.8b, v20.8b
+    uabal2  $0, v4.16b,v20.16b
+    uabal   $0, v5.8b, v21.8b
+    uabal2  $0, v5.16b,v21.16b
+    uabal   $0, v6.8b, v22.8b
+    uabal2  $0, v6.16b,v22.16b
+    uabal   $0, v7.8b, v23.8b
+    uabal2  $0, v7.16b,v23.16b
+.endm
+
+.macro CALC_ABS_16X8_2
+    uab$0l  v29.8h, v0.8b, v18.8b
+    uabal2  v29.8h, v0.16b,v18.16b
+    uabal   v29.8h, v1.8b, v19.8b
+    uabal2  v29.8h, v1.16b,v19.16b
+    uabal   v29.8h, v2.8b, v20.8b
+    uabal2  v29.8h, v2.16b,v20.16b
+    uabal   v29.8h, v3.8b, v21.8b
+    uabal2  v29.8h, v3.16b,v21.16b
+    uabal   v29.8h, v4.8b, v22.8b
+    uabal2  v29.8h, v4.16b,v22.16b
+    uabal   v29.8h, v5.8b, v23.8b
+    uabal2  v29.8h, v5.16b,v23.16b
+    uabal   v29.8h, v6.8b, v24.8b
+    uabal2  v29.8h, v6.16b,v24.16b
+    uabal   v29.8h, v7.8b, v25.8b
+    uabal2  v29.8h, v7.16b,v25.16b
+.endm
+#else
+.macro LOAD_8X8_2 arg0
+    ld1     {v16.8b}, [\arg0], x3
+    ld1     {v17.8b}, [\arg0], x3
+    ld1     {v18.8b}, [\arg0], x3
+    ld1     {v19.8b}, [\arg0], x3
+    ld1     {v20.8b}, [\arg0], x3
+    ld1     {v21.8b}, [\arg0], x3
+    ld1     {v22.8b}, [\arg0], x3
+    ld1     {v23.8b}, [\arg0], x3
+.endm
+
+.macro CALC_ABS_8X8_1 arg0, arg1
+    uab\arg1\()l    \arg0, v0.8b, v16.8b
+    uabal   \arg0, v1.8b, v17.8b
+    uabal   \arg0, v2.8b, v18.8b
+    uabal   \arg0, v3.8b, v19.8b
+    uabal   \arg0, v4.8b, v20.8b
+    uabal   \arg0, v5.8b, v21.8b
+    uabal   \arg0, v6.8b, v22.8b
+    uabal   \arg0, v7.8b, v23.8b
+.endm
+
+.macro CALC_ABS_8X8_2 arg0
+    uab\arg0\()l    v29.8h, v0.8b, v18.8b
+    uabal   v29.8h, v1.8b, v19.8b
+    uabal   v29.8h, v2.8b, v20.8b
+    uabal   v29.8h, v3.8b, v21.8b
+    uabal   v29.8h, v4.8b, v22.8b
+    uabal   v29.8h, v5.8b, v23.8b
+    uabal   v29.8h, v6.8b, v24.8b
+    uabal   v29.8h, v7.8b, v25.8b
+.endm
+
+.macro LOAD_16X8_2 arg0
+    ld1     {v16.16b}, [\arg0], x3
+    ld1     {v17.16b}, [\arg0], x3
+    ld1     {v18.16b}, [\arg0], x3
+    ld1     {v19.16b}, [\arg0], x3
+    ld1     {v20.16b}, [\arg0], x3
+    ld1     {v21.16b}, [\arg0], x3
+    ld1     {v22.16b}, [\arg0], x3
+    ld1     {v23.16b}, [\arg0], x3
+.endm
+
+.macro CALC_ABS_16X8_1 arg0, arg1
+    uab\arg1\()l  \arg0, v0.8b, v16.8b
+    uabal2  \arg0, v0.16b,v16.16b
+    uabal   \arg0, v1.8b, v17.8b
+    uabal2  \arg0, v1.16b,v17.16b
+    uabal   \arg0, v2.8b, v18.8b
+    uabal2  \arg0, v2.16b,v18.16b
+    uabal   \arg0, v3.8b, v19.8b
+    uabal2  \arg0, v3.16b,v19.16b
+    uabal   \arg0, v4.8b, v20.8b
+    uabal2  \arg0, v4.16b,v20.16b
+    uabal   \arg0, v5.8b, v21.8b
+    uabal2  \arg0, v5.16b,v21.16b
+    uabal   \arg0, v6.8b, v22.8b
+    uabal2  \arg0, v6.16b,v22.16b
+    uabal   \arg0, v7.8b, v23.8b
+    uabal2  \arg0, v7.16b,v23.16b
+.endm
+
+.macro CALC_ABS_16X8_2 arg0
+    uab\arg0\()l  v29.8h, v0.8b, v18.8b
+    uabal2  v29.8h, v0.16b,v18.16b
+    uabal   v29.8h, v1.8b, v19.8b
+    uabal2  v29.8h, v1.16b,v19.16b
+    uabal   v29.8h, v2.8b, v20.8b
+    uabal2  v29.8h, v2.16b,v20.16b
+    uabal   v29.8h, v3.8b, v21.8b
+    uabal2  v29.8h, v3.16b,v21.16b
+    uabal   v29.8h, v4.8b, v22.8b
+    uabal2  v29.8h, v4.16b,v22.16b
+    uabal   v29.8h, v5.8b, v23.8b
+    uabal2  v29.8h, v5.16b,v23.16b
+    uabal   v29.8h, v6.8b, v24.8b
+    uabal2  v29.8h, v6.16b,v24.16b
+    uabal   v29.8h, v7.8b, v25.8b
+    uabal2  v29.8h, v7.16b,v25.16b
+.endm
+#endif
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.s}[0], [x0], x1
+    ld1     {v1.s}[0], [x2], x3
+    uabdl   v2.8h, v0.8b, v1.8b
+.rept 3
+    ld1     {v0.s}[0], [x0], x1
+    ld1     {v1.s}[0], [x2], x3
+    uabal   v2.8h, v0.8b, v1.8b
+.endr
+    saddlv  s2, v2.4h
+    fmov    w0, s2
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x2], x3
+    uabdl   v2.8h, v0.8b, v1.8b
+.rept 7
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x2], x3
+    uabal   v2.8h, v0.8b, v1.8b
+.endr
+    CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x2], x3
+    uabdl   v2.8h, v0.8b, v1.8b
+.rept 15
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x2], x3
+    uabal   v2.8h, v0.8b, v1.8b
+.endr
+    CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x2], x3
+    uabdl   v2.8h, v0.8b, v1.8b
+    uabal2  v2.8h, v0.16b, v1.16b
+.rept 7
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x2], x3
+    uabal   v2.8h, v0.8b, v1.8b
+    uabal2  v2.8h, v0.16b, v1.16b
+.endr
+    CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x2], x3
+    uabdl   v2.8h, v0.8b, v1.8b
+    uabal2  v2.8h, v0.16b, v1.16b
+.rept 15
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x2], x3
+    uabal   v2.8h, v0.8b, v1.8b
+    uabal2  v2.8h, v0.16b, v1.16b
+.endr
+    CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour4x4_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.s}[0], [x0], x1
+    ld1     {v0.s}[1], [x0], x1
+    ld1     {v1.s}[0], [x0], x1
+    ld1     {v1.s}[1], [x0]
+    sub     x0, x2, x3
+    ld1     {v2.s}[0], [x0], x3
+    ld1     {v2.s}[1], [x0], x3
+    ld1     {v3.s}[0], [x0], x3
+    ld1     {v3.s}[1], [x0], x3
+    ld1     {v4.s}[0], [x0], x3
+    ld1     {v4.s}[1], [x0], x3
+
+    uabdl   v28.8h, v0.8b, v2.8b
+    uabal   v28.8h, v1.8b, v3.8b
+
+    uabdl   v29.8h, v0.8b, v3.8b
+    uabal   v29.8h, v1.8b, v4.8b
+
+    sub     x0, x2, #1
+    ld1     {v2.s}[0], [x0], x3
+    ld1     {v2.s}[1], [x0], x3
+    ld1     {v3.s}[0], [x0], x3
+    ld1     {v3.s}[1], [x0]
+    uabdl   v30.8h, v0.8b, v2.8b
+    uabal   v30.8h, v1.8b, v3.8b
+
+    add     x0, x2, #1
+    ld1     {v2.s}[0], [x0], x3
+    ld1     {v2.s}[1], [x0], x3
+    ld1     {v3.s}[0], [x0], x3
+    ld1     {v3.s}[1], [x0]
+    uabdl   v31.8h, v0.8b, v2.8b
+    uabal   v31.8h, v1.8b, v3.8b
+
+    CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    LOAD_8X8_1
+    sub     x0, x2, x3
+    LOAD_8X8_2 x0
+    ld1     {v24.8b}, [x0], x3
+    ld1     {v25.8b}, [x0]
+
+    CALC_ABS_8X8_1 v28.8h, d
+    CALC_ABS_8X8_2 d
+
+    sub     x0, x2, #1
+    LOAD_8X8_2 x0
+    CALC_ABS_8X8_1 v30.8h, d
+
+    add     x0, x2, #1
+    LOAD_8X8_2 x0
+    CALC_ABS_8X8_1 v31.8h, d
+
+    CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    LOAD_8X8_1
+    sub     x5, x2, x3
+    LOAD_8X8_2 x5
+    ld1     {v24.8b}, [x5], x3
+    ld1     {v25.8b}, [x5], x3
+
+    CALC_ABS_8X8_1 v28.8h, d
+    CALC_ABS_8X8_2 d
+
+    sub     x6, x2, #1
+    LOAD_8X8_2 x6
+    CALC_ABS_8X8_1 v30.8h, d
+
+    add     x7, x2, #1
+    LOAD_8X8_2 x7
+    CALC_ABS_8X8_1 v31.8h, d
+
+    LOAD_8X8_1
+    sub     x5, x5, x3
+    sub     x5, x5, x3
+    LOAD_8X8_2 x5
+    ld1     {v24.8b}, [x5], x3
+    ld1     {v25.8b}, [x5]
+
+    CALC_ABS_8X8_1 v28.8h, a
+    CALC_ABS_8X8_2 a
+
+    LOAD_8X8_2 x6
+    CALC_ABS_8X8_1 v30.8h, a
+
+    LOAD_8X8_2 x7
+    CALC_ABS_8X8_1 v31.8h, a
+
+    CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    LOAD_16X8_1
+    sub     x0, x2, x3
+    LOAD_16X8_2 x0
+    ld1     {v24.16b}, [x0], x3
+    ld1     {v25.16b}, [x0]
+
+    CALC_ABS_16X8_1 v28.8h, d
+    CALC_ABS_16X8_2 d
+
+    sub     x0, x2, #1
+    LOAD_16X8_2 x0
+    CALC_ABS_16X8_1 v30.8h, d
+
+    add     x0, x2, #1
+    LOAD_16X8_2 x0
+    CALC_ABS_16X8_1 v31.8h, d
+
+    CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+
+    LOAD_16X8_1
+    sub     x5, x2, x3
+    LOAD_16X8_2 x5
+    ld1     {v24.16b}, [x5], x3
+    ld1     {v25.16b}, [x5], x3
+
+    CALC_ABS_16X8_1 v28.8h, d
+    CALC_ABS_16X8_2 d
+
+    sub     x6, x2, #1
+    LOAD_16X8_2 x6
+    CALC_ABS_16X8_1 v30.8h, d
+
+    add     x7, x2, #1
+    LOAD_16X8_2 x7
+    CALC_ABS_16X8_1 v31.8h, d
+
+    LOAD_16X8_1
+    sub     x5, x5, x3
+    sub     x5, x5, x3
+    LOAD_16X8_2 x5
+    ld1     {v24.16b}, [x5], x3
+    ld1     {v25.16b}, [x5]
+
+    CALC_ABS_16X8_1 v28.8h, a
+    CALC_ABS_16X8_2 a
+
+    LOAD_16X8_2 x6
+    CALC_ABS_16X8_1 v30.8h, a
+
+    LOAD_16X8_2 x7
+    CALC_ABS_16X8_1 v31.8h, a
+
+    CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+#endif
\ No newline at end of file
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -413,6 +413,23 @@
     pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad  = WelsIntra16x16Combined3Sad_neon;
   }
 #endif
+
+#if defined (HAVE_NEON_AARCH64)
+  if (uiCpuFlag & WELS_CPU_NEON) {
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4  ] = WelsSampleSad4x4_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_AArch64_neon;
+
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_AArch64_neon;
+
+  }
+#endif
 }
 
 } // namespace WelsSVCEnc
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -60,6 +60,13 @@
 ENCODER_OBJS += $(ENCODER_ASM_ARM_SRCS:.S=.$(OBJ))
 endif
 
+ifeq ($(ASM_ARCH), arm64)
+ENCODER_ASM_ARM64_SRCS=\
+	$(ENCODER_SRCDIR)/core/arm64/pixel_neon_aarch64.S\
+
+ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))
+endif
+
 OBJS += $(ENCODER_OBJS)
 $(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.cpp
 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<