ref: 80fdf09b260a900c9ce6bba59315db832c12de17
parent: 7de6eb2bad6b76ba7d6795a4dc74b8735948651f
parent: 53c8af4566c435960263f66d53550411ae1a7827
author: dongzha <[email protected]>
date: Fri May 30 05:26:04 EDT 2014
Merge pull request #903 from zhilwang/arm64-sad Add Arm64 sad code
--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@@ -12,6 +12,7 @@
4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066A18C57D0400DFA14A /* pixel_neon.S */; };
4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */; };
+ 4CB8F2B419235FC5005D6386 /* pixel_neon_aarch64.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */; };
4CE4431518B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
4CE4432318B6FFA00017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4432218B6FFA00017DF25 /* XCTest.framework */; };
4CE4432418B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
@@ -81,6 +82,7 @@
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
+ 4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = pixel_neon_aarch64.S; path = arm64/pixel_neon_aarch64.S; sourceTree = "<group>"; };
4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_transpose_matrix.h; sourceTree = "<group>"; };
4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
@@ -210,6 +212,14 @@
path = arm;
sourceTree = "<group>";
};
+ 4CB8F2B219235FAC005D6386 /* arm64 */ = {
+ isa = PBXGroup;
+ children = (
+ 4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */,
+ );
+ name = arm64;
+ sourceTree = "<group>";
+ };
4CE4430818B6FFA00017DF25 = {
isa = PBXGroup;
children = (
@@ -270,6 +280,7 @@
4CE446A118BC605B0017DF25 /* core */ = {
isa = PBXGroup;
children = (
+ 4CB8F2B219235FAC005D6386 /* arm64 */,
4C34066418C57D0400DFA14A /* arm */,
4CE446A918BC605C0017DF25 /* inc */,
4CE446DC18BC605C0017DF25 /* src */,
@@ -507,6 +518,7 @@
4CE4471918BC605C0017DF25 /* memory_align.cpp in Sources */,
4CE4472418BC605C0017DF25 /* svc_enc_slice_segment.cpp in Sources */,
4CE4472318BC605C0017DF25 /* svc_base_layer_md.cpp in Sources */,
+ 4CB8F2B419235FC5005D6386 /* pixel_neon_aarch64.S in Sources */,
4CE4471E18BC605C0017DF25 /* ratectl.cpp in Sources */,
4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */,
4CE4471C18BC605C0017DF25 /* picture_handle.cpp in Sources */,
@@ -648,6 +660,7 @@
"$(SRCROOT)/../../../../processing/interface",
"$(SRCROOT)/../../../../api/svc",
"$(SRCROOT)/../../../../common/arm",
+ "$(SRCROOT)/../../../../common/arm64",
);
IPHONEOS_DEPLOYMENT_TARGET = 6.1;
ONLY_ACTIVE_ARCH = NO;
@@ -686,6 +699,7 @@
"$(SRCROOT)/../../../../processing/interface",
"$(SRCROOT)/../../../../api/svc",
"$(SRCROOT)/../../../../common/arm",
+ "$(SRCROOT)/../../../../common/arm64",
);
IPHONEOS_DEPLOYMENT_TARGET = 6.1;
OTHER_LDFLAGS = "-ObjC";
--- a/codec/common/inc/sad_common.h
+++ b/codec/common/inc/sad_common.h
@@ -89,6 +89,19 @@
#endif
+#if defined (HAVE_NEON_AARCH64)
+int32_t WelsSampleSad4x4_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+
+void WelsSampleSadFour16x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour16x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour4x4_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+#endif
#if defined(__cplusplus)
}
#endif//__cplusplus
--- /dev/null
+++ b/codec/encoder/core/arm64/pixel_neon_aarch64.S
@@ -1,0 +1,477 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+.macro CALC_AND_STORE_SAD
+ saddlv s2, v2.8h
+ fmov w0, s2
+.endm
+
+.macro CALC_AND_STORE_SAD_FOUR
+ saddlv s28, v28.8h
+ saddlv s29, v29.8h
+ saddlv s30, v30.8h
+ saddlv s31, v31.8h
+ st4 {v28.s, v29.s, v30.s, v31.s}[0], [x4]
+.endm
+
+.macro LOAD_8X8_1
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v3.8b}, [x0], x1
+ ld1 {v4.8b}, [x0], x1
+ ld1 {v5.8b}, [x0], x1
+ ld1 {v6.8b}, [x0], x1
+ ld1 {v7.8b}, [x0], x1
+.endm
+
+.macro LOAD_16X8_1
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x0], x1
+ ld1 {v2.16b}, [x0], x1
+ ld1 {v3.16b}, [x0], x1
+ ld1 {v4.16b}, [x0], x1
+ ld1 {v5.16b}, [x0], x1
+ ld1 {v6.16b}, [x0], x1
+ ld1 {v7.16b}, [x0], x1
+.endm
+
+#ifdef __APPLE__
+.macro LOAD_8X8_2
+ ld1 {v16.8b}, [$0], x3
+ ld1 {v17.8b}, [$0], x3
+ ld1 {v18.8b}, [$0], x3
+ ld1 {v19.8b}, [$0], x3
+ ld1 {v20.8b}, [$0], x3
+ ld1 {v21.8b}, [$0], x3
+ ld1 {v22.8b}, [$0], x3
+ ld1 {v23.8b}, [$0], x3
+.endm
+
+.macro CALC_ABS_8X8_1
+ uab$1l $0, v0.8b, v16.8b
+ uabal $0, v1.8b, v17.8b
+ uabal $0, v2.8b, v18.8b
+ uabal $0, v3.8b, v19.8b
+ uabal $0, v4.8b, v20.8b
+ uabal $0, v5.8b, v21.8b
+ uabal $0, v6.8b, v22.8b
+ uabal $0, v7.8b, v23.8b
+.endm
+
+.macro CALC_ABS_8X8_2
+ uab$0l v29.8h, v0.8b, v18.8b
+ uabal v29.8h, v1.8b, v19.8b
+ uabal v29.8h, v2.8b, v20.8b
+ uabal v29.8h, v3.8b, v21.8b
+ uabal v29.8h, v4.8b, v22.8b
+ uabal v29.8h, v5.8b, v23.8b
+ uabal v29.8h, v6.8b, v24.8b
+ uabal v29.8h, v7.8b, v25.8b
+.endm
+
+.macro LOAD_16X8_2
+ ld1 {v16.16b}, [$0], x3
+ ld1 {v17.16b}, [$0], x3
+ ld1 {v18.16b}, [$0], x3
+ ld1 {v19.16b}, [$0], x3
+ ld1 {v20.16b}, [$0], x3
+ ld1 {v21.16b}, [$0], x3
+ ld1 {v22.16b}, [$0], x3
+ ld1 {v23.16b}, [$0], x3
+.endm
+
+.macro CALC_ABS_16X8_1
+ uab$1l $0, v0.8b, v16.8b
+ uabal2 $0, v0.16b,v16.16b
+ uabal $0, v1.8b, v17.8b
+ uabal2 $0, v1.16b,v17.16b
+ uabal $0, v2.8b, v18.8b
+ uabal2 $0, v2.16b,v18.16b
+ uabal $0, v3.8b, v19.8b
+ uabal2 $0, v3.16b,v19.16b
+ uabal $0, v4.8b, v20.8b
+ uabal2 $0, v4.16b,v20.16b
+ uabal $0, v5.8b, v21.8b
+ uabal2 $0, v5.16b,v21.16b
+ uabal $0, v6.8b, v22.8b
+ uabal2 $0, v6.16b,v22.16b
+ uabal $0, v7.8b, v23.8b
+ uabal2 $0, v7.16b,v23.16b
+.endm
+
+.macro CALC_ABS_16X8_2
+ uab$0l v29.8h, v0.8b, v18.8b
+ uabal2 v29.8h, v0.16b,v18.16b
+ uabal v29.8h, v1.8b, v19.8b
+ uabal2 v29.8h, v1.16b,v19.16b
+ uabal v29.8h, v2.8b, v20.8b
+ uabal2 v29.8h, v2.16b,v20.16b
+ uabal v29.8h, v3.8b, v21.8b
+ uabal2 v29.8h, v3.16b,v21.16b
+ uabal v29.8h, v4.8b, v22.8b
+ uabal2 v29.8h, v4.16b,v22.16b
+ uabal v29.8h, v5.8b, v23.8b
+ uabal2 v29.8h, v5.16b,v23.16b
+ uabal v29.8h, v6.8b, v24.8b
+ uabal2 v29.8h, v6.16b,v24.16b
+ uabal v29.8h, v7.8b, v25.8b
+ uabal2 v29.8h, v7.16b,v25.16b
+.endm
+#else
+.macro LOAD_8X8_2 arg0
+ ld1 {v16.8b}, [\arg0], x3
+ ld1 {v17.8b}, [\arg0], x3
+ ld1 {v18.8b}, [\arg0], x3
+ ld1 {v19.8b}, [\arg0], x3
+ ld1 {v20.8b}, [\arg0], x3
+ ld1 {v21.8b}, [\arg0], x3
+ ld1 {v22.8b}, [\arg0], x3
+ ld1 {v23.8b}, [\arg0], x3
+.endm
+
+.macro CALC_ABS_8X8_1 arg0, arg1
+ uab\arg1\()l \arg0, v0.8b, v16.8b
+ uabal \arg0, v1.8b, v17.8b
+ uabal \arg0, v2.8b, v18.8b
+ uabal \arg0, v3.8b, v19.8b
+ uabal \arg0, v4.8b, v20.8b
+ uabal \arg0, v5.8b, v21.8b
+ uabal \arg0, v6.8b, v22.8b
+ uabal \arg0, v7.8b, v23.8b
+.endm
+
+.macro CALC_ABS_8X8_2 arg0
+ uab\arg0\()l v29.8h, v0.8b, v18.8b
+ uabal v29.8h, v1.8b, v19.8b
+ uabal v29.8h, v2.8b, v20.8b
+ uabal v29.8h, v3.8b, v21.8b
+ uabal v29.8h, v4.8b, v22.8b
+ uabal v29.8h, v5.8b, v23.8b
+ uabal v29.8h, v6.8b, v24.8b
+ uabal v29.8h, v7.8b, v25.8b
+.endm
+
+.macro LOAD_16X8_2 arg0
+ ld1 {v16.16b}, [\arg0], x3
+ ld1 {v17.16b}, [\arg0], x3
+ ld1 {v18.16b}, [\arg0], x3
+ ld1 {v19.16b}, [\arg0], x3
+ ld1 {v20.16b}, [\arg0], x3
+ ld1 {v21.16b}, [\arg0], x3
+ ld1 {v22.16b}, [\arg0], x3
+ ld1 {v23.16b}, [\arg0], x3
+.endm
+
+.macro CALC_ABS_16X8_1 arg0, arg1
+ uab\arg1\()l \arg0, v0.8b, v16.8b
+ uabal2 \arg0, v0.16b,v16.16b
+ uabal \arg0, v1.8b, v17.8b
+ uabal2 \arg0, v1.16b,v17.16b
+ uabal \arg0, v2.8b, v18.8b
+ uabal2 \arg0, v2.16b,v18.16b
+ uabal \arg0, v3.8b, v19.8b
+ uabal2 \arg0, v3.16b,v19.16b
+ uabal \arg0, v4.8b, v20.8b
+ uabal2 \arg0, v4.16b,v20.16b
+ uabal \arg0, v5.8b, v21.8b
+ uabal2 \arg0, v5.16b,v21.16b
+ uabal \arg0, v6.8b, v22.8b
+ uabal2 \arg0, v6.16b,v22.16b
+ uabal \arg0, v7.8b, v23.8b
+ uabal2 \arg0, v7.16b,v23.16b
+.endm
+
+.macro CALC_ABS_16X8_2 arg0
+ uab\arg0\()l v29.8h, v0.8b, v18.8b
+ uabal2 v29.8h, v0.16b,v18.16b
+ uabal v29.8h, v1.8b, v19.8b
+ uabal2 v29.8h, v1.16b,v19.16b
+ uabal v29.8h, v2.8b, v20.8b
+ uabal2 v29.8h, v2.16b,v20.16b
+ uabal v29.8h, v3.8b, v21.8b
+ uabal2 v29.8h, v3.16b,v21.16b
+ uabal v29.8h, v4.8b, v22.8b
+ uabal2 v29.8h, v4.16b,v22.16b
+ uabal v29.8h, v5.8b, v23.8b
+ uabal2 v29.8h, v5.16b,v23.16b
+ uabal v29.8h, v6.8b, v24.8b
+ uabal2 v29.8h, v6.16b,v24.16b
+ uabal v29.8h, v7.8b, v25.8b
+ uabal2 v29.8h, v7.16b,v25.16b
+.endm
+#endif
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
+ sxtw x1, w1
+ sxtw x3, w3
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v1.s}[0], [x2], x3
+ uabdl v2.8h, v0.8b, v1.8b
+.rept 3
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v1.s}[0], [x2], x3
+ uabal v2.8h, v0.8b, v1.8b
+.endr
+ saddlv s2, v2.4h
+ fmov w0, s2
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x8_AArch64_neon
+ sxtw x1, w1
+ sxtw x3, w3
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x2], x3
+ uabdl v2.8h, v0.8b, v1.8b
+.rept 7
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x2], x3
+ uabal v2.8h, v0.8b, v1.8b
+.endr
+ CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x16_AArch64_neon
+ sxtw x1, w1
+ sxtw x3, w3
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x2], x3
+ uabdl v2.8h, v0.8b, v1.8b
+.rept 15
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x2], x3
+ uabal v2.8h, v0.8b, v1.8b
+.endr
+ CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x8_AArch64_neon
+ sxtw x1, w1
+ sxtw x3, w3
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x2], x3
+ uabdl v2.8h, v0.8b, v1.8b
+ uabal2 v2.8h, v0.16b, v1.16b
+.rept 7
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x2], x3
+ uabal v2.8h, v0.8b, v1.8b
+ uabal2 v2.8h, v0.16b, v1.16b
+.endr
+ CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x16_AArch64_neon
+ sxtw x1, w1
+ sxtw x3, w3
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x2], x3
+ uabdl v2.8h, v0.8b, v1.8b
+ uabal2 v2.8h, v0.16b, v1.16b
+.rept 15
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x2], x3
+ uabal v2.8h, v0.8b, v1.8b
+ uabal2 v2.8h, v0.16b, v1.16b
+.endr
+ CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour4x4_AArch64_neon
+ sxtw x1, w1
+ sxtw x3, w3
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0]
+ sub x0, x2, x3
+ ld1 {v2.s}[0], [x0], x3
+ ld1 {v2.s}[1], [x0], x3
+ ld1 {v3.s}[0], [x0], x3
+ ld1 {v3.s}[1], [x0], x3
+ ld1 {v4.s}[0], [x0], x3
+ ld1 {v4.s}[1], [x0], x3
+
+ uabdl v28.8h, v0.8b, v2.8b
+ uabal v28.8h, v1.8b, v3.8b
+
+ uabdl v29.8h, v0.8b, v3.8b
+ uabal v29.8h, v1.8b, v4.8b
+
+ sub x0, x2, #1
+ ld1 {v2.s}[0], [x0], x3
+ ld1 {v2.s}[1], [x0], x3
+ ld1 {v3.s}[0], [x0], x3
+ ld1 {v3.s}[1], [x0]
+ uabdl v30.8h, v0.8b, v2.8b
+ uabal v30.8h, v1.8b, v3.8b
+
+ add x0, x2, #1
+ ld1 {v2.s}[0], [x0], x3
+ ld1 {v2.s}[1], [x0], x3
+ ld1 {v3.s}[0], [x0], x3
+ ld1 {v3.s}[1], [x0]
+ uabdl v31.8h, v0.8b, v2.8b
+ uabal v31.8h, v1.8b, v3.8b
+
+ CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x8_AArch64_neon
+ sxtw x1, w1
+ sxtw x3, w3
+ LOAD_8X8_1
+ sub x0, x2, x3
+ LOAD_8X8_2 x0
+ ld1 {v24.8b}, [x0], x3
+ ld1 {v25.8b}, [x0]
+
+ CALC_ABS_8X8_1 v28.8h, d
+ CALC_ABS_8X8_2 d
+
+ sub x0, x2, #1
+ LOAD_8X8_2 x0
+ CALC_ABS_8X8_1 v30.8h, d
+
+ add x0, x2, #1
+ LOAD_8X8_2 x0
+ CALC_ABS_8X8_1 v31.8h, d
+
+ CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x16_AArch64_neon
+ sxtw x1, w1
+ sxtw x3, w3
+ LOAD_8X8_1
+ sub x5, x2, x3
+ LOAD_8X8_2 x5
+ ld1 {v24.8b}, [x5], x3
+ ld1 {v25.8b}, [x5], x3
+
+ CALC_ABS_8X8_1 v28.8h, d
+ CALC_ABS_8X8_2 d
+
+ sub x6, x2, #1
+ LOAD_8X8_2 x6
+ CALC_ABS_8X8_1 v30.8h, d
+
+ add x7, x2, #1
+ LOAD_8X8_2 x7
+ CALC_ABS_8X8_1 v31.8h, d
+
+ LOAD_8X8_1
+ sub x5, x5, x3
+ sub x5, x5, x3
+ LOAD_8X8_2 x5
+ ld1 {v24.8b}, [x5], x3
+ ld1 {v25.8b}, [x5]
+
+ CALC_ABS_8X8_1 v28.8h, a
+ CALC_ABS_8X8_2 a
+
+ LOAD_8X8_2 x6
+ CALC_ABS_8X8_1 v30.8h, a
+
+ LOAD_8X8_2 x7
+ CALC_ABS_8X8_1 v31.8h, a
+
+ CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x8_AArch64_neon
+ sxtw x1, w1
+ sxtw x3, w3
+ LOAD_16X8_1
+ sub x0, x2, x3
+ LOAD_16X8_2 x0
+ ld1 {v24.16b}, [x0], x3
+ ld1 {v25.16b}, [x0]
+
+ CALC_ABS_16X8_1 v28.8h, d
+ CALC_ABS_16X8_2 d
+
+ sub x0, x2, #1
+ LOAD_16X8_2 x0
+ CALC_ABS_16X8_1 v30.8h, d
+
+ add x0, x2, #1
+ LOAD_16X8_2 x0
+ CALC_ABS_16X8_1 v31.8h, d
+
+ CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x16_AArch64_neon
+ sxtw x1, w1
+ sxtw x3, w3
+
+ LOAD_16X8_1
+ sub x5, x2, x3
+ LOAD_16X8_2 x5
+ ld1 {v24.16b}, [x5], x3
+ ld1 {v25.16b}, [x5], x3
+
+ CALC_ABS_16X8_1 v28.8h, d
+ CALC_ABS_16X8_2 d
+
+ sub x6, x2, #1
+ LOAD_16X8_2 x6
+ CALC_ABS_16X8_1 v30.8h, d
+
+ add x7, x2, #1
+ LOAD_16X8_2 x7
+ CALC_ABS_16X8_1 v31.8h, d
+
+ LOAD_16X8_1
+ sub x5, x5, x3
+ sub x5, x5, x3
+ LOAD_16X8_2 x5
+ ld1 {v24.16b}, [x5], x3
+ ld1 {v25.16b}, [x5]
+
+ CALC_ABS_16X8_1 v28.8h, a
+ CALC_ABS_16X8_2 a
+
+ LOAD_16X8_2 x6
+ CALC_ABS_16X8_1 v30.8h, a
+
+ LOAD_16X8_2 x7
+ CALC_ABS_16X8_1 v31.8h, a
+
+ CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+#endif
\ No newline at end of file
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -413,6 +413,23 @@
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_neon;
}
#endif
+
+#if defined (HAVE_NEON_AARCH64)
+ if (uiCpuFlag & WELS_CPU_NEON) {
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_AArch64_neon;
+
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_AArch64_neon;
+
+ }
+#endif
}
} // namespace WelsSVCEnc
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -60,6 +60,13 @@
ENCODER_OBJS += $(ENCODER_ASM_ARM_SRCS:.S=.$(OBJ))
endif
+ifeq ($(ASM_ARCH), arm64)
+ENCODER_ASM_ARM64_SRCS=\
+ $(ENCODER_SRCDIR)/core/arm64/pixel_neon_aarch64.S\
+
+ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))
+endif
+
OBJS += $(ENCODER_OBJS)
$(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.cpp
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<