ref: 300bbfb67b745a35bfff0e5819e9dde34c8061ab
parent: 775b507941d40561a0fdfb4544ab1df3119d5167
parent: e389cf4348c51b76f4883ceed171e60f53a5a6a8
author: dongzha <[email protected]>
date: Fri Jun 27 11:55:33 EDT 2014
Merge pull request #1014 from zhilwang/arm64-intraSad Add arm64 neon code for intraSad&Satd
--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@@ -7,6 +7,7 @@
objects = {
/* Begin PBXBuildFile section */
+ 4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */; };
4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066618C57D0400DFA14A /* intra_pred_neon.S */; };
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */; };
4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
@@ -61,6 +62,7 @@
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
+ 4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_sad_3_opt_aarch64_neon.S; path = arm64/intra_pred_sad_3_opt_aarch64_neon.S; sourceTree = "<group>"; };
4C34066618C57D0400DFA14A /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_sad_3_opt_neon.S; sourceTree = "<group>"; };
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
@@ -182,6 +184,7 @@
4CB8F2B219235FAC005D6386 /* arm64 */ = {
isa = PBXGroup;
children = (
+ 4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */,
4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */,
4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */,
);
@@ -422,6 +425,7 @@
4CE4472518BC605C0017DF25 /* svc_encode_mb.cpp in Sources */,
4CE4471A18BC605C0017DF25 /* mv_pred.cpp in Sources */,
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */,
+ 4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S in Sources */,
4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */,
4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,
4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */,
--- /dev/null
+++ b/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S
@@ -1,0 +1,665 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+.macro LOAD_LUMA_DATA
+ sub x7, x0, x1
+ ld1 {v0.16b}, [x7] //top
+ sub x7, x0, #1
+ ld1 {v1.b}[0], [x7], x1
+ ld1 {v1.b}[1], [x7], x1
+ ld1 {v1.b}[2], [x7], x1
+ ld1 {v1.b}[3], [x7], x1
+ ld1 {v1.b}[4], [x7], x1
+ ld1 {v1.b}[5], [x7], x1
+ ld1 {v1.b}[6], [x7], x1
+ ld1 {v1.b}[7], [x7], x1
+ ld1 {v1.b}[8], [x7], x1
+ ld1 {v1.b}[9], [x7], x1
+ ld1 {v1.b}[10], [x7], x1
+ ld1 {v1.b}[11], [x7], x1
+ ld1 {v1.b}[12], [x7], x1
+ ld1 {v1.b}[13], [x7], x1
+ ld1 {v1.b}[14], [x7], x1
+ ld1 {v1.b}[15], [x7] //left
+.endm
+
+.macro LOAD_16X4_DATA
+ //Load the p_enc data and save to "v22 ~ v25"--- 16X4 bytes
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v20.16b}, [x2], x3
+ ld1 {v21.16b}, [x2], x3
+ trn1 v22.4s, v0.4s, v1.4s
+ trn2 v23.4s, v0.4s, v1.4s
+ trn1 v24.4s, v20.4s, v21.4s
+ trn2 v25.4s, v20.4s, v21.4s
+.endm
+
+.macro GET_16X16_V_SATD
+ trn1 v6.4s, v4.4s, v5.4s
+ trn2 v7.4s, v4.4s, v5.4s
+ add v4.8h, v6.8h, v7.8h
+ sub v5.8h, v6.8h, v7.8h
+ trn1 v6.8h, v4.8h, v5.8h
+ trn2 v7.8h, v4.8h, v5.8h
+ add v4.8h, v6.8h, v7.8h
+ sub v5.8h, v6.8h, v7.8h
+ trn1 v6.4s, v4.4s, v5.4s
+ trn2 v7.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v6 {8,9,11,10, 12,13,15,14} v7
+.endm
+
+.macro GET_16X16_H_SATD
+ trn1 v16.4s, v4.4s, v5.4s
+ trn2 v17.4s, v4.4s, v5.4s
+ add v4.8h, v16.8h, v17.8h
+ sub v5.8h, v16.8h, v17.8h
+ trn1 v16.8h, v4.8h, v5.8h
+ trn2 v17.8h, v4.8h, v5.8h
+ add v4.8h, v16.8h, v17.8h
+ sub v5.8h, v16.8h, v17.8h
+ trn1 v16.4s, v4.4s, v5.4s
+ trn2 v17.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17
+.endm
+
+#ifdef __APPLE__
+.macro SELECT_BEST_COST
+ cmp w1, $0
+ csel $0, $0, w1, hs
+ cset w7, lo
+ cmp w2, $0
+ mov w6, #2
+ csel $0, $0, w2, hs
+ csel w7, w7, w6, hs
+.endm
+
+.macro LOAD_CHROMA_DATA
+ sub x9, $0, x1
+ ld1 {$1}, [x9] //top_cb
+ sub x9, $0, #1
+ ld1 {$2}[8], [x9], x1
+ ld1 {$2}[9], [x9], x1
+ ld1 {$2}[10], [x9], x1
+ ld1 {$2}[11], [x9], x1
+ ld1 {$2}[12], [x9], x1
+ ld1 {$2}[13], [x9], x1
+ ld1 {$2}[14], [x9], x1
+ ld1 {$2}[15], [x9], x1 //left_cb
+.endm
+
+.macro LOAD_8X4_DATA
+ //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
+ ld1 {v0.8b}, [$0], x3
+ ld1 {v1.8b}, [$0], x3
+ ld1 {v0.d}[1], [$0], x3
+ ld1 {v1.d}[1], [$0], x3
+ trn1 v2.4s, v0.4s, v1.4s
+ trn2 v1.4s, v0.4s, v1.4s
+ trn1 v20.2d, v2.2d, v1.2d
+ trn2 v21.2d, v2.2d, v1.2d
+.endm
+
+.macro HDM_TRANSFORM_4X4_L0
+ //Do the vertical transform
+ uadd$9 v0.8h, $0, $1
+ usub$9 v1.8h, $0, $1
+ trn1 v3.2d, v0.2d, v1.2d
+ trn2 v1.2d, v0.2d, v1.2d
+ add v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
+ sub v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
+
+ //Do the horizontal transform
+ trn1 v0.4s, v4.4s, v5.4s
+ trn2 v1.4s, v4.4s, v5.4s
+ add v4.8h, v0.8h, v1.8h
+ sub v5.8h, v0.8h, v1.8h
+ trn1 v0.8h, v4.8h, v5.8h
+ trn2 v1.8h, v4.8h, v5.8h
+ add v4.8h, v0.8h, v1.8h
+ sub v5.8h, v0.8h, v1.8h
+
+ //16x16_v
+ trn1 v0.2s, v4.2s, v5.2s
+ trn2 v1.2s, v4.2s, v5.2s
+ sabal $5, v0.4h, $2
+ sabal $5, v1.4h, $8.4h
+ sabal2 $5, v4.8h, $8.8h
+ sabal2 $5, v5.8h, $8.8h
+
+ //16x16_h
+ ins v3.d[0], v4.d[1]
+ trn1 v0.4h, v4.4h, v3.4h
+ trn2 v1.4h, v4.4h, v3.4h
+ sabal $6, v0.4h, $3
+ sabdl v4.4s, v1.4h, $8.4h
+ sabal v4.4s, v5.4h, $8.4h
+ sabal2 v4.4s, v5.8h, $8.8h
+ add $6, $6, v4.4s
+
+ //16x16_dc_both
+ sabal $7, v0.4h, $4
+ add $7, $7, v4.4s
+.endm
+#else
+.macro SELECT_BEST_COST arg0
+ cmp w1, \arg0
+ csel \arg0, \arg0, w1, hs
+ cset w7, lo
+ cmp w2, \arg0
+ mov w6, #2
+ csel \arg0, \arg0, w2, hs
+ csel w7, w7, w6, hs
+.endm
+
+.macro LOAD_CHROMA_DATA arg0, arg1, arg2
+ sub x9, \arg0, x1
+ ld1 {\arg1}, [x9] //top_cb
+ sub x9, $0, #1
+ ld1 {\arg2}[8], [x9], x1
+ ld1 {\arg2}[9], [x9], x1
+ ld1 {\arg2}[10], [x9], x1
+ ld1 {\arg2}[11], [x9], x1
+ ld1 {\arg2}[12], [x9], x1
+ ld1 {\arg2}[13], [x9], x1
+ ld1 {\arg2}[14], [x9], x1
+ ld1 {\arg2}[15], [x9], x1 //left_cb
+.endm
+
+.macro LOAD_8X4_DATA arg0
+ //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
+ ld1 {v0.8b}, [\arg0], x3
+ ld1 {v1.8b}, [\arg0], x3
+ ld1 {v0.d}[1], [\arg0], x3
+ ld1 {v1.d}[1], [\arg0], x3
+ trn1 v2.4s, v0.4s, v1.4s
+ trn2 v1.4s, v0.4s, v1.4s
+ trn1 v20.2d, v2.2d, v1.2d
+ trn2 v21.2d, v2.2d, v1.2d
+.endm
+
+.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+ //Do the vertical transform
+ uadd\arg9\() v0.8h, \arg0, \arg1
+ usub\arg9\() v1.8h, \arg0, \arg1
+ trn1 v3.2d, v0.2d, v1.2d
+ trn2 v1.2d, v0.2d, v1.2d
+ add v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
+ sub v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
+
+ //Do the horizontal transform
+ trn1 v0.4s, v4.4s, v5.4s
+ trn2 v1.4s, v4.4s, v5.4s
+ add v4.8h, v0.8h, v1.8h
+ sub v5.8h, v0.8h, v1.8h
+ trn1 v0.8h, v4.8h, v5.8h
+ trn2 v1.8h, v4.8h, v5.8h
+ add v4.8h, v0.8h, v1.8h
+ sub v5.8h, v0.8h, v1.8h
+
+ //16x16_v
+ trn1 v0.2s, v4.2s, v5.2s
+ trn2 v1.2s, v4.2s, v5.2s
+ sabal \arg5, v0.4h, \arg2
+ sabal \arg5, v1.4h, \arg8\().4h
+ sabal2 \arg5, v4.8h, \arg8\().8h
+ sabal2 \arg5, v5.8h, \arg8\().8h
+
+ //16x16_h
+ ins v3.d[0], v4.d[1]
+ trn1 v0.4h, v4.4h, v3.4h
+ trn2 v1.4h, v4.4h, v3.4h
+ sabal \arg6, v0.4h, \arg3
+ sabdl v4.4s, v1.4h, \arg8\().4h
+ sabal v4.4s, v5.4h, \arg8\().4h
+ sabal2 v4.4s, v5.8h, \arg8\().8h
+ add \arg6, \arg6, v4.4s
+
+ //16x16_dc_both
+ sabal \arg7, v0.4h, \arg4
+ add \arg7, \arg7, v4.4s
+.endm
+#endif
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
+ ldr x11, [sp, #0]
+
+ LOAD_CHROMA_DATA x0, v0.8b, v0.b
+
+ uaddlp v1.8h, v0.16b
+ uaddlp v2.4s, v1.8h
+ ins v3.d[0], v2.d[1]
+ add v3.2s, v2.2s, v3.2s
+ urshr v2.4s, v2.4s, #2
+ urshr v3.2s, v3.2s, #3
+
+ dup v20.8b, v3.b[0]
+ dup v21.8b, v2.b[4]
+ dup v22.8b, v2.b[12]
+ dup v23.8b, v3.b[4]
+ ins v20.s[1], v21.s[0]
+ ins v22.s[1], v23.s[0]
+
+ LOAD_CHROMA_DATA x7, v4.8b, v4.b
+
+ uaddlp v5.8h, v4.16b
+ uaddlp v6.4s, v5.8h
+ ins v7.d[0], v6.d[1]
+ add v7.2s, v6.2s, v7.2s
+ urshr v6.4s, v6.4s, #2
+ urshr v7.2s, v7.2s, #3
+
+ dup v24.8b, v7.b[0]
+ dup v25.8b, v6.b[4]
+ dup v26.8b, v6.b[12]
+ dup v27.8b, v7.b[4]
+ ins v24.s[1], v25.s[0]
+ ins v26.s[1], v27.s[0]
+
+ sub x9, x0, #1
+ sub x10, x7, #1
+
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v5.8b}, [x11], x3
+
+ ld1r {v6.8b}, [x9], x1
+ ld1r {v7.8b}, [x10], x1
+
+ uabdl v29.8h, v0.8b, v3.8b
+ uabal v29.8h, v4.8b, v5.8b //top
+
+ uabdl v30.8h, v6.8b, v3.8b
+ uabal v30.8h, v7.8b, v5.8b //left
+
+ uabdl v31.8h, v20.8b, v3.8b
+ uabal v31.8h, v24.8b, v5.8b //Dc
+.rept 3
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v5.8b}, [x11], x3
+
+ ld1r {v6.8b}, [x9], x1
+ ld1r {v7.8b}, [x10], x1
+
+ uabal v29.8h, v0.8b, v3.8b
+ uabal v29.8h, v4.8b, v5.8b //top
+
+ uabal v30.8h, v6.8b, v3.8b
+ uabal v30.8h, v7.8b, v5.8b //left
+
+ uabal v31.8h, v20.8b, v3.8b
+ uabal v31.8h, v24.8b, v5.8b //Dc
+.endr
+
+.rept 4
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v5.8b}, [x11], x3
+
+ ld1r {v6.8b}, [x9], x1
+ ld1r {v7.8b}, [x10], x1
+
+ uabal v29.8h, v0.8b, v3.8b
+ uabal v29.8h, v4.8b, v5.8b //top
+
+ uabal v30.8h, v6.8b, v3.8b
+ uabal v30.8h, v7.8b, v5.8b //left
+
+ uabal v31.8h, v22.8b, v3.8b
+ uabal v31.8h, v26.8b, v5.8b //Dc
+.endr
+
+ saddlv s29, v29.8h
+ fmov w2, s29
+ add w2, w2, w5, lsl #1
+ saddlv s30, v30.8h
+ fmov w1, s30
+ add w1, w1, w5, lsl #1
+ saddlv s31, v31.8h
+ fmov w0, s31
+
+ SELECT_BEST_COST w0
+
+ str w7, [x4]
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon
+
+ LOAD_LUMA_DATA
+
+ uaddlv h2, v0.16b
+ uaddlv h3, v1.16b
+ add v2.8h, v2.8h, v3.8h
+ uqrshrn b2, h2, #5
+ dup v2.16b, v2.b[0] //Dc
+
+ sub x7, x0, #1
+ ld1 {v3.16b}, [x2], x3
+ ld1r {v4.16b}, [x7], x1
+
+ uabdl v29.8h, v0.8b, v3.8b
+ uabal2 v29.8h, v0.16b,v3.16b //top
+
+ uabdl v30.8h, v4.8b, v3.8b
+ uabal2 v30.8h, v4.16b,v3.16b //left
+
+ uabdl v31.8h, v2.8b, v3.8b
+ uabal2 v31.8h, v2.16b,v3.16b //Dc
+ mov x6, #15
+sad_intra_16x16_x3_opt_loop0:
+ ld1 {v3.16b}, [x2], x3
+ ld1r {v4.16b}, [x7], x1
+
+ uabal v29.8h, v0.8b, v3.8b
+ uabal2 v29.8h, v0.16b,v3.16b //top
+
+ uabal v30.8h, v4.8b, v3.8b
+ uabal2 v30.8h, v4.16b,v3.16b //left
+
+ uabal v31.8h, v2.8b, v3.8b
+ uabal2 v31.8h, v2.16b,v3.16b //Dc
+ sub x6, x6, #1
+ cbnz x6, sad_intra_16x16_x3_opt_loop0
+
+ saddlv s29, v29.8h
+ fmov w0, s29
+ saddlv s30, v30.8h
+ fmov w1, s30
+ add w1, w1, w5, lsl #1
+ saddlv s31, v31.8h
+ fmov w2, s31
+ add w2, w2, w5, lsl #1
+
+ SELECT_BEST_COST w0
+
+ str w7, [x4]
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon
+ sub x9, x0, x1
+ ld1 {v16.s}[0], [x9] //top
+ sub x9, x0, #1
+ ld1 {v16.b}[4], [x9], x1
+ ld1 {v16.b}[5], [x9], x1
+ ld1 {v16.b}[6], [x9], x1
+ ld1 {v16.b}[7], [x9], x1
+
+
+ uaddlv h2, v16.8b
+ uqrshrn b17, h2, #3
+ urshr v2.4h, v2.4h, #3
+ shl v2.4h, v2.4h, #4
+
+ //Calculate the 4x4_v 4x4_h mode SATD and save to "v6, v7"
+ ushll v4.8h, v16.8b, #2
+ ins v5.d[0], v4.d[1]
+ trn1 v6.2s, v4.2s, v5.2s
+ trn2 v7.2s, v4.2s, v5.2s
+
+ add v4.4h, v6.4h, v7.4h
+ sub v5.4h, v6.4h, v7.4h
+ trn1 v6.4h, v4.4h, v5.4h
+ trn2 v7.4h, v4.4h, v5.4h
+ add v4.4h, v6.4h, v7.4h
+ sub v5.4h, v6.4h, v7.4h
+ trn1 v6.2s, v4.2s, v5.2s
+ trn2 v7.2s, v4.2s, v5.2s //{0,1,3,2,top} v6 {0,1,3,2,left} v7
+
+ eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH
+ eor v30.16b, v30.16b, v30.16b //Save the SATD of H
+ eor v29.16b, v29.16b, v29.16b //Save the SATD of V
+ eor v28.16b, v28.16b, v28.16b //For zero register
+
+ //Load the p_enc data and save to "v22 ~ v23"--- 16X4 bytes
+ ld1 {v22.s}[0], [x2], x3
+ ld1 {v22.s}[1], [x2], x3
+ ld1 {v23.s}[0], [x2], x3
+ ld1 {v23.s}[1], [x2], x3
+
+ HDM_TRANSFORM_4X4_L0 v22.8b, v23.8b, v6.4h, v7.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+
+ ldr x11, [sp, #0]
+ urshr v29.4s, v29.4s, #1
+ addv s29, v29.4s
+ fmov w0, s29
+ add w0, w0, w11
+
+ urshr v30.4s, v30.4s, #1
+ addv s30, v30.4s
+ fmov w1, s30
+ add w1, w1, w7
+
+ urshr v31.4s, v31.4s, #1
+ addv s31, v31.4s
+ fmov w2, s31
+ add w2, w2, w6
+
+ mov w10, w0
+ SELECT_BEST_COST w10
+
+ str w7, [x5]
+
+ sub w9, w10, w2
+ cbnz w9, satd_intra_4x4_x3_opt_jump0
+ dup v0.16b, v17.b[0]
+ st1 {v0.16b}, [x4]
+ b satd_intra_4x4_x3_opt_end
+
+satd_intra_4x4_x3_opt_jump0:
+ sub w8, w10, w1
+ cbnz w8, satd_intra_4x4_x3_opt_jump1
+ dup v0.16b, v16.b[4]
+ dup v1.16b, v16.b[5]
+ dup v2.16b, v16.b[6]
+ dup v3.16b, v16.b[7]
+ st4 {v0.s,v1.s,v2.s,v3.s}[0], [x4]
+ b satd_intra_4x4_x3_opt_end
+
+satd_intra_4x4_x3_opt_jump1:
+ st1 {v16.S}[0], [x4], #4
+ st1 {v16.S}[0], [x4], #4
+ st1 {v16.S}[0], [x4], #4
+ st1 {v16.S}[0], [x4]
+satd_intra_4x4_x3_opt_end:
+ mov w0, w10
+
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon
+ ldr x11, [sp, #0]
+
+ LOAD_CHROMA_DATA x0, v0.8b, v0.b
+
+ LOAD_CHROMA_DATA x7, v1.8b, v1.b
+
+ //Calculate the 16x16_v mode SATD and save to "v6, v7"
+ ushll v4.8h, v0.8b, #2
+ ushll v5.8h, v1.8b, #2
+ GET_16X16_V_SATD
+
+ //Calculate the 16x16_h mode SATD and save to "v16, v17"
+ ushll2 v4.8h, v0.16b, #2
+ ushll2 v5.8h, v1.16b, #2
+ GET_16X16_H_SATD
+
+ uaddlp v0.8h, v0.16b
+ uaddlp v2.4s, v0.8h
+ ins v3.d[0], v2.d[1]
+ add v3.2s, v2.2s, v3.2s
+
+ uaddlp v1.8h, v1.16b
+ uaddlp v4.4s, v1.8h
+ ins v5.d[0], v4.d[1]
+ add v5.2s, v4.2s, v5.2s
+
+ trn2 v0.4s, v2.4s, v4.4s
+ urshr v0.4s, v0.4s, #2
+ urshr v3.2s, v3.2s, #3
+ urshr v5.2s, v5.2s, #3
+
+ ushll v22.2d, v0.2s, #4 //{1cb, 1cr}
+ ushll2 v23.2d, v0.4s, #4 //{2cb, 2cr}
+ ushll v24.2d, v3.2s, #4 //{0cb, 3cb}
+ ushll v25.2d, v5.2s, #4 //{0cr, 3cr}
+
+ eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH
+ eor v30.16b, v30.16b, v30.16b //Save the SATD of H
+ eor v29.16b, v29.16b, v29.16b //Save the SATD of V
+ eor v28.16b, v28.16b, v28.16b //For zero register
+
+ ins v18.d[0], v6.d[1]
+ ins v19.d[0], v7.d[1]
+ ins v26.d[0], v16.d[1]
+ ins v27.d[0], v17.d[1]
+
+ LOAD_8X4_DATA x2
+
+ HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v16.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v16.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+ LOAD_8X4_DATA x11
+
+ ins v22.d[0], v22.d[1]
+ HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v17.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v17.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+ LOAD_8X4_DATA x2
+
+ ins v24.d[0], v24.d[1]
+ HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v26.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v26.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+ LOAD_8X4_DATA x11
+
+ ins v23.d[0], v23.d[1]
+ ins v25.d[0], v25.d[1]
+ HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v27.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v27.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+ urshr v29.4s, v29.4s, #1
+ addv s29, v29.4s
+ fmov w2, s29
+ add w2, w2, w5, lsl #1
+
+ urshr v30.4s, v30.4s, #1
+ addv s30, v30.4s
+ fmov w1, s30
+ add w1, w1, w5, lsl #1
+
+ urshr v31.4s, v31.4s, #1
+ addv s31, v31.4s
+ fmov w0, s31
+
+ SELECT_BEST_COST w0
+
+ str w7, [x4]
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon
+ LOAD_LUMA_DATA
+
+ uaddlv h2, v0.16b
+ uaddlv h3, v1.16b
+ add v2.8h, v2.8h, v3.8h
+ urshr v2.4h, v2.4h, #5
+ shl v2.4h, v2.4h, #4
+
+ //Calculate the 16x16_v mode SATD and save to "v6, v7"
+ ushll v4.8h, v0.8b, #2
+ ushll2 v5.8h, v0.16b, #2
+ GET_16X16_V_SATD
+
+ //Calculate the 16x16_h mode SATD and save to "v16, v17"
+ ushll v4.8h, v1.8b, #2
+ ushll2 v5.8h, v1.16b, #2
+ GET_16X16_H_SATD
+
+ eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH
+ eor v30.16b, v30.16b, v30.16b //Save the SATD of H
+ eor v29.16b, v29.16b, v29.16b //Save the SATD of V
+ eor v28.16b, v28.16b, v28.16b //For zero register
+
+ ins v18.d[0], v6.d[1]
+ ins v19.d[0], v7.d[1]
+ ins v26.d[0], v16.d[1]
+ ins v27.d[0], v17.d[1]
+
+ LOAD_16X4_DATA
+
+ HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+ HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+ LOAD_16X4_DATA
+
+ HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+ HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+ LOAD_16X4_DATA
+
+ HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+ HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+ LOAD_16X4_DATA
+
+ HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+ HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+ HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+ urshr v29.4s, v29.4s, #1
+ addv s29, v29.4s
+ fmov w0, s29
+
+ urshr v30.4s, v30.4s, #1
+ addv s30, v30.4s
+ fmov w1, s30
+ add w1, w1, w5, lsl #1
+
+ urshr v31.4s, v31.4s, #1
+ addv s31, v31.4s
+ fmov w2, s31
+ add w2, w2, w5, lsl #1
+
+ SELECT_BEST_COST w0
+
+ str w7, [x4]
+
+WELS_ASM_ARCH64_FUNC_END
+
+#endif
\ No newline at end of file
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -108,6 +108,14 @@
int32_t WelsSampleSatd16x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd8x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsIntra16x16Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntra16x16Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntra8x8Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,
+ uint8_t*);
+int32_t WelsIntra8x8Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,
+ uint8_t*);
+int32_t WelsIntra4x4Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t,
+ int32_t);
#endif
#if defined(__cplusplus)
}
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -433,6 +433,12 @@
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_AArch64_neon;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_AArch64_neon;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_AArch64_neon;
+
+ pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsIntra4x4Combined3Satd_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntra8x8Combined3Satd_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = WelsIntra8x8Combined3Sad_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_AArch64_neon;
+ pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_AArch64_neon;
}
#endif
}
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -60,6 +60,7 @@
ifeq ($(ASM_ARCH), arm64)
ENCODER_ASM_ARM64_SRCS=\
$(ENCODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\
+ $(ENCODER_SRCDIR)/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S\
$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\
ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))