shithub: openh264

--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj

+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj

@@ -7,7 +7,7 @@

 	objects = {

 /* Begin PBXBuildFile section */

-		4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */; };

+		4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */; };

 		4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066618C57D0400DFA14A /* intra_pred_neon.S */; };

 		4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */; };

 		4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };

@@ -62,7 +62,7 @@

 /* End PBXCopyFilesBuildPhase section */

 /* Begin PBXFileReference section */

-		4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_sad_3_opt_neon_aarch64.S; path = arm64/intra_pred_sad_3_opt_neon_aarch64.S; sourceTree = "<group>"; };

+		4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_sad_3_opt_aarch64_neon.S; path = arm64/intra_pred_sad_3_opt_aarch64_neon.S; sourceTree = "<group>"; };

 		4C34066618C57D0400DFA14A /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };

 		4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_sad_3_opt_neon.S; sourceTree = "<group>"; };

 		4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };

@@ -184,7 +184,7 @@

 		4CB8F2B219235FAC005D6386 /* arm64 */ = {

 			isa = PBXGroup;

 			children = (

-				4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */,

+				4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */,

 				4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */,

 				4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */,

);

@@ -425,7 +425,7 @@

 				4CE4472518BC605C0017DF25 /* svc_encode_mb.cpp in Sources */,

 				4CE4471A18BC605C0017DF25 /* mv_pred.cpp in Sources */,

 				4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */,

-				4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S in Sources */,

+				4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S in Sources */,

 				4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */,

 				4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,

 				4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */,

--- /dev/null

+++ b/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S

@@ -1,0 +1,665 @@

+/*!

+ * \copy

+ *     Copyright (c)  2013, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ */

+#ifdef HAVE_NEON_AARCH64

+.text

+#include "arm_arch64_common_macro.S"

+.macro LOAD_LUMA_DATA

+    sub     x7, x0, x1

+    ld1     {v0.16b}, [x7]      //top

+    sub     x7, x0, #1

+    ld1     {v1.b}[0], [x7], x1

+    ld1     {v1.b}[1], [x7], x1

+    ld1     {v1.b}[2], [x7], x1

+    ld1     {v1.b}[3], [x7], x1

+    ld1     {v1.b}[4], [x7], x1

+    ld1     {v1.b}[5], [x7], x1

+    ld1     {v1.b}[6], [x7], x1

+    ld1     {v1.b}[7], [x7], x1

+    ld1     {v1.b}[8], [x7], x1

+    ld1     {v1.b}[9], [x7], x1

+    ld1     {v1.b}[10], [x7], x1

+    ld1     {v1.b}[11], [x7], x1

+    ld1     {v1.b}[12], [x7], x1

+    ld1     {v1.b}[13], [x7], x1

+    ld1     {v1.b}[14], [x7], x1

+    ld1     {v1.b}[15], [x7]    //left

+.endm

+.macro LOAD_16X4_DATA

+    //Load the p_enc data and save to "v22 ~ v25"--- 16X4 bytes

+    ld1     {v0.16b}, [x2], x3

+    ld1     {v1.16b}, [x2], x3

+    ld1     {v20.16b}, [x2], x3

+    ld1     {v21.16b}, [x2], x3

+    trn1    v22.4s, v0.4s, v1.4s

+    trn2    v23.4s, v0.4s, v1.4s

+    trn1    v24.4s, v20.4s, v21.4s

+    trn2    v25.4s, v20.4s, v21.4s

+.endm

+.macro GET_16X16_V_SATD

+    trn1    v6.4s, v4.4s, v5.4s

+    trn2    v7.4s, v4.4s, v5.4s

+    add     v4.8h, v6.8h, v7.8h

+    sub     v5.8h, v6.8h, v7.8h

+    trn1    v6.8h, v4.8h, v5.8h

+    trn2    v7.8h, v4.8h, v5.8h

+    add     v4.8h, v6.8h, v7.8h

+    sub     v5.8h, v6.8h, v7.8h

+    trn1    v6.4s, v4.4s, v5.4s

+    trn2    v7.4s, v4.4s, v5.4s     //{0,1,3,2, 4,5,7,6} v6 {8,9,11,10, 12,13,15,14} v7

+.endm

+.macro GET_16X16_H_SATD

+    trn1    v16.4s, v4.4s, v5.4s

+    trn2    v17.4s, v4.4s, v5.4s

+    add     v4.8h, v16.8h, v17.8h

+    sub     v5.8h, v16.8h, v17.8h

+    trn1    v16.8h, v4.8h, v5.8h

+    trn2    v17.8h, v4.8h, v5.8h

+    add     v4.8h, v16.8h, v17.8h

+    sub     v5.8h, v16.8h, v17.8h

+    trn1    v16.4s, v4.4s, v5.4s

+    trn2    v17.4s, v4.4s, v5.4s    //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17

+.endm

+#ifdef __APPLE__

+.macro SELECT_BEST_COST

+    cmp     w1, $0

+    csel    $0, $0, w1, hs

+    cset    w7, lo

+    cmp     w2, $0

+    mov     w6, #2

+    csel    $0, $0, w2, hs

+    csel    w7, w7, w6, hs

+.endm

+.macro LOAD_CHROMA_DATA

+    sub     x9, $0, x1

+    ld1     {$1}, [x9]      //top_cb

+    sub     x9, $0, #1

+    ld1     {$2}[8], [x9], x1

+    ld1     {$2}[9], [x9], x1

+    ld1     {$2}[10], [x9], x1

+    ld1     {$2}[11], [x9], x1

+    ld1     {$2}[12], [x9], x1

+    ld1     {$2}[13], [x9], x1

+    ld1     {$2}[14], [x9], x1

+    ld1     {$2}[15], [x9], x1 //left_cb

+.endm

+.macro LOAD_8X4_DATA

+    //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes

+    ld1     {v0.8b}, [$0], x3

+    ld1     {v1.8b}, [$0], x3

+    ld1     {v0.d}[1], [$0], x3

+    ld1     {v1.d}[1], [$0], x3

+    trn1    v2.4s, v0.4s, v1.4s

+    trn2    v1.4s, v0.4s, v1.4s

+    trn1    v20.2d, v2.2d, v1.2d

+    trn2    v21.2d, v2.2d, v1.2d

+.endm

+.macro HDM_TRANSFORM_4X4_L0

+    //Do the vertical transform

+    uadd$9   v0.8h, $0, $1

+    usub$9   v1.8h, $0, $1

+    trn1    v3.2d, v0.2d, v1.2d

+    trn2    v1.2d, v0.2d, v1.2d

+    add     v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}

+    sub     v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}

+    //Do the horizontal transform

+    trn1    v0.4s, v4.4s, v5.4s

+    trn2    v1.4s, v4.4s, v5.4s

+    add     v4.8h, v0.8h, v1.8h

+    sub     v5.8h, v0.8h, v1.8h

+    trn1    v0.8h, v4.8h, v5.8h

+    trn2    v1.8h, v4.8h, v5.8h

+    add     v4.8h, v0.8h, v1.8h

+    sub     v5.8h, v0.8h, v1.8h

+    //16x16_v

+    trn1    v0.2s, v4.2s, v5.2s

+    trn2    v1.2s, v4.2s, v5.2s

+    sabal   $5, v0.4h, $2

+    sabal   $5, v1.4h, $8.4h

+    sabal2  $5, v4.8h, $8.8h

+    sabal2  $5, v5.8h, $8.8h

+    //16x16_h

+    ins     v3.d[0], v4.d[1]

+    trn1    v0.4h, v4.4h, v3.4h

+    trn2    v1.4h, v4.4h, v3.4h

+    sabal   $6, v0.4h, $3

+    sabdl   v4.4s, v1.4h, $8.4h

+    sabal   v4.4s, v5.4h, $8.4h

+    sabal2  v4.4s, v5.8h, $8.8h

+    add     $6, $6, v4.4s

+    //16x16_dc_both

+    sabal   $7, v0.4h, $4

+    add     $7, $7, v4.4s

+.endm

+#else

+.macro SELECT_BEST_COST arg0

+    cmp     w1, \arg0

+    csel    \arg0, \arg0, w1, hs

+    cset    w7, lo

+    cmp     w2, \arg0

+    mov     w6, #2

+    csel    \arg0, \arg0, w2, hs

+    csel    w7, w7, w6, hs

+.endm

+.macro LOAD_CHROMA_DATA arg0, arg1, arg2

+    sub     x9, \arg0, x1

+    ld1     {\arg1}, [x9]      //top_cb

+    sub     x9, $0, #1

+    ld1     {\arg2}[8], [x9], x1

+    ld1     {\arg2}[9], [x9], x1

+    ld1     {\arg2}[10], [x9], x1

+    ld1     {\arg2}[11], [x9], x1

+    ld1     {\arg2}[12], [x9], x1

+    ld1     {\arg2}[13], [x9], x1

+    ld1     {\arg2}[14], [x9], x1

+    ld1     {\arg2}[15], [x9], x1 //left_cb

+.endm

+.macro LOAD_8X4_DATA arg0

+    //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes

+    ld1     {v0.8b}, [\arg0], x3

+    ld1     {v1.8b}, [\arg0], x3

+    ld1     {v0.d}[1], [\arg0], x3

+    ld1     {v1.d}[1], [\arg0], x3

+    trn1    v2.4s, v0.4s, v1.4s

+    trn2    v1.4s, v0.4s, v1.4s

+    trn1    v20.2d, v2.2d, v1.2d

+    trn2    v21.2d, v2.2d, v1.2d

+.endm

+.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

+    //Do the vertical transform

+    uadd\arg9\()   v0.8h, \arg0, \arg1

+    usub\arg9\()   v1.8h, \arg0, \arg1

+    trn1    v3.2d, v0.2d, v1.2d

+    trn2    v1.2d, v0.2d, v1.2d

+    add     v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}

+    sub     v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}

+    //Do the horizontal transform

+    trn1    v0.4s, v4.4s, v5.4s

+    trn2    v1.4s, v4.4s, v5.4s

+    add     v4.8h, v0.8h, v1.8h

+    sub     v5.8h, v0.8h, v1.8h

+    trn1    v0.8h, v4.8h, v5.8h

+    trn2    v1.8h, v4.8h, v5.8h

+    add     v4.8h, v0.8h, v1.8h

+    sub     v5.8h, v0.8h, v1.8h

+    //16x16_v

+    trn1    v0.2s, v4.2s, v5.2s

+    trn2    v1.2s, v4.2s, v5.2s

+    sabal   \arg5, v0.4h, \arg2

+    sabal   \arg5, v1.4h, \arg8\().4h

+    sabal2  \arg5, v4.8h, \arg8\().8h

+    sabal2  \arg5, v5.8h, \arg8\().8h

+    //16x16_h

+    ins     v3.d[0], v4.d[1]

+    trn1    v0.4h, v4.4h, v3.4h

+    trn2    v1.4h, v4.4h, v3.4h

+    sabal   \arg6, v0.4h, \arg3

+    sabdl   v4.4s, v1.4h, \arg8\().4h

+    sabal   v4.4s, v5.4h, \arg8\().4h

+    sabal2  v4.4s, v5.8h, \arg8\().8h

+    add     \arg6, \arg6, v4.4s

+    //16x16_dc_both

+    sabal   \arg7, v0.4h, \arg4

+    add     \arg7, \arg7, v4.4s

+.endm

+#endif

+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon

+    ldr     x11, [sp, #0]

+    LOAD_CHROMA_DATA x0, v0.8b, v0.b

+    uaddlp  v1.8h, v0.16b

+    uaddlp  v2.4s, v1.8h

+    ins     v3.d[0], v2.d[1]

+    add     v3.2s, v2.2s, v3.2s

+    urshr   v2.4s, v2.4s, #2

+    urshr   v3.2s, v3.2s, #3

+    dup     v20.8b, v3.b[0]

+    dup     v21.8b, v2.b[4]

+    dup     v22.8b, v2.b[12]

+    dup     v23.8b, v3.b[4]

+    ins     v20.s[1], v21.s[0]

+    ins     v22.s[1], v23.s[0]

+    LOAD_CHROMA_DATA x7, v4.8b, v4.b

+    uaddlp  v5.8h, v4.16b

+    uaddlp  v6.4s, v5.8h

+    ins     v7.d[0], v6.d[1]

+    add     v7.2s, v6.2s, v7.2s

+    urshr   v6.4s, v6.4s, #2

+    urshr   v7.2s, v7.2s, #3

+    dup     v24.8b, v7.b[0]

+    dup     v25.8b, v6.b[4]

+    dup     v26.8b, v6.b[12]

+    dup     v27.8b, v7.b[4]

+    ins     v24.s[1], v25.s[0]

+    ins     v26.s[1], v27.s[0]

+    sub     x9, x0, #1

+    sub     x10, x7, #1

+    ld1     {v3.8b}, [x2], x3

+    ld1     {v5.8b}, [x11], x3

+    ld1r    {v6.8b}, [x9], x1

+    ld1r    {v7.8b}, [x10], x1

+    uabdl   v29.8h, v0.8b, v3.8b

+    uabal   v29.8h, v4.8b, v5.8b   //top

+    uabdl   v30.8h, v6.8b, v3.8b

+    uabal   v30.8h, v7.8b, v5.8b   //left

+    uabdl   v31.8h, v20.8b, v3.8b

+    uabal   v31.8h, v24.8b, v5.8b   //Dc

+.rept 3

+    ld1     {v3.8b}, [x2], x3

+    ld1     {v5.8b}, [x11], x3

+    ld1r    {v6.8b}, [x9], x1

+    ld1r    {v7.8b}, [x10], x1

+    uabal   v29.8h, v0.8b, v3.8b

+    uabal   v29.8h, v4.8b, v5.8b   //top

+    uabal   v30.8h, v6.8b, v3.8b

+    uabal   v30.8h, v7.8b, v5.8b   //left

+    uabal   v31.8h, v20.8b, v3.8b

+    uabal   v31.8h, v24.8b, v5.8b   //Dc

+.endr

+.rept 4

+    ld1     {v3.8b}, [x2], x3

+    ld1     {v5.8b}, [x11], x3

+    ld1r    {v6.8b}, [x9], x1

+    ld1r    {v7.8b}, [x10], x1

+    uabal   v29.8h, v0.8b, v3.8b

+    uabal   v29.8h, v4.8b, v5.8b   //top

+    uabal   v30.8h, v6.8b, v3.8b

+    uabal   v30.8h, v7.8b, v5.8b   //left

+    uabal   v31.8h, v22.8b, v3.8b

+    uabal   v31.8h, v26.8b, v5.8b   //Dc

+.endr

+    saddlv  s29, v29.8h

+    fmov    w2, s29

+    add     w2, w2, w5, lsl #1

+    saddlv  s30, v30.8h

+    fmov    w1, s30

+    add     w1, w1, w5, lsl #1

+    saddlv  s31, v31.8h

+    fmov    w0, s31

+    SELECT_BEST_COST w0

+    str     w7, [x4]

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon

+    LOAD_LUMA_DATA

+    uaddlv    h2, v0.16b

+    uaddlv    h3, v1.16b

+    add       v2.8h, v2.8h, v3.8h

+    uqrshrn   b2, h2, #5

+    dup       v2.16b, v2.b[0]   //Dc

+    sub     x7, x0, #1

+    ld1     {v3.16b}, [x2], x3

+    ld1r    {v4.16b}, [x7], x1

+    uabdl   v29.8h, v0.8b, v3.8b

+    uabal2  v29.8h, v0.16b,v3.16b   //top

+    uabdl   v30.8h, v4.8b, v3.8b

+    uabal2  v30.8h, v4.16b,v3.16b   //left

+    uabdl   v31.8h, v2.8b, v3.8b

+    uabal2  v31.8h, v2.16b,v3.16b   //Dc

+    mov     x6, #15

+sad_intra_16x16_x3_opt_loop0:

+    ld1     {v3.16b}, [x2], x3

+    ld1r    {v4.16b}, [x7], x1

+    uabal   v29.8h, v0.8b, v3.8b

+    uabal2  v29.8h, v0.16b,v3.16b   //top

+    uabal   v30.8h, v4.8b, v3.8b

+    uabal2  v30.8h, v4.16b,v3.16b   //left

+    uabal   v31.8h, v2.8b, v3.8b

+    uabal2  v31.8h, v2.16b,v3.16b   //Dc

+    sub     x6, x6, #1

+    cbnz    x6,  sad_intra_16x16_x3_opt_loop0

+    saddlv  s29, v29.8h

+    fmov    w0, s29

+    saddlv  s30, v30.8h

+    fmov    w1, s30

+    add     w1, w1, w5, lsl #1

+    saddlv  s31, v31.8h

+    fmov    w2, s31

+    add     w2, w2, w5, lsl #1

+    SELECT_BEST_COST w0

+    str     w7, [x4]

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon

+    sub     x9, x0, x1

+    ld1     {v16.s}[0], [x9]      //top

+    sub     x9, x0, #1

+    ld1     {v16.b}[4], [x9], x1

+    ld1     {v16.b}[5], [x9], x1

+    ld1     {v16.b}[6], [x9], x1

+    ld1     {v16.b}[7], [x9], x1

+    uaddlv  h2, v16.8b

+    uqrshrn b17, h2, #3

+    urshr   v2.4h, v2.4h, #3

+    shl     v2.4h, v2.4h, #4

+    //Calculate the 4x4_v 4x4_h mode SATD and save to "v6, v7"

+    ushll   v4.8h, v16.8b, #2

+    ins     v5.d[0], v4.d[1]

+    trn1    v6.2s, v4.2s, v5.2s

+    trn2    v7.2s, v4.2s, v5.2s

+    add     v4.4h, v6.4h, v7.4h

+    sub     v5.4h, v6.4h, v7.4h

+    trn1    v6.4h, v4.4h, v5.4h

+    trn2    v7.4h, v4.4h, v5.4h

+    add     v4.4h, v6.4h, v7.4h

+    sub     v5.4h, v6.4h, v7.4h

+    trn1    v6.2s, v4.2s, v5.2s

+    trn2    v7.2s, v4.2s, v5.2s     //{0,1,3,2,top} v6 {0,1,3,2,left} v7

+    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH

+    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H

+    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V

+    eor     v28.16b, v28.16b, v28.16b  //For zero register

+    //Load the p_enc data and save to "v22 ~ v23"--- 16X4 bytes

+    ld1     {v22.s}[0], [x2], x3

+    ld1     {v22.s}[1], [x2], x3

+    ld1     {v23.s}[0], [x2], x3

+    ld1     {v23.s}[1], [x2], x3

+    HDM_TRANSFORM_4X4_L0 v22.8b, v23.8b, v6.4h, v7.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

+    ldr     x11, [sp, #0]

+    urshr   v29.4s, v29.4s, #1

+    addv    s29, v29.4s

+    fmov    w0, s29

+    add     w0, w0, w11

+    urshr   v30.4s, v30.4s, #1

+    addv    s30, v30.4s

+    fmov    w1, s30

+    add     w1, w1, w7

+    urshr   v31.4s, v31.4s, #1

+    addv    s31, v31.4s

+    fmov    w2, s31

+    add     w2, w2, w6

+    mov     w10, w0

+    SELECT_BEST_COST w10

+    str     w7, [x5]

+    sub     w9, w10, w2

+    cbnz    w9, satd_intra_4x4_x3_opt_jump0

+    dup     v0.16b, v17.b[0]

+    st1     {v0.16b}, [x4]

+    b       satd_intra_4x4_x3_opt_end

+satd_intra_4x4_x3_opt_jump0:

+    sub     w8, w10, w1

+    cbnz    w8, satd_intra_4x4_x3_opt_jump1

+    dup     v0.16b, v16.b[4]

+    dup     v1.16b, v16.b[5]

+    dup     v2.16b, v16.b[6]

+    dup     v3.16b, v16.b[7]

+    st4     {v0.s,v1.s,v2.s,v3.s}[0], [x4]

+    b       satd_intra_4x4_x3_opt_end

+satd_intra_4x4_x3_opt_jump1:

+    st1     {v16.S}[0], [x4], #4

+    st1     {v16.S}[0], [x4], #4

+    st1     {v16.S}[0], [x4], #4

+    st1     {v16.S}[0], [x4]

+satd_intra_4x4_x3_opt_end:

+    mov     w0, w10

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon

+    ldr     x11, [sp, #0]

+    LOAD_CHROMA_DATA x0, v0.8b, v0.b

+    LOAD_CHROMA_DATA x7, v1.8b, v1.b

+    //Calculate the 16x16_v mode SATD and save to "v6, v7"

+    ushll   v4.8h, v0.8b, #2

+    ushll   v5.8h, v1.8b, #2

+    GET_16X16_V_SATD

+    //Calculate the 16x16_h mode SATD and save to "v16, v17"

+    ushll2  v4.8h, v0.16b, #2

+    ushll2  v5.8h, v1.16b, #2

+    GET_16X16_H_SATD

+    uaddlp  v0.8h, v0.16b

+    uaddlp  v2.4s, v0.8h

+    ins     v3.d[0], v2.d[1]

+    add     v3.2s, v2.2s, v3.2s

+    uaddlp  v1.8h, v1.16b

+    uaddlp  v4.4s, v1.8h

+    ins     v5.d[0], v4.d[1]

+    add     v5.2s, v4.2s, v5.2s

+    trn2    v0.4s, v2.4s, v4.4s

+    urshr   v0.4s, v0.4s, #2

+    urshr   v3.2s, v3.2s, #3

+    urshr   v5.2s, v5.2s, #3

+    ushll   v22.2d, v0.2s, #4    //{1cb, 1cr}

+    ushll2  v23.2d, v0.4s, #4    //{2cb, 2cr}

+    ushll   v24.2d, v3.2s, #4   //{0cb, 3cb}

+    ushll   v25.2d, v5.2s, #4   //{0cr, 3cr}

+    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH

+    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H

+    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V

+    eor     v28.16b, v28.16b, v28.16b  //For zero register

+    ins     v18.d[0], v6.d[1]

+    ins     v19.d[0], v7.d[1]

+    ins     v26.d[0], v16.d[1]

+    ins     v27.d[0], v17.d[1]

+    LOAD_8X4_DATA x2

+    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v16.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v16.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    LOAD_8X4_DATA x11

+    ins     v22.d[0], v22.d[1]

+    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v17.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v17.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    LOAD_8X4_DATA x2

+    ins     v24.d[0], v24.d[1]

+    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v26.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v26.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    LOAD_8X4_DATA x11

+    ins     v23.d[0], v23.d[1]

+    ins     v25.d[0], v25.d[1]

+    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v27.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v27.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    urshr   v29.4s, v29.4s, #1

+    addv    s29, v29.4s

+    fmov    w2, s29

+    add     w2, w2, w5, lsl #1

+    urshr   v30.4s, v30.4s, #1

+    addv    s30, v30.4s

+    fmov    w1, s30

+    add     w1, w1, w5, lsl #1

+    urshr   v31.4s, v31.4s, #1

+    addv    s31, v31.4s

+    fmov    w0, s31

+    SELECT_BEST_COST w0

+    str     w7, [x4]

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon

+    LOAD_LUMA_DATA

+    uaddlv  h2, v0.16b

+    uaddlv  h3, v1.16b

+    add     v2.8h, v2.8h, v3.8h

+    urshr   v2.4h, v2.4h, #5

+    shl     v2.4h, v2.4h, #4

+    //Calculate the 16x16_v mode SATD and save to "v6, v7"

+    ushll   v4.8h, v0.8b, #2

+    ushll2  v5.8h, v0.16b, #2

+    GET_16X16_V_SATD

+    //Calculate the 16x16_h mode SATD and save to "v16, v17"

+    ushll   v4.8h, v1.8b, #2

+    ushll2  v5.8h, v1.16b, #2

+    GET_16X16_H_SATD

+    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH

+    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H

+    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V

+    eor     v28.16b, v28.16b, v28.16b  //For zero register

+    ins     v18.d[0], v6.d[1]

+    ins     v19.d[0], v7.d[1]

+    ins     v26.d[0], v16.d[1]

+    ins     v27.d[0], v17.d[1]

+    LOAD_16X4_DATA

+    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    LOAD_16X4_DATA

+    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    LOAD_16X4_DATA

+    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    LOAD_16X4_DATA

+    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

+    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

+    urshr   v29.4s, v29.4s, #1

+    addv    s29, v29.4s

+    fmov    w0, s29

+    urshr   v30.4s, v30.4s, #1

+    addv    s30, v30.4s

+    fmov    w1, s30

+    add     w1, w1, w5, lsl #1

+    urshr   v31.4s, v31.4s, #1

+    addv    s31, v31.4s

+    fmov    w2, s31

+    add     w2, w2, w5, lsl #1

+    SELECT_BEST_COST w0

+    str     w7, [x4]

+WELS_ASM_ARCH64_FUNC_END

+#endif

\ No newline at end of file

--- a/codec/encoder/core/arm64/intra_pred_sad_3_opt_neon_aarch64.S

+++ /dev/null

@@ -1,665 +1,0 @@

-/*!

- * \copy

- *     Copyright (c)  2013, Cisco Systems

- *     All rights reserved.

- *

- *     Redistribution and use in source and binary forms, with or without

- *     modification, are permitted provided that the following conditions

- *     are met:

- *

- *        * Redistributions of source code must retain the above copyright

- *          notice, this list of conditions and the following disclaimer.

- *

- *        * Redistributions in binary form must reproduce the above copyright

- *          notice, this list of conditions and the following disclaimer in

- *          the documentation and/or other materials provided with the

- *          distribution.

- *

- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

- *     POSSIBILITY OF SUCH DAMAGE.

- *

- */

-#ifdef HAVE_NEON_AARCH64

-.text

-#include "arm_arch64_common_macro.S"

-.macro LOAD_LUMA_DATA

-    sub     x7, x0, x1

-    ld1     {v0.16b}, [x7]      //top

-    sub     x7, x0, #1

-    ld1     {v1.b}[0], [x7], x1

-    ld1     {v1.b}[1], [x7], x1

-    ld1     {v1.b}[2], [x7], x1

-    ld1     {v1.b}[3], [x7], x1

-    ld1     {v1.b}[4], [x7], x1

-    ld1     {v1.b}[5], [x7], x1

-    ld1     {v1.b}[6], [x7], x1

-    ld1     {v1.b}[7], [x7], x1

-    ld1     {v1.b}[8], [x7], x1

-    ld1     {v1.b}[9], [x7], x1

-    ld1     {v1.b}[10], [x7], x1

-    ld1     {v1.b}[11], [x7], x1

-    ld1     {v1.b}[12], [x7], x1

-    ld1     {v1.b}[13], [x7], x1

-    ld1     {v1.b}[14], [x7], x1

-    ld1     {v1.b}[15], [x7]    //left

-.endm

-.macro LOAD_16X4_DATA

-    //Load the p_enc data and save to "v22 ~ v25"--- 16X4 bytes

-    ld1     {v0.16b}, [x2], x3

-    ld1     {v1.16b}, [x2], x3

-    ld1     {v20.16b}, [x2], x3

-    ld1     {v21.16b}, [x2], x3

-    trn1    v22.4s, v0.4s, v1.4s

-    trn2    v23.4s, v0.4s, v1.4s

-    trn1    v24.4s, v20.4s, v21.4s

-    trn2    v25.4s, v20.4s, v21.4s

-.endm

-.macro GET_16X16_V_SATD

-    trn1    v6.4s, v4.4s, v5.4s

-    trn2    v7.4s, v4.4s, v5.4s

-    add     v4.8h, v6.8h, v7.8h

-    sub     v5.8h, v6.8h, v7.8h

-    trn1    v6.8h, v4.8h, v5.8h

-    trn2    v7.8h, v4.8h, v5.8h

-    add     v4.8h, v6.8h, v7.8h

-    sub     v5.8h, v6.8h, v7.8h

-    trn1    v6.4s, v4.4s, v5.4s

-    trn2    v7.4s, v4.4s, v5.4s     //{0,1,3,2, 4,5,7,6} v6 {8,9,11,10, 12,13,15,14} v7

-.endm

-.macro GET_16X16_H_SATD

-    trn1    v16.4s, v4.4s, v5.4s

-    trn2    v17.4s, v4.4s, v5.4s

-    add     v4.8h, v16.8h, v17.8h

-    sub     v5.8h, v16.8h, v17.8h

-    trn1    v16.8h, v4.8h, v5.8h

-    trn2    v17.8h, v4.8h, v5.8h

-    add     v4.8h, v16.8h, v17.8h

-    sub     v5.8h, v16.8h, v17.8h

-    trn1    v16.4s, v4.4s, v5.4s

-    trn2    v17.4s, v4.4s, v5.4s    //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17

-.endm

-#ifdef __APPLE__

-.macro SELECT_BEST_COST

-    cmp     w1, $0

-    csel    $0, $0, w1, hs

-    cset    w7, lo

-    cmp     w2, $0

-    mov     w6, #2

-    csel    $0, $0, w2, hs

-    csel    w7, w7, w6, hs

-.endm

-.macro LOAD_CHROMA_DATA

-    sub     x9, $0, x1

-    ld1     {$1}, [x9]      //top_cb

-    sub     x9, $0, #1

-    ld1     {$2}[8], [x9], x1

-    ld1     {$2}[9], [x9], x1

-    ld1     {$2}[10], [x9], x1

-    ld1     {$2}[11], [x9], x1

-    ld1     {$2}[12], [x9], x1

-    ld1     {$2}[13], [x9], x1

-    ld1     {$2}[14], [x9], x1

-    ld1     {$2}[15], [x9], x1 //left_cb

-.endm

-.macro LOAD_8X4_DATA

-    //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes

-    ld1     {v0.8b}, [$0], x3

-    ld1     {v1.8b}, [$0], x3

-    ld1     {v0.d}[1], [$0], x3

-    ld1     {v1.d}[1], [$0], x3

-    trn1    v2.4s, v0.4s, v1.4s

-    trn2    v1.4s, v0.4s, v1.4s

-    trn1    v20.2d, v2.2d, v1.2d

-    trn2    v21.2d, v2.2d, v1.2d

-.endm

-.macro HDM_TRANSFORM_4X4_L0

-    //Do the vertical transform

-    uadd$9   v0.8h, $0, $1

-    usub$9   v1.8h, $0, $1

-    trn1    v3.2d, v0.2d, v1.2d

-    trn2    v1.2d, v0.2d, v1.2d

-    add     v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}

-    sub     v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}

-    //Do the horizontal transform

-    trn1    v0.4s, v4.4s, v5.4s

-    trn2    v1.4s, v4.4s, v5.4s

-    add     v4.8h, v0.8h, v1.8h

-    sub     v5.8h, v0.8h, v1.8h

-    trn1    v0.8h, v4.8h, v5.8h

-    trn2    v1.8h, v4.8h, v5.8h

-    add     v4.8h, v0.8h, v1.8h

-    sub     v5.8h, v0.8h, v1.8h

-    //16x16_v

-    trn1    v0.2s, v4.2s, v5.2s

-    trn2    v1.2s, v4.2s, v5.2s

-    sabal   $5, v0.4h, $2

-    sabal   $5, v1.4h, $8.4h

-    sabal2  $5, v4.8h, $8.8h

-    sabal2  $5, v5.8h, $8.8h

-    //16x16_h

-    ins     v3.d[0], v4.d[1]

-    trn1    v0.4h, v4.4h, v3.4h

-    trn2    v1.4h, v4.4h, v3.4h

-    sabal   $6, v0.4h, $3

-    sabdl   v4.4s, v1.4h, $8.4h

-    sabal   v4.4s, v5.4h, $8.4h

-    sabal2  v4.4s, v5.8h, $8.8h

-    add     $6, $6, v4.4s

-    //16x16_dc_both

-    sabal   $7, v0.4h, $4

-    add     $7, $7, v4.4s

-.endm

-#else

-.macro SELECT_BEST_COST arg0

-    cmp     w1, \arg0

-    csel    \arg0, \arg0, w1, hs

-    cset    w7, lo

-    cmp     w2, \arg0

-    mov     w6, #2

-    csel    \arg0, \arg0, w2, hs

-    csel    w7, w7, w6, hs

-.endm

-.macro LOAD_CHROMA_DATA arg0, arg1, arg2

-    sub     x9, \arg0, x1

-    ld1     {\arg1}, [x9]      //top_cb

-    sub     x9, $0, #1

-    ld1     {\arg2}[8], [x9], x1

-    ld1     {\arg2}[9], [x9], x1

-    ld1     {\arg2}[10], [x9], x1

-    ld1     {\arg2}[11], [x9], x1

-    ld1     {\arg2}[12], [x9], x1

-    ld1     {\arg2}[13], [x9], x1

-    ld1     {\arg2}[14], [x9], x1

-    ld1     {\arg2}[15], [x9], x1 //left_cb

-.endm

-.macro LOAD_8X4_DATA arg0

-    //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes

-    ld1     {v0.8b}, [\arg0], x3

-    ld1     {v1.8b}, [\arg0], x3

-    ld1     {v0.d}[1], [\arg0], x3

-    ld1     {v1.d}[1], [\arg0], x3

-    trn1    v2.4s, v0.4s, v1.4s

-    trn2    v1.4s, v0.4s, v1.4s

-    trn1    v20.2d, v2.2d, v1.2d

-    trn2    v21.2d, v2.2d, v1.2d

-.endm

-.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

-    //Do the vertical transform

-    uadd\arg9\()   v0.8h, \arg0, \arg1

-    usub\arg9\()   v1.8h, \arg0, \arg1

-    trn1    v3.2d, v0.2d, v1.2d

-    trn2    v1.2d, v0.2d, v1.2d

-    add     v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}

-    sub     v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}

-    //Do the horizontal transform

-    trn1    v0.4s, v4.4s, v5.4s

-    trn2    v1.4s, v4.4s, v5.4s

-    add     v4.8h, v0.8h, v1.8h

-    sub     v5.8h, v0.8h, v1.8h

-    trn1    v0.8h, v4.8h, v5.8h

-    trn2    v1.8h, v4.8h, v5.8h

-    add     v4.8h, v0.8h, v1.8h

-    sub     v5.8h, v0.8h, v1.8h

-    //16x16_v

-    trn1    v0.2s, v4.2s, v5.2s

-    trn2    v1.2s, v4.2s, v5.2s

-    sabal   \arg5, v0.4h, \arg2

-    sabal   \arg5, v1.4h, \arg8\().4h

-    sabal2  \arg5, v4.8h, \arg8\().8h

-    sabal2  \arg5, v5.8h, \arg8\().8h

-    //16x16_h

-    ins     v3.d[0], v4.d[1]

-    trn1    v0.4h, v4.4h, v3.4h

-    trn2    v1.4h, v4.4h, v3.4h

-    sabal   \arg6, v0.4h, \arg3

-    sabdl   v4.4s, v1.4h, \arg8\().4h

-    sabal   v4.4s, v5.4h, \arg8\().4h

-    sabal2  v4.4s, v5.8h, \arg8\().8h

-    add     \arg6, \arg6, v4.4s

-    //16x16_dc_both

-    sabal   \arg7, v0.4h, \arg4

-    add     \arg7, \arg7, v4.4s

-.endm

-#endif

-WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon

-    ldr     x11, [sp, #0]

-    LOAD_CHROMA_DATA x0, v0.8b, v0.b

-    uaddlp  v1.8h, v0.16b

-    uaddlp  v2.4s, v1.8h

-    ins     v3.d[0], v2.d[1]

-    add     v3.2s, v2.2s, v3.2s

-    urshr   v2.4s, v2.4s, #2

-    urshr   v3.2s, v3.2s, #3

-    dup     v20.8b, v3.b[0]

-    dup     v21.8b, v2.b[4]

-    dup     v22.8b, v2.b[12]

-    dup     v23.8b, v3.b[4]

-    ins     v20.s[1], v21.s[0]

-    ins     v22.s[1], v23.s[0]

-    LOAD_CHROMA_DATA x7, v4.8b, v4.b

-    uaddlp  v5.8h, v4.16b

-    uaddlp  v6.4s, v5.8h

-    ins     v7.d[0], v6.d[1]

-    add     v7.2s, v6.2s, v7.2s

-    urshr   v6.4s, v6.4s, #2

-    urshr   v7.2s, v7.2s, #3

-    dup     v24.8b, v7.b[0]

-    dup     v25.8b, v6.b[4]

-    dup     v26.8b, v6.b[12]

-    dup     v27.8b, v7.b[4]

-    ins     v24.s[1], v25.s[0]

-    ins     v26.s[1], v27.s[0]

-    sub     x9, x0, #1

-    sub     x10, x7, #1

-    ld1     {v3.8b}, [x2], x3

-    ld1     {v5.8b}, [x11], x3

-    ld1r    {v6.8b}, [x9], x1

-    ld1r    {v7.8b}, [x10], x1

-    uabdl   v29.8h, v0.8b, v3.8b

-    uabal   v29.8h, v4.8b, v5.8b   //top

-    uabdl   v30.8h, v6.8b, v3.8b

-    uabal   v30.8h, v7.8b, v5.8b   //left

-    uabdl   v31.8h, v20.8b, v3.8b

-    uabal   v31.8h, v24.8b, v5.8b   //Dc

-.rept 3

-    ld1     {v3.8b}, [x2], x3

-    ld1     {v5.8b}, [x11], x3

-    ld1r    {v6.8b}, [x9], x1

-    ld1r    {v7.8b}, [x10], x1

-    uabal   v29.8h, v0.8b, v3.8b

-    uabal   v29.8h, v4.8b, v5.8b   //top

-    uabal   v30.8h, v6.8b, v3.8b

-    uabal   v30.8h, v7.8b, v5.8b   //left

-    uabal   v31.8h, v20.8b, v3.8b

-    uabal   v31.8h, v24.8b, v5.8b   //Dc

-.endr

-.rept 4

-    ld1     {v3.8b}, [x2], x3

-    ld1     {v5.8b}, [x11], x3

-    ld1r    {v6.8b}, [x9], x1

-    ld1r    {v7.8b}, [x10], x1

-    uabal   v29.8h, v0.8b, v3.8b

-    uabal   v29.8h, v4.8b, v5.8b   //top

-    uabal   v30.8h, v6.8b, v3.8b

-    uabal   v30.8h, v7.8b, v5.8b   //left

-    uabal   v31.8h, v22.8b, v3.8b

-    uabal   v31.8h, v26.8b, v5.8b   //Dc

-.endr

-    saddlv  s29, v29.8h

-    fmov    w2, s29

-    add     w2, w2, w5, lsl #1

-    saddlv  s30, v30.8h

-    fmov    w1, s30

-    add     w1, w1, w5, lsl #1

-    saddlv  s31, v31.8h

-    fmov    w0, s31

-    SELECT_BEST_COST w0

-    str     w7, [x4]

-WELS_ASM_ARCH64_FUNC_END

-WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon

-    LOAD_LUMA_DATA

-    uaddlv    h2, v0.16b

-    uaddlv    h3, v1.16b

-    add       v2.8h, v2.8h, v3.8h

-    uqrshrn   b2, h2, #5

-    dup       v2.16b, v2.b[0]   //Dc

-    sub     x7, x0, #1

-    ld1     {v3.16b}, [x2], x3

-    ld1r    {v4.16b}, [x7], x1

-    uabdl   v29.8h, v0.8b, v3.8b

-    uabal2  v29.8h, v0.16b,v3.16b   //top

-    uabdl   v30.8h, v4.8b, v3.8b

-    uabal2  v30.8h, v4.16b,v3.16b   //left

-    uabdl   v31.8h, v2.8b, v3.8b

-    uabal2  v31.8h, v2.16b,v3.16b   //Dc

-    mov     x6, #15

-sad_intra_16x16_x3_opt_loop0:

-    ld1     {v3.16b}, [x2], x3

-    ld1r    {v4.16b}, [x7], x1

-    uabal   v29.8h, v0.8b, v3.8b

-    uabal2  v29.8h, v0.16b,v3.16b   //top

-    uabal   v30.8h, v4.8b, v3.8b

-    uabal2  v30.8h, v4.16b,v3.16b   //left

-    uabal   v31.8h, v2.8b, v3.8b

-    uabal2  v31.8h, v2.16b,v3.16b   //Dc

-    sub     x6, x6, #1

-    cbnz    x6,  sad_intra_16x16_x3_opt_loop0

-    saddlv  s29, v29.8h

-    fmov    w0, s29

-    saddlv  s30, v30.8h

-    fmov    w1, s30

-    add     w1, w1, w5, lsl #1

-    saddlv  s31, v31.8h

-    fmov    w2, s31

-    add     w2, w2, w5, lsl #1

-    SELECT_BEST_COST w0

-    str     w7, [x4]

-WELS_ASM_ARCH64_FUNC_END

-WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon

-    sub     x9, x0, x1

-    ld1     {v16.s}[0], [x9]      //top

-    sub     x9, x0, #1

-    ld1     {v16.b}[4], [x9], x1

-    ld1     {v16.b}[5], [x9], x1

-    ld1     {v16.b}[6], [x9], x1

-    ld1     {v16.b}[7], [x9], x1

-    uaddlv  h2, v16.8b

-    uqrshrn b17, h2, #3

-    urshr   v2.4h, v2.4h, #3

-    shl     v2.4h, v2.4h, #4

-    //Calculate the 4x4_v 4x4_h mode SATD and save to "v6, v7"

-    ushll   v4.8h, v16.8b, #2

-    ins     v5.d[0], v4.d[1]

-    trn1    v6.2s, v4.2s, v5.2s

-    trn2    v7.2s, v4.2s, v5.2s

-    add     v4.4h, v6.4h, v7.4h

-    sub     v5.4h, v6.4h, v7.4h

-    trn1    v6.4h, v4.4h, v5.4h

-    trn2    v7.4h, v4.4h, v5.4h

-    add     v4.4h, v6.4h, v7.4h

-    sub     v5.4h, v6.4h, v7.4h

-    trn1    v6.2s, v4.2s, v5.2s

-    trn2    v7.2s, v4.2s, v5.2s     //{0,1,3,2,top} v6 {0,1,3,2,left} v7

-    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH

-    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H

-    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V

-    eor     v28.16b, v28.16b, v28.16b  //For zero register

-    //Load the p_enc data and save to "v22 ~ v23"--- 16X4 bytes

-    ld1     {v22.s}[0], [x2], x3

-    ld1     {v22.s}[1], [x2], x3

-    ld1     {v23.s}[0], [x2], x3

-    ld1     {v23.s}[1], [x2], x3

-    HDM_TRANSFORM_4X4_L0 v22.8b, v23.8b, v6.4h, v7.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

-    ldr     x11, [sp, #0]

-    urshr   v29.4s, v29.4s, #1

-    addv    s29, v29.4s

-    fmov    w0, s29

-    add     w0, w0, w11

-    urshr   v30.4s, v30.4s, #1

-    addv    s30, v30.4s

-    fmov    w1, s30

-    add     w1, w1, w7

-    urshr   v31.4s, v31.4s, #1

-    addv    s31, v31.4s

-    fmov    w2, s31

-    add     w2, w2, w6

-    mov     w10, w0

-    SELECT_BEST_COST w10

-    str     w7, [x5]

-    sub     w9, w10, w2

-    cbnz    w9, satd_intra_4x4_x3_opt_jump0

-    dup     v0.16b, v17.b[0]

-    st1     {v0.16b}, [x4]

-    b       satd_intra_4x4_x3_opt_end

-satd_intra_4x4_x3_opt_jump0:

-    sub     w8, w10, w1

-    cbnz    w8, satd_intra_4x4_x3_opt_jump1

-    dup     v0.16b, v16.b[4]

-    dup     v1.16b, v16.b[5]

-    dup     v2.16b, v16.b[6]

-    dup     v3.16b, v16.b[7]

-    st4     {v0.s,v1.s,v2.s,v3.s}[0], [x4]

-    b       satd_intra_4x4_x3_opt_end

-satd_intra_4x4_x3_opt_jump1:

-    st1     {v16.S}[0], [x4], #4

-    st1     {v16.S}[0], [x4], #4

-    st1     {v16.S}[0], [x4], #4

-    st1     {v16.S}[0], [x4]

-satd_intra_4x4_x3_opt_end:

-    mov     w0, w10

-WELS_ASM_ARCH64_FUNC_END

-WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon

-    ldr     x11, [sp, #0]

-    LOAD_CHROMA_DATA x0, v0.8b, v0.b

-    LOAD_CHROMA_DATA x7, v1.8b, v1.b

-    //Calculate the 16x16_v mode SATD and save to "v6, v7"

-    ushll   v4.8h, v0.8b, #2

-    ushll   v5.8h, v1.8b, #2

-    GET_16X16_V_SATD

-    //Calculate the 16x16_h mode SATD and save to "v16, v17"

-    ushll2  v4.8h, v0.16b, #2

-    ushll2  v5.8h, v1.16b, #2

-    GET_16X16_H_SATD

-    uaddlp  v0.8h, v0.16b

-    uaddlp  v2.4s, v0.8h

-    ins     v3.d[0], v2.d[1]

-    add     v3.2s, v2.2s, v3.2s

-    uaddlp  v1.8h, v1.16b

-    uaddlp  v4.4s, v1.8h

-    ins     v5.d[0], v4.d[1]

-    add     v5.2s, v4.2s, v5.2s

-    trn2    v0.4s, v2.4s, v4.4s

-    urshr   v0.4s, v0.4s, #2

-    urshr   v3.2s, v3.2s, #3

-    urshr   v5.2s, v5.2s, #3

-    ushll   v22.2d, v0.2s, #4    //{1cb, 1cr}

-    ushll2  v23.2d, v0.4s, #4    //{2cb, 2cr}

-    ushll   v24.2d, v3.2s, #4   //{0cb, 3cb}

-    ushll   v25.2d, v5.2s, #4   //{0cr, 3cr}

-    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH

-    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H

-    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V

-    eor     v28.16b, v28.16b, v28.16b  //For zero register

-    ins     v18.d[0], v6.d[1]

-    ins     v19.d[0], v7.d[1]

-    ins     v26.d[0], v16.d[1]

-    ins     v27.d[0], v17.d[1]

-    LOAD_8X4_DATA x2

-    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v16.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v16.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    LOAD_8X4_DATA x11

-    ins     v22.d[0], v22.d[1]

-    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v17.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v17.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    LOAD_8X4_DATA x2

-    ins     v24.d[0], v24.d[1]

-    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v26.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v26.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    LOAD_8X4_DATA x11

-    ins     v23.d[0], v23.d[1]

-    ins     v25.d[0], v25.d[1]

-    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v27.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v27.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    urshr   v29.4s, v29.4s, #1

-    addv    s29, v29.4s

-    fmov    w2, s29

-    add     w2, w2, w5, lsl #1

-    urshr   v30.4s, v30.4s, #1

-    addv    s30, v30.4s

-    fmov    w1, s30

-    add     w1, w1, w5, lsl #1

-    urshr   v31.4s, v31.4s, #1

-    addv    s31, v31.4s

-    fmov    w0, s31

-    SELECT_BEST_COST w0

-    str     w7, [x4]

-WELS_ASM_ARCH64_FUNC_END

-WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon

-    LOAD_LUMA_DATA

-    uaddlv  h2, v0.16b

-    uaddlv  h3, v1.16b

-    add     v2.8h, v2.8h, v3.8h

-    urshr   v2.4h, v2.4h, #5

-    shl     v2.4h, v2.4h, #4

-    //Calculate the 16x16_v mode SATD and save to "v6, v7"

-    ushll   v4.8h, v0.8b, #2

-    ushll2  v5.8h, v0.16b, #2

-    GET_16X16_V_SATD

-    //Calculate the 16x16_h mode SATD and save to "v16, v17"

-    ushll   v4.8h, v1.8b, #2

-    ushll2  v5.8h, v1.16b, #2

-    GET_16X16_H_SATD

-    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH

-    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H

-    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V

-    eor     v28.16b, v28.16b, v28.16b  //For zero register

-    ins     v18.d[0], v6.d[1]

-    ins     v19.d[0], v7.d[1]

-    ins     v26.d[0], v16.d[1]

-    ins     v27.d[0], v17.d[1]

-    LOAD_16X4_DATA

-    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    LOAD_16X4_DATA

-    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    LOAD_16X4_DATA

-    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    LOAD_16X4_DATA

-    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

-    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

-    urshr   v29.4s, v29.4s, #1

-    addv    s29, v29.4s

-    fmov    w0, s29

-    urshr   v30.4s, v30.4s, #1

-    addv    s30, v30.4s

-    fmov    w1, s30

-    add     w1, w1, w5, lsl #1

-    urshr   v31.4s, v31.4s, #1

-    addv    s31, v31.4s

-    fmov    w2, s31

-    add     w2, w2, w5, lsl #1

-    SELECT_BEST_COST w0

-    str     w7, [x4]

-WELS_ASM_ARCH64_FUNC_END

-#endif

\ No newline at end of file

--- a/codec/encoder/targets.mk

+++ b/codec/encoder/targets.mk

@@ -60,7 +60,7 @@

 ifeq ($(ASM_ARCH), arm64)

 ENCODER_ASM_ARM64_SRCS=\

 	$(ENCODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\

-	$(ENCODER_SRCDIR)/core/arm64/intra_pred_sad_3_opt_neon_aarch64.S\

+	$(ENCODER_SRCDIR)/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S\

 	$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\

 ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))