shithub: openh264

Download patch

ref: d8202cf38f2145a39e22021f70e033cd5dc6401d
parent: cdce1b73ca1a39289e75f19e3ab97539e90e2436
author: Martin Storsjö <[email protected]>
date: Fri Mar 27 06:54:14 EDT 2015

Remove apple specific versions of arm64 macros with arguments

The apple assembler for arm64 can handle the gnu binutils style
macros just fine, so there is no need to duplicate all of these
macros in two syntaxes, when the new one works fine in all cases.

We already require a new enough assembler to support the gnu binutils
style features since we use the .rept directive in a few places.

--- a/codec/common/arm64/copy_mb_aarch64_neon.S
+++ b/codec/common/arm64/copy_mb_aarch64_neon.S
@@ -33,80 +33,6 @@
 #ifdef  HAVE_NEON_AARCH64
 #include "arm_arch64_common_macro.S"
 
-#ifdef __APPLE__
-.macro LOAD_ALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, src*, src_stride
-    ld1 {$0.d}[0], [$4], $5
-    ld1 {$1.d}[0], [$4], $5
-    ld1 {$2.d}[0], [$4], $5
-    ld1 {$3.d}[0], [$4], $5
-//  }
-.endm
-
-.macro STORE_ALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, dst*, dst_stride
-    st1 {$0.d}[0], [$4], $5
-    st1 {$1.d}[0], [$4], $5
-    st1 {$2.d}[0], [$4], $5
-    st1 {$3.d}[0], [$4], $5
-//  }
-.endm
-
-.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, src*, src_stride
-    ld1 {$0.8b}, [$4], $5
-    ld1 {$1.8b}, [$4], $5
-    ld1 {$2.8b}, [$4], $5
-    ld1 {$3.8b}, [$4], $5
-//  }
-.endm
-
-.macro STORE_UNALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, dst*, dst_stride
-    st1 {$0.8b}, [$4], $5
-    st1 {$1.8b}, [$4], $5
-    st1 {$2.8b}, [$4], $5
-    st1 {$3.8b}, [$4], $5
-//  }
-.endm
-
-.macro LOAD16_ALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, src*, src_stride
-    ld1 {$0.2d}, [$4], $5
-    ld1 {$1.2d}, [$4], $5
-    ld1 {$2.2d}, [$4], $5
-    ld1 {$3.2d}, [$4], $5
-//  }
-.endm
-
-.macro STORE16_ALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, dst*, dst_stride
-    st1 {$0.2d}, [$4], $5
-    st1 {$1.2d}, [$4], $5
-    st1 {$2.2d}, [$4], $5
-    st1 {$3.2d}, [$4], $5
-//  }
-.endm
-
-.macro LOAD16_UNALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, src*, src_stride
-    ld1 {$0.16b}, [$4], $5
-    ld1 {$1.16b}, [$4], $5
-    ld1 {$2.16b}, [$4], $5
-    ld1 {$3.16b}, [$4], $5
-//  }
-.endm
-
-.macro STORE16_UNALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, dst*, dst_stride
-    st1 {$0.16b}, [$4], $5
-    st1 {$1.16b}, [$4], $5
-    st1 {$2.16b}, [$4], $5
-    st1 {$3.16b}, [$4], $5
-//  }
-.endm
-
-#else
 .macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
 //  {   //  input: $0~$3, src*, src_stride
     ld1 {\arg0\().d}[0], [\arg4], \arg5
@@ -178,8 +104,6 @@
     st1 {\arg3\().16b}, [\arg4], \arg5
 //  }
 .endm
-
-#endif
 
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy8x8_AArch64_neon
--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@@ -33,266 +33,7 @@
 #ifdef HAVE_NEON_AARCH64
 
 #include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
 
-.macro MASK_MATRIX
-    uabd    $6.16b, $1.16b, $2.16b
-    cmhi    $6.16b, $4.16b, $6.16b
-
-    uabd    $4.16b, $0.16b, $1.16b
-    cmhi    $4.16b, $5.16b, $4.16b
-    and     $6.16b, $6.16b, $4.16b
-
-    uabd    $4.16b, $3.16b, $2.16b
-    cmhi    $4.16b, $5.16b, $4.16b
-    and     $6.16b, $6.16b, $4.16b
-.endm
-
-.macro DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24)
-    //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
-    urhadd    $8.16b, $2.16b, $3.16b
-    uhadd   $8.16b, $0.16b, $8.16b
-    usubl   $9.8h, $8.8b, $1.8b
-    sqxtn   $9.8b, $9.8h
-    usubl2  $8.8h, $8.16b, $1.16b
-    sqxtn2  $9.16b, $8.8h
-    smax    $8.16b, $9.16b, $5.16b
-//
-    smin  $8.16b, $8.16b, $6.16b
-    uabd  $9.16b, $0.16b, $2.16b
-    cmhi  $9.16b, $4.16b, $9.16b
-    and     $8.16b, $8.16b, $9.16b
-    and     $8.16b, $8.16b, $7.16b
-    add     $8.16b, $1.16b, $8.16b
-    abs     $9.16b, $9.16b
-.endm
-
-.macro DIFF_LUMA_LT4_P0_Q0_1
-    usubl $5.8h, $0.8b, $3.8b
-    usubl $6.8h, $2.8b, $1.8b
-    shl     $6.8h, $6.8h, #2
-    add     $5.8h, $5.8h, $6.8h
-    sqrshrn  $4.8b, $5.8h, #3
-.endm
-
-.macro DIFF_LUMA_LT4_P0_Q0_2
-    usubl2    $5.8h, $0.16b, $3.16b
-    usubl2    $6.8h, $2.16b, $1.16b
-    shl     $6.8h, $6.8h, #2
-    add     $5.8h, $5.8h, $6.8h
-    sqrshrn2  $4.16b, $5.8h, #3
-.endm
-
-.macro EXTRACT_DELTA_INTO_TWO_PART
-    cmge  $1.16b, $0.16b, #0
-    and     $1.16b, $0.16b, $1.16b
-    sub     $0.16b, $1.16b, $0.16b
-.endm
-
-.macro DIFF_LUMA_EQ4_P2P1P0_1
-    uaddl $8.8h, $1.8b, $2.8b
-    uaddl $9.8h, $3.8b, $4.8b
-    add   $9.8h, $9.8h, $8.8h
-
-    uaddl $8.8h, $0.8b, $1.8b
-    shl   $8.8h, $8.8h, #1
-    add   $8.8h, $9.8h, $8.8h
-
-    rshrn $0.8b, $9.8h, #2
-    rshrn $7.8b, $8.8h, #3
-    shl     $9.8h, $9.8h, #1
-    usubl   $8.8h, $5.8b, $1.8b
-    add     $9.8h, $8.8h, $9.8h
-
-    uaddl $8.8h, $2.8b, $5.8b
-    uaddw $8.8h, $8.8h, $2.8b
-    uaddw $8.8h, $8.8h, $3.8b
-
-    rshrn $9.8b, $9.8h, #3
-    rshrn $8.8b, $8.8h, #2
-    bsl       $6.8b, $9.8b, $8.8b
-.endm
-
-.macro DIFF_LUMA_EQ4_P2P1P0_2
-    uaddl2 $8.8h, $1.16b, $2.16b
-    uaddl2 $9.8h, $3.16b, $4.16b
-    add   $9.8h, $9.8h, $8.8h
-
-    uaddl2 $8.8h, $0.16b, $1.16b
-    shl   $8.8h, $8.8h, #1
-    add   $8.8h, $9.8h, $8.8h
-
-    rshrn2    $0.16b, $9.8h, #2
-    rshrn2    $7.16b, $8.8h, #3
-    shl     $9.8h, $9.8h, #1
-    usubl2   $8.8h, $5.16b, $1.16b
-    add     $9.8h, $8.8h, $9.8h
-
-    uaddl2    $8.8h, $2.16b, $5.16b
-    uaddw2    $8.8h, $8.8h, $2.16b
-    uaddw2    $8.8h, $8.8h, $3.16b
-
-    rshrn2    $9.16b, $9.8h, #3
-    rshrn2    $8.16b, $8.8h, #2
-    bsl       $6.16b, $9.16b, $8.16b
-.endm
-
-
-.macro DIFF_CHROMA_EQ4_P0Q0_1
-    uaddl $4.8h, $0.8b, $3.8b
-    shl   $4.8h, $4.8h, #1
-    usubl $5.8h, $1.8b, $3.8b
-    add   $5.8h, $5.8h, $4.8h
-    rshrn $6.8b, $5.8h, #2
-    usubl $5.8h, $2.8b, $0.8b
-    add   $5.8h, $5.8h, $4.8h
-    rshrn $7.8b, $5.8h, #2
-.endm
-
-.macro DIFF_CHROMA_EQ4_P0Q0_2
-    uaddl2 $4.8h, $0.16b, $3.16b
-    shl   $4.8h, $4.8h, #1
-    usubl2 $5.8h, $1.16b, $3.16b
-    add   $5.8h, $5.8h, $4.8h
-    rshrn2 $6.16b, $5.8h, #2
-    usubl2 $5.8h, $2.16b, $0.16b
-    add   $5.8h, $5.8h, $4.8h
-    rshrn2 $7.16b, $5.8h, #2
-.endm
-
-.macro DIFF_LUMA_EQ4_MASK
-    mov   $3.16b, $2.16b
-    bsl   $3.16b, $0.16b, $1.16b
-.endm
-
-.macro LOAD_LUMA_DATA_3
-    ld3   {$0.b, $1.b, $2.b} [$6], [x2], x1
-    ld3   {$3.b, $4.b, $5.b} [$6], [x0], x1
-.endm
-
-.macro LOAD_LUMA_DATA_4
-    ld4   {$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1
-    ld4   {$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1
-.endm
-
-.macro STORE_LUMA_DATA_4
-    st4   {$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1
-    st4   {$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1
-.endm
-
-.macro STORE_LUMA_DATA_3
-    st3 {$0.b, $1.b, $2.b} [$6], [x3], x1
-    st3   {$3.b, $4.b, $5.b} [$6], [x0], x1
-.endm
-
-.macro LOAD_CHROMA_DATA_4
-    ld4   {$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2
-.endm
-
-.macro STORE_CHROMA_DATA_2
-    st2   {$0.b, $1.b} [$3], [$2], x2
-.endm
-
-.macro ZERO_JUMP_END
-    mov $1, $0.d[0]
-    mov $2, $0.d[1]
-    orr $1, $1, $2
-    cbz $1, $3
-.endm
-
-.macro BS_NZC_CHECK
-    ld1 {v0.16b}, [$0]
-    //Arrange the input data --- TOP
-    ands     x6, $1, #2
-    cbz      x6, bs_nzc_check_jump0
-    sub      x6, $0, $2, lsl #4
-    sub      x6, x6, $2, lsl #3
-    add      x6, x6, #12
-    ld1      {v1.s} [3], [x6]
-
-    bs_nzc_check_jump0:
-    ext      v1.16b, v1.16b, v0.16b, #12
-    add      $3.16b, v0.16b, v1.16b
-
-    // Arrange the input data --- LEFT
-    ands     x6, $1, #1
-    cbz      x6, bs_nzc_check_jump1
-
-    sub      x6, $0, #21
-    add      x7, x6, #4
-    ld1      {v1.b} [12], [x6]
-    add      x6, x7, #4
-    ld1      {v1.b} [13], [x7]
-    add      x7, x6, #4
-    ld1      {v1.b} [14], [x6]
-    ld1      {v1.b} [15], [x7]
-
-bs_nzc_check_jump1:
-    ins      v2.d[0], v0.d[1]
-    zip1     v0.16b, v0.16b, v2.16b
-    ins      v2.d[0], v0.d[1]
-    zip1     v0.16b, v0.16b, v2.16b
-    ext      v1.16b, v1.16b, v0.16b, #12
-    add      $4.16b, v0.16b, v1.16b
-.endm
-
-.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5
-    mov   w6, #4
-    sabd  v20.8h, $0.8h, $1.8h
-    sabd  v21.8h, $1.8h, $2.8h
-    dup   $0.8h, w6
-    sabd  v22.8h, $2.8h, $3.8h
-    sabd  v23.8h, $3.8h, $4.8h
-
-    cmge  v20.8h, v20.8h, $0.8h
-    cmge  v21.8h, v21.8h, $0.8h
-    cmge  v22.8h, v22.8h, $0.8h
-    cmge  v23.8h, v23.8h, $0.8h
-
-    addp v20.8h, v20.8h, v21.8h
-    addp v21.8h, v22.8h, v23.8h
-
-    addhn  $5.8b, v20.8h, v20.8h
-    addhn2  $5.16b, v21.8h, v21.8h
-.endm
-
-.macro BS_MV_CHECK
-    ldp q0, q1, [$0], #32
-    ldp q2, q3, [$0]
-    sub $0, $0, #32
-    // Arrenge the input data --- TOP
-    ands     x6, $1, #2
-    cbz     x6, bs_mv_check_jump0
-    sub      x6, $0, $2, lsl #6
-    add      x6, x6, #48
-    ld1      {v4.16b}, [x6]
-bs_mv_check_jump0:
-    BS_COMPARE_MV  v4, v0, v1, v2, v3, $3
-    // Arrange the input data --- LEFT
-    ands     x6, $1, #1
-    cbz      x6, bs_mv_check_jump1
-    sub      x6, $0, #52
-    add      x7, x6, #16
-    ld1      {v4.s} [0], [x6]
-    add      x6, x7, #16
-    ld1      {v4.s} [1], [x7]
-    add      x7, x6, #16
-    ld1      {v4.s} [2], [x6]
-    ld1      {v4.s} [3], [x7]
-bs_mv_check_jump1:
-    zip1  $5.4s, v0.4s, v2.4s
-    zip2  $6.4s, v0.4s, v2.4s
-    zip1  v0.4s, v1.4s, v3.4s
-    zip2  v2.4s, v1.4s, v3.4s
-    zip2  v1.4s, $5.4s, v0.4s
-    zip1  v0.4s, $5.4s, v0.4s
-    zip2  v3.4s, $6.4s, v2.4s
-    zip1  v2.4s, $6.4s, v2.4s
-    BS_COMPARE_MV  v4, v0, v1, v2, v3, $4
-.endm
-
-#else
-
 .macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
     uabd    \arg6\().16b, \arg1\().16b, \arg2\().16b
     cmhi    \arg6\().16b, \arg4\().16b, \arg6\().16b
@@ -549,7 +290,6 @@
     zip1  v2.4s, \arg6\().4s, v2.4s
     BS_COMPARE_MV  v4, v0, v1, v2, v3, \arg4
 .endm
-#endif
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
     mov w1, #1
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -35,181 +35,6 @@
 .align 4
 filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
 
-#ifdef __APPLE__
-
-.macro FILTER_6TAG_8BITS1
-//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
-    uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
-    uaddl v19.8h, $2.8b, $3.8b  //src[0]+src[1]
-    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
-    uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
-    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
-    sqrshrun $6.8b, v18.8h, #5
-//  }
-.endm
-
-.macro FILTER_6TAG_8BITS2
-//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
-    uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, $2.16b, $3.16b   //src[0]+src[1]
-    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
-    uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
-    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
-    sqrshrun2 $6.16b, v18.8h, #5
-//  }
-.endm
-
-.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
-//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
-    uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
-    uaddl v19.8h, $2.8b, $3.8b  //src[0]+src[1]
-    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
-    uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
-    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
-    sqrshrun $6.8b, v18.8h, #5
-    uaddl  v19.8h, $2.8b, $6.8b
-    rshrn $6.8b, v19.8h, #1
-//  }
-.endm
-
-.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
-//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
-    uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, $2.16b, $3.16b   //src[0]+src[1]
-    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
-    uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
-    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
-    sqrshrun2 $6.16b, v18.8h, #5
-    uaddl2  v19.8h, $2.16b, $6.16b
-    rshrn2 $6.16b, v19.8h, #1
-//  }
-.endm
-
-.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
-//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
-    uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
-    uaddl v19.8h, $2.8b, $3.8b  //src[0]+src[1]
-    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
-    uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
-    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
-    sqrshrun $6.8b, v18.8h, #5
-    uaddl  v19.8h, $3.8b, $6.8b
-    rshrn $6.8b, v19.8h, #1
-//  }
-.endm
-
-.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
-//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
-    uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, $2.16b, $3.16b   //src[0]+src[1]
-    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
-    uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
-    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
-    sqrshrun2 $6.16b, v18.8h, #5
-    uaddl2  v19.8h, $3.16b, $6.16b
-    rshrn2 $6.16b, v19.8h, #1
-//  }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS1
-//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
-    uaddl   $6.8h, $0.8b, $5.8b     //dst_q=src[-2]+src[3]
-    uaddl   v31.8h, $2.8b, $3.8b    //src[0]+src[1]
-    mla $6.8h, v31.8h, $7.8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
-    uaddl   v31.8h, $1.8b, $4.8b    //src[-1]+src[2]
-    mls $6.8h, v31.8h, $8.8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//  }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS2
-//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
-    uaddl2  $6.8h, $0.16b, $5.16b       //dst_q=src[-2]+src[3]
-    uaddl2  v31.8h, $2.16b, $3.16b  //src[0]+src[1]
-    mla $6.8h, v31.8h, $7.8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
-    uaddl2  v31.8h, $1.16b, $4.16b  //src[-1]+src[2]
-    mls $6.8h, v31.8h, $8.8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//  }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS1
-//  {   // input:a, b, c, dst_d;
-    sub $0.8h, $0.8h, $1.8h         //a-b
-    sshr    $0.8h, $0.8h, #2            //(a-b)/4
-    sub $0.8h, $0.8h, $1.8h         //(a-b)/4-b
-    add $0.8h, $0.8h, $2.8h         //(a-b)/4-b+c
-    sshr    $0.8h, $0.8h, #2            //((a-b)/4-b+c)/4
-    add $0.8h, $0.8h, $2.8h         //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    sqrshrun    $3.8b, $0.8h, #6        //(+32)>>6
-//  }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS2
-//  {   // input:a, b, c, dst_d;
-    sub $0.8h, $0.8h, $1.8h         //a-b
-    sshr    $0.8h, $0.8h, #2            //(a-b)/4
-    sub $0.8h, $0.8h, $1.8h         //(a-b)/4-b
-    add $0.8h, $0.8h, $2.8h         //(a-b)/4-b+c
-    sshr    $0.8h, $0.8h, #2            //((a-b)/4-b+c)/4
-    add $0.8h, $0.8h, $2.8h         //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    sqrshrun2   $3.16b, $0.8h, #6       //(+32)>>6
-//  }
-.endm
-
-.macro UNPACK_2_16BITS_TO_ABC
-//  {   // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    ext $4.16b, $0.16b, $1.16b, #4      //src[0]
-    ext $3.16b, $0.16b, $1.16b, #6      //src[1]
-    add $4.8h, $4.8h, $3.8h                 //c=src[0]+src[1]
-
-    ext $3.16b, $0.16b, $1.16b, #2      //src[-1]
-    ext $2.16b, $0.16b, $1.16b, #8      //src[2]
-    add $3.8h, $3.8h, $2.8h                 //b=src[-1]+src[2]
-
-    ext $2.16b, $0.16b, $1.16b, #10     //src[3]
-    add $2.8h, $2.8h, $0.8h                 //a=src[-2]+src[3]
-//  }
-.endm
-
-.macro AVERAGE_TWO_8BITS1
-//  {   // input:dst_d, src_d A and B; working: v5
-    uaddl   v30.8h, $2.8b, $1.8b
-    rshrn   $0.8b, v30.8h, #1
-//  }
-.endm
-
-.macro AVERAGE_TWO_8BITS2
-//  {   // input:dst_d, src_d A and B; working: v5
-    uaddl2  v30.8h, $2.16b, $1.16b
-    rshrn2  $0.16b, v30.8h, #1
-//  }
-.endm
-
-.macro FILTER_SINGLE_TAG_8BITS      // when width=17/9, used
-//  {   // input: src_d{Y[0][1][2][3][4][5]X},
-    rev64   $2.8b, $0.8b                // X[5][4][3][2][1][0]O
-    uaddl   $2.8h, $0.8b, $2.8b         // each 16bits, *[50][41][32][23][14][05]*
-    mul $2.4h, $2.4h, $1.4h         // 0+1*[50]-5*[41]+20[32]
-    addv $3, $2.4h
-    sqrshrun $0.8b, $0.8h, #5
-//  }
-.endm
-
-.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
-//  {   // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
-    ext $3.16b, $1.16b, $1.16b, #14       // X[0][1][2][3][4][5]O
-    ext $4.16b, $3.16b, $3.16b, #8      // [3][4][5]OX[0][1][2]
-    rev64  $4.8h, $4.8h         // X[5][4][3][2][1][0]O
-    add   $3.8h, $3.8h, $4.8h    // each 16bits, *[50][41][32][23][14][05]*
-    smull $3.4s, $3.4h, $2.4h           // 0+1*[50]-5*[41]+20[32]
-    saddlv $5, $3.4s
-    //sshr $0.2d, $0.2d, #4
-    sqrshrun $0.2s, $0.2d, #10
-    uqxtn $0.4h, $0.4s
-    uqxtn $0.8b, $0.8h
-   //   }
-.endm
-
-#else
 .macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
 //  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
@@ -382,7 +207,6 @@
     uqxtn \arg0\().8b, \arg0\().8h
    //   }
 .endm
-#endif
 
 //(const uint8_t* pSrc {x0}, int32_t iSrcStride{x1}, uint8_t* pDst{x2}, int32_t iDstStride{x3}, int32_t iHeight{x4})
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq16_AArch64_neon
--- a/codec/decoder/core/arm64/block_add_aarch64_neon.S
+++ b/codec/decoder/core/arm64/block_add_aarch64_neon.S
@@ -32,40 +32,7 @@
 
 #ifdef HAVE_NEON_AARCH64
 #include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
-.macro ROW_TRANSFORM_1_STEP
-//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
 
-    saddl       $4.4s, $0.4h, $2.4h          //int32 e[i][0] = src[0] + src[2];
-    ssubl       $5.4s, $0.4h, $2.4h          //int32 e[i][1] = src[0] - src[2];
-    sshr        $8.4h, $1.4h, #1
-    sshr        $9.4h, $3.4h, #1
-    ssubl       $6.4s, $8.4h, $3.4h          //int32 e[i][2] = (src[1]>>1)-src[3];
-    saddl       $7.4s, $1.4h, $9.4h          //int32 e[i][3] = src[1] + (src[3]>>1);
-//  }
-.endm
-
-.macro TRANSFORM_4BYTES // both row & col transform used
-//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
-    add       $0.4s, $4.4s, $7.4s          //int16 f[i][0] = e[i][0] + e[i][3];
-    add       $1.4s, $5.4s, $6.4s          //int16 f[i][1] = e[i][1] + e[i][2];
-    sub       $2.4s, $5.4s, $6.4s          //int16 f[i][2] = e[i][1] - e[i][2];
-    sub       $3.4s, $4.4s, $7.4s          //int16 f[i][3] = e[i][0] - e[i][3];
-//  }
-.endm
-
-.macro COL_TRANSFORM_1_STEP
-//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
-    add        $4.4s, $0.4s, $2.4s          //int32 e[0][j] = f[0][j] + f[2][j];
-    sub        $5.4s, $0.4s, $2.4s          //int32 e[1][j] = f[0][j] - f[2][j];
-    sshr        $6.4s, $1.4s, #1
-    sshr        $7.4s, $3.4s, #1
-    sub        $6.4s, $6.4s, $3.4s          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    add        $7.4s, $1.4s, $7.4s          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//  }
-.endm
-
-#else
 .macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
 //  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8\() \arg9\()
 
@@ -98,7 +65,6 @@
     add        \arg7\().4s, \arg1\().4s, \arg7\().4s          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
 //  }
 .endm
-#endif
 
 //  uint8_t *pred, const int32_t stride, int16_t *rs
 WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon
--- a/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S
+++ b/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S
@@ -93,93 +93,6 @@
     trn2    v17.4s, v4.4s, v5.4s    //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17
 .endm
 
-#ifdef __APPLE__
-.macro SELECT_BEST_COST
-    cmp     w1, $0
-    csel    $0, $0, w1, $2
-    cset    w7, $1
-    cmp     w2, $0
-    mov     w6, #2
-    csel    $0, $0, w2, $2
-    csel    w7, w7, w6, $2
-.endm
-
-.macro SELECT_BEST_COST_PREFER_HIGHER arg0
-    SELECT_BEST_COST \arg0, ls, hi
-.endm
-
-.macro SELECT_BEST_COST_PREFER_LOWER arg0
-    SELECT_BEST_COST \arg0, lo, hs
-.endm
-
-.macro LOAD_CHROMA_DATA
-    sub     x9, $0, x1
-    ld1     {$1}, [x9]      //top_cb
-    sub     x9, $0, #1
-    ld1     {$2}[8], [x9], x1
-    ld1     {$2}[9], [x9], x1
-    ld1     {$2}[10], [x9], x1
-    ld1     {$2}[11], [x9], x1
-    ld1     {$2}[12], [x9], x1
-    ld1     {$2}[13], [x9], x1
-    ld1     {$2}[14], [x9], x1
-    ld1     {$2}[15], [x9], x1 //left_cb
-.endm
-
-.macro LOAD_8X4_DATA
-    //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
-    ld1     {v0.8b}, [$0], x3
-    ld1     {v1.8b}, [$0], x3
-    ld1     {v0.d}[1], [$0], x3
-    ld1     {v1.d}[1], [$0], x3
-    trn1    v2.4s, v0.4s, v1.4s
-    trn2    v1.4s, v0.4s, v1.4s
-    trn1    v20.2d, v2.2d, v1.2d
-    trn2    v21.2d, v2.2d, v1.2d
-.endm
-
-.macro HDM_TRANSFORM_4X4_L0
-    //Do the vertical transform
-    uadd$9   v0.8h, $0, $1
-    usub$9   v1.8h, $0, $1
-    trn1    v3.2d, v0.2d, v1.2d
-    trn2    v1.2d, v0.2d, v1.2d
-    add     v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
-    sub     v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
-
-    //Do the horizontal transform
-    trn1    v0.4s, v4.4s, v5.4s
-    trn2    v1.4s, v4.4s, v5.4s
-    add     v4.8h, v0.8h, v1.8h
-    sub     v5.8h, v0.8h, v1.8h
-    trn1    v0.8h, v4.8h, v5.8h
-    trn2    v1.8h, v4.8h, v5.8h
-    add     v4.8h, v0.8h, v1.8h
-    sub     v5.8h, v0.8h, v1.8h
-
-    //16x16_v
-    trn1    v0.2s, v4.2s, v5.2s
-    trn2    v1.2s, v4.2s, v5.2s
-    sabal   $5, v0.4h, $2
-    sabal   $5, v1.4h, $8.4h
-    sabal2  $5, v4.8h, $8.8h
-    sabal2  $5, v5.8h, $8.8h
-
-    //16x16_h
-    ins     v3.d[0], v4.d[1]
-    trn1    v0.4h, v4.4h, v3.4h
-    trn2    v1.4h, v4.4h, v3.4h
-    sabal   $6, v0.4h, $3
-    sabdl   v4.4s, v1.4h, $8.4h
-    sabal   v4.4s, v5.4h, $8.4h
-    sabal2  v4.4s, v5.8h, $8.8h
-    add     $6, $6, v4.4s
-
-    //16x16_dc_both
-    sabal   $7, v0.4h, $4
-    add     $7, $7, v4.4s
-.endm
-#else
 .macro SELECT_BEST_COST arg0, arg1, arg2
     cmp     w1, \arg0
     csel    \arg0, \arg0, w1, \arg2
@@ -265,7 +178,6 @@
     sabal   \arg7, v0.4h, \arg4
     add     \arg7, \arg7, v4.4s
 .endm
-#endif
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
     ldr     x11, [sp, #0]
--- a/codec/encoder/core/arm64/pixel_aarch64_neon.S
+++ b/codec/encoder/core/arm64/pixel_aarch64_neon.S
@@ -68,89 +68,6 @@
     ld1     {v7.16b}, [x0], x1
 .endm
 
-#ifdef __APPLE__
-.macro LOAD_8X8_2
-    ld1     {v16.8b}, [$0], x3
-    ld1     {v17.8b}, [$0], x3
-    ld1     {v18.8b}, [$0], x3
-    ld1     {v19.8b}, [$0], x3
-    ld1     {v20.8b}, [$0], x3
-    ld1     {v21.8b}, [$0], x3
-    ld1     {v22.8b}, [$0], x3
-    ld1     {v23.8b}, [$0], x3
-.endm
-
-.macro CALC_ABS_8X8_1
-    uab$1l  $0, v0.8b, v16.8b
-    uabal   $0, v1.8b, v17.8b
-    uabal   $0, v2.8b, v18.8b
-    uabal   $0, v3.8b, v19.8b
-    uabal   $0, v4.8b, v20.8b
-    uabal   $0, v5.8b, v21.8b
-    uabal   $0, v6.8b, v22.8b
-    uabal   $0, v7.8b, v23.8b
-.endm
-
-.macro CALC_ABS_8X8_2
-    uab$0l  v29.8h, v0.8b, v18.8b
-    uabal   v29.8h, v1.8b, v19.8b
-    uabal   v29.8h, v2.8b, v20.8b
-    uabal   v29.8h, v3.8b, v21.8b
-    uabal   v29.8h, v4.8b, v22.8b
-    uabal   v29.8h, v5.8b, v23.8b
-    uabal   v29.8h, v6.8b, v24.8b
-    uabal   v29.8h, v7.8b, v25.8b
-.endm
-
-.macro LOAD_16X8_2
-    ld1     {v16.16b}, [$0], x3
-    ld1     {v17.16b}, [$0], x3
-    ld1     {v18.16b}, [$0], x3
-    ld1     {v19.16b}, [$0], x3
-    ld1     {v20.16b}, [$0], x3
-    ld1     {v21.16b}, [$0], x3
-    ld1     {v22.16b}, [$0], x3
-    ld1     {v23.16b}, [$0], x3
-.endm
-
-.macro CALC_ABS_16X8_1
-    uab$1l  $0, v0.8b, v16.8b
-    uabal2  $0, v0.16b,v16.16b
-    uabal   $0, v1.8b, v17.8b
-    uabal2  $0, v1.16b,v17.16b
-    uabal   $0, v2.8b, v18.8b
-    uabal2  $0, v2.16b,v18.16b
-    uabal   $0, v3.8b, v19.8b
-    uabal2  $0, v3.16b,v19.16b
-    uabal   $0, v4.8b, v20.8b
-    uabal2  $0, v4.16b,v20.16b
-    uabal   $0, v5.8b, v21.8b
-    uabal2  $0, v5.16b,v21.16b
-    uabal   $0, v6.8b, v22.8b
-    uabal2  $0, v6.16b,v22.16b
-    uabal   $0, v7.8b, v23.8b
-    uabal2  $0, v7.16b,v23.16b
-.endm
-
-.macro CALC_ABS_16X8_2
-    uab$0l  v29.8h, v0.8b, v18.8b
-    uabal2  v29.8h, v0.16b,v18.16b
-    uabal   v29.8h, v1.8b, v19.8b
-    uabal2  v29.8h, v1.16b,v19.16b
-    uabal   v29.8h, v2.8b, v20.8b
-    uabal2  v29.8h, v2.16b,v20.16b
-    uabal   v29.8h, v3.8b, v21.8b
-    uabal2  v29.8h, v3.16b,v21.16b
-    uabal   v29.8h, v4.8b, v22.8b
-    uabal2  v29.8h, v4.16b,v22.16b
-    uabal   v29.8h, v5.8b, v23.8b
-    uabal2  v29.8h, v5.16b,v23.16b
-    uabal   v29.8h, v6.8b, v24.8b
-    uabal2  v29.8h, v6.16b,v24.16b
-    uabal   v29.8h, v7.8b, v25.8b
-    uabal2  v29.8h, v7.16b,v25.16b
-.endm
-#else
 .macro LOAD_8X8_2 arg0
     ld1     {v16.8b}, [\arg0], x3
     ld1     {v17.8b}, [\arg0], x3
@@ -232,7 +149,6 @@
     uabal   v29.8h, v7.8b, v25.8b
     uabal2  v29.8h, v7.16b,v25.16b
 .endm
-#endif
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
     sxtw    x1, w1
--- a/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
+++ b/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
@@ -33,247 +33,6 @@
 #ifdef  HAVE_NEON_AARCH64
 #include "arm_arch64_common_macro.S"
 
-#ifdef __APPLE__
-.macro ZERO_COUNT_IN_2_QUARWORD
-//  {   //  input:  coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
-    cmeq    $0.8h, $0.8h, #0
-    cmeq    $1.8h, $1.8h, #0
-    uzp1    $0.16b, $0.16b, $1.16b
-    ushr    $0.16b, $0.16b, 7
-    addv    $2, $0.16b
-//  }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS    // if coef <= 0, - coef; else , coef;
-//  {   //  input:  coef, ff (dst), mf
-    eor     $3.16b, $3.16b, $3.16b          // init 0 , and keep 0;
-    saba    $1.8h, $0.8h, $3.8h      // f + abs(coef - 0)
-    smull   $4.4s, $1.4h, $2.4h
-    smull2  $5.4s, $1.8h, $2.8h
-    shrn    $1.4h, $4.4s, #16
-    shrn2   $1.8h, $5.4s, #16
-
-    cmgt    $4.8h, $0.8h, #0      // if true, location of coef == 11111111
-    bif     $3.16b, $1.16b, $4.16b      // if (x<0) reserved part; else keep 0 untouched
-    shl     $3.8h, $3.8h, #1
-    sub     $1.8h, $1.8h, $3.8h      // if x > 0, -= 0; else x-= 2x
-//  }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS_MAX    // if coef <= 0, - coef; else , coef;
-//  {   //  input:  coef, ff (dst), mf
-    eor     $3.16b, $3.16b, $3.16b          // init 0 , and keep 0;
-    saba    $1.8h, $0.8h, $3.8h      // f + abs(coef - 0)
-    smull   $4.4s, $1.4h, $2.4h
-    smull2  $5.4s, $1.8h, $2.8h
-    shrn    $1.4h, $4.4s, #16
-    shrn2   $1.8h, $5.4s, #16
-
-    cmgt    $4.8h, $0.8h, #0      // if true, location of coef == 11111111
-    bif     $3.16b, $1.16b, $4.16b      // if (x<0) reserved part; else keep 0 untouched
-    shl     $3.8h, $3.8h, #1
-    mov     $6.16b, $1.16b
-    sub     $1.8h, $1.8h, $3.8h      // if x > 0, -= 0; else x-= 2x
-//  }
-.endm
-
-.macro QUANT_DUALWORD_COEF_EACH_16BITS  // if coef <= 0, - coef; else , coef;
-//  {   //  input:  coef, ff (dst), mf
-    saba    $1.8h, $0.8h, $3.8h      // f + abs(coef - 0)
-    smull   $4.4s, $1.4h, $2.4h
-    shrn    $1.4h, $4.4s, #16
-
-    cmgt    $4.8h, $0.8h, #0      // if true, location of coef == 11111111
-    bif     $3.16b, $1.16b, $4.16b      // if (x<0) reserved part; else keep 0 untouched
-    shl     $3.8h, $3.8h, #1
-    sub     $1.8h, $1.8h, $3.8h      // if x > 0, -= 0; else x-= 2x
-//  }
-.endm
-
-.macro SELECT_MAX_IN_ABS_COEF
-//  {   //  input:  coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)
-    umax    $0.8h, $0.8h, $1.8h
-    umaxv   $4, $0.8h
-    umax    $2.8h, $2.8h, $3.8h
-    umaxv   $5, $2.8h
-//  }
-.endm
-
-.macro HDM_QUANT_2x2_TOTAL_16BITS
-//  {   //  input: src_d[0][16][32][48], dst_d[0][16][32][48], working
-    sshr  $1.2d, $0.2d, #32
-    add   $2.4h, $0.4h, $1.4h      // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
-    sub   $1.4h, $0.4h, $1.4h      // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
-    zip1  $1.4h, $2.4h, $1.4h
-//  }
-.endm
-
-
-.macro DC_ZERO_COUNT_IN_DUALWORD
-//  {   //  input:  coef, dst_d, working_d (all 0x01)
-    cmeq    $0.4h, $0.4h, #0
-    and     $0.8b, $0.8b, $2.8b
-    addv    $1, $0.4h
-//  }
-.endm
-
-.macro IHDM_4x4_TOTAL_16BITS
-//  {   //  input: each src_d[0]~[3](dst), working_q0, working_q1
-    uzp2  $1.4s, $0.4s, $0.4s
-    uzp1  $0.4s, $0.4s, $0.4s
-    add   $2.8h, $0.8h, $1.8h      // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
-    sub   $1.8h, $0.8h, $1.8h      // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
-    zip1  $2.8h, $2.8h, $1.8h      // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
-
-    uzp2  $1.4s, $2.4s, $2.4s
-    uzp1  $0.4s, $2.4s, $2.4s
-    add   $2.8h, $0.8h, $1.8h      // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
-    sub   $1.8h, $0.8h, $1.8h      // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
-    rev32 $1.4h, $1.4h             // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
-    zip1  $0.4s, $2.4s, $1.4s
-//  }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2
-//  {   //  input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
-    uzp1 $2.4s, $0.4s, $1.4s   //[0 1 4 5]+[8 9 12 13]
-    uzp2 $3.4s, $0.4s, $1.4s   //[2 3 6 7]+[10 11 14 15]
-
-    uzp1 $0.8h, $2.8h, $3.8h   //[0 4 8 12]+[2 6 10 14]
-    uzp2 $2.8h, $2.8h, $3.8h   //[1 5 9 13]+[3 7 11 15]
-    zip2 $1.2d, $0.2d, $2.2d   //[2 6 10 14]+[3 7 11 15]
-    zip1 $0.2d, $0.2d, $2.2d   //[0 4 8 12]+[1 5 9 13]
-//  }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4
-//  {   //  input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]
-    trn1 $4.8h, v0.8h, v1.8h
-    trn2 $5.8h, v0.8h, v1.8h
-    trn1 $6.8h, v2.8h, v3.8h
-    trn2 $7.8h, v2.8h, v3.8h
-
-    trn1 $0.4s, v4.4s, v6.4s
-    trn2 $2.4s, v4.4s, v6.4s
-    trn1 $1.4s, v5.4s, v7.4s
-    trn2 $3.4s, v5.4s, v7.4s
-//  }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2
-//  {   //  input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]
-    mov  $0.d[1], $1.d[0]  //[0 1 2 3]+[4 5 6 7]
-    mov  $2.d[1], $3.d[0]  //[8 9 10 11]+[12 13 14 15]
-    uzp1 $1.4s, $0.4s, $2.4s   //[0 1 4 5]+[8 9 12 13]
-    uzp2 $3.4s, $0.4s, $2.4s   //[2 3 6 7]+[10 11 14 15]
-
-    uzp1 $0.8h, $1.8h, $3.8h   //[0 4 8 12]+[2 6 10 14]
-    uzp2 $2.8h, $1.8h, $3.8h   //[1 5 9 13]+[3 7 11 15]
-    zip2 $1.2d, $0.2d, $2.2d   //[2 6 10 14]+[3 7 11 15]
-    zip1 $0.2d, $0.2d, $2.2d   //[0 4 8 12]+[1 5 9 13]
-//  }
-.endm
-
-.macro LOAD_4x4_DATA_FOR_DCT
-    ld1   {$0.s}[0], [$2], $3
-    ld1   {$0.s}[1], [$2], $3
-    ld1   {$0.s}[2], [$2], $3
-    ld1   {$0.s}[3], [$2]
-
-    ld1   {$1.s}[0], [$4], $5
-    ld1   {$1.s}[1], [$4], $5
-    ld1   {$1.s}[2], [$4], $5
-    ld1   {$1.s}[3], [$4]
-.endm
-
-.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
-//  {   //  input: src_d[0]~[3], working: [4]~[7]
-    add     $4.8h, $0.8h, $3.8h   //int16 s[0] = data[i] + data[i3];
-    sub     $7.8h, $0.8h, $3.8h   //int16 s[3] = data[i] - data[i3];
-    add     $5.8h, $1.8h, $2.8h   //int16 s[1] = data[i1] + data[i2];
-    sub     $6.8h, $1.8h, $2.8h   //int16 s[2] = data[i1] - data[i2];
-
-    add     $0.8h, $4.8h, $5.8h   //int16 dct[i ] = s[0] + s[1];
-    sub     $2.8h, $4.8h, $5.8h   //int16 dct[i2] = s[0] - s[1];
-    shl     $1.8h, $7.8h, #1
-    shl     $3.8h, $6.8h, #1
-    add     $1.8h, $1.8h, $6.8h   //int16 dct[i1] = (s[3] << 1) + s[2];
-    sub     $3.8h, $7.8h, $3.8h   //int16 dct[i3] = s[3] - (s[2] << 1);
-//  }
-.endm
-
-.macro LOAD_8x4_DATA_FOR_DCT
-//  {   //  input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
-    ld1   {$0.d}[0], [$8], x2
-    ld1   {$1.d}[0], [$8], x2
-    ld1   {$2.d}[0], [$8], x2
-    ld1   {$3.d}[0], [$8], x2
-
-    ld1   {$4.d}[0], [$9], x4
-    ld1   {$5.d}[0], [$9], x4
-    ld1   {$6.d}[0], [$9], x4
-    ld1   {$7.d}[0], [$9], x4
-//  }
-.endm
-
-.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
-//  {   //  input: src_d[0]~[3], output: e_d[0]~[3];
-    add   $4.8h, $0.8h, $2.8h          //int16 e[i][0] = src[0] + src[2];
-    sub   $5.8h, $0.8h, $2.8h          //int16 e[i][1] = src[0] - src[2];
-    sshr  $6.8h, $1.8h, #1
-    sshr  $7.8h, $3.8h, #1
-    sub   $6.8h, $6.8h, $3.8h          //int16 e[i][2] = (src[1]>>1)-src[3];
-    add   $7.8h, $1.8h, $7.8h          //int16 e[i][3] = src[1] + (src[3]>>1);
-//  }
-.endm
-
-.macro TRANSFORM_TOTAL_16BITS   // both row & col transform used
-//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
-    add   $0.8h, $4.8h, $7.8h          //int16 f[i][0] = e[i][0] + e[i][3];
-    add   $1.8h, $5.8h, $6.8h          //int16 f[i][1] = e[i][1] + e[i][2];
-    sub   $2.8h, $5.8h, $6.8h          //int16 f[i][2] = e[i][1] - e[i][2];
-    sub   $3.8h, $4.8h, $7.8h          //int16 f[i][3] = e[i][0] - e[i][3];
-//  }
-.endm
-
-.macro ROW_TRANSFORM_0_STEP
-//  {   //  input: src_d[0]~[3], output: e_q[0]~[3];
-    saddl   $4.4s, $0.4h, $2.4h          //int32 e[i][0] = src[0] + src[2];
-    ssubl   $5.4s, $0.4h, $2.4h          //int32 e[i][1] = src[0] - src[2];
-    ssubl   $6.4s, $1.4h, $3.4h          //int32 e[i][2] = src[1] - src[3];
-    saddl   $7.4s, $1.4h, $3.4h          //int32 e[i][3] = src[1] + src[3];
-//  }
-.endm
-
-.macro COL_TRANSFORM_0_STEP
-//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
-    add     $4.4s, $0.4s, $2.4s          //int32 e[0][j] = f[0][j] + f[2][j];
-    sub     $5.4s, $0.4s, $2.4s          //int32 e[1][j] = f[0][j] - f[2][j];
-    sub     $6.4s, $1.4s, $3.4s          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    add     $7.4s, $1.4s, $3.4s          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//  }
-.endm
-
-.macro TRANSFORM_4BYTES // both row & col transform used
-//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
-    add     $0.4s, $4.4s, $7.4s          //int16 f[i][0] = e[i][0] + e[i][3];
-    add     $1.4s, $5.4s, $6.4s          //int16 f[i][1] = e[i][1] + e[i][2];
-    sub     $2.4s, $5.4s, $6.4s          //int16 f[i][2] = e[i][1] - e[i][2];
-    sub     $3.4s, $4.4s, $7.4s          //int16 f[i][3] = e[i][0] - e[i][3];
-//  }
-.endm
-
-.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
-//  {   //  input: pred_d[0](output), dct_q0/1, working_q0/1;
-    uxtl      $3.8h, $0.8b
-    uxtl2     $4.8h, $0.16b
-    add       $3.8h, $3.8h, $1.8h
-    add       $4.8h, $4.8h, $2.8h
-    sqxtun    $0.8b, $3.8h
-    sqxtun2   $0.16b,$4.8h
-//  }
-.endm
-#else
 .macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2
 //  {   //  input:  coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q
     cmeq    \arg0\().8h, \arg0\().8h, #0
@@ -518,7 +277,6 @@
     sqxtun2   \arg0\().16b,\arg4\().8h
 //  }
 .endm
-#endif
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsGetNoneZeroCount_AArch64_neon
     ld1     {v0.8h, v1.8h}, [x0]
--- a/codec/processing/src/arm64/vaa_calc_aarch64_neon.S
+++ b/codec/processing/src/arm64/vaa_calc_aarch64_neon.S
@@ -33,29 +33,6 @@
 #ifdef HAVE_NEON_AARCH64
 #include "arm_arch64_common_macro.S"
 
-#ifdef __APPLE__
-.macro ABS_SUB_SUM_16BYTES
-    ld1     {v0.16b}, [x0], x4
-    ld1     {v1.16b}, [x1], x4
-    uabal   $0, v0.8b, v1.8b
-    uabal2  $1, v0.16b,v1.16b
-.endm
-
-.macro ABS_SUB_SUM_8x16BYTES
-    ld1     {v0.16b}, [x0], x4
-    ld1     {v1.16b}, [x1], x4
-    uabdl   $0, v0.8b, v1.8b
-    uabdl2  $1, v0.16b,v1.16b
-
-    ABS_SUB_SUM_16BYTES $0, $1
-    ABS_SUB_SUM_16BYTES $0, $1
-    ABS_SUB_SUM_16BYTES $0, $1
-    ABS_SUB_SUM_16BYTES $0, $1
-    ABS_SUB_SUM_16BYTES $0, $1
-    ABS_SUB_SUM_16BYTES $0, $1
-    ABS_SUB_SUM_16BYTES $0, $1
-.endm
-#else
 .macro ABS_SUB_SUM_16BYTES arg0, arg1
     ld1     {v0.16b}, [x0], x4
     ld1     {v1.16b}, [x1], x4
@@ -77,7 +54,6 @@
     ABS_SUB_SUM_16BYTES \arg0, \arg1
     ABS_SUB_SUM_16BYTES \arg0, \arg1
 .endm
-#endif
 
 /*
  * void vaa_calc_sad_neon(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride,