shithub: openh264

Download patch

ref: 3956bccc3de5255859033daa5aebfb3cbe70963f
parent: b66703133e92cde6b62e436be1ac8e022766d923
author: Martin Storsjö <[email protected]>
date: Wed Apr 23 07:25:40 EDT 2014

Use the correct, official syntax for aarch64 mov instructions

Previously this used unofficial, apple specific syntax (with fallback
macros for gnu binutils), since Xcode 5.x didn't support the official
syntax of these instructions. Since Xcode 6 has been out for quite a
number of months already, it should be safe to require this (for
building 64 bit binaries for iOS, armv7 builds can still be built
with older Xcode versions).

This clarifies the code by avoiding apple specific syntax in the
assembler instructions.

--- a/codec/common/arm64/arm_arch64_common_macro.S
+++ b/codec/common/arm64/arm_arch64_common_macro.S
@@ -59,12 +59,4 @@
 .endfunc
 .endm
 
-.macro mov.16b arg0, arg1
-    mov \arg0\().16b, \arg1\().16b
-.endm
-
-.macro mov.8b arg0, arg1
-    mov \arg0\().8b, \arg1\().8b
-.endm
-
 #endif
--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@@ -162,7 +162,7 @@
 .endm
 
 .macro DIFF_LUMA_EQ4_MASK
-    mov.16b   $3, $2
+    mov   $3.16b, $2.16b
     bsl   $3.16b, $0.16b, $1.16b
 .endm
 
@@ -420,7 +420,7 @@
 .endm
 
 .macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
-    mov.16b   \arg3, \arg2
+    mov   \arg3\().16b, \arg2\().16b
     bsl   \arg3\().16b, \arg0\().16b, \arg1\().16b
 .endm
 
@@ -652,10 +652,10 @@
     and     v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
     and     v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
 
-    mov.16b v23, v21
-    mov.16b v24, v21
+    mov v23.16b, v21.16b
+    mov v24.16b, v21.16b
 
-    mov.16b v25, v0
+    mov v25.16b, v0.16b
     DIFF_LUMA_EQ4_P2P1P0_1        v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
     DIFF_LUMA_EQ4_P2P1P0_2        v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
     ins v0.d[1], v25.d[1]
@@ -669,9 +669,9 @@
     st1   {v17.16b}, [x3], x1
 
 
-    mov.16b v23, v22
-    mov.16b v24, v22
-    mov.16b v25, v7
+    mov v23.16b, v22.16b
+    mov v24.16b, v22.16b
+    mov v25.16b, v7.16b
     DIFF_LUMA_EQ4_P2P1P0_1        v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
     DIFF_LUMA_EQ4_P2P1P0_2        v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
     ins v7.d[1], v25.d[1]
@@ -728,7 +728,7 @@
     sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
 
     DIFF_LUMA_LT4_P1_Q1   v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 //Use Tmp v23,v24
-    mov.16b v25, v19
+    mov v25.16b, v19.16b
 
     DIFF_LUMA_LT4_P1_Q1   v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 //Use Tmp v23,v24
 
@@ -749,11 +749,11 @@
     EXTRACT_DELTA_INTO_TWO_PART   v19, v20
     uqadd v2.16b, v2.16b, v20.16b
     uqsub v2.16b, v2.16b, v19.16b
-    mov.16b v26, v2
+    mov v26.16b, v2.16b
     uqsub v3.16b, v3.16b, v20.16b
     uqadd v3.16b, v3.16b, v19.16b
-    mov.16b v27, v3
-    mov.16b v28, v21
+    mov v27.16b, v3.16b
+    mov v28.16b, v21.16b
 
     sub   x0, x0, #2
     add   x2, x0, x1
@@ -816,10 +816,10 @@
     and     v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
     and     v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
 
-    mov.16b v23, v21
-    mov.16b v24, v21
+    mov v23.16b, v21.16b
+    mov v24.16b, v21.16b
 
-    mov.16b v25, v0
+    mov v25.16b, v0.16b
     DIFF_LUMA_EQ4_P2P1P0_1        v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
     DIFF_LUMA_EQ4_P2P1P0_2        v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
     ins v0.d[1], v25.d[1]
@@ -826,16 +826,16 @@
     ins v23.d[1], v24.d[1]
     and   v21.16b, v20.16b, v21.16b
     DIFF_LUMA_EQ4_MASK    v19, v1, v21, v17
-    mov.16b v26, v17
+    mov v26.16b, v17.16b
     DIFF_LUMA_EQ4_MASK    v0, v2, v21, v17
-    mov.16b v27, v17
+    mov v27.16b, v17.16b
     DIFF_LUMA_EQ4_MASK    v23, v3, v18, v17
-    mov.16b v28, v17
+    mov v28.16b, v17.16b
 
 
-    mov.16b v23, v22
-    mov.16b v24, v22
-    mov.16b v25, v7
+    mov v23.16b, v22.16b
+    mov v24.16b, v22.16b
+    mov v25.16b, v7.16b
     DIFF_LUMA_EQ4_P2P1P0_1        v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
     DIFF_LUMA_EQ4_P2P1P0_2        v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
     ins v7.d[1], v25.d[1]
@@ -842,11 +842,11 @@
     ins v23.d[1], v24.d[1]
     and   v22.16b, v20.16b, v22.16b
     DIFF_LUMA_EQ4_MASK    v23, v4, v18, v17
-    mov.16b v29, v17
+    mov v29.16b, v17.16b
     DIFF_LUMA_EQ4_MASK    v7, v5, v22, v17
-    mov.16b v30, v17
+    mov v30.16b, v17.16b
     DIFF_LUMA_EQ4_MASK    v19, v6, v22, v17
-    mov.16b v31, v17
+    mov v31.16b, v17.16b
 
     STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 0
     STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 1
@@ -1013,7 +1013,7 @@
     DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
     DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
 
-    mov.16b v6, v7
+    mov v6.16b, v7.16b
     bsl v6.16b, v20.16b, v1.16b
     bsl v7.16b, v21.16b, v2.16b
 
@@ -1059,7 +1059,7 @@
     DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
     DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
 
-    mov.16b v6, v7
+    mov v6.16b, v7.16b
     bsl v6.16b, v20.16b, v1.16b
     bsl v7.16b, v21.16b, v2.16b
 
--- a/codec/common/arm64/expand_picture_aarch64_neon.S
+++ b/codec/common/arm64/expand_picture_aarch64_neon.S
@@ -46,8 +46,8 @@
     add x6, x4, #1
     ld1r {v0.16b}, [x7], x1
     ld1r {v2.16b}, [x4], x1
-    mov.16b v1, v0
-    mov.16b v3, v2
+    mov v1.16b, v0.16b
+    mov v3.16b, v2.16b
     st2 {v0.16b, v1.16b}, [x5]
     st2 {v2.16b, v3.16b}, [x6]
     sub x8, x8, #1
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -696,12 +696,12 @@
     FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1
     st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
 
-    mov.16b v3, v5
-    mov.16b v5, v7
-    mov.16b v7, v2
-    mov.16b v2, v4
-    mov.16b v4, v6
-    mov.16b v6, v7
+    mov v3.16b, v5.16b
+    mov v5.16b, v7.16b
+    mov v7.16b, v2.16b
+    mov v2.16b, v4.16b
+    mov v4.16b, v6.16b
+    mov v6.16b, v7.16b
     sub x4, x4, #8
     cbnz x4, w16_xy_01_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
@@ -746,12 +746,12 @@
     FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1
     st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
 
-    mov.16b v5, v3
-    mov.16b v3, v7
-    mov.16b v7, v2
-    mov.16b v2, v6
-    mov.16b v6, v4
-    mov.16b v4, v7
+    mov v5.16b, v3.16b
+    mov v3.16b, v7.16b
+    mov v7.16b, v2.16b
+    mov v2.16b, v6.16b
+    mov v6.16b, v4.16b
+    mov v4.16b, v7.16b
     sub x4, x4, #4
     cbnz x4, w8_xy_01_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
@@ -799,14 +799,14 @@
     st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
     mov v4.s[0], v3.s[1]
 
-    mov.8b v21, v6
-    mov.8b v6, v4
-    mov.8b v4, v2
-    mov.8b v2, v21
-    mov.8b v21, v3
-    mov.8b v3, v7
-    mov.8b v7, v5
-    mov.8b v5, v21
+    mov v21.8b, v6.8b
+    mov v6.8b, v4.8b
+    mov v4.8b, v2.8b
+    mov v2.8b, v21.8b
+    mov v21.8b, v3.8b
+    mov v3.8b, v7.8b
+    mov v7.8b, v5.8b
+    mov v5.8b, v21.8b
 
     sub x4, x4, #4
     cbnz x4, w4_xy_01_mc_luma_loop
@@ -885,12 +885,12 @@
     FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1
     st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
 
-    mov.16b v3, v5
-    mov.16b v5, v7
-    mov.16b v7, v2
-    mov.16b v2, v4
-    mov.16b v4, v6
-    mov.16b v6, v7
+    mov v3.16b, v5.16b
+    mov v5.16b, v7.16b
+    mov v7.16b, v2.16b
+    mov v2.16b, v4.16b
+    mov v4.16b, v6.16b
+    mov v6.16b, v7.16b
     sub x4, x4, #8
     cbnz x4, w16_xy_03_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
@@ -935,12 +935,12 @@
     FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1
     st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
 
-    mov.16b v5, v3
-    mov.16b v3, v7
-    mov.16b v7, v2
-    mov.16b v2, v6
-    mov.16b v6, v4
-    mov.16b v4, v7
+    mov v5.16b, v3.16b
+    mov v3.16b, v7.16b
+    mov v7.16b, v2.16b
+    mov v2.16b, v6.16b
+    mov v6.16b, v4.16b
+    mov v4.16b, v7.16b
     sub x4, x4, #4
     cbnz x4, w8_xy_03_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
@@ -988,14 +988,14 @@
     st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
     mov v4.s[0], v3.s[1]
 
-    mov.8b v21, v6
-    mov.8b v6, v4
-    mov.8b v4, v2
-    mov.8b v2, v21
-    mov.8b v21, v3
-    mov.8b v3, v7
-    mov.8b v7, v5
-    mov.8b v5, v21
+    mov v21.8b, v6.8b
+    mov v6.8b, v4.8b
+    mov v4.8b, v2.8b
+    mov v2.8b, v21.8b
+    mov v21.8b, v3.8b
+    mov v3.8b, v7.8b
+    mov v7.8b, v5.8b
+    mov v5.8b, v21.8b
 
     sub x4, x4, #4
     cbnz x4, w4_xy_03_mc_luma_loop
@@ -1074,12 +1074,12 @@
     FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
     st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
 
-    mov.16b v3, v5
-    mov.16b v5, v7
-    mov.16b v7, v2
-    mov.16b v2, v4
-    mov.16b v4, v6
-    mov.16b v6, v7
+    mov v3.16b, v5.16b
+    mov v5.16b, v7.16b
+    mov v7.16b, v2.16b
+    mov v2.16b, v4.16b
+    mov v4.16b, v6.16b
+    mov v6.16b, v7.16b
     sub x4, x4, #8
     cbnz x4, w16_xy_02_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
@@ -1124,12 +1124,12 @@
     FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
     st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
 
-    mov.16b v5, v3
-    mov.16b v3, v7
-    mov.16b v7, v2
-    mov.16b v2, v6
-    mov.16b v6, v4
-    mov.16b v4, v7
+    mov v5.16b, v3.16b
+    mov v3.16b, v7.16b
+    mov v7.16b, v2.16b
+    mov v2.16b, v6.16b
+    mov v6.16b, v4.16b
+    mov v4.16b, v7.16b
     sub x4, x4, #4
     cbnz x4, w8_xy_02_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
@@ -1177,14 +1177,14 @@
     st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
     mov v4.s[0], v3.s[1]
 
-    mov.8b v21, v6
-    mov.8b v6, v4
-    mov.8b v4, v2
-    mov.8b v2, v21
-    mov.8b v21, v3
-    mov.8b v3, v7
-    mov.8b v7, v5
-    mov.8b v5, v21
+    mov v21.8b, v6.8b
+    mov v6.8b, v4.8b
+    mov v4.8b, v2.8b
+    mov v2.8b, v21.8b
+    mov v21.8b, v3.8b
+    mov v3.8b, v7.8b
+    mov v7.8b, v5.8b
+    mov v5.8b, v21.8b
 
     sub x4, x4, #4
     cbnz x4, w4_xy_02_mc_luma_loop
@@ -1326,26 +1326,26 @@
     FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 7 line
 
-    mov.16b v5, v11
-    mov.16b v11, v17
-    mov.16b v30, v2
-    mov.16b v2, v8
-    mov.16b v8, v14
-    mov.16b v14, v30
+    mov v5.16b, v11.16b
+    mov v11.16b, v17.16b
+    mov v30.16b, v2.16b
+    mov v2.16b, v8.16b
+    mov v8.16b, v14.16b
+    mov v14.16b, v30.16b
 
-    mov.16b v6, v12
-    mov.16b v12, v18
-    mov.16b v30, v3
-    mov.16b v3, v9
-    mov.16b v9, v15
-    mov.16b v15, v30
+    mov v6.16b, v12.16b
+    mov v12.16b, v18.16b
+    mov v30.16b, v3.16b
+    mov v3.16b, v9.16b
+    mov v9.16b, v15.16b
+    mov v15.16b, v30.16b
 
-    mov.16b v7, v13
-    mov.16b v13, v19
-    mov.16b v30, v4
-    mov.16b v4, v10
-    mov.16b v10, v16
-    mov.16b v16, v30
+    mov v7.16b, v13.16b
+    mov v13.16b, v19.16b
+    mov v30.16b, v4.16b
+    mov v4.16b, v10.16b
+    mov v10.16b, v16.16b
+    mov v16.16b, v30.16b
 
     sub x4, x4, #8
     cbnz x4, w16_hv_mc_luma_loop
@@ -1416,12 +1416,12 @@
     st1 {v26.8b}, [x2], x3 //write 8Byte : 3 line
 
 
-    mov.16b v5, v3
-    mov.16b v3, v7
-    mov.16b v30, v2
-    mov.16b v2, v6
-    mov.16b v6, v4
-    mov.16b v4, v30
+    mov v5.16b, v3.16b
+    mov v3.16b, v7.16b
+    mov v30.16b, v2.16b
+    mov v2.16b, v6.16b
+    mov v6.16b, v4.16b
+    mov v4.16b, v30.16b
 
     sub x4, x4, #4
     cbnz x4, w8_hv_mc_luma_loop
@@ -1487,12 +1487,12 @@
     st1 {v27.s}[0], [x2], x3 //write 4Byte : 2 line
     st1 {v27.s}[1], [x2], x3 //write 4Byte : 3 line
 
-    mov.16b v5, v3
-    mov.16b v3, v7
-    mov.16b v30, v2
-    mov.16b v2, v6
-    mov.16b v6, v4
-    mov.16b v4, v30
+    mov v5.16b, v3.16b
+    mov v3.16b, v7.16b
+    mov v30.16b, v2.16b
+    mov v2.16b, v6.16b
+    mov v6.16b, v4.16b
+    mov v4.16b, v30.16b
 
     sub x4, x4, #4
     cbnz x4, w4_hv_mc_luma_loop
@@ -1735,8 +1735,8 @@
     rshrn v17.8b, v16.8h, #6
     st1 {v17.8b}, [x2], x3
 
-    mov.16b v0, v18
-    mov.16b v1, v19
+    mov v0.16b, v18.16b
+    mov v1.16b, v19.16b
     sub x5, x5, #2
     cbnz x5, w8_mc_chroma_loop
 WELS_ASM_AARCH64_FUNC_END
@@ -1764,8 +1764,8 @@
     st1 {v17.s}[0], [x2], x3
     st1 {v17.s}[1], [x2], x3
 
-    mov.8b v0, v18
-    mov.8b v1, v19
+    mov v0.8b, v18.8b
+    mov v1.8b, v19.8b
     sub x5, x5, #2
     cbnz x5, w4_mc_chroma_loop
 WELS_ASM_AARCH64_FUNC_END
@@ -1985,26 +1985,26 @@
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 7 line
 
-    mov.16b v5, v11
-    mov.16b v11, v17
-    mov.16b v30, v2
-    mov.16b v2, v8
-    mov.16b v8, v14
-    mov.16b v14, v30
+    mov v5.16b, v11.16b
+    mov v11.16b, v17.16b
+    mov v30.16b, v2.16b
+    mov v2.16b, v8.16b
+    mov v8.16b, v14.16b
+    mov v14.16b, v30.16b
 
-    mov.16b v6, v12
-    mov.16b v12, v18
-    mov.16b v30, v3
-    mov.16b v3, v9
-    mov.16b v9, v15
-    mov.16b v15, v30
+    mov v6.16b, v12.16b
+    mov v12.16b, v18.16b
+    mov v30.16b, v3.16b
+    mov v3.16b, v9.16b
+    mov v9.16b, v15.16b
+    mov v15.16b, v30.16b
 
-    mov.16b v7, v13
-    mov.16b v13, v19
-    mov.16b v30, v4
-    mov.16b v4, v10
-    mov.16b v10, v16
-    mov.16b v16, v30
+    mov v7.16b, v13.16b
+    mov v13.16b, v19.16b
+    mov v30.16b, v4.16b
+    mov v4.16b, v10.16b
+    mov v10.16b, v16.16b
+    mov v16.16b, v30.16b
 
     sub x4, x4, #8
     cbnz x4, w17_hv_mc_luma_loop
@@ -2104,12 +2104,12 @@
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 3 line
 
 
-    mov.16b v5, v3
-    mov.16b v3, v7
-    mov.16b v30, v2
-    mov.16b v2, v6
-    mov.16b v6, v4
-    mov.16b v4, v30
+    mov v5.16b, v3.16b
+    mov v3.16b, v7.16b
+    mov v30.16b, v2.16b
+    mov v2.16b, v6.16b
+    mov v6.16b, v4.16b
+    mov v4.16b, v30.16b
 
     sub x4, x4, #4
     cbnz x4, w9_hv_mc_luma_loop
@@ -2200,12 +2200,12 @@
     FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
     st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
 
-    mov.16b v3, v5
-    mov.16b v5, v7
-    mov.16b v7, v2
-    mov.16b v2, v4
-    mov.16b v4, v6
-    mov.16b v6, v7
+    mov v3.16b, v5.16b
+    mov v5.16b, v7.16b
+    mov v7.16b, v2.16b
+    mov v2.16b, v4.16b
+    mov v4.16b, v6.16b
+    mov v6.16b, v7.16b
     sub x4, x4, #8
     cbnz x4, w17_v_mc_luma_loop
 
@@ -2255,12 +2255,12 @@
     FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
     st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
 
-    mov.16b v5, v3
-    mov.16b v3, v7
-    mov.16b v7, v2
-    mov.16b v2, v6
-    mov.16b v6, v4
-    mov.16b v4, v7
+    mov v5.16b, v3.16b
+    mov v3.16b, v7.16b
+    mov v7.16b, v2.16b
+    mov v2.16b, v6.16b
+    mov v6.16b, v4.16b
+    mov v4.16b, v7.16b
     sub x4, x4, #4
     cbnz x4, w9_v_mc_luma_loop
 
--- a/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
+++ b/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
@@ -73,7 +73,7 @@
     cmgt    $4.8h, $0.8h, #0      // if true, location of coef == 11111111
     bif     $3.16b, $1.16b, $4.16b      // if (x<0) reserved part; else keep 0 untouched
     shl     $3.8h, $3.8h, #1
-    mov.16b $6, $1
+    mov     $6.16b, $1.16b
     sub     $1.8h, $1.8h, $3.8h      // if x > 0, -= 0; else x-= 2x
 //  }
 .endm
@@ -533,7 +533,7 @@
     ld1     {v2.8h}, [x1]
     ld1     {v0.8h, v1.8h}, [x0]
     ld1     {v3.8h}, [x2]
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS   v0, v2, v3, v5, v6, v7
     st1     {v2.8h}, [x0], #16
     NEWQUANT_COEF_EACH_16BITS   v1, v4, v3, v5, v6, v7
@@ -545,7 +545,7 @@
     ld1     {v0.8h, v1.8h}, [x0]
     dup     v2.8h, w1      // even ff range [0, 768]
     dup     v3.8h, w2
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS   v0, v2, v3, v5, v6, v7
     st1     {v2.8h}, [x0], #16
     NEWQUANT_COEF_EACH_16BITS   v1, v4, v3, v5, v6, v7
@@ -559,10 +559,10 @@
 
 .rept 4
     ld1     {v0.8h, v1.8h}, [x0], #32
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS   v0, v4, v3, v5, v6, v7
     st1     {v4.8h}, [x1], #16
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS   v1, v4, v3, v5, v6, v7
     st1     {v4.8h}, [x1], #16
 .endr
@@ -575,18 +575,18 @@
     mov     x1, x0
 
     ld1     {v0.8h, v1.8h}, [x0], #32
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v16
     st1     {v4.8h}, [x1], #16
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v17
     st1     {v4.8h}, [x1], #16   // then 1st 16 elem in v16  & v17
 
     ld1     {v0.8h, v1.8h}, [x0], #32
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v18
     st1     {v4.8h}, [x1], #16
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v19
     st1     {v4.8h}, [x1], #16   // then 2st 16 elem in v18 & v19
 
@@ -593,18 +593,18 @@
     SELECT_MAX_IN_ABS_COEF  v16, v17, v18, v19, h20, h21
 
     ld1     {v0.8h, v1.8h}, [x0], #32
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v16
     st1     {v4.8h}, [x1], #16
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v17
     st1     {v4.8h}, [x1], #16   // then 1st 16 elem in v16  & v17
 
     ld1     {v0.8h, v1.8h}, [x0], #32
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v18
     st1     {v4.8h}, [x1], #16
-    mov.16b v4, v2
+    mov     v4.16b, v2.16b
     NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v19
     st1     {v4.8h}, [x1], #16   // then 2st 16 elem in v18 & v19
 
--- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
@@ -288,7 +288,7 @@
 
 _hash_height_loop:
     mov x7, x1
-    mov.16b v2, v5 //mx_x_offset_x4
+    mov v2.16b, v5.16b //mx_x_offset_x4
 
 _hash_width_loop:
     ld1 {v0.d}[0], [x0], #8