ref: 3956bccc3de5255859033daa5aebfb3cbe70963f
parent: b66703133e92cde6b62e436be1ac8e022766d923
author: Martin Storsjö <[email protected]>
date: Wed Apr 23 07:25:40 EDT 2014
Use the correct, official syntax for aarch64 mov instructions Previously this used unofficial, apple specific syntax (with fallback macros for gnu binutils), since Xcode 5.x didn't support the official syntax of these instructions. Since Xcode 6 has been out for quite a number of months already, it should be safe to require this (for building 64 bit binaries for iOS, armv7 builds can still be built with older Xcode versions). This clarifies the code by avoiding apple specific syntax in the assembler instructions.
--- a/codec/common/arm64/arm_arch64_common_macro.S
+++ b/codec/common/arm64/arm_arch64_common_macro.S
@@ -59,12 +59,4 @@
.endfunc
.endm
-.macro mov.16b arg0, arg1
- mov \arg0\().16b, \arg1\().16b
-.endm
-
-.macro mov.8b arg0, arg1
- mov \arg0\().8b, \arg1\().8b
-.endm
-
#endif
--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@@ -162,7 +162,7 @@
.endm
.macro DIFF_LUMA_EQ4_MASK
- mov.16b $3, $2
+ mov $3.16b, $2.16b
bsl $3.16b, $0.16b, $1.16b
.endm
@@ -420,7 +420,7 @@
.endm
.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
- mov.16b \arg3, \arg2
+ mov \arg3\().16b, \arg2\().16b
bsl \arg3\().16b, \arg0\().16b, \arg1\().16b
.endm
@@ -652,10 +652,10 @@
and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
- mov.16b v23, v21
- mov.16b v24, v21
+ mov v23.16b, v21.16b
+ mov v24.16b, v21.16b
- mov.16b v25, v0
+ mov v25.16b, v0.16b
DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
ins v0.d[1], v25.d[1]
@@ -669,9 +669,9 @@
st1 {v17.16b}, [x3], x1
- mov.16b v23, v22
- mov.16b v24, v22
- mov.16b v25, v7
+ mov v23.16b, v22.16b
+ mov v24.16b, v22.16b
+ mov v25.16b, v7.16b
DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
ins v7.d[1], v25.d[1]
@@ -728,7 +728,7 @@
sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
DIFF_LUMA_LT4_P1_Q1 v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 //Use Tmp v23,v24
- mov.16b v25, v19
+ mov v25.16b, v19.16b
DIFF_LUMA_LT4_P1_Q1 v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 //Use Tmp v23,v24
@@ -749,11 +749,11 @@
EXTRACT_DELTA_INTO_TWO_PART v19, v20
uqadd v2.16b, v2.16b, v20.16b
uqsub v2.16b, v2.16b, v19.16b
- mov.16b v26, v2
+ mov v26.16b, v2.16b
uqsub v3.16b, v3.16b, v20.16b
uqadd v3.16b, v3.16b, v19.16b
- mov.16b v27, v3
- mov.16b v28, v21
+ mov v27.16b, v3.16b
+ mov v28.16b, v21.16b
sub x0, x0, #2
add x2, x0, x1
@@ -816,10 +816,10 @@
and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
- mov.16b v23, v21
- mov.16b v24, v21
+ mov v23.16b, v21.16b
+ mov v24.16b, v21.16b
- mov.16b v25, v0
+ mov v25.16b, v0.16b
DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
ins v0.d[1], v25.d[1]
@@ -826,16 +826,16 @@
ins v23.d[1], v24.d[1]
and v21.16b, v20.16b, v21.16b
DIFF_LUMA_EQ4_MASK v19, v1, v21, v17
- mov.16b v26, v17
+ mov v26.16b, v17.16b
DIFF_LUMA_EQ4_MASK v0, v2, v21, v17
- mov.16b v27, v17
+ mov v27.16b, v17.16b
DIFF_LUMA_EQ4_MASK v23, v3, v18, v17
- mov.16b v28, v17
+ mov v28.16b, v17.16b
- mov.16b v23, v22
- mov.16b v24, v22
- mov.16b v25, v7
+ mov v23.16b, v22.16b
+ mov v24.16b, v22.16b
+ mov v25.16b, v7.16b
DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
ins v7.d[1], v25.d[1]
@@ -842,11 +842,11 @@
ins v23.d[1], v24.d[1]
and v22.16b, v20.16b, v22.16b
DIFF_LUMA_EQ4_MASK v23, v4, v18, v17
- mov.16b v29, v17
+ mov v29.16b, v17.16b
DIFF_LUMA_EQ4_MASK v7, v5, v22, v17
- mov.16b v30, v17
+ mov v30.16b, v17.16b
DIFF_LUMA_EQ4_MASK v19, v6, v22, v17
- mov.16b v31, v17
+ mov v31.16b, v17.16b
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 0
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 1
@@ -1013,7 +1013,7 @@
DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
- mov.16b v6, v7
+ mov v6.16b, v7.16b
bsl v6.16b, v20.16b, v1.16b
bsl v7.16b, v21.16b, v2.16b
@@ -1059,7 +1059,7 @@
DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
- mov.16b v6, v7
+ mov v6.16b, v7.16b
bsl v6.16b, v20.16b, v1.16b
bsl v7.16b, v21.16b, v2.16b
--- a/codec/common/arm64/expand_picture_aarch64_neon.S
+++ b/codec/common/arm64/expand_picture_aarch64_neon.S
@@ -46,8 +46,8 @@
add x6, x4, #1
ld1r {v0.16b}, [x7], x1
ld1r {v2.16b}, [x4], x1
- mov.16b v1, v0
- mov.16b v3, v2
+ mov v1.16b, v0.16b
+ mov v3.16b, v2.16b
st2 {v0.16b, v1.16b}, [x5]
st2 {v2.16b, v3.16b}, [x6]
sub x8, x8, #1
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -696,12 +696,12 @@
FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1
st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
- mov.16b v3, v5
- mov.16b v5, v7
- mov.16b v7, v2
- mov.16b v2, v4
- mov.16b v4, v6
- mov.16b v6, v7
+ mov v3.16b, v5.16b
+ mov v5.16b, v7.16b
+ mov v7.16b, v2.16b
+ mov v2.16b, v4.16b
+ mov v4.16b, v6.16b
+ mov v6.16b, v7.16b
sub x4, x4, #8
cbnz x4, w16_xy_01_mc_luma_loop
WELS_ASM_AARCH64_FUNC_END
@@ -746,12 +746,12 @@
FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1
st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
- mov.16b v5, v3
- mov.16b v3, v7
- mov.16b v7, v2
- mov.16b v2, v6
- mov.16b v6, v4
- mov.16b v4, v7
+ mov v5.16b, v3.16b
+ mov v3.16b, v7.16b
+ mov v7.16b, v2.16b
+ mov v2.16b, v6.16b
+ mov v6.16b, v4.16b
+ mov v4.16b, v7.16b
sub x4, x4, #4
cbnz x4, w8_xy_01_mc_luma_loop
WELS_ASM_AARCH64_FUNC_END
@@ -799,14 +799,14 @@
st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
mov v4.s[0], v3.s[1]
- mov.8b v21, v6
- mov.8b v6, v4
- mov.8b v4, v2
- mov.8b v2, v21
- mov.8b v21, v3
- mov.8b v3, v7
- mov.8b v7, v5
- mov.8b v5, v21
+ mov v21.8b, v6.8b
+ mov v6.8b, v4.8b
+ mov v4.8b, v2.8b
+ mov v2.8b, v21.8b
+ mov v21.8b, v3.8b
+ mov v3.8b, v7.8b
+ mov v7.8b, v5.8b
+ mov v5.8b, v21.8b
sub x4, x4, #4
cbnz x4, w4_xy_01_mc_luma_loop
@@ -885,12 +885,12 @@
FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1
st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
- mov.16b v3, v5
- mov.16b v5, v7
- mov.16b v7, v2
- mov.16b v2, v4
- mov.16b v4, v6
- mov.16b v6, v7
+ mov v3.16b, v5.16b
+ mov v5.16b, v7.16b
+ mov v7.16b, v2.16b
+ mov v2.16b, v4.16b
+ mov v4.16b, v6.16b
+ mov v6.16b, v7.16b
sub x4, x4, #8
cbnz x4, w16_xy_03_mc_luma_loop
WELS_ASM_AARCH64_FUNC_END
@@ -935,12 +935,12 @@
FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1
st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
- mov.16b v5, v3
- mov.16b v3, v7
- mov.16b v7, v2
- mov.16b v2, v6
- mov.16b v6, v4
- mov.16b v4, v7
+ mov v5.16b, v3.16b
+ mov v3.16b, v7.16b
+ mov v7.16b, v2.16b
+ mov v2.16b, v6.16b
+ mov v6.16b, v4.16b
+ mov v4.16b, v7.16b
sub x4, x4, #4
cbnz x4, w8_xy_03_mc_luma_loop
WELS_ASM_AARCH64_FUNC_END
@@ -988,14 +988,14 @@
st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
mov v4.s[0], v3.s[1]
- mov.8b v21, v6
- mov.8b v6, v4
- mov.8b v4, v2
- mov.8b v2, v21
- mov.8b v21, v3
- mov.8b v3, v7
- mov.8b v7, v5
- mov.8b v5, v21
+ mov v21.8b, v6.8b
+ mov v6.8b, v4.8b
+ mov v4.8b, v2.8b
+ mov v2.8b, v21.8b
+ mov v21.8b, v3.8b
+ mov v3.8b, v7.8b
+ mov v7.8b, v5.8b
+ mov v5.8b, v21.8b
sub x4, x4, #4
cbnz x4, w4_xy_03_mc_luma_loop
@@ -1074,12 +1074,12 @@
FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
- mov.16b v3, v5
- mov.16b v5, v7
- mov.16b v7, v2
- mov.16b v2, v4
- mov.16b v4, v6
- mov.16b v6, v7
+ mov v3.16b, v5.16b
+ mov v5.16b, v7.16b
+ mov v7.16b, v2.16b
+ mov v2.16b, v4.16b
+ mov v4.16b, v6.16b
+ mov v6.16b, v7.16b
sub x4, x4, #8
cbnz x4, w16_xy_02_mc_luma_loop
WELS_ASM_AARCH64_FUNC_END
@@ -1124,12 +1124,12 @@
FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
- mov.16b v5, v3
- mov.16b v3, v7
- mov.16b v7, v2
- mov.16b v2, v6
- mov.16b v6, v4
- mov.16b v4, v7
+ mov v5.16b, v3.16b
+ mov v3.16b, v7.16b
+ mov v7.16b, v2.16b
+ mov v2.16b, v6.16b
+ mov v6.16b, v4.16b
+ mov v4.16b, v7.16b
sub x4, x4, #4
cbnz x4, w8_xy_02_mc_luma_loop
WELS_ASM_AARCH64_FUNC_END
@@ -1177,14 +1177,14 @@
st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
mov v4.s[0], v3.s[1]
- mov.8b v21, v6
- mov.8b v6, v4
- mov.8b v4, v2
- mov.8b v2, v21
- mov.8b v21, v3
- mov.8b v3, v7
- mov.8b v7, v5
- mov.8b v5, v21
+ mov v21.8b, v6.8b
+ mov v6.8b, v4.8b
+ mov v4.8b, v2.8b
+ mov v2.8b, v21.8b
+ mov v21.8b, v3.8b
+ mov v3.8b, v7.8b
+ mov v7.8b, v5.8b
+ mov v5.8b, v21.8b
sub x4, x4, #4
cbnz x4, w4_xy_02_mc_luma_loop
@@ -1326,26 +1326,26 @@
FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 7 line
- mov.16b v5, v11
- mov.16b v11, v17
- mov.16b v30, v2
- mov.16b v2, v8
- mov.16b v8, v14
- mov.16b v14, v30
+ mov v5.16b, v11.16b
+ mov v11.16b, v17.16b
+ mov v30.16b, v2.16b
+ mov v2.16b, v8.16b
+ mov v8.16b, v14.16b
+ mov v14.16b, v30.16b
- mov.16b v6, v12
- mov.16b v12, v18
- mov.16b v30, v3
- mov.16b v3, v9
- mov.16b v9, v15
- mov.16b v15, v30
+ mov v6.16b, v12.16b
+ mov v12.16b, v18.16b
+ mov v30.16b, v3.16b
+ mov v3.16b, v9.16b
+ mov v9.16b, v15.16b
+ mov v15.16b, v30.16b
- mov.16b v7, v13
- mov.16b v13, v19
- mov.16b v30, v4
- mov.16b v4, v10
- mov.16b v10, v16
- mov.16b v16, v30
+ mov v7.16b, v13.16b
+ mov v13.16b, v19.16b
+ mov v30.16b, v4.16b
+ mov v4.16b, v10.16b
+ mov v10.16b, v16.16b
+ mov v16.16b, v30.16b
sub x4, x4, #8
cbnz x4, w16_hv_mc_luma_loop
@@ -1416,12 +1416,12 @@
st1 {v26.8b}, [x2], x3 //write 8Byte : 3 line
- mov.16b v5, v3
- mov.16b v3, v7
- mov.16b v30, v2
- mov.16b v2, v6
- mov.16b v6, v4
- mov.16b v4, v30
+ mov v5.16b, v3.16b
+ mov v3.16b, v7.16b
+ mov v30.16b, v2.16b
+ mov v2.16b, v6.16b
+ mov v6.16b, v4.16b
+ mov v4.16b, v30.16b
sub x4, x4, #4
cbnz x4, w8_hv_mc_luma_loop
@@ -1487,12 +1487,12 @@
st1 {v27.s}[0], [x2], x3 //write 4Byte : 2 line
st1 {v27.s}[1], [x2], x3 //write 4Byte : 3 line
- mov.16b v5, v3
- mov.16b v3, v7
- mov.16b v30, v2
- mov.16b v2, v6
- mov.16b v6, v4
- mov.16b v4, v30
+ mov v5.16b, v3.16b
+ mov v3.16b, v7.16b
+ mov v30.16b, v2.16b
+ mov v2.16b, v6.16b
+ mov v6.16b, v4.16b
+ mov v4.16b, v30.16b
sub x4, x4, #4
cbnz x4, w4_hv_mc_luma_loop
@@ -1735,8 +1735,8 @@
rshrn v17.8b, v16.8h, #6
st1 {v17.8b}, [x2], x3
- mov.16b v0, v18
- mov.16b v1, v19
+ mov v0.16b, v18.16b
+ mov v1.16b, v19.16b
sub x5, x5, #2
cbnz x5, w8_mc_chroma_loop
WELS_ASM_AARCH64_FUNC_END
@@ -1764,8 +1764,8 @@
st1 {v17.s}[0], [x2], x3
st1 {v17.s}[1], [x2], x3
- mov.8b v0, v18
- mov.8b v1, v19
+ mov v0.8b, v18.8b
+ mov v1.8b, v19.8b
sub x5, x5, #2
cbnz x5, w4_mc_chroma_loop
WELS_ASM_AARCH64_FUNC_END
@@ -1985,26 +1985,26 @@
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 7 line
- mov.16b v5, v11
- mov.16b v11, v17
- mov.16b v30, v2
- mov.16b v2, v8
- mov.16b v8, v14
- mov.16b v14, v30
+ mov v5.16b, v11.16b
+ mov v11.16b, v17.16b
+ mov v30.16b, v2.16b
+ mov v2.16b, v8.16b
+ mov v8.16b, v14.16b
+ mov v14.16b, v30.16b
- mov.16b v6, v12
- mov.16b v12, v18
- mov.16b v30, v3
- mov.16b v3, v9
- mov.16b v9, v15
- mov.16b v15, v30
+ mov v6.16b, v12.16b
+ mov v12.16b, v18.16b
+ mov v30.16b, v3.16b
+ mov v3.16b, v9.16b
+ mov v9.16b, v15.16b
+ mov v15.16b, v30.16b
- mov.16b v7, v13
- mov.16b v13, v19
- mov.16b v30, v4
- mov.16b v4, v10
- mov.16b v10, v16
- mov.16b v16, v30
+ mov v7.16b, v13.16b
+ mov v13.16b, v19.16b
+ mov v30.16b, v4.16b
+ mov v4.16b, v10.16b
+ mov v10.16b, v16.16b
+ mov v16.16b, v30.16b
sub x4, x4, #8
cbnz x4, w17_hv_mc_luma_loop
@@ -2104,12 +2104,12 @@
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 3 line
- mov.16b v5, v3
- mov.16b v3, v7
- mov.16b v30, v2
- mov.16b v2, v6
- mov.16b v6, v4
- mov.16b v4, v30
+ mov v5.16b, v3.16b
+ mov v3.16b, v7.16b
+ mov v30.16b, v2.16b
+ mov v2.16b, v6.16b
+ mov v6.16b, v4.16b
+ mov v4.16b, v30.16b
sub x4, x4, #4
cbnz x4, w9_hv_mc_luma_loop
@@ -2200,12 +2200,12 @@
FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
- mov.16b v3, v5
- mov.16b v5, v7
- mov.16b v7, v2
- mov.16b v2, v4
- mov.16b v4, v6
- mov.16b v6, v7
+ mov v3.16b, v5.16b
+ mov v5.16b, v7.16b
+ mov v7.16b, v2.16b
+ mov v2.16b, v4.16b
+ mov v4.16b, v6.16b
+ mov v6.16b, v7.16b
sub x4, x4, #8
cbnz x4, w17_v_mc_luma_loop
@@ -2255,12 +2255,12 @@
FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
- mov.16b v5, v3
- mov.16b v3, v7
- mov.16b v7, v2
- mov.16b v2, v6
- mov.16b v6, v4
- mov.16b v4, v7
+ mov v5.16b, v3.16b
+ mov v3.16b, v7.16b
+ mov v7.16b, v2.16b
+ mov v2.16b, v6.16b
+ mov v6.16b, v4.16b
+ mov v4.16b, v7.16b
sub x4, x4, #4
cbnz x4, w9_v_mc_luma_loop
--- a/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
+++ b/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
@@ -73,7 +73,7 @@
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
shl $3.8h, $3.8h, #1
- mov.16b $6, $1
+ mov $6.16b, $1.16b
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
// }
.endm
@@ -533,7 +533,7 @@
ld1 {v2.8h}, [x1]
ld1 {v0.8h, v1.8h}, [x0]
ld1 {v3.8h}, [x2]
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS v0, v2, v3, v5, v6, v7
st1 {v2.8h}, [x0], #16
NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7
@@ -545,7 +545,7 @@
ld1 {v0.8h, v1.8h}, [x0]
dup v2.8h, w1 // even ff range [0, 768]
dup v3.8h, w2
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS v0, v2, v3, v5, v6, v7
st1 {v2.8h}, [x0], #16
NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7
@@ -559,10 +559,10 @@
.rept 4
ld1 {v0.8h, v1.8h}, [x0], #32
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS v0, v4, v3, v5, v6, v7
st1 {v4.8h}, [x1], #16
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7
st1 {v4.8h}, [x1], #16
.endr
@@ -575,18 +575,18 @@
mov x1, x0
ld1 {v0.8h, v1.8h}, [x0], #32
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v16
st1 {v4.8h}, [x1], #16
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v17
st1 {v4.8h}, [x1], #16 // then 1st 16 elem in v16 & v17
ld1 {v0.8h, v1.8h}, [x0], #32
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v18
st1 {v4.8h}, [x1], #16
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v19
st1 {v4.8h}, [x1], #16 // then 2st 16 elem in v18 & v19
@@ -593,18 +593,18 @@
SELECT_MAX_IN_ABS_COEF v16, v17, v18, v19, h20, h21
ld1 {v0.8h, v1.8h}, [x0], #32
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v16
st1 {v4.8h}, [x1], #16
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v17
st1 {v4.8h}, [x1], #16 // then 1st 16 elem in v16 & v17
ld1 {v0.8h, v1.8h}, [x0], #32
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v18
st1 {v4.8h}, [x1], #16
- mov.16b v4, v2
+ mov v4.16b, v2.16b
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v19
st1 {v4.8h}, [x1], #16 // then 2st 16 elem in v18 & v19
--- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
@@ -288,7 +288,7 @@
_hash_height_loop:
mov x7, x1
- mov.16b v2, v5 //mx_x_offset_x4
+ mov v2.16b, v5.16b //mx_x_offset_x4
_hash_width_loop:
ld1 {v0.d}[0], [x0], #8