ref: d5c71dbe2f5d5aaffcd12db70d81fb19efa73ace
parent: 019fb9e20e4cb567d8739a36cb003181c3b0c833
author: Martin Storsjö <[email protected]>
date: Wed Apr 23 07:25:40 EDT 2014
Use the correct syntax for the aarch64 ext instructions Since Xcode 5.1, the apple tools actually support using the official, correct syntax for the ext instructions. This syntax is already used in a number of places already - use it consistently, and get rid of the compatibility hacks.
--- a/codec/common/arm64/arm_arch64_common_macro.S
+++ b/codec/common/arm64/arm_arch64_common_macro.S
@@ -67,11 +67,4 @@
mov \arg0\().8b, \arg1\().8b
.endm
-.macro ext.16b arg0, arg1, arg2, arg3
- ext \arg0\().16b, \arg1\().16b, \arg2\().16b, \arg3
-.endm
-
-.macro ext.8b arg0, arg1, arg2, arg3
- ext \arg0\().8b, \arg1\().8b, \arg2\().8b, \arg3
-.endm
#endif
--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@@ -212,7 +212,7 @@
ld1 {v1.s} [3], [x6]
bs_nzc_check_jump0:
- ext.16b v1, v1, v0, #12
+ ext v1.16b, v1.16b, v0.16b, #12
add $3.16b, v0.16b, v1.16b
// Arrange the input data --- LEFT
@@ -233,7 +233,7 @@
zip1 v0.16b, v0.16b, v2.16b
ins v2.d[0], v0.d[1]
zip1 v0.16b, v0.16b, v2.16b
- ext.16b v1, v1, v0, #12
+ ext v1.16b, v1.16b, v0.16b, #12
add $4.16b, v0.16b, v1.16b
.endm
@@ -470,7 +470,7 @@
ld1 {v1.s} [3], [x6]
bs_nzc_check_jump0:
- ext.16b v1, v1, v0, #12
+ ext v1.16b, v1.16b, v0.16b, #12
add \arg3\().16b, v0.16b, v1.16b
// Arrange the input data --- LEFT
@@ -491,7 +491,7 @@
zip1 v0.16b, v0.16b, v2.16b
ins v2.d[0], v0.d[1]
zip1 v0.16b, v0.16b, v2.16b
- ext.16b v1, v1, v0, #12
+ ext v1.16b, v1.16b, v0.16b, #12
add \arg4\().16b, v0.16b, v1.16b
.endm
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -197,8 +197,8 @@
.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
- ext.16b $3, $1, $1, #14 // X[0][1][2][3][4][5]O
- ext.16b $4, $3, $3, #8 // [3][4][5]OX[0][1][2]
+ ext $3.16b, $1.16b, $1.16b, #14 // X[0][1][2][3][4][5]O
+ ext $4.16b, $3.16b, $3.16b, #8 // [3][4][5]OX[0][1][2]
rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O
add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]*
smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32]
@@ -1713,12 +1713,12 @@
WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
ld1 {v0.16b}, [x0], x1 // src[x]
- ext.16b v1, v0, v0, #1 // src[x+1]
+ ext v1.16b, v0.16b, v0.16b, #1 // src[x+1]
w8_mc_chroma_loop:
ld1 {v2.16b}, [x0], x1 // src[x+stride]
- ext.16b v3, v2, v2, #1 // src[x+stride+1]
+ ext v3.16b, v2.16b, v2.16b, #1 // src[x+stride+1]
ld1 {v18.16b}, [x0], x1 // src[x+2*stride]
- ext.16b v19, v18, v18, #1 // src[x+2*stride+1]
+ ext v19.16b, v18.16b, v18.16b, #1 // src[x+2*stride+1]
umull v16.8h, v0.8b, v4.8b
umlal v16.8h, v1.8b, v5.8b
@@ -1744,12 +1744,12 @@
WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon
ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
ld1 {v0.8b}, [x0], x1 // src[x]
- ext.8b v1, v0, v0, #1 // src[x+1]
+ ext v1.8b, v0.8b, v0.8b, #1 // src[x+1]
w4_mc_chroma_loop:
ld1 {v2.8b}, [x0], x1 // src[x+stride]
- ext.8b v3, v2, v2, #1 // src[x+stride+1]
+ ext v3.8b, v2.8b, v2.8b, #1 // src[x+stride+1]
ld1 {v18.8b}, [x0], x1 // src[x+2*stride]
- ext.8b v19, v18, v18, #1 // src[x+2*stride+1]
+ ext v19.8b, v18.8b, v18.8b, #1 // src[x+2*stride+1]
zip1 v0.4s, v0.4s, v2.4s
zip1 v1.4s, v1.4s, v3.4s
@@ -1792,7 +1792,7 @@
FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1
st1 {v20.16b}, [x2], x5 //write 16Byte
- ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
+ ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
st1 {v21.b}[0], [x2], x3 //write 16th Byte
@@ -1820,7 +1820,7 @@
FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
st1 {v20.8b}, [x2], x5 //write 8Byte
- ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
+ ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
st1 {v21.b}[0], [x2], x3 //write 9th Byte