ref: ce740ee19e5b49afeae5c041d230563d5a869338
parent: 1ae709dde0a246b94af107091b2f5ecfb739c236
parent: cc8a2bd07eb88e8c93eae25692a477117d8779ec
author: zhilwang <[email protected]>
date: Wed Jul 9 05:04:22 EDT 2014
Merge pull request #1113 from mstorsjo/asm-indent Fix indentation of macros in reconstruct_aarc64_neon.S
--- a/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
+++ b/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
@@ -270,18 +270,18 @@
uxtl2 $4.8h, $0.16b
add $3.8h, $3.8h, $1.8h
add $4.8h, $4.8h, $2.8h
- sqxtun $0.8b, $3.8h
- sqxtun2 $0.16b,$4.8h
+ sqxtun $0.8b, $3.8h
+ sqxtun2 $0.16b,$4.8h
// }
.endm
#else
.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2
// { // input: coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q
-cmeq \arg0\().8h, \arg0\().8h, #0
-cmeq \arg1\().8h, \arg1\().8h, #0
-uzp1 \arg0\().16b, \arg0\().16b, \arg1\().16b
-ushr \arg0\().16b, \arg0\().16b, 7
-addv \arg2\(), \arg0\().16b
+ cmeq \arg0\().8h, \arg0\().8h, #0
+ cmeq \arg1\().8h, \arg1\().8h, #0
+ uzp1 \arg0\().16b, \arg0\().16b, \arg1\().16b
+ ushr \arg0\().16b, \arg0\().16b, 7
+ addv \arg2\(), \arg0\().16b
// }
.endm
@@ -288,17 +288,17 @@
.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5
// if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
-eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0;
-saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
-smull \arg4\().4s, \arg1\().4h, \arg2\().4h
-smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h
-shrn \arg1\().4h, \arg4\().4s, #16
-shrn2 \arg1\().8h, \arg5\().4s, #16
+ eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0;
+ saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
+ smull \arg4\().4s, \arg1\().4h, \arg2\().4h
+ smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h
+ shrn \arg1\().4h, \arg4\().4s, #16
+ shrn2 \arg1\().8h, \arg5\().4s, #16
-cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
-bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
-shl \arg3\().8h, \arg3\().8h, #1
-sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
+ cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
+ bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
+ shl \arg3\().8h, \arg3\().8h, #1
+ sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
// }
.endm
@@ -305,18 +305,18 @@
.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6
// if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
-eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0;
-saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
-smull \arg4\().4s, \arg1\().4h, \arg2\().4h
-smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h
-shrn \arg1\().4h, \arg4\().4s, #16
-shrn2 \arg1\().8h, \arg5\().4s, #16
+ eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0;
+ saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
+ smull \arg4\().4s, \arg1\().4h, \arg2\().4h
+ smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h
+ shrn \arg1\().4h, \arg4\().4s, #16
+ shrn2 \arg1\().8h, \arg5\().4s, #16
-cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
-bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
-shl \arg3\().8h, \arg3\().8h, #1
-mov \arg6\().16b, \arg1\().16b
-sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
+ cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
+ bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
+ shl \arg3\().8h, \arg3\().8h, #1
+ mov \arg6\().16b, \arg1\().16b
+ sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
// }
.endm
@@ -323,32 +323,32 @@
.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
// if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
-saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
-smull \arg4\().4s, \arg1\().4h, \arg2\().4h
-shrn \arg1\().4h, \arg4\().4s, #16
+ saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
+ smull \arg4\().4s, \arg1\().4h, \arg2\().4h
+ shrn \arg1\().4h, \arg4\().4s, #16
-cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
-bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
-shl \arg3\().8h, \arg3\().8h, #1
-sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
+ cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
+ bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
+ shl \arg3\().8h, \arg3\().8h, #1
+ sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4, arg5
// { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)
-umax \arg0\().8h, \arg0\().8h, \arg1\().8h
-umaxv \arg4\(), \arg0\().8h
-umax \arg2\().8h, \arg2\().8h, \arg3\().8h
-umaxv \arg5\(), \arg2\().8h
+ umax \arg0\().8h, \arg0\().8h, \arg1\().8h
+ umaxv \arg4\(), \arg0\().8h
+ umax \arg2\().8h, \arg2\().8h, \arg3\().8h
+ umaxv \arg5\(), \arg2\().8h
// }
.endm
.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
// { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working
-sshr \arg1\().2d, \arg0\().2d, #32
-add \arg2\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
-sub \arg1\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
-zip1 \arg1\().4h, \arg2\().4h, \arg1\().4h
+ sshr \arg1\().2d, \arg0\().2d, #32
+ add \arg2\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+ sub \arg1\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+ zip1 \arg1\().4h, \arg2\().4h, \arg1\().4h
// }
.endm
@@ -355,119 +355,119 @@
.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
// { // input: coef, dst_d, working_d (all 0x01)
-cmeq \arg0\().4h, \arg0\().4h, #0
-and \arg0\().8b, \arg0\().8b, \arg2\().8b
-addv \arg1\(), \arg0\().4h
+ cmeq \arg0\().4h, \arg0\().4h, #0
+ and \arg0\().8b, \arg0\().8b, \arg2\().8b
+ addv \arg1\(), \arg0\().4h
// }
.endm
.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
// { // input: each src_d[0]~[3](dst), working_q0, working_q1
-uzp2 \arg1\().4s, \arg0\().4s, \arg0\().4s
-uzp1 \arg0\().4s, \arg0\().4s, \arg0\().4s
-add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
-sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
-zip1 \arg2\().8h, \arg2\().8h, \arg1\().8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
+ uzp2 \arg1\().4s, \arg0\().4s, \arg0\().4s
+ uzp1 \arg0\().4s, \arg0\().4s, \arg0\().4s
+ add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
+ sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
+ zip1 \arg2\().8h, \arg2\().8h, \arg1\().8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
-uzp2 \arg1\().4s, \arg2\().4s, \arg2\().4s
-uzp1 \arg0\().4s, \arg2\().4s, \arg2\().4s
-add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
-sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
-rev32 \arg1\().4h, \arg1\().4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
-zip1 \arg0\().4s, \arg2\().4s, \arg1\().4s
-// }
+ uzp2 \arg1\().4s, \arg2\().4s, \arg2\().4s
+ uzp1 \arg0\().4s, \arg2\().4s, \arg2\().4s
+ add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
+ sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
+ rev32 \arg1\().4h, \arg1\().4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
+ zip1 \arg0\().4s, \arg2\().4s, \arg1\().4s
+ // }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 arg0, arg1, arg2, arg3
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
-uzp1 \arg2\().4s, \arg0\().4s, \arg1\().4s //[0 1 4 5]+[8 9 12 13]
-uzp2 \arg3\().4s, \arg0\().4s, \arg1\().4s //[2 3 6 7]+[10 11 14 15]
+ uzp1 \arg2\().4s, \arg0\().4s, \arg1\().4s //[0 1 4 5]+[8 9 12 13]
+ uzp2 \arg3\().4s, \arg0\().4s, \arg1\().4s //[2 3 6 7]+[10 11 14 15]
-uzp1 \arg0\().8h, \arg2\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14]
-uzp2 \arg2\().8h, \arg2\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15]
-zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15]
-zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13]
+ uzp1 \arg0\().8h, \arg2\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14]
+ uzp2 \arg2\().8h, \arg2\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15]
+ zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15]
+ zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13]
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]
-trn1 \arg4\().8h, v0.8h, v1.8h
-trn2 \arg5\().8h, v0.8h, v1.8h
-trn1 \arg6\().8h, v2.8h, v3.8h
-trn2 \arg7\().8h, v2.8h, v3.8h
+ trn1 \arg4\().8h, v0.8h, v1.8h
+ trn2 \arg5\().8h, v0.8h, v1.8h
+ trn1 \arg6\().8h, v2.8h, v3.8h
+ trn2 \arg7\().8h, v2.8h, v3.8h
-trn1 \arg0\().4s, v4.4s, v6.4s
-trn2 \arg2\().4s, v4.4s, v6.4s
-trn1 \arg1\().4s, v5.4s, v7.4s
-trn2 \arg3\().4s, v5.4s, v7.4s
+ trn1 \arg0\().4s, v4.4s, v6.4s
+ trn2 \arg2\().4s, v4.4s, v6.4s
+ trn1 \arg1\().4s, v5.4s, v7.4s
+ trn2 \arg3\().4s, v5.4s, v7.4s
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2 arg0, arg1, arg2, arg3
// { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]
-mov \arg0\().d[1], \arg1\().d[0] //[0 1 2 3]+[4 5 6 7]
-mov \arg2\().d[1], \arg3\().d[0] //[8 9 10 11]+[12 13 14 15]
-uzp1 \arg1\().4s, \arg0\().4s, \arg2\().4s //[0 1 4 5]+[8 9 12 13]
-uzp2 \arg3\().4s, \arg0\().4s, \arg2\().4s //[2 3 6 7]+[10 11 14 15]
+ mov \arg0\().d[1], \arg1\().d[0] //[0 1 2 3]+[4 5 6 7]
+ mov \arg2\().d[1], \arg3\().d[0] //[8 9 10 11]+[12 13 14 15]
+ uzp1 \arg1\().4s, \arg0\().4s, \arg2\().4s //[0 1 4 5]+[8 9 12 13]
+ uzp2 \arg3\().4s, \arg0\().4s, \arg2\().4s //[2 3 6 7]+[10 11 14 15]
-uzp1 \arg0\().8h, \arg1\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14]
-uzp2 \arg2\().8h, \arg1\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15]
-zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15]
-zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13]
+ uzp1 \arg0\().8h, \arg1\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14]
+ uzp2 \arg2\().8h, \arg1\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15]
+ zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15]
+ zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13]
// }
.endm
.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5
-ld1 {\arg0\().s}[0], [\arg2\()], \arg3\()
-ld1 {\arg0\().s}[1], [\arg2\()], \arg3\()
-ld1 {\arg0\().s}[2], [\arg2\()], \arg3\()
-ld1 {\arg0\().s}[3], [\arg2\()]
+ ld1 {\arg0\().s}[0], [\arg2\()], \arg3\()
+ ld1 {\arg0\().s}[1], [\arg2\()], \arg3\()
+ ld1 {\arg0\().s}[2], [\arg2\()], \arg3\()
+ ld1 {\arg0\().s}[3], [\arg2\()]
-ld1 {\arg1\().s}[0], [\arg4\()], \arg5\()
-ld1 {\arg1\().s}[1], [\arg4\()], \arg5\()
-ld1 {\arg1\().s}[2], [\arg4\()], \arg5\()
-ld1 {\arg1\().s}[3], [\arg4\()]
+ ld1 {\arg1\().s}[0], [\arg4\()], \arg5\()
+ ld1 {\arg1\().s}[1], [\arg4\()], \arg5\()
+ ld1 {\arg1\().s}[2], [\arg4\()], \arg5\()
+ ld1 {\arg1\().s}[3], [\arg4\()]
.endm
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_d[0]~[3], working: [4]~[7]
-add \arg4\().8h, \arg0\().8h, \arg3\().8h //int16 s[0] = data[i] + data[i3];
-sub \arg7\().8h, \arg0\().8h, \arg3\().8h //int16 s[3] = data[i] - data[i3];
-add \arg5\().8h, \arg1\().8h, \arg2\().8h //int16 s[1] = data[i1] + data[i2];
-sub \arg6\().8h, \arg1\().8h, \arg2\().8h //int16 s[2] = data[i1] - data[i2];
+ add \arg4\().8h, \arg0\().8h, \arg3\().8h //int16 s[0] = data[i] + data[i3];
+ sub \arg7\().8h, \arg0\().8h, \arg3\().8h //int16 s[3] = data[i] - data[i3];
+ add \arg5\().8h, \arg1\().8h, \arg2\().8h //int16 s[1] = data[i1] + data[i2];
+ sub \arg6\().8h, \arg1\().8h, \arg2\().8h //int16 s[2] = data[i1] - data[i2];
-add \arg0\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i ] = s[0] + s[1];
-sub \arg2\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i2] = s[0] - s[1];
-shl \arg1\().8h, \arg7\().8h, #1
-shl \arg3\().8h, \arg6\().8h, #1
-add \arg1\().8h, \arg1\().8h, \arg6\().8h //int16 dct[i1] = (s[3] << 1) + s[2];
-sub \arg3\().8h, \arg7\().8h, \arg3\().8h //int16 dct[i3] = s[3] - (s[2] << 1);
+ add \arg0\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i ] = s[0] + s[1];
+ sub \arg2\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i2] = s[0] - s[1];
+ shl \arg1\().8h, \arg7\().8h, #1
+ shl \arg3\().8h, \arg6\().8h, #1
+ add \arg1\().8h, \arg1\().8h, \arg6\().8h //int16 dct[i1] = (s[3] << 1) + s[2];
+ sub \arg3\().8h, \arg7\().8h, \arg3\().8h //int16 dct[i3] = s[3] - (s[2] << 1);
// }
.endm
.macro LOAD_8x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: \arg0\()~\arg3\(), src1*, src2*; untouched r2:src1_stride &r4:src2_stride
-ld1 {\arg0\().d}[0], [\arg8\()], x2
-ld1 {\arg1\().d}[0], [\arg8\()], x2
-ld1 {\arg2\().d}[0], [\arg8\()], x2
-ld1 {\arg3\().d}[0], [\arg8\()], x2
+ ld1 {\arg0\().d}[0], [\arg8\()], x2
+ ld1 {\arg1\().d}[0], [\arg8\()], x2
+ ld1 {\arg2\().d}[0], [\arg8\()], x2
+ ld1 {\arg3\().d}[0], [\arg8\()], x2
-ld1 {\arg4\().d}[0], [\arg9\()], x4
-ld1 {\arg5\().d}[0], [\arg9\()], x4
-ld1 {\arg6\().d}[0], [\arg9\()], x4
-ld1 {\arg7\().d}[0], [\arg9\()], x4
+ ld1 {\arg4\().d}[0], [\arg9\()], x4
+ ld1 {\arg5\().d}[0], [\arg9\()], x4
+ ld1 {\arg6\().d}[0], [\arg9\()], x4
+ ld1 {\arg7\().d}[0], [\arg9\()], x4
// }
.endm
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_d[0]~[3], output: e_d[0]~[3];
-add \arg4\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][0] = src[0] + src[2];
-sub \arg5\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][1] = src[0] - src[2];
-sshr \arg6\().8h, \arg1\().8h, #1
-sshr \arg7\().8h, \arg3\().8h, #1
-sub \arg6\().8h, \arg6\().8h, \arg3\().8h //int16 e[i][2] = (src[1]>>1)-src[3];
-add \arg7\().8h, \arg1\().8h, \arg7\().8h //int16 e[i][3] = src[1] + (src[3]>>1);
+ add \arg4\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][0] = src[0] + src[2];
+ sub \arg5\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][1] = src[0] - src[2];
+ sshr \arg6\().8h, \arg1\().8h, #1
+ sshr \arg7\().8h, \arg3\().8h, #1
+ sub \arg6\().8h, \arg6\().8h, \arg3\().8h //int16 e[i][2] = (src[1]>>1)-src[3];
+ add \arg7\().8h, \arg1\().8h, \arg7\().8h //int16 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
@@ -474,28 +474,28 @@
.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
-add \arg0\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][0] = e[i][0] + e[i][3];
-add \arg1\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][1] = e[i][1] + e[i][2];
-sub \arg2\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][2] = e[i][1] - e[i][2];
-sub \arg3\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][3] = e[i][0] - e[i][3];
+ add \arg0\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][0] = e[i][0] + e[i][3];
+ add \arg1\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][1] = e[i][1] + e[i][2];
+ sub \arg2\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][2] = e[i][1] - e[i][2];
+ sub \arg3\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_d[0]~[3], output: e_q[0]~[3];
-saddl \arg4\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][0] = src[0] + src[2];
-ssubl \arg5\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][1] = src[0] - src[2];
-ssubl \arg6\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][2] = src[1] - src[3];
-saddl \arg7\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][3] = src[1] + src[3];
+ saddl \arg4\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][0] = src[0] + src[2];
+ ssubl \arg5\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][1] = src[0] - src[2];
+ ssubl \arg6\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][2] = src[1] - src[3];
+ saddl \arg7\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][3] = src[1] + src[3];
// }
.endm
.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_q[0]~[3], output: e_q[0]~[3];
-add \arg4\().4s, \arg0\().4s, \arg2\().4s //int32 e[0][j] = f[0][j] + f[2][j];
-sub \arg5\().4s, \arg0\().4s, \arg2\().4s //int32 e[1][j] = f[0][j] - f[2][j];
-sub \arg6\().4s, \arg1\().4s, \arg3\().4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-add \arg7\().4s, \arg1\().4s, \arg3\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+ add \arg4\().4s, \arg0\().4s, \arg2\().4s //int32 e[0][j] = f[0][j] + f[2][j];
+ sub \arg5\().4s, \arg0\().4s, \arg2\().4s //int32 e[1][j] = f[0][j] - f[2][j];
+ sub \arg6\().4s, \arg1\().4s, \arg3\().4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ add \arg7\().4s, \arg1\().4s, \arg3\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
@@ -502,21 +502,21 @@
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
-add \arg0\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][0] = e[i][0] + e[i][3];
-add \arg1\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][1] = e[i][1] + e[i][2];
-sub \arg2\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][2] = e[i][1] - e[i][2];
-sub \arg3\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][3] = e[i][0] - e[i][3];
+ add \arg0\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][0] = e[i][0] + e[i][3];
+ add \arg1\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][1] = e[i][1] + e[i][2];
+ sub \arg2\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][2] = e[i][1] - e[i][2];
+ sub \arg3\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4
// { // input: pred_d[0](output), dct_q0/1, working_q0/1;
-uxtl \arg3\().8h, \arg0\().8b
-uxtl2 \arg4\().8h, \arg0\().16b
-add \arg3\().8h, \arg3\().8h, \arg1\().8h
-add \arg4\().8h, \arg4\().8h, \arg2\().8h
-sqxtun \arg0\().8b, \arg3\().8h
-sqxtun2 \arg0\().16b,\arg4\().8h
+ uxtl \arg3\().8h, \arg0\().8b
+ uxtl2 \arg4\().8h, \arg0\().16b
+ add \arg3\().8h, \arg3\().8h, \arg1\().8h
+ add \arg4\().8h, \arg4\().8h, \arg2\().8h
+ sqxtun \arg0\().8b, \arg3\().8h
+ sqxtun2 \arg0\().16b,\arg4\().8h
// }
.endm
#endif