ref: 57f6bcc4b0da529101c25fd97349e9e55a6a5cee
parent: faaf62afadeedc01a89a482ab56ec23027b6c3ba
author: Martin Storsjö <[email protected]>
date: Sat May 31 10:13:34 EDT 2014
Convert all tabs to spaces in assembly sources, unify indentation Previously the assembly sources had mixed indentation consisting of both spaces and tabs, making it quite hard to read unless the right tab size was used in the editor. Tabs have been interpreted as 4 spaces in most cases, matching the surrounding code.
--- a/codec/common/arm/copy_mb_neon.S
+++ b/codec/common/arm/copy_mb_neon.S
@@ -36,75 +36,75 @@
#ifdef __APPLE__
.macro LOAD_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- vld1.64 {$0}, [$4,:128], $5
- vld1.64 {$1}, [$4,:128], $5
- vld1.64 {$2}, [$4,:128], $5
- vld1.64 {$3}, [$4,:128], $5
-// }
+// { // input: $0~$3, src*, src_stride
+ vld1.64 {$0}, [$4,:128], $5
+ vld1.64 {$1}, [$4,:128], $5
+ vld1.64 {$2}, [$4,:128], $5
+ vld1.64 {$3}, [$4,:128], $5
+// }
.endm
.macro STORE_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- vst1.64 {$0}, [$4,:128], $5
- vst1.64 {$1}, [$4,:128], $5
- vst1.64 {$2}, [$4,:128], $5
- vst1.64 {$3}, [$4,:128], $5
-// }
+// { // input: $0~$3, dst*, dst_stride
+ vst1.64 {$0}, [$4,:128], $5
+ vst1.64 {$1}, [$4,:128], $5
+ vst1.64 {$2}, [$4,:128], $5
+ vst1.64 {$3}, [$4,:128], $5
+// }
.endm
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- vld1.64 {$0}, [$4], $5
- vld1.64 {$1}, [$4], $5
- vld1.64 {$2}, [$4], $5
- vld1.64 {$3}, [$4], $5
-// }
+// { // input: $0~$3, src*, src_stride
+ vld1.64 {$0}, [$4], $5
+ vld1.64 {$1}, [$4], $5
+ vld1.64 {$2}, [$4], $5
+ vld1.64 {$3}, [$4], $5
+// }
.endm
.macro STORE_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- vst1.64 {$0}, [$4], $5
- vst1.64 {$1}, [$4], $5
- vst1.64 {$2}, [$4], $5
- vst1.64 {$3}, [$4], $5
-// }
+// { // input: $0~$3, dst*, dst_stride
+ vst1.64 {$0}, [$4], $5
+ vst1.64 {$1}, [$4], $5
+ vst1.64 {$2}, [$4], $5
+ vst1.64 {$3}, [$4], $5
+// }
.endm
#else
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, src*, src_stride
- vld1.64 {\arg0}, [\arg4,:128], \arg5
- vld1.64 {\arg1}, [\arg4,:128], \arg5
- vld1.64 {\arg2}, [\arg4,:128], \arg5
- vld1.64 {\arg3}, [\arg4,:128], \arg5
-// }
+// { // input: \arg0~\arg3, src*, src_stride
+ vld1.64 {\arg0}, [\arg4,:128], \arg5
+ vld1.64 {\arg1}, [\arg4,:128], \arg5
+ vld1.64 {\arg2}, [\arg4,:128], \arg5
+ vld1.64 {\arg3}, [\arg4,:128], \arg5
+// }
.endm
.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, dst*, dst_stride
- vst1.64 {\arg0}, [\arg4,:128], \arg5
- vst1.64 {\arg1}, [\arg4,:128], \arg5
- vst1.64 {\arg2}, [\arg4,:128], \arg5
- vst1.64 {\arg3}, [\arg4,:128], \arg5
-// }
+// { // input: \arg0~\arg3, dst*, dst_stride
+ vst1.64 {\arg0}, [\arg4,:128], \arg5
+ vst1.64 {\arg1}, [\arg4,:128], \arg5
+ vst1.64 {\arg2}, [\arg4,:128], \arg5
+ vst1.64 {\arg3}, [\arg4,:128], \arg5
+// }
.endm
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, src*, src_stride
- vld1.64 {\arg0}, [\arg4], \arg5
- vld1.64 {\arg1}, [\arg4], \arg5
- vld1.64 {\arg2}, [\arg4], \arg5
- vld1.64 {\arg3}, [\arg4], \arg5
-// }
+// { // input: \arg0~\arg3, src*, src_stride
+ vld1.64 {\arg0}, [\arg4], \arg5
+ vld1.64 {\arg1}, [\arg4], \arg5
+ vld1.64 {\arg2}, [\arg4], \arg5
+ vld1.64 {\arg3}, [\arg4], \arg5
+// }
.endm
.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, dst*, dst_stride
- vst1.64 {\arg0}, [\arg4], \arg5
- vst1.64 {\arg1}, [\arg4], \arg5
- vst1.64 {\arg2}, [\arg4], \arg5
- vst1.64 {\arg3}, [\arg4], \arg5
-// }
+// { // input: \arg0~\arg3, dst*, dst_stride
+ vst1.64 {\arg0}, [\arg4], \arg5
+ vst1.64 {\arg1}, [\arg4], \arg5
+ vst1.64 {\arg2}, [\arg4], \arg5
+ vst1.64 {\arg3}, [\arg4], \arg5
+// }
.endm
#endif
@@ -112,13 +112,13 @@
WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
- LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
WELS_ASM_FUNC_END
@@ -125,21 +125,21 @@
WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
- LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+ LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
- STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+ STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
- LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
+ LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
- STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
+ STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
- LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+ LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
- STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+ STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
- LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
+ LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
- STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
+ STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
@@ -146,21 +146,21 @@
WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
- LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
@@ -167,13 +167,13 @@
WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
- LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
@@ -180,21 +180,21 @@
WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
- LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
WELS_ASM_FUNC_END
--- a/codec/common/arm/deblocking_neon.S
+++ b/codec/common/arm/deblocking_neon.S
@@ -37,814 +37,814 @@
#ifdef __APPLE__
.macro JMP_IF_128BITS_IS_ZERO
- vorr.s16 $2, $0, $1
- vmov r3, r2, $2
- orr r3, r3, r2
- cmp r3, #0
+ vorr.s16 $2, $0, $1
+ vmov r3, r2, $2
+ orr r3, r3, r2
+ cmp r3, #0
.endm
.macro MASK_MATRIX
- vabd.u8 $6, $1, $2
- vcgt.u8 $6, $4, $6
+ vabd.u8 $6, $1, $2
+ vcgt.u8 $6, $4, $6
- vabd.u8 $4, $0, $1
- vclt.u8 $4, $4, $5
- vand.u8 $6, $6, $4
+ vabd.u8 $4, $0, $1
+ vclt.u8 $4, $4, $5
+ vand.u8 $6, $6, $4
- vabd.u8 $4, $3, $2
- vclt.u8 $4, $4, $5
- vand.u8 $6, $6, $4
+ vabd.u8 $4, $3, $2
+ vclt.u8 $4, $4, $5
+ vand.u8 $6, $6, $4
.endm
.macro DIFF_LUMA_LT4_P1_Q1
vmov.i8 $9, #128
- vrhadd.u8 $8, $2, $3
- vhadd.u8 $8, $0, $8
- vsub.s8 $8, $8, $9
- vsub.s8 $9, $1, $9
- vqsub.s8 $8, $8, $9
- vmax.s8 $8, $8, $5
- vmin.s8 $8, $8, $6
- vabd.u8 $9, $0, $2
- vclt.u8 $9, $9, $4
- vand.s8 $8, $8, $9
- vand.s8 $8, $8, $7
- vadd.u8 $8, $1, $8
- vabs.s8 $9, $9
+ vrhadd.u8 $8, $2, $3
+ vhadd.u8 $8, $0, $8
+ vsub.s8 $8, $8, $9
+ vsub.s8 $9, $1, $9
+ vqsub.s8 $8, $8, $9
+ vmax.s8 $8, $8, $5
+ vmin.s8 $8, $8, $6
+ vabd.u8 $9, $0, $2
+ vclt.u8 $9, $9, $4
+ vand.s8 $8, $8, $9
+ vand.s8 $8, $8, $7
+ vadd.u8 $8, $1, $8
+ vabs.s8 $9, $9
.endm
.macro DIFF_LUMA_LT4_P0_Q0
- vsubl.u8 $5, $0, $3
- vsubl.u8 $6, $2, $1
- vshl.s16 $6, $6, #2
- vadd.s16 $5, $5, $6
- vqrshrn.s16 $4, $5, #3
+ vsubl.u8 $5, $0, $3
+ vsubl.u8 $6, $2, $1
+ vshl.s16 $6, $6, #2
+ vadd.s16 $5, $5, $6
+ vqrshrn.s16 $4, $5, #3
.endm
.macro DIFF_LUMA_EQ4_P2P1P0
- vaddl.u8 q4, $1, $2
- vaddl.u8 q5, $3, $4
- vadd.u16 q5, q4, q5
+ vaddl.u8 q4, $1, $2
+ vaddl.u8 q5, $3, $4
+ vadd.u16 q5, q4, q5
- vaddl.u8 q4, $0, $1
- vshl.u16 q4, q4, #1
- vadd.u16 q4, q5, q4
+ vaddl.u8 q4, $0, $1
+ vshl.u16 q4, q4, #1
+ vadd.u16 q4, q5, q4
- vrshrn.u16 $0, q5, #2
- vrshrn.u16 $7, q4, #3
+ vrshrn.u16 $0, q5, #2
+ vrshrn.u16 $7, q4, #3
- vshl.u16 q5, q5, #1
- vsubl.u8 q4, $5, $1
- vadd.u16 q5, q4,q5
+ vshl.u16 q5, q5, #1
+ vsubl.u8 q4, $5, $1
+ vadd.u16 q5, q4,q5
- vaddl.u8 q4, $2, $5
- vaddw.u8 q4, q4, $2
- vaddw.u8 q4, q4, $3
+ vaddl.u8 q4, $2, $5
+ vaddw.u8 q4, q4, $2
+ vaddw.u8 q4, q4, $3
- vrshrn.u16 d10,q5, #3
- vrshrn.u16 d8, q4, #2
- vbsl.u8 $6, d10, d8
+ vrshrn.u16 d10,q5, #3
+ vrshrn.u16 d8, q4, #2
+ vbsl.u8 $6, d10, d8
.endm
.macro DIFF_LUMA_EQ4_MASK
- vmov $3, $2
- vbsl.u8 $3, $0, $1
+ vmov $3, $2
+ vbsl.u8 $3, $0, $1
.endm
.macro DIFF_CHROMA_EQ4_P0Q0
- vaddl.u8 $4, $0, $3
- vaddw.u8 $5, $4, $1
- vaddw.u8 $6, $4, $2
- vaddw.u8 $5, $5, $0
+ vaddl.u8 $4, $0, $3
+ vaddw.u8 $5, $4, $1
+ vaddw.u8 $6, $4, $2
+ vaddw.u8 $5, $5, $0
- vaddw.u8 $6, $6, $3
- vrshrn.u16 $7, $5, #2
- vrshrn.u16 $8, $6, #2
+ vaddw.u8 $6, $6, $3
+ vrshrn.u16 $7, $5, #2
+ vrshrn.u16 $8, $6, #2
.endm
.macro LOAD_CHROMA_DATA_4
- vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
- vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+ vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+ vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
.endm
.macro STORE_CHROMA_DATA_4
- vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
- vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+ vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+ vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
.endm
.macro LOAD_LUMA_DATA_3
- vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
- vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
+ vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
+ vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
.endm
.macro STORE_LUMA_DATA_4
- vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
- vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
+ vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
+ vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
.endm
.macro STORE_LUMA_DATA_3
- vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
- vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
+ vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
+ vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
.endm
.macro EXTRACT_DELTA_INTO_TWO_PART
- vcge.s8 $1, $0, #0
- vand $1, $0, $1
- vsub.s8 $0, $1, $0
+ vcge.s8 $1, $0, #0
+ vand $1, $0, $1
+ vsub.s8 $0, $1, $0
.endm
#else
.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
- vorr.s16 \arg2, \arg0, \arg1
- vmov r3, r2, \arg2
- orr r3, r3, r2
- cmp r3, #0
+ vorr.s16 \arg2, \arg0, \arg1
+ vmov r3, r2, \arg2
+ orr r3, r3, r2
+ cmp r3, #0
.endm
.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vabd.u8 \arg6, \arg1, \arg2
- vcgt.u8 \arg6, \arg4, \arg6
+ vabd.u8 \arg6, \arg1, \arg2
+ vcgt.u8 \arg6, \arg4, \arg6
- vabd.u8 \arg4, \arg0, \arg1
- vclt.u8 \arg4, \arg4, \arg5
- vand.u8 \arg6, \arg6, \arg4
+ vabd.u8 \arg4, \arg0, \arg1
+ vclt.u8 \arg4, \arg4, \arg5
+ vand.u8 \arg6, \arg6, \arg4
- vabd.u8 \arg4, \arg3, \arg2
- vclt.u8 \arg4, \arg4, \arg5
- vand.u8 \arg6, \arg6, \arg4
+ vabd.u8 \arg4, \arg3, \arg2
+ vclt.u8 \arg4, \arg4, \arg5
+ vand.u8 \arg6, \arg6, \arg4
.endm
.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
vmov.i8 \arg9, #128
- vrhadd.u8 \arg8, \arg2, \arg3
- vhadd.u8 \arg8, \arg0, \arg8
- vsub.s8 \arg8, \arg8, \arg9
- vsub.s8 \arg9, \arg1, \arg9
+ vrhadd.u8 \arg8, \arg2, \arg3
+ vhadd.u8 \arg8, \arg0, \arg8
+ vsub.s8 \arg8, \arg8, \arg9
+ vsub.s8 \arg9, \arg1, \arg9
vqsub.s8 \arg8, \arg8, \arg9
- vmax.s8 \arg8, \arg8, \arg5
- vmin.s8 \arg8, \arg8, \arg6
- vabd.u8 \arg9, \arg0, \arg2
- vclt.u8 \arg9, \arg9, \arg4
- vand.s8 \arg8, \arg8, \arg9
- vand.s8 \arg8, \arg8, \arg7
- vadd.u8 \arg8, \arg1, \arg8
- vabs.s8 \arg9, \arg9
+ vmax.s8 \arg8, \arg8, \arg5
+ vmin.s8 \arg8, \arg8, \arg6
+ vabd.u8 \arg9, \arg0, \arg2
+ vclt.u8 \arg9, \arg9, \arg4
+ vand.s8 \arg8, \arg8, \arg9
+ vand.s8 \arg8, \arg8, \arg7
+ vadd.u8 \arg8, \arg1, \arg8
+ vabs.s8 \arg9, \arg9
.endm
.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vsubl.u8 \arg5, \arg0, \arg3
- vsubl.u8 \arg6, \arg2, \arg1
- vshl.s16 \arg6, \arg6, #2
- vadd.s16 \arg5, \arg5, \arg6
- vqrshrn.s16 \arg4, \arg5, #3
+ vsubl.u8 \arg5, \arg0, \arg3
+ vsubl.u8 \arg6, \arg2, \arg1
+ vshl.s16 \arg6, \arg6, #2
+ vadd.s16 \arg5, \arg5, \arg6
+ vqrshrn.s16 \arg4, \arg5, #3
.endm
.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
- vaddl.u8 q4, \arg1, \arg2
- vaddl.u8 q5, \arg3, \arg4
- vadd.u16 q5, q4, q5
+ vaddl.u8 q4, \arg1, \arg2
+ vaddl.u8 q5, \arg3, \arg4
+ vadd.u16 q5, q4, q5
- vaddl.u8 q4, \arg0, \arg1
- vshl.u16 q4, q4, #1
- vadd.u16 q4, q5, q4
+ vaddl.u8 q4, \arg0, \arg1
+ vshl.u16 q4, q4, #1
+ vadd.u16 q4, q5, q4
- vrshrn.u16 \arg0, q5, #2
- vrshrn.u16 \arg7, q4, #3
+ vrshrn.u16 \arg0, q5, #2
+ vrshrn.u16 \arg7, q4, #3
- vshl.u16 q5, q5, #1
- vsubl.u8 q4, \arg5, \arg1
- vadd.u16 q5, q4,q5
+ vshl.u16 q5, q5, #1
+ vsubl.u8 q4, \arg5, \arg1
+ vadd.u16 q5, q4,q5
- vaddl.u8 q4, \arg2, \arg5
- vaddw.u8 q4, q4, \arg2
- vaddw.u8 q4, q4, \arg3
+ vaddl.u8 q4, \arg2, \arg5
+ vaddw.u8 q4, q4, \arg2
+ vaddw.u8 q4, q4, \arg3
- vrshrn.u16 d10,q5, #3
- vrshrn.u16 d8, q4, #2
- vbsl.u8 \arg6, d10, d8
+ vrshrn.u16 d10,q5, #3
+ vrshrn.u16 d8, q4, #2
+ vbsl.u8 \arg6, d10, d8
.endm
.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
- vmov \arg3, \arg2
- vbsl.u8 \arg3, \arg0, \arg1
+ vmov \arg3, \arg2
+ vbsl.u8 \arg3, \arg0, \arg1
.endm
.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- vaddl.u8 \arg4, \arg0, \arg3
- vaddw.u8 \arg5, \arg4, \arg1
- vaddw.u8 \arg6, \arg4, \arg2
- vaddw.u8 \arg5, \arg5, \arg0
- vaddw.u8 \arg6, \arg6, \arg3
- vrshrn.u16 \arg7, \arg5, #2
- vrshrn.u16 \arg8, \arg6, #2
+ vaddl.u8 \arg4, \arg0, \arg3
+ vaddw.u8 \arg5, \arg4, \arg1
+ vaddw.u8 \arg6, \arg4, \arg2
+ vaddw.u8 \arg5, \arg5, \arg0
+ vaddw.u8 \arg6, \arg6, \arg3
+ vrshrn.u16 \arg7, \arg5, #2
+ vrshrn.u16 \arg8, \arg6, #2
.endm
.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
- vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+ vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+ vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
.endm
.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
- vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+ vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+ vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
.endm
.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
- vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+ vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
+ vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
.endm
.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
- vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
- vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
+ vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
+ vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
.endm
.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
- vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+ vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
+ vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
.endm
.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
- vcge.s8 \arg1, \arg0, #0
- vand \arg1, \arg0, \arg1
- vsub.s8 \arg0, \arg1, \arg0
+ vcge.s8 \arg1, \arg0, #0
+ vand \arg1, \arg0, \arg1
+ vsub.s8 \arg0, \arg1, \arg0
.endm
#endif
WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
- vpush {q4-q7}
- vdup.u8 q11, r2
- vdup.u8 q9, r3
+ vpush {q4-q7}
+ vdup.u8 q11, r2
+ vdup.u8 q9, r3
- add r2, r1, r1, lsl #1
- sub r2, r0, r2
- vld1.u8 {q0}, [r2], r1
- vld1.u8 {q3}, [r0], r1
- vld1.u8 {q1}, [r2], r1
- vld1.u8 {q4}, [r0], r1
- vld1.u8 {q2}, [r2]
- vld1.u8 {q5}, [r0]
- sub r2, r2, r1
+ add r2, r1, r1, lsl #1
+ sub r2, r0, r2
+ vld1.u8 {q0}, [r2], r1
+ vld1.u8 {q3}, [r0], r1
+ vld1.u8 {q1}, [r2], r1
+ vld1.u8 {q4}, [r0], r1
+ vld1.u8 {q2}, [r2]
+ vld1.u8 {q5}, [r0]
+ sub r2, r2, r1
- ldr r3, [sp, #64]
- vld1.s8 {d31}, [r3]
- vdup.s8 d28, d31[0]
- vdup.s8 d30, d31[1]
- vdup.s8 d29, d31[2]
- vdup.s8 d31, d31[3]
- vtrn.32 d28, d30
- vtrn.32 d29, d31
- vcge.s8 q10, q14, #0
+ ldr r3, [sp, #64]
+ vld1.s8 {d31}, [r3]
+ vdup.s8 d28, d31[0]
+ vdup.s8 d30, d31[1]
+ vdup.s8 d29, d31[2]
+ vdup.s8 d31, d31[3]
+ vtrn.32 d28, d30
+ vtrn.32 d29, d31
+ vcge.s8 q10, q14, #0
- MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
- vand.u8 q10, q10, q15
+ MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
+ vand.u8 q10, q10, q15
- veor q15, q15
- vsub.i8 q15,q15,q14
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
- vst1.u8 {q6}, [r2], r1
+ DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+ vst1.u8 {q6}, [r2], r1
- DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+ DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
- vabs.s8 q12, q12
- vabs.s8 q13, q13
- vadd.u8 q14,q14,q12
- vadd.u8 q14,q14,q13
- veor q15, q15
- vsub.i8 q15,q15,q14
+ vabs.s8 q12, q12
+ vabs.s8 q13, q13
+ vadd.u8 q14,q14,q12
+ vadd.u8 q14,q14,q13
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
- vand.s8 q8, q8, q10
- EXTRACT_DELTA_INTO_TWO_PART q8, q9
- vqadd.u8 q2, q2, q9
- vqsub.u8 q2, q2, q8
- vst1.u8 {q2}, [r2], r1
- vqsub.u8 q3, q3, q9
- vqadd.u8 q3, q3, q8
- vst1.u8 {q3}, [r2] , r1
- vst1.u8 {q7}, [r2]
+ DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
+ vand.s8 q8, q8, q10
+ EXTRACT_DELTA_INTO_TWO_PART q8, q9
+ vqadd.u8 q2, q2, q9
+ vqsub.u8 q2, q2, q8
+ vst1.u8 {q2}, [r2], r1
+ vqsub.u8 q3, q3, q9
+ vqadd.u8 q3, q3, q8
+ vst1.u8 {q3}, [r2] , r1
+ vst1.u8 {q7}, [r2]
- vpop {q4-q7}
+ vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
- vpush {q4-q7}
+ vpush {q4-q7}
- vdup.u8 q5, r2
- vdup.u8 q4, r3
+ vdup.u8 q5, r2
+ vdup.u8 q4, r3
- sub r3, r0, r1, lsl #2
- vld1.u8 {q8}, [r3], r1
- vld1.u8 {q12}, [r0], r1
- vld1.u8 {q9}, [r3], r1
- vld1.u8 {q13}, [r0], r1
- vld1.u8 {q10}, [r3], r1
- vld1.u8 {q14}, [r0], r1
- vld1.u8 {q11}, [r3]
- vld1.u8 {q15}, [r0]
- sub r3, r3, r1 , lsl #1
+ sub r3, r0, r1, lsl #2
+ vld1.u8 {q8}, [r3], r1
+ vld1.u8 {q12}, [r0], r1
+ vld1.u8 {q9}, [r3], r1
+ vld1.u8 {q13}, [r0], r1
+ vld1.u8 {q10}, [r3], r1
+ vld1.u8 {q14}, [r0], r1
+ vld1.u8 {q11}, [r3]
+ vld1.u8 {q15}, [r0]
+ sub r3, r3, r1 , lsl #1
- MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
+ MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
- mov r2, r2, lsr #2
- add r2, r2, #2
- vdup.u8 q5, r2
- vabd.u8 q0, q11, q12
- vclt.u8 q7, q0, q5
+ mov r2, r2, lsr #2
+ add r2, r2, #2
+ vdup.u8 q5, r2
+ vabd.u8 q0, q11, q12
+ vclt.u8 q7, q0, q5
- vabd.u8 q1, q9, q11
- vclt.u8 q1, q1, q4
- vand.s8 q1, q1, q7
+ vabd.u8 q1, q9, q11
+ vclt.u8 q1, q1, q4
+ vand.s8 q1, q1, q7
- vabd.u8 q2, q14,q12
- vclt.u8 q2, q2, q4
- vand.s8 q2, q2, q7
- vand.u8 q7, q7, q6
+ vabd.u8 q2, q14,q12
+ vclt.u8 q2, q2, q4
+ vand.s8 q2, q2, q7
+ vand.u8 q7, q7, q6
- vmov q3, q1
+ vmov q3, q1
- DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
- DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
+ DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
+ DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
- vand.u8 q3, q7, q3
- DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q8,q10, q3, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q1,q11, q6, q4
- vst1.u8 {q4}, [r3], r1
+ vand.u8 q3, q7, q3
+ DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q8,q10, q3, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q1,q11, q6, q4
+ vst1.u8 {q4}, [r3], r1
- vmov q0, q2
- DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6
- DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7
+ vmov q0, q2
+ DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6
+ DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7
- vand.u8 q0, q7, q0
- DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q15, q13, q0, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q3, q14, q0, q4
- vst1.u8 {q4}, [r3], r1
+ vand.u8 q0, q7, q0
+ DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q15, q13, q0, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q3, q14, q0, q4
+ vst1.u8 {q4}, [r3], r1
- vpop {q4-q7}
+ vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
- vpush {q4-q7}
+ vpush {q4-q7}
- vdup.u8 q11, r2
- vdup.u8 q9, r3
+ vdup.u8 q11, r2
+ vdup.u8 q9, r3
- sub r2, r0, #3
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7
+ sub r2, r0, #3
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7
- vswp d1, d2
- vswp d3, d4
- vswp d1, d4
- vswp d7, d8
- vswp d9, d10
- vswp d7, d10
+ vswp d1, d2
+ vswp d3, d4
+ vswp d1, d4
+ vswp d7, d8
+ vswp d9, d10
+ vswp d7, d10
- sub r0, r0, r1, lsl #4
+ sub r0, r0, r1, lsl #4
- ldr r3, [sp, #64]
- vld1.s8 {d31}, [r3]
- vdup.s8 d28, d31[0]
- vdup.s8 d30, d31[1]
- vdup.s8 d29, d31[2]
- vdup.s8 d31, d31[3]
- vtrn.32 d28, d30
- vtrn.32 d29, d31
- vcge.s8 q10, q14, #0
+ ldr r3, [sp, #64]
+ vld1.s8 {d31}, [r3]
+ vdup.s8 d28, d31[0]
+ vdup.s8 d30, d31[1]
+ vdup.s8 d29, d31[2]
+ vdup.s8 d31, d31[3]
+ vtrn.32 d28, d30
+ vtrn.32 d29, d31
+ vcge.s8 q10, q14, #0
- MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
- vand.u8 q10, q10, q15
+ MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
+ vand.u8 q10, q10, q15
- veor q15, q15
- vsub.i8 q15,q15,q14
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
- DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+ DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+ DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
- vabs.s8 q12, q12
- vabs.s8 q13, q13
- vadd.u8 q14,q14,q12
- vadd.u8 q14,q14,q13
- veor q15, q15
- vsub.i8 q15,q15,q14
+ vabs.s8 q12, q12
+ vabs.s8 q13, q13
+ vadd.u8 q14,q14,q12
+ vadd.u8 q14,q14,q13
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
- vand.s8 q8, q8, q10
- EXTRACT_DELTA_INTO_TWO_PART q8, q9
- vqadd.u8 q2, q2, q9
- vqsub.u8 q2, q2, q8
+ DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
+ vand.s8 q8, q8, q10
+ EXTRACT_DELTA_INTO_TWO_PART q8, q9
+ vqadd.u8 q2, q2, q9
+ vqsub.u8 q2, q2, q8
- vqsub.u8 q3, q3, q9
- vqadd.u8 q3, q3, q8
+ vqsub.u8 q3, q3, q9
+ vqadd.u8 q3, q3, q8
- sub r0, #2
- add r2, r0, r1
- lsl r1, #1
+ sub r0, #2
+ add r2, r0, r1
+ lsl r1, #1
- vmov q1, q6
- vmov q4, q7
+ vmov q1, q6
+ vmov q4, q7
- vswp q2, q3
- vswp d3, d6
- vswp d5, d8
+ vswp q2, q3
+ vswp d3, d6
+ vswp d5, d8
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7
- vpop {q4-q7}
+ vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
- vpush {q4-q7}
- vdup.u8 q5, r2
- vdup.u8 q4, r3
+ vpush {q4-q7}
+ vdup.u8 q5, r2
+ vdup.u8 q4, r3
- sub r3, r0, #4 // pix -= 4
+ sub r3, r0, #4 // pix -= 4
- vld1.u8 {d16}, [r3], r1
- vld1.u8 {d17}, [r3], r1
- vld1.u8 {d18}, [r3], r1
- vld1.u8 {d19}, [r3], r1
- vld1.u8 {d20}, [r3], r1
- vld1.u8 {d21}, [r3], r1
- vld1.u8 {d22}, [r3], r1
- vld1.u8 {d23}, [r3], r1
- vld1.u8 {d24}, [r3], r1
- vld1.u8 {d25}, [r3], r1
- vld1.u8 {d26}, [r3], r1
- vld1.u8 {d27}, [r3], r1
- vld1.u8 {d28}, [r3], r1
- vld1.u8 {d29}, [r3], r1
- vld1.u8 {d30}, [r3], r1
- vld1.u8 {d31}, [r3], r1
+ vld1.u8 {d16}, [r3], r1
+ vld1.u8 {d17}, [r3], r1
+ vld1.u8 {d18}, [r3], r1
+ vld1.u8 {d19}, [r3], r1
+ vld1.u8 {d20}, [r3], r1
+ vld1.u8 {d21}, [r3], r1
+ vld1.u8 {d22}, [r3], r1
+ vld1.u8 {d23}, [r3], r1
+ vld1.u8 {d24}, [r3], r1
+ vld1.u8 {d25}, [r3], r1
+ vld1.u8 {d26}, [r3], r1
+ vld1.u8 {d27}, [r3], r1
+ vld1.u8 {d28}, [r3], r1
+ vld1.u8 {d29}, [r3], r1
+ vld1.u8 {d30}, [r3], r1
+ vld1.u8 {d31}, [r3], r1
- vtrn.u32 d16, d20
- vtrn.u32 d17, d21
- vtrn.u32 d18, d22
- vtrn.u32 d19, d23
- vtrn.u32 d24, d28
- vtrn.u32 d25, d29
- vtrn.u32 d26, d30
- vtrn.u32 d27, d31
+ vtrn.u32 d16, d20
+ vtrn.u32 d17, d21
+ vtrn.u32 d18, d22
+ vtrn.u32 d19, d23
+ vtrn.u32 d24, d28
+ vtrn.u32 d25, d29
+ vtrn.u32 d26, d30
+ vtrn.u32 d27, d31
- vtrn.u16 d16, d18
- vtrn.u16 d17, d19
- vtrn.u16 d20, d22
- vtrn.u16 d21, d23
- vtrn.u16 d24, d26
- vtrn.u16 d25, d27
- vtrn.u16 d28, d30
- vtrn.u16 d29, d31
+ vtrn.u16 d16, d18
+ vtrn.u16 d17, d19
+ vtrn.u16 d20, d22
+ vtrn.u16 d21, d23
+ vtrn.u16 d24, d26
+ vtrn.u16 d25, d27
+ vtrn.u16 d28, d30
+ vtrn.u16 d29, d31
- vtrn.u8 d16, d17
- vtrn.u8 d18, d19
- vtrn.u8 d20, d21
- vtrn.u8 d22, d23
- vtrn.u8 d24, d25
- vtrn.u8 d26, d27
- vtrn.u8 d28, d29
- vtrn.u8 d30, d31
+ vtrn.u8 d16, d17
+ vtrn.u8 d18, d19
+ vtrn.u8 d20, d21
+ vtrn.u8 d22, d23
+ vtrn.u8 d24, d25
+ vtrn.u8 d26, d27
+ vtrn.u8 d28, d29
+ vtrn.u8 d30, d31
- vswp d17, d24
- vswp d19, d26
- vswp d21, d28
- vswp d23, d30
+ vswp d17, d24
+ vswp d19, d26
+ vswp d21, d28
+ vswp d23, d30
- vswp q12, q9
- vswp q14, q11
+ vswp q12, q9
+ vswp q14, q11
- vswp q12, q10
- vswp q13, q11
+ vswp q12, q10
+ vswp q13, q11
- MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
+ MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
- mov r2, r2, lsr #2
- add r2, r2, #2
- vdup.u8 q5, r2
- vabd.u8 q0, q11, q12
- vclt.u8 q7, q0, q5
+ mov r2, r2, lsr #2
+ add r2, r2, #2
+ vdup.u8 q5, r2
+ vabd.u8 q0, q11, q12
+ vclt.u8 q7, q0, q5
- vabd.u8 q1, q9, q11
- vclt.u8 q1, q1, q4
- vand.s8 q1, q1, q7
+ vabd.u8 q1, q9, q11
+ vclt.u8 q1, q1, q4
+ vand.s8 q1, q1, q7
- vabd.u8 q2, q14,q12
- vclt.u8 q2, q2, q4
- vand.s8 q2, q2, q7
- vand.u8 q7, q7, q6
+ vabd.u8 q2, q14,q12
+ vclt.u8 q2, q2, q4
+ vand.s8 q2, q2, q7
+ vand.u8 q7, q7, q6
- vmov q3, q1
+ vmov q3, q1
- DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
- DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
+ DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
+ DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
- vand.u8 q3, q7, q3
- DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
- vmov q9, q4
- vbsl.u8 q3, q8, q10
- DIFF_LUMA_EQ4_MASK q1,q11, q6, q8
+ vand.u8 q3, q7, q3
+ DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
+ vmov q9, q4
+ vbsl.u8 q3, q8, q10
+ DIFF_LUMA_EQ4_MASK q1,q11, q6, q8
- vand.u8 q7, q7, q2
+ vand.u8 q7, q7, q2
- DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0
- DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1
+ DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0
+ DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1
- vbsl.u8 q6, q2, q12
- DIFF_LUMA_EQ4_MASK q15, q13, q7, q4
+ vbsl.u8 q6, q2, q12
+ DIFF_LUMA_EQ4_MASK q15, q13, q7, q4
- vbsl.u8 q7, q0, q14
+ vbsl.u8 q7, q0, q14
- vmov q5, q6
- vmov q2, q9
- vmov q6, q4
- vmov q4, q8
+ vmov q5, q6
+ vmov q2, q9
+ vmov q6, q4
+ vmov q4, q8
- vswp d8, d6
- vswp d5, d7
- vswp d5, d8
- vswp d14, d12
- vswp d11, d13
- vswp d11, d14
+ vswp d8, d6
+ vswp d5, d7
+ vswp d5, d8
+ vswp d14, d12
+ vswp d11, d13
+ vswp d11, d14
- sub r3, r0, #3
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7
+ sub r3, r0, #3
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7
- vpop {q4-q7}
+ vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
- vdup.u8 q11, r3
- ldr r3, [sp, #0]
+ vdup.u8 q11, r3
+ ldr r3, [sp, #0]
- sub r0, r0, r2 , lsl #1
- sub r1, r1, r2, lsl #1
- vdup.u8 q9, r3
- ldr r3, [sp, #4]
+ sub r0, r0, r2 , lsl #1
+ sub r1, r1, r2, lsl #1
+ vdup.u8 q9, r3
+ ldr r3, [sp, #4]
- vld1.u8 {d0}, [r0], r2
- vld1.u8 {d1}, [r1], r2
- vld1.u8 {d2}, [r0], r2
- vld1.u8 {d3}, [r1], r2
- vld1.u8 {d4}, [r0], r2
- vld1.u8 {d5}, [r1], r2
- vld1.u8 {d6}, [r0]
- vld1.u8 {d7}, [r1]
+ vld1.u8 {d0}, [r0], r2
+ vld1.u8 {d1}, [r1], r2
+ vld1.u8 {d2}, [r0], r2
+ vld1.u8 {d3}, [r1], r2
+ vld1.u8 {d4}, [r0], r2
+ vld1.u8 {d5}, [r1], r2
+ vld1.u8 {d6}, [r0]
+ vld1.u8 {d7}, [r1]
- sub r0, r0, r2, lsl #1
- sub r1, r1, r2, lsl #1
+ sub r0, r0, r2, lsl #1
+ sub r1, r1, r2, lsl #1
- vld1.s8 {d31}, [r3]
- vmovl.u8 q14,d31
- vshl.u64 d29,d28,#8
- vorr d28,d29
- vmov d29, d28
- veor q15, q15
- vsub.i8 q15,q15,q14
+ vld1.s8 {d31}, [r3]
+ vmovl.u8 q14,d31
+ vshl.u64 d29,d28,#8
+ vorr d28,d29
+ vmov d29, d28
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
- DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
+ DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
- vand.s8 q8, q8, q10
- vcge.s8 q14, q14, #0
- vand.s8 q8, q8, q14
- EXTRACT_DELTA_INTO_TWO_PART q8, q10
- vqadd.u8 q1, q1, q10
- vqsub.u8 q1, q1, q8
- vst1.u8 {d2}, [r0], r2
- vst1.u8 {d3}, [r1], r2
- vqsub.u8 q2, q2, q10
- vqadd.u8 q2, q2, q8
- vst1.u8 {d4}, [r0]
- vst1.u8 {d5}, [r1]
+ vand.s8 q8, q8, q10
+ vcge.s8 q14, q14, #0
+ vand.s8 q8, q8, q14
+ EXTRACT_DELTA_INTO_TWO_PART q8, q10
+ vqadd.u8 q1, q1, q10
+ vqsub.u8 q1, q1, q8
+ vst1.u8 {d2}, [r0], r2
+ vst1.u8 {d3}, [r1], r2
+ vqsub.u8 q2, q2, q10
+ vqadd.u8 q2, q2, q8
+ vst1.u8 {d4}, [r0]
+ vst1.u8 {d5}, [r1]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
- vpush {q4-q5}
+ vpush {q4-q5}
- vdup.u8 q11, r3
- ldr r3, [sp, #32]
+ vdup.u8 q11, r3
+ ldr r3, [sp, #32]
- sub r0, r0, r2 , lsl #1
- sub r1, r1, r2, lsl #1
- vdup.u8 q9, r3
- vld1.u8 {d0}, [r0], r2 // q0::p1
- vld1.u8 {d1}, [r1], r2
- vld1.u8 {d2}, [r0], r2 // q1::p0
- vld1.u8 {d3}, [r1], r2
- vld1.u8 {d4}, [r0], r2 // q2::q0
- vld1.u8 {d5}, [r1], r2
- vld1.u8 {d6}, [r0] // q3::q1
- vld1.u8 {d7}, [r1]
+ sub r0, r0, r2 , lsl #1
+ sub r1, r1, r2, lsl #1
+ vdup.u8 q9, r3
+ vld1.u8 {d0}, [r0], r2 // q0::p1
+ vld1.u8 {d1}, [r1], r2
+ vld1.u8 {d2}, [r0], r2 // q1::p0
+ vld1.u8 {d3}, [r1], r2
+ vld1.u8 {d4}, [r0], r2 // q2::q0
+ vld1.u8 {d5}, [r1], r2
+ vld1.u8 {d6}, [r0] // q3::q1
+ vld1.u8 {d7}, [r1]
- sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
- sub r1, r1, r2, lsl #1
+ sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
+ sub r1, r1, r2, lsl #1
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
- vmov q11, q10
+ vmov q11, q10
- DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q8, d30, d0 // Cb::p0' q0'
- DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d31, d1 // Cr::p0' q0'
+ DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q8, d30, d0 // Cb::p0' q0'
+ DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d31, d1 // Cr::p0' q0'
- vbsl.u8 q10, q15, q1
- vst1.u8 {d20}, [r0], r2
- vst1.u8 {d21}, [r1], r2
+ vbsl.u8 q10, q15, q1
+ vst1.u8 {d20}, [r0], r2
+ vst1.u8 {d21}, [r1], r2
- vbsl.u8 q11, q0, q2
- vst1.u8 {d22}, [r0]
- vst1.u8 {d23}, [r1]
+ vbsl.u8 q11, q0, q2
+ vst1.u8 {d22}, [r0]
+ vst1.u8 {d23}, [r1]
- vpop {q4-q5}
+ vpop {q4-q5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
- vdup.u8 q11, r3
- ldr r3, [sp, #0]
+ vdup.u8 q11, r3
+ ldr r3, [sp, #0]
- sub r0, r0, #2
- vdup.u8 q9, r3
- ldr r3, [sp, #4]
- sub r1, r1, #2
- vld1.s8 {d31}, [r3]
+ sub r0, r0, #2
+ vdup.u8 q9, r3
+ ldr r3, [sp, #4]
+ sub r1, r1, #2
+ vld1.s8 {d31}, [r3]
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
- vswp q1, q2
- vswp d1, d2
- vswp d6, d5
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+ vswp q1, q2
+ vswp d1, d2
+ vswp d6, d5
- vmovl.u8 q14, d31
- vshl.u64 d29,d28,#8
- vorr d28,d29
- vmov d29, d28
- veor q15, q15
- vsub.i8 q15,q15,q14
+ vmovl.u8 q14, d31
+ vshl.u64 d29,d28,#8
+ vorr d28,d29
+ vmov d29, d28
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
- DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
+ DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
- vand.s8 q8, q8, q10
- vcge.s8 q14, q14, #0
- vand.s8 q8, q8, q14
- EXTRACT_DELTA_INTO_TWO_PART q8, q10
- vqadd.u8 q1, q1, q10
- vqsub.u8 q1, q1, q8
- vqsub.u8 q2, q2, q10
- vqadd.u8 q2, q2, q8
+ vand.s8 q8, q8, q10
+ vcge.s8 q14, q14, #0
+ vand.s8 q8, q8, q14
+ EXTRACT_DELTA_INTO_TWO_PART q8, q10
+ vqadd.u8 q1, q1, q10
+ vqsub.u8 q1, q1, q8
+ vqsub.u8 q2, q2, q10
+ vqadd.u8 q2, q2, q8
- sub r0, r0, r2, lsl #3
- sub r1, r1, r2, lsl #3
- vswp d1, d2
- vswp d6, d5
- vswp q1, q2
+ sub r0, r0, r2, lsl #3
+ sub r1, r1, r2, lsl #3
+ vswp d1, d2
+ vswp d6, d5
+ vswp q1, q2
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
- vpush {q4-q5}
- vdup.u8 q11, r3
- ldr r3, [sp, #32]
+ vpush {q4-q5}
+ vdup.u8 q11, r3
+ ldr r3, [sp, #32]
- sub r0, r0, #2
- sub r1, r1, #2
+ sub r0, r0, #2
+ sub r1, r1, #2
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
- vswp q1, q2
- vswp d1, d2
- vswp d6, d5
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+ vswp q1, q2
+ vswp d1, d2
+ vswp d6, d5
- vdup.u8 q9, r3
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
- vmov q11, q10
+ vdup.u8 q9, r3
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+ vmov q11, q10
- DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10
- DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11
+ DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10
+ DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11
- vbsl.u8 q10, q4, q1
- vbsl.u8 q11, q5, q2
- sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
- sub r1, r1, r2, lsl #3
+ vbsl.u8 q10, q4, q1
+ vbsl.u8 q11, q5, q2
+ sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
+ sub r1, r1, r2, lsl #3
- vmov q1, q10
- vmov q2, q11
- vswp d1, d2
- vswp d6, d5
- vswp q1, q2
- // Cb:d0d1d2d3, Cr:d4d5d6d7
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+ vmov q1, q10
+ vmov q2, q11
+ vswp d1, d2
+ vswp d6, d5
+ vswp q1, q2
+ // Cb:d0d1d2d3, Cr:d4d5d6d7
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
- vpop {q4-q5}
+ vpop {q4-q5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
- vld1.64 {d0-d2}, [r0]
+ vld1.64 {d0-d2}, [r0]
- vceq.s8 q0, q0, #0
- vceq.s8 d2, d2, #0
- vmvn q0, q0
- vmvn d2, d2
- vabs.s8 q0, q0
- vabs.s8 d2, d2
+ vceq.s8 q0, q0, #0
+ vceq.s8 d2, d2, #0
+ vmvn q0, q0
+ vmvn d2, d2
+ vabs.s8 q0, q0
+ vabs.s8 d2, d2
- vst1.64 {d0-d2}, [r0]
+ vst1.64 {d0-d2}, [r0]
WELS_ASM_FUNC_END
#ifdef __APPLE__
@@ -851,37 +851,37 @@
.macro BS_NZC_CHECK
vld1.8 {d0,d1}, [$0]
/* Arrenge the input data --- TOP */
- ands r6, $1, #2
- beq bs_nzc_check_jump0
+ ands r6, $1, #2
+ beq bs_nzc_check_jump0
sub r6, $0, $2, lsl #4
- sub r6, $2, lsl #3
+ sub r6, $2, lsl #3
add r6, #12
vld1.32 d3[1], [r6]
bs_nzc_check_jump0:
vext.8 q1, q1, q0, #12
- vadd.u8 $3, q0, q1
+ vadd.u8 $3, q0, q1
/* Arrenge the input data --- LEFT */
- ands r6, $1, #1
- beq bs_nzc_check_jump1
+ ands r6, $1, #1
+ beq bs_nzc_check_jump1
sub r6, $0, #21
- add r7, r6, #4
+ add r7, r6, #4
vld1.8 d3[4], [r6]
- add r6, r7, #4
+ add r6, r7, #4
vld1.8 d3[5], [r7]
- add r7, r6, #4
+ add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_nzc_check_jump1:
- vzip.8 d0, d1
- vzip.8 d0, d1
+ vzip.8 d0, d1
+ vzip.8 d0, d1
vext.8 q1, q1, q0, #12
- vadd.u8 $4, q0, q1
+ vadd.u8 $4, q0, q1
.endm
.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
@@ -888,7 +888,7 @@
mov r6, #4
vabd.s16 q8, $0, $1
vabd.s16 q9, $1, $2
- vdup.s16 $0, r6
+ vdup.s16 $0, r6
vabd.s16 q10, $2, $3
vabd.s16 q11, $3, $4
@@ -897,7 +897,7 @@
vcge.s16 q10, $0
vcge.s16 q11, $0
- vpadd.i16 d16, d16, d17
+ vpadd.i16 d16, d16, d17
vpadd.i16 d17, d18, d19
vpadd.i16 d18, d20, d21
vpadd.i16 d19, d22, d23
@@ -910,8 +910,8 @@
vldm $0, {q0,q1,q2,q3}
/* Arrenge the input data --- TOP */
- ands r6, $1, #2
- beq bs_mv_check_jump0
+ ands r6, $1, #2
+ beq bs_mv_check_jump0
sub r6, $0, $2, lsl #6
add r6, #48
@@ -921,22 +921,22 @@
BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
/* Arrenge the input data --- LEFT */
- ands r6, $1, #1
- beq bs_mv_check_jump1
+ ands r6, $1, #1
+ beq bs_mv_check_jump1
sub r6, $0, #52
add r7, r6, #16
- vld1.32 d8[0], [r6]
- add r6, r7, #16
+ vld1.32 d8[0], [r6]
+ add r6, r7, #16
vld1.32 d8[1], [r7]
- add r7, r6, #16
+ add r7, r6, #16
vld1.32 d9[0], [r6]
vld1.32 d9[1], [r7]
bs_mv_check_jump1:
- vzip.32 q0, q2
- vzip.32 q1, q3
- vzip.32 q0, q1
+ vzip.32 q0, q2
+ vzip.32 q1, q3
+ vzip.32 q0, q1
vzip.32 q2, q3
BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
.endm
@@ -1038,41 +1038,41 @@
WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
- stmdb sp!, {r5-r7}
- vpush {q4}
+ stmdb sp!, {r5-r7}
+ vpush {q4}
- ldr r5, [sp, #28] //Save BS to r5
+ ldr r5, [sp, #28] //Save BS to r5
- /* Checking the nzc status */
- BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
+ /* Checking the nzc status */
+ BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
- /* For checking bS[I] = 2 */
- mov r6, #2
- vcgt.s8 q14, q14, #0
- vdup.u8 q0, r6
- vcgt.s8 q15, q15, #0
+ /* For checking bS[I] = 2 */
+ mov r6, #2
+ vcgt.s8 q14, q14, #0
+ vdup.u8 q0, r6
+ vcgt.s8 q15, q15, #0
- vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
- vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
+ vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
+ vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
- /* Checking the mv status*/
- BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
+ /* Checking the mv status*/
+ BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
- /* For checking bS[I] = 1 */
+ /* For checking bS[I] = 1 */
mov r6, #1
- vdup.u8 q0, r6
+ vdup.u8 q0, r6
- vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
- vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
+ vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
+ vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
- /* Check bS[I] is '1' or '2' */
- vmax.u8 q1, q12, q14
- vmax.u8 q0, q13, q15
+ /* Check bS[I] is '1' or '2' */
+ vmax.u8 q1, q12, q14
+ vmax.u8 q0, q13, q15
- //vstm r5, {q0, q1}
+ //vstm r5, {q0, q1}
vst1.32 {q0, q1}, [r5]
- vpop {q4}
- ldmia sp!, {r5-r7}
+ vpop {q4}
+ ldmia sp!, {r5-r7}
WELS_ASM_FUNC_END
#endif
--- a/codec/common/arm/expand_picture_neon.S
+++ b/codec/common/arm/expand_picture_neon.S
@@ -37,119 +37,119 @@
WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
stmdb sp!, {r4-r8}
- //Save the dst
- mov r7, r0
- mov r8, r3
+ //Save the dst
+ mov r7, r0
+ mov r8, r3
- add r4, r7, r2
- sub r4, #1
+ add r4, r7, r2
+ sub r4, #1
//For the left and right expand
_expand_picture_luma_loop2:
- sub r5, r7, #32
- add r6, r4, #1
+ sub r5, r7, #32
+ add r6, r4, #1
- vld1.8 {d0[], d1[]}, [r7], r1
- vld1.8 {d2[], d3[]}, [r4], r1
+ vld1.8 {d0[], d1[]}, [r7], r1
+ vld1.8 {d2[], d3[]}, [r4], r1
- vst1.8 {q0}, [r5]!
- vst1.8 {q0}, [r5]
- vst1.8 {q1}, [r6]!
- vst1.8 {q1}, [r6]
- subs r8, #1
- bne _expand_picture_luma_loop2
+ vst1.8 {q0}, [r5]!
+ vst1.8 {q0}, [r5]
+ vst1.8 {q1}, [r6]!
+ vst1.8 {q1}, [r6]
+ subs r8, #1
+ bne _expand_picture_luma_loop2
- //for the top and bottom expand
- add r2, #64
- sub r0, #32
- mla r4, r1, r3, r0
- sub r4, r1
+ //for the top and bottom expand
+ add r2, #64
+ sub r0, #32
+ mla r4, r1, r3, r0
+ sub r4, r1
_expand_picture_luma_loop0:
- mov r5, #32
+ mov r5, #32
mls r5, r5, r1, r0
- add r6, r4, r1
- vld1.8 {q0}, [r0]!
- vld1.8 {q1}, [r4]!
+ add r6, r4, r1
+ vld1.8 {q0}, [r0]!
+ vld1.8 {q1}, [r4]!
- mov r8, #32
+ mov r8, #32
_expand_picture_luma_loop1:
- vst1.8 {q0}, [r5], r1
- vst1.8 {q1}, [r6], r1
- subs r8, #1
+ vst1.8 {q0}, [r5], r1
+ vst1.8 {q1}, [r6], r1
+ subs r8, #1
bne _expand_picture_luma_loop1
- subs r2, #16
- bne _expand_picture_luma_loop0
+ subs r2, #16
+ bne _expand_picture_luma_loop0
//vldreq.32 d0, [r0]
- ldmia sp!, {r4-r8}
+ ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
stmdb sp!, {r4-r9}
- //Save the dst
- mov r7, r0
- mov r8, r3
+ //Save the dst
+ mov r7, r0
+ mov r8, r3
- add r4, r7, r2
- sub r4, #1
+ add r4, r7, r2
+ sub r4, #1
//For the left and right expand
_expand_picture_chroma_loop2:
- sub r5, r7, #16
- add r6, r4, #1
+ sub r5, r7, #16
+ add r6, r4, #1
- vld1.8 {d0[], d1[]}, [r7], r1
- vld1.8 {d2[], d3[]}, [r4], r1
+ vld1.8 {d0[], d1[]}, [r7], r1
+ vld1.8 {d2[], d3[]}, [r4], r1
- vst1.8 {q0}, [r5]
- vst1.8 {q1}, [r6]
- subs r8, #1
- bne _expand_picture_chroma_loop2
+ vst1.8 {q0}, [r5]
+ vst1.8 {q1}, [r6]
+ subs r8, #1
+ bne _expand_picture_chroma_loop2
- //for the top and bottom expand
- add r2, #32
- mov r9, r2
- bic r2, #15
- sub r0, #16
- mla r4, r1, r3, r0
- sub r4, r1
+ //for the top and bottom expand
+ add r2, #32
+ mov r9, r2
+ bic r2, #15
+ sub r0, #16
+ mla r4, r1, r3, r0
+ sub r4, r1
_expand_picture_chroma_loop0:
- mov r5, #16
- mls r5, r5, r1, r0
- add r6, r4, r1
- vld1.8 {q0}, [r0]!
- vld1.8 {q1}, [r4]!
+ mov r5, #16
+ mls r5, r5, r1, r0
+ add r6, r4, r1
+ vld1.8 {q0}, [r0]!
+ vld1.8 {q1}, [r4]!
- mov r8, #16
+ mov r8, #16
_expand_picture_chroma_loop1:
- vst1.8 {q0}, [r5], r1
- vst1.8 {q1}, [r6], r1
- subs r8, #1
- bne _expand_picture_chroma_loop1
+ vst1.8 {q0}, [r5], r1
+ vst1.8 {q1}, [r6], r1
+ subs r8, #1
+ bne _expand_picture_chroma_loop1
- subs r2, #16
- bne _expand_picture_chroma_loop0
+ subs r2, #16
+ bne _expand_picture_chroma_loop0
//vldreq.32 d0, [r0]
- and r9, #15
- cmp r9, #8
- bne _expand_picture_chroma_end
- mov r5, #16
- mls r5, r5, r1, r0
- add r6, r4, r1
- vld1.8 {d0}, [r0]!
- vld1.8 {d2}, [r4]!
- mov r8, #16
+ and r9, #15
+ cmp r9, #8
+ bne _expand_picture_chroma_end
+ mov r5, #16
+ mls r5, r5, r1, r0
+ add r6, r4, r1
+ vld1.8 {d0}, [r0]!
+ vld1.8 {d2}, [r4]!
+ mov r8, #16
_expand_picture_chroma_loop3:
- vst1.8 {d0}, [r5], r1
- vst1.8 {d2}, [r6], r1
- subs r8, #1
- bne _expand_picture_chroma_loop3
+ vst1.8 {d0}, [r5], r1
+ vst1.8 {d2}, [r6], r1
+ subs r8, #1
+ bne _expand_picture_chroma_loop3
_expand_picture_chroma_end:
- ldmia sp!, {r4-r9}
+ ldmia sp!, {r4-r9}
WELS_ASM_FUNC_END
#endif
--- a/codec/common/arm/mc_neon.S
+++ b/codec/common/arm/mc_neon.S
@@ -36,2175 +36,2175 @@
#ifdef __APPLE__
.macro AVERAGE_TWO_8BITS
-// { // input:dst_d, src_d A and B; working: q13
- vaddl.u8 q13, $2, $1
- vrshrn.u16 $0, q13, #1
-// }
+// { // input:dst_d, src_d A and B; working: q13
+ vaddl.u8 q13, $2, $1
+ vrshrn.u16 $0, q13, #1
+// }
.endm
.macro FILTER_6TAG_8BITS
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
-// }
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+// }
.endm
-.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
- vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
- vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
- vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
- vpadd.s16 $0, $0, $0
- vpadd.s16 $0, $0, $0
- vqrshrun.s16 $0, $4, #5
-// }
+.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
+ vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
+ vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
+ vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
+ vpadd.s16 $0, $0, $0
+ vpadd.s16 $0, $0, $0
+ vqrshrun.s16 $0, $4, #5
+// }
.endm
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $2, $6
- vrshrn.u16 $6, q13, #1
-// }
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+ vaddl.u8 q13, $2, $6
+ vrshrn.u16 $6, q13, #1
+// }
.endm
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $3, $6
- vrshrn.u16 $6, q13, #1
-// }
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+ vaddl.u8 q13, $3, $6
+ vrshrn.u16 $6, q13, #1
+// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
- vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+ vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS
-// { // input:a, b, c, dst_d;
- vsub.s16 $0, $0, $1 //a-b
- vshr.s16 $0, $0, #2 //(a-b)/4
- vsub.s16 $0, $0, $1 //(a-b)/4-b
- vadd.s16 $0, $0, $2 //(a-b)/4-b+c
- vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
- vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 $3, $0, #6 //(+32)>>6
-// }
+// { // input:a, b, c, dst_d;
+ vsub.s16 $0, $0, $1 //a-b
+ vshr.s16 $0, $0, #2 //(a-b)/4
+ vsub.s16 $0, $0, $1 //(a-b)/4-b
+ vadd.s16 $0, $0, $2 //(a-b)/4-b+c
+ vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 $3, $0, #6 //(+32)>>6
+// }
.endm
.macro UNPACK_2_16BITS_TO_ABC
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- vext.16 $4, $0, $1, #2 //src[0]
- vext.16 $3, $0, $1, #3 //src[1]
- vadd.s16 $4, $3 //c=src[0]+src[1]
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ vext.16 $4, $0, $1, #2 //src[0]
+ vext.16 $3, $0, $1, #3 //src[1]
+ vadd.s16 $4, $3 //c=src[0]+src[1]
- vext.16 $3, $0, $1, #1 //src[-1]
- vext.16 $2, $0, $1, #4 //src[2]
- vadd.s16 $3, $2 //b=src[-1]+src[2]
+ vext.16 $3, $0, $1, #1 //src[-1]
+ vext.16 $2, $0, $1, #4 //src[2]
+ vadd.s16 $3, $2 //b=src[-1]+src[2]
- vext.16 $2, $0, $1, #5 //src[3]
- vadd.s16 $2, $0 //a=src[-2]+src[3]
-// }
+ vext.16 $2, $0, $1, #5 //src[3]
+ vadd.s16 $2, $0 //a=src[-2]+src[3]
+// }
.endm
.macro UNPACK_1_IN_8x16BITS_TO_8BITS
-// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
- vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
- vrev64.16 $1, $1
- vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
- vshr.s64 $1, $2, #16
- vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
+// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+ vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
+ vrev64.16 $1, $1
+ vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
+ vshr.s64 $1, $2, #16
+ vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
- vsub.s16 $0, $0, $1 //a-b
- vshr.s16 $0, $0, #2 //(a-b)/4
- vsub.s16 $0, $0, $1 //(a-b)/4-b
- vadd.s16 $0, $0, $2 //(a-b)/4-b+c
- vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
- vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 $0, $3, #6 //(+32)>>6
-// }
+ vsub.s16 $0, $0, $1 //a-b
+ vshr.s16 $0, $0, #2 //(a-b)/4
+ vsub.s16 $0, $0, $1 //(a-b)/4-b
+ vadd.s16 $0, $0, $2 //(a-b)/4-b+c
+ vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 $0, $3, #6 //(+32)>>6
+// }
.endm
#else
.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
-// { // input:dst_d, src_d A and B; working: q13
- vaddl.u8 q13, \arg2, \arg1
- vrshrn.u16 \arg0, q13, #1
-// }
+// { // input:dst_d, src_d A and B; working: q13
+ vaddl.u8 q13, \arg2, \arg1
+ vrshrn.u16 \arg0, q13, #1
+// }
.endm
.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
-// }
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+// }
.endm
-.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
- vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
- vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
- vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32]
- vpadd.s16 \arg0, \arg0, \arg0
- vpadd.s16 \arg0, \arg0, \arg0
- vqrshrun.s16 \arg0, \arg4, #5
-// }
+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
+ vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
+ vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
+ vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32]
+ vpadd.s16 \arg0, \arg0, \arg0
+ vpadd.s16 \arg0, \arg0, \arg0
+ vqrshrun.s16 \arg0, \arg4, #5
+// }
.endm
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
- vaddl.u8 q13, \arg2, \arg6
- vrshrn.u16 \arg6, q13, #1
-// }
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+ vaddl.u8 q13, \arg2, \arg6
+ vrshrn.u16 \arg6, q13, #1
+// }
.endm
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
- vaddl.u8 q13, \arg3, \arg6
- vrshrn.u16 \arg6, q13, #1
-// }
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+ vaddl.u8 q13, \arg3, \arg6
+ vrshrn.u16 \arg6, q13, #1
+// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
- vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+ vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
-// { // input:a, b, c, dst_d;
- vsub.s16 \arg0, \arg0, \arg1 //a-b
- vshr.s16 \arg0, \arg0, #2 //(a-b)/4
- vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
- vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
- vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
- vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6
-// }
+// { // input:a, b, c, dst_d;
+ vsub.s16 \arg0, \arg0, \arg1 //a-b
+ vshr.s16 \arg0, \arg0, #2 //(a-b)/4
+ vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
+ vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
+ vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6
+// }
.endm
.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- vext.16 \arg4, \arg0, \arg1, #2 //src[0]
- vext.16 \arg3, \arg0, \arg1, #3 //src[1]
- vadd.s16 \arg4, \arg3 //c=src[0]+src[1]
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ vext.16 \arg4, \arg0, \arg1, #2 //src[0]
+ vext.16 \arg3, \arg0, \arg1, #3 //src[1]
+ vadd.s16 \arg4, \arg3 //c=src[0]+src[1]
- vext.16 \arg3, \arg0, \arg1, #1 //src[-1]
- vext.16 \arg2, \arg0, \arg1, #4 //src[2]
- vadd.s16 \arg3,\arg2 //b=src[-1]+src[2]
+ vext.16 \arg3, \arg0, \arg1, #1 //src[-1]
+ vext.16 \arg2, \arg0, \arg1, #4 //src[2]
+ vadd.s16 \arg3,\arg2 //b=src[-1]+src[2]
- vext.16 \arg2, \arg0, \arg1, #5 //src[3]
- vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
-// }
+ vext.16 \arg2, \arg0, \arg1, #5 //src[3]
+ vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
+// }
.endm
.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
-// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
- vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
- vrev64.16 \arg1, \arg1
- vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5]
- vshr.s64 \arg1, \arg2, #16
- vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0
+// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+ vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
+ vrev64.16 \arg1, \arg1
+ vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5]
+ vshr.s64 \arg1, \arg2, #16
+ vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0
- vsub.s16 \arg0, \arg0, \arg1 //a-b
- vshr.s16 \arg0, \arg0, #2 //(a-b)/4
- vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
- vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
- vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
- vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
-// }
+ vsub.s16 \arg0, \arg0, \arg1 //a-b
+ vshr.s16 \arg0, \arg0, #2 //(a-b)/4
+ vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
+ vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
+ vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
+// }
.endm
#endif
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w16_h_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
- FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d2, q14, q15
+ FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d2, q14, q15
- FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d3, q14, q15
+ FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d3, q14, q15
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
- cmp r4, #0
- bne w16_h_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w16_h_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w8_h_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
- FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d1, q14, q15
+ FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d1, q14, q15
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
- cmp r4, #0
- bne w8_h_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w8_h_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w4_h_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q8, q2, q2, #2 //src[1:6 * *]
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q8, q2, q2, #2 //src[1:6 * *]
- vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+ vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
- FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15
+ FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
- sub r6, #2
- cmp r6, #0
- bne w4_h_mc_luma_loop
+ sub r6, #2
+ cmp r6, #0
+ bne w4_h_mc_luma_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w16_xy_10_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d16, d18, d20, d2, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d16, d18, d20, d2, q14, q15
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d17, d19, d21, d3, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d17, d19, d21, d3, q14, q15
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
- cmp r4, #0
- bne w16_xy_10_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w16_xy_10_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w8_xy_10_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
- cmp r4, #0
- bne w8_xy_10_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w8_xy_10_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w4_xy_10_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q8, q2, q2, #2 //src[1:6 * *]
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q8, q2, q2, #2 //src[1:6 * *]
- vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+ vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
- sub r6, #2
- cmp r6, #0
- bne w4_xy_10_mc_luma_loop
+ sub r6, #2
+ cmp r6, #0
+ bne w4_xy_10_mc_luma_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w16_xy_30_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d16, d18, d20, d2, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d16, d18, d20, d2, q14, q15
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d17, d19, d21, d3, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d17, d19, d21, d3, q14, q15
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
- cmp r4, #0
- bne w16_xy_30_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w16_xy_30_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w8_xy_30_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
- cmp r4, #0
- bne w8_xy_30_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w8_xy_30_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w4_xy_30_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q8, q2, q2, #2 //src[1:6 * *]
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q8, q2, q2, #2 //src[1:6 * *]
- vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+ vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
- sub r6, #2
- cmp r6, #0
- bne w4_xy_30_mc_luma_loop
+ sub r6, #2
+ cmp r6, #0
+ bne w4_xy_30_mc_luma_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
w16_xy_01_luma_loop:
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
- //q2, q3, q4, q5, q0 --> q0~q4
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q4
+ //q2, q3, q4, q5, q0 --> q0~q4
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q4
- sub r4, #8
- cmp r4, #0
- bne w16_xy_01_luma_loop
- pop {r4}
+ sub r4, #8
+ cmp r4, #0
+ bne w16_xy_01_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
w8_xy_01_mc_luma_loop:
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
- sub r4, #4
- cmp r4, #0
- bne w8_xy_01_mc_luma_loop
+ sub r4, #4
+ cmp r4, #0
+ bne w8_xy_01_mc_luma_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
w4_xy_01_mc_luma_loop:
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vmov r4, r5, d16
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vmov r4, r5, d16
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vmov r5, r6, d16
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vmov r5, r6, d16
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
- sub r7, #4
- cmp r7, #0
- bne w4_xy_01_mc_luma_loop
+ sub r7, #4
+ cmp r7, #0
+ bne w4_xy_01_mc_luma_loop
- pop {r4, r5, r6, r7}
+ pop {r4, r5, r6, r7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
w16_xy_03_luma_loop:
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
- //q2, q3, q8, q9, q0 --> q0~q8
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q8
+ //q2, q3, q8, q9, q0 --> q0~q8
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q8
- sub r4, #8
- cmp r4, #0
- bne w16_xy_03_luma_loop
- pop {r4}
+ sub r4, #8
+ cmp r4, #0
+ bne w16_xy_03_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
w8_xy_03_mc_luma_loop:
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
- sub r4, #4
- cmp r4, #0
- bne w8_xy_03_mc_luma_loop
+ sub r4, #4
+ cmp r4, #0
+ bne w8_xy_03_mc_luma_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
w4_xy_03_mc_luma_loop:
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vmov r4, r5, d16
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vmov r4, r5, d16
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vmov r5, r6, d16
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vmov r5, r6, d16
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
- sub r7, #4
- cmp r7, #0
- bne w4_xy_03_mc_luma_loop
+ sub r7, #4
+ cmp r7, #0
+ bne w4_xy_03_mc_luma_loop
- pop {r4, r5, r6, r7}
+ pop {r4, r5, r6, r7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
w16_v_mc_luma_loop:
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
- FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+ FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
- FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+ FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
- FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+ FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
- FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+ FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
- //q2, q3, q8, q9, q0 --> q0~q8
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q8
+ //q2, q3, q8, q9, q0 --> q0~q8
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q8
- sub r4, #8
- cmp r4, #0
- bne w16_v_mc_luma_loop
- pop {r4}
+ sub r4, #8
+ cmp r4, #0
+ bne w16_v_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
w8_v_mc_luma_loop:
- pld [r0]
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
- sub r4, #4
- cmp r4, #0
- bne w8_v_mc_luma_loop
+ sub r4, #4
+ cmp r4, #0
+ bne w8_v_mc_luma_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
w4_v_mc_luma_loop:
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vmov r4, r5, d16
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vmov r4, r5, d16
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
- vmov r5, r6, d16
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vmov r5, r6, d16
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
- sub r7, #4
- cmp r7, #0
- bne w4_v_mc_luma_loop
+ sub r7, #4
+ cmp r7, #0
+ bne w4_v_mc_luma_loop
- pop {r4, r5, r6, r7}
+ pop {r4, r5, r6, r7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
- push {r4}
- vpush {q4-q7}
- ldr r4, [sp, #68]
+ push {r4}
+ vpush {q4-q7}
+ ldr r4, [sp, #68]
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2]
- vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2]
+ vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0]
- vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
+ vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0]
+ vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
w16_hv_mc_luma_loop:
- vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {q0}, [r2], r3 //write 16Byte
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {q0}, [r2], r3 //write 16Byte
- vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
+ vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
- vst1.u8 {d3, d4}, [r2], r3 //write 16Byte
+ vst1.u8 {d3, d4}, [r2], r3 //write 16Byte
- vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
+ vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
- vst1.u8 {d6, d7}, [r2], r3 //write 16Byte
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
+ vst1.u8 {d6, d7}, [r2], r3 //write 16Byte
- vld1.u8 {d6-d8}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
- vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
+ vld1.u8 {d6-d8}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
+ vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
- //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
- vswp q0, q6
- vswp q6, q3
- vmov q5, q2
- vmov q2, q8
+ //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+ vswp q0, q6
+ vswp q6, q3
+ vmov q5, q2
+ vmov q2, q8
- vmov d20,d8
- vmov q4, q1
- vmov q1, q7
- vmov d14,d20
+ vmov d20,d8
+ vmov q4, q1
+ vmov q1, q7
+ vmov d14,d20
- sub r4, #4
- cmp r4, #0
- bne w16_hv_mc_luma_loop
- vpop {q4-q7}
- pop {r4}
+ sub r4, #4
+ cmp r4, #0
+ bne w16_hv_mc_luma_loop
+ vpop {q4-q7}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
- push {r4}
- vpush {q4}
- ldr r4, [sp, #20]
+ push {r4}
+ vpush {q4}
+ ldr r4, [sp, #20]
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 13(8+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 13(8+5), =src[-1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 13(8+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 13(8+5), =src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //use 13(8+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 13(8+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 13(8+5), =src[2]
+ vld1.u8 {q2}, [r0], r1 //use 13(8+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 13(8+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 13(8+5), =src[2]
w8_hv_mc_luma_loop:
- vld1.u8 {q8}, [r0], r1 //use 13(8+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
+ vld1.u8 {q8}, [r0], r1 //use 13(8+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
- vld1.u8 {q2}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
- //q4~q5, q0~q2, --> q0~q4
- vswp q0, q4
- vswp q2, q4
- vmov q3, q1
- vmov q1, q8
+ //q4~q5, q0~q2, --> q0~q4
+ vswp q0, q4
+ vswp q2, q4
+ vmov q3, q1
+ vmov q1, q8
- sub r4, #4
- cmp r4, #0
- bne w8_hv_mc_luma_loop
- vpop {q4}
- pop {r4}
+ sub r4, #4
+ cmp r4, #0
+ bne w8_hv_mc_luma_loop
+ vpop {q4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
- push {r4 ,r5, r6}
- vpush {q4-q7}
- ldr r6, [sp, #76]
+ push {r4 ,r5, r6}
+ vpush {q4-q7}
+ ldr r6, [sp, #76]
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2]
+ vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2]
w4_hv_mc_luma_loop:
- vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3]
- vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4]
+ vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3]
+ vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4]
- //the 1st&2nd row
- pld [r0]
- pld [r0, r1]
- // vertical filtered
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail
+ //the 1st&2nd row
+ pld [r0]
+ pld [r0, r1]
+ // vertical filtered
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
- UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
+ UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail
- vmov d23, d0
- vmov d25, d14
- vmov d27, d16
+ vmov d23, d0
+ vmov d25, d14
+ vmov d27, d16
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
- vmov r4 ,r5, d22
- str r4, [r2], r3 //write 4Byte
- str r5, [r2], r3 //write 4Byte
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
+ vmov r4 ,r5, d22
+ str r4, [r2], r3 //write 4Byte
+ str r5, [r2], r3 //write 4Byte
- //the 3rd&4th row
- vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3]
- vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4]
- pld [r0]
- pld [r0, r1]
- // vertical filtered
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail
+ //the 3rd&4th row
+ vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3]
+ vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4]
+ pld [r0]
+ pld [r0, r1]
+ // vertical filtered
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail
- FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
- UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
+ UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail
- vmov d23, d4
- vmov d25, d14
- vmov d27, d16
+ vmov d23, d4
+ vmov d25, d14
+ vmov d27, d16
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
- vmov r4 ,r5, d22
- str r4, [r2], r3 //write 4Byte
- str r5, [r2], r3 //write 4Byte
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
+ vmov r4 ,r5, d22
+ str r4, [r2], r3 //write 4Byte
+ str r5, [r2], r3 //write 4Byte
- //q4~q6, q0~q1, --> q0~q4
- vswp q4, q0
- vmov q3, q4
- vmov q4, q1
- vmov q1, q5
- vmov q2, q6
+ //q4~q6, q0~q1, --> q0~q4
+ vswp q4, q0
+ vmov q3, q4
+ vmov q4, q1
+ vmov q1, q5
+ vmov q2, q6
- sub r6, #4
- cmp r6, #0
- bne w4_hv_mc_luma_loop
+ sub r6, #4
+ cmp r6, #0
+ bne w4_hv_mc_luma_loop
- vpop {q4-q7}
- pop {r4, r5, r6}
+ vpop {q4-q7}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
w16_copy_loop:
- vld1.u8 {q0}, [r0], r1
- sub r4, #2
- vld1.u8 {q1}, [r0], r1
- vst1.u8 {q0}, [r2], r3
- cmp r4, #0
- vst1.u8 {q1}, [r2], r3
- bne w16_copy_loop
+ vld1.u8 {q0}, [r0], r1
+ sub r4, #2
+ vld1.u8 {q1}, [r0], r1
+ vst1.u8 {q0}, [r2], r3
+ cmp r4, #0
+ vst1.u8 {q1}, [r2], r3
+ bne w16_copy_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
w8_copy_loop:
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d1}, [r0], r1
- vst1.u8 {d0}, [r2], r3
- vst1.u8 {d1}, [r2], r3
- sub r4, #2
- cmp r4, #0
- bne w8_copy_loop
+ vld1.u8 {d0}, [r0], r1
+ vld1.u8 {d1}, [r0], r1
+ vst1.u8 {d0}, [r2], r3
+ vst1.u8 {d1}, [r2], r3
+ sub r4, #2
+ cmp r4, #0
+ bne w8_copy_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
w4_copy_loop:
- ldr r5, [r0], r1
- ldr r6, [r0], r1
- str r5, [r2], r3
- str r6, [r2], r3
+ ldr r5, [r0], r1
+ ldr r6, [r0], r1
+ str r5, [r2], r3
+ str r6, [r2], r3
- sub r4, #2
- cmp r4, #0
- bne w4_copy_loop
+ sub r4, #2
+ cmp r4, #0
+ bne w4_copy_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
w16_pix_avg_loop:
- vld1.u8 {q0}, [r2]!
- vld1.u8 {q1}, [r3]!
- vld1.u8 {q2}, [r2]!
- vld1.u8 {q3}, [r3]!
+ vld1.u8 {q0}, [r2]!
+ vld1.u8 {q1}, [r3]!
+ vld1.u8 {q2}, [r2]!
+ vld1.u8 {q3}, [r3]!
- vld1.u8 {q8}, [r2]!
- vld1.u8 {q9}, [r3]!
- vld1.u8 {q10}, [r2]!
- vld1.u8 {q11}, [r3]!
+ vld1.u8 {q8}, [r2]!
+ vld1.u8 {q9}, [r3]!
+ vld1.u8 {q10}, [r2]!
+ vld1.u8 {q11}, [r3]!
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {q0}, [r0], r1
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {q0}, [r0], r1
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {q2}, [r0], r1
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {q2}, [r0], r1
- AVERAGE_TWO_8BITS d16, d16, d18
- AVERAGE_TWO_8BITS d17, d17, d19
- vst1.u8 {q8}, [r0], r1
+ AVERAGE_TWO_8BITS d16, d16, d18
+ AVERAGE_TWO_8BITS d17, d17, d19
+ vst1.u8 {q8}, [r0], r1
- AVERAGE_TWO_8BITS d20, d20, d22
- AVERAGE_TWO_8BITS d21, d21, d23
- vst1.u8 {q10}, [r0], r1
+ AVERAGE_TWO_8BITS d20, d20, d22
+ AVERAGE_TWO_8BITS d21, d21, d23
+ vst1.u8 {q10}, [r0], r1
- sub r4, #4
- cmp r4, #0
- bne w16_pix_avg_loop
+ sub r4, #4
+ cmp r4, #0
+ bne w16_pix_avg_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
- push {r4, r5}
- ldr r4, [sp, #8]
- mov r5, #16
+ push {r4, r5}
+ ldr r4, [sp, #8]
+ mov r5, #16
w8_pix_avg_loop:
- vld1.u8 {d0}, [r2], r5
- vld1.u8 {d2}, [r3], r5
- vld1.u8 {d1}, [r2], r5
- vld1.u8 {d3}, [r3], r5
+ vld1.u8 {d0}, [r2], r5
+ vld1.u8 {d2}, [r3], r5
+ vld1.u8 {d1}, [r2], r5
+ vld1.u8 {d3}, [r3], r5
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {d0}, [r0], r1
- vst1.u8 {d1}, [r0], r1
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {d0}, [r0], r1
+ vst1.u8 {d1}, [r0], r1
- vld1.u8 {d4}, [r2], r5
- vld1.u8 {d6}, [r3], r5
- vld1.u8 {d5}, [r2], r5
- vld1.u8 {d7}, [r3], r5
+ vld1.u8 {d4}, [r2], r5
+ vld1.u8 {d6}, [r3], r5
+ vld1.u8 {d5}, [r2], r5
+ vld1.u8 {d7}, [r3], r5
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {d4}, [r0], r1
- vst1.u8 {d5}, [r0], r1
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {d4}, [r0], r1
+ vst1.u8 {d5}, [r0], r1
- sub r4, #4
- cmp r4, #0
- bne w8_pix_avg_loop
+ sub r4, #4
+ cmp r4, #0
+ bne w8_pix_avg_loop
- pop {r4, r5}
+ pop {r4, r5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
- push {r4-r8}
- ldr r4, [sp, #20]
+ push {r4-r8}
+ ldr r4, [sp, #20]
w4_pix_avg_loop:
- ldr r5, [r2]
- ldr r6, [r2, #16]
- ldr r7, [r3]
- ldr r8, [r3, #16]
- add r2, #32
- add r3, #32
+ ldr r5, [r2]
+ ldr r6, [r2, #16]
+ ldr r7, [r3]
+ ldr r8, [r3, #16]
+ add r2, #32
+ add r3, #32
- vmov d0, r5, r6
- vmov d1, r7, r8
- AVERAGE_TWO_8BITS d0, d0, d1
- vmov r5, r6, d0
+ vmov d0, r5, r6
+ vmov d1, r7, r8
+ AVERAGE_TWO_8BITS d0, d0, d1
+ vmov r5, r6, d0
- str r5, [r0], r1
- str r6, [r0], r1
+ str r5, [r0], r1
+ str r6, [r0], r1
- sub r4, #2
- cmp r4, #0
- bne w4_pix_avg_loop
+ sub r4, #2
+ cmp r4, #0
+ bne w4_pix_avg_loop
- pop {r4-r8}
+ pop {r4-r8}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
- push {r4, r5}
- ldr r4, [sp, #8]
- ldr r5, [sp, #12]
-// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-// we can opti it by adding vert only/ hori only cases, to be continue
- vld1.u8 {d31}, [r4] //load A/B/C/D
- vld1.u8 {q0}, [r0], r1 //src[x]
+ push {r4, r5}
+ ldr r4, [sp, #8]
+ ldr r5, [sp, #12]
+// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+// we can opti it by adding vert only/ hori only cases, to be continue
+ vld1.u8 {d31}, [r4] //load A/B/C/D
+ vld1.u8 {q0}, [r0], r1 //src[x]
- vdup.u8 d28, d31[0] //A
- vdup.u8 d29, d31[1] //B
- vdup.u8 d30, d31[2] //C
- vdup.u8 d31, d31[3] //D
+ vdup.u8 d28, d31[0] //A
+ vdup.u8 d29, d31[1] //B
+ vdup.u8 d30, d31[2] //C
+ vdup.u8 d31, d31[3] //D
- vext.u8 d1, d0, d1, #1 //src[x+1]
+ vext.u8 d1, d0, d1, #1 //src[x+1]
-w8_mc_chroma_loop: // each two pxl row
- vld1.u8 {q1}, [r0], r1 //src[x+stride]
- vld1.u8 {q2}, [r0], r1 //src[x+2*stride]
- vext.u8 d3, d2, d3, #1 //src[x+stride+1]
- vext.u8 d5, d4, d5, #1 //src[x+2*stride+1]
+w8_mc_chroma_loop: // each two pxl row
+ vld1.u8 {q1}, [r0], r1 //src[x+stride]
+ vld1.u8 {q2}, [r0], r1 //src[x+2*stride]
+ vext.u8 d3, d2, d3, #1 //src[x+stride+1]
+ vext.u8 d5, d4, d5, #1 //src[x+2*stride+1]
- vmull.u8 q3, d0, d28 //(src[x] * A)
- vmlal.u8 q3, d1, d29 //+=(src[x+1] * B)
- vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C)
- vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d6, q3, #6
- vst1.u8 d6, [r2], r3
+ vmull.u8 q3, d0, d28 //(src[x] * A)
+ vmlal.u8 q3, d1, d29 //+=(src[x+1] * B)
+ vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D)
+ vrshrn.u16 d6, q3, #6
+ vst1.u8 d6, [r2], r3
- vmull.u8 q3, d2, d28 //(src[x] * A)
- vmlal.u8 q3, d3, d29 //+=(src[x+1] * B)
- vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C)
- vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d6, q3, #6
- vst1.u8 d6, [r2], r3
+ vmull.u8 q3, d2, d28 //(src[x] * A)
+ vmlal.u8 q3, d3, d29 //+=(src[x+1] * B)
+ vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D)
+ vrshrn.u16 d6, q3, #6
+ vst1.u8 d6, [r2], r3
- vmov q0, q2
- sub r5, #2
- cmp r5, #0
- bne w8_mc_chroma_loop
+ vmov q0, q2
+ sub r5, #2
+ cmp r5, #0
+ bne w8_mc_chroma_loop
- pop {r4, r5}
+ pop {r4, r5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r6, [sp, #16]
-// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-// we can opti it by adding vert only/ hori only cases, to be continue
- vld1.u8 {d31}, [r4] //load A/B/C/D
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r6, [sp, #16]
+// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+// we can opti it by adding vert only/ hori only cases, to be continue
+ vld1.u8 {d31}, [r4] //load A/B/C/D
- vdup.u8 d28, d31[0] //A
- vdup.u8 d29, d31[1] //B
- vdup.u8 d30, d31[2] //C
- vdup.u8 d31, d31[3] //D
+ vdup.u8 d28, d31[0] //A
+ vdup.u8 d29, d31[1] //B
+ vdup.u8 d30, d31[2] //C
+ vdup.u8 d31, d31[3] //D
-w4_mc_chroma_loop: // each two pxl row
- vld1.u8 {d0}, [r0], r1 //a::src[x]
- vld1.u8 {d2}, [r0], r1 //b::src[x+stride]
- vld1.u8 {d4}, [r0] //c::src[x+2*stride]
+w4_mc_chroma_loop: // each two pxl row
+ vld1.u8 {d0}, [r0], r1 //a::src[x]
+ vld1.u8 {d2}, [r0], r1 //b::src[x+stride]
+ vld1.u8 {d4}, [r0] //c::src[x+2*stride]
- vshr.u64 d1, d0, #8
- vshr.u64 d3, d2, #8
- vshr.u64 d5, d4, #8
+ vshr.u64 d1, d0, #8
+ vshr.u64 d3, d2, #8
+ vshr.u64 d5, d4, #8
- vmov q3, q1 //b::[0:7]+b::[1~8]
- vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
- vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
+ vmov q3, q1 //b::[0:7]+b::[1~8]
+ vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
+ vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
- vmull.u8 q1, d0, d28 //(src[x] * A)
- vmlal.u8 q1, d1, d29 //+=(src[x+1] * B)
- vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C)
- vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D)
+ vmull.u8 q1, d0, d28 //(src[x] * A)
+ vmlal.u8 q1, d1, d29 //+=(src[x+1] * B)
+ vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d2, q1, #6
- vmov r4, r5, d2
- str r4, [r2], r3
- str r5, [r2], r3
+ vrshrn.u16 d2, q1, #6
+ vmov r4, r5, d2
+ str r4, [r2], r3
+ str r5, [r2], r3
- sub r6, #2
- cmp r6, #0
- bne w4_mc_chroma_loop
+ sub r6, #2
+ cmp r6, #0
+ bne w4_mc_chroma_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon
- push {r4-r5}
- mov r4, #20
- mov r5, #1
- sub r4, r4, r4, lsl #(16-2)
- lsl r5, #16
- ror r4, #16
- vmov d3, r5, r4 // 0x0014FFFB00010000
+ push {r4-r5}
+ mov r4, #20
+ mov r5, #1
+ sub r4, r4, r4, lsl #(16-2)
+ lsl r5, #16
+ ror r4, #16
+ vmov d3, r5, r4 // 0x0014FFFB00010000
- sub r3, #16
- ldr r4, [sp, #8]
+ sub r3, #16
+ ldr r4, [sp, #8]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w17_h_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2]
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2]
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
- FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d22, q14, q15
+ FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d22, q14, q15
- FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d23, q14, q15
+ FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d23, q14, q15
- vst1.u8 {d22, d23}, [r2]! //write [0:15] Byte
+ vst1.u8 {d22, d23}, [r2]! //write [0:15] Byte
- vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS d2, d3, d22, q11, q1
+ vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+ FILTER_SINGLE_TAG_8BITS d2, d3, d22, q11, q1
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
- sub r4, #1
- cmp r4, #0
- bne w17_h_mc_luma_loop
- pop {r4-r5}
+ sub r4, #1
+ cmp r4, #0
+ bne w17_h_mc_luma_loop
+ pop {r4-r5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon
- push {r4-r5}
- mov r4, #20
- mov r5, #1
- sub r4, r4, r4, lsl #(16-2)
- lsl r5, #16
- ror r4, #16
- vmov d7, r5, r4 // 0x0014FFFB00010000
+ push {r4-r5}
+ mov r4, #20
+ mov r5, #1
+ sub r4, r4, r4, lsl #(16-2)
+ lsl r5, #16
+ ror r4, #16
+ vmov d7, r5, r4 // 0x0014FFFB00010000
- sub r3, #8
- ldr r4, [sp, #8]
+ sub r3, #8
+ ldr r4, [sp, #8]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w9_h_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2]
- pld [r0]
+ vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2]
+ pld [r0]
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
- FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d16, q14, q15
+ FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d16, q14, q15
- sub r4, #1
- vst1.u8 {d16}, [r2]! //write [0:7] Byte
+ sub r4, #1
+ vst1.u8 {d16}, [r2]! //write [0:7] Byte
- vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS d2, d7, d18, q9, q1
- vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte
+ vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+ FILTER_SINGLE_TAG_8BITS d2, d7, d18, q9, q1
+ vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte
- cmp r4, #0
- bne w9_h_mc_luma_loop
- pop {r4-r5}
+ cmp r4, #0
+ bne w9_h_mc_luma_loop
+ pop {r4-r5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
w17_v_mc_luma_loop:
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
- FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+ FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
- FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+ FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
- FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+ FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
- FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+ FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
- //q2, q3, q8, q9, q0 --> q0~q8
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q8
+ //q2, q3, q8, q9, q0 --> q0~q8
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q8
- sub r4, #8
- cmp r4, #1
- bne w17_v_mc_luma_loop
- // the last 16Bytes
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+ sub r4, #8
+ cmp r4, #1
+ bne w17_v_mc_luma_loop
+ // the last 16Bytes
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
w9_v_mc_luma_loop:
- pld [r0]
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
- sub r4, #4
- cmp r4, #1
- bne w9_v_mc_luma_loop
+ sub r4, #4
+ cmp r4, #1
+ bne w9_v_mc_luma_loop
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vst1.u8 {d16}, [r2], r3 //write last 8Byte
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vst1.u8 {d16}, [r2], r3 //write last 8Byte
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon
- push {r4}
- vpush {q4-q7}
- ldr r4, [sp, #68]
+ push {r4}
+ vpush {q4-q7}
+ ldr r4, [sp, #68]
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2]
- vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2]
+ vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0]
- vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
- sub r3, #16
+ vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0]
+ vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
+ sub r3, #16
w17_hv_mc_luma_loop:
- vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {d0, d1}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {d0, d1}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
- vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
- vst1.u8 {d3, d4}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0]
- vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte
+ vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
+ vst1.u8 {d3, d4}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0]
+ vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte
- vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
- vst1.u8 {d6, d7}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0]
- vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte
+ vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
+ vst1.u8 {d6, d7}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0]
+ vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte
- vld1.u8 {d6-d8}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
- vst1.u8 {d9, d10}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0]
- vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte
+ vld1.u8 {d6-d8}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
+ vst1.u8 {d9, d10}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0]
+ vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte
- //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
- vswp q0, q6
- vswp q6, q3
- vmov q5, q2
- vmov q2, q8
+ //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+ vswp q0, q6
+ vswp q6, q3
+ vmov q5, q2
+ vmov q2, q8
- vmov d20,d8
- vmov q4, q1
- vmov q1, q7
- vmov d14,d20
+ vmov d20,d8
+ vmov q4, q1
+ vmov q1, q7
+ vmov d14,d20
- sub r4, #4
- cmp r4, #1
- bne w17_hv_mc_luma_loop
- //the last row
- vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {q0}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+ sub r4, #4
+ cmp r4, #1
+ bne w17_hv_mc_luma_loop
+ //the last row
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {q0}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
- vpop {q4-q7}
- pop {r4}
+ vpop {q4-q7}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon
- push {r4}
- vpush {q4}
- ldr r4, [sp, #20]
+ push {r4}
+ vpush {q4}
+ ldr r4, [sp, #20]
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2]
- sub r3, #8
+ vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2]
+ sub r3, #8
w9_hv_mc_luma_loop:
- vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
- vld1.u8 {q2}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
- //q4~q8, q0~q2, --> q0~q4
- vswp q0, q4
- vswp q2, q4
- vmov q3, q1
- vmov q1, q8
+ //q4~q8, q0~q2, --> q0~q4
+ vswp q0, q4
+ vswp q2, q4
+ vmov q3, q1
+ vmov q1, q8
- sub r4, #4
- cmp r4, #1
- bne w9_hv_mc_luma_loop
- //the last row
- vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
- vpop {q4}
- pop {r4}
+ sub r4, #4
+ cmp r4, #1
+ bne w9_hv_mc_luma_loop
+ //the last row
+ vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vpop {q4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r5, [sp, #16]
- ldr r6, [sp, #20]
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ ldr r6, [sp, #20]
enc_w16_pix_avg_loop:
- vld1.u8 {q0}, [r2], r3
- vld1.u8 {q1}, [r4], r5
- vld1.u8 {q2}, [r2], r3
- vld1.u8 {q3}, [r4], r5
+ vld1.u8 {q0}, [r2], r3
+ vld1.u8 {q1}, [r4], r5
+ vld1.u8 {q2}, [r2], r3
+ vld1.u8 {q3}, [r4], r5
- vld1.u8 {q8}, [r2], r3
- vld1.u8 {q9}, [r4], r5
- vld1.u8 {q10}, [r2], r3
- vld1.u8 {q11}, [r4], r5
+ vld1.u8 {q8}, [r2], r3
+ vld1.u8 {q9}, [r4], r5
+ vld1.u8 {q10}, [r2], r3
+ vld1.u8 {q11}, [r4], r5
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {q0}, [r0], r1
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {q0}, [r0], r1
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {q2}, [r0], r1
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {q2}, [r0], r1
- AVERAGE_TWO_8BITS d16, d16, d18
- AVERAGE_TWO_8BITS d17, d17, d19
- vst1.u8 {q8}, [r0], r1
+ AVERAGE_TWO_8BITS d16, d16, d18
+ AVERAGE_TWO_8BITS d17, d17, d19
+ vst1.u8 {q8}, [r0], r1
- AVERAGE_TWO_8BITS d20, d20, d22
- AVERAGE_TWO_8BITS d21, d21, d23
- vst1.u8 {q10}, [r0], r1
+ AVERAGE_TWO_8BITS d20, d20, d22
+ AVERAGE_TWO_8BITS d21, d21, d23
+ vst1.u8 {q10}, [r0], r1
- sub r6, #4
- cmp r6, #0
- bne enc_w16_pix_avg_loop
+ sub r6, #4
+ cmp r6, #0
+ bne enc_w16_pix_avg_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r5, [sp, #16]
- ldr r6, [sp, #20]
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ ldr r6, [sp, #20]
enc_w8_pix_avg_loop:
- vld1.u8 {d0}, [r2], r3
- vld1.u8 {d2}, [r4], r5
- vld1.u8 {d1}, [r2], r3
- vld1.u8 {d3}, [r4], r5
+ vld1.u8 {d0}, [r2], r3
+ vld1.u8 {d2}, [r4], r5
+ vld1.u8 {d1}, [r2], r3
+ vld1.u8 {d3}, [r4], r5
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {d0}, [r0], r1
- vst1.u8 {d1}, [r0], r1
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {d0}, [r0], r1
+ vst1.u8 {d1}, [r0], r1
- vld1.u8 {d4}, [r2], r3
- vld1.u8 {d6}, [r4], r5
- vld1.u8 {d5}, [r2], r3
- vld1.u8 {d7}, [r4], r5
+ vld1.u8 {d4}, [r2], r3
+ vld1.u8 {d6}, [r4], r5
+ vld1.u8 {d5}, [r2], r3
+ vld1.u8 {d7}, [r4], r5
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {d4}, [r0], r1
- vst1.u8 {d5}, [r0], r1
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {d4}, [r0], r1
+ vst1.u8 {d5}, [r0], r1
- sub r6, #4
- cmp r6, #0
- bne enc_w8_pix_avg_loop
+ sub r6, #4
+ cmp r6, #0
+ bne enc_w8_pix_avg_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
#endif
--- a/codec/common/arm64/expand_picture_aarch64_neon.S
+++ b/codec/common/arm64/expand_picture_aarch64_neon.S
@@ -53,88 +53,88 @@
sub x8, x8, #1
cbnz x8, _expand_picture_luma_loop2
//for the top and bottom expand
- add x2, x2, #64
- sub x0, x0, #32
+ add x2, x2, #64
+ sub x0, x0, #32
madd x4, x1, x3, x0
sub x4, x4, x1
_expand_picture_luma_loop0:
- mov x5, #32
+ mov x5, #32
msub x5, x5, x1, x0
- add x6, x4, x1
+ add x6, x4, x1
ld1 {v0.16b}, [x0], x10
ld1 {v1.16b}, [x4], x10
- mov x8, #32
+ mov x8, #32
_expand_picture_luma_loop1:
- st1 {v0.16b}, [x5], x1
- st1 {v1.16b}, [x6], x1
- sub x8, x8, #1
+ st1 {v0.16b}, [x5], x1
+ st1 {v1.16b}, [x6], x1
+ sub x8, x8, #1
cbnz x8, _expand_picture_luma_loop1
- sub x2, x2, #16
- cbnz x2, _expand_picture_luma_loop0
+ sub x2, x2, #16
+ cbnz x2, _expand_picture_luma_loop0
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon
- //Save the dst
- mov x7, x0
- mov x8, x3
+ //Save the dst
+ mov x7, x0
+ mov x8, x3
mov x10, #16
- add x4, x7, x2
- sub x4, x4, #1
+ add x4, x7, x2
+ sub x4, x4, #1
//For the left and right expand
_expand_picture_chroma_loop2:
- sub x5, x7, #16
- add x6, x4, #1
+ sub x5, x7, #16
+ add x6, x4, #1
- ld1r {v0.16b}, [x7], x1
- ld1r {v1.16b}, [x4], x1
+ ld1r {v0.16b}, [x7], x1
+ ld1r {v1.16b}, [x4], x1
- st1 {v0.16b}, [x5]
- st1 {v1.16b}, [x6]
- sub x8, x8, #1
- cbnz x8, _expand_picture_chroma_loop2
+ st1 {v0.16b}, [x5]
+ st1 {v1.16b}, [x6]
+ sub x8, x8, #1
+ cbnz x8, _expand_picture_chroma_loop2
- //for the top and bottom expand
- add x2, x2, #32
+ //for the top and bottom expand
+ add x2, x2, #32
//
mov x9, x2
mov x11, #15
bic x2, x2, x11
//
- sub x0, x0, #16
- madd x4, x1, x3, x0
- sub x4, x4, x1
+ sub x0, x0, #16
+ madd x4, x1, x3, x0
+ sub x4, x4, x1
_expand_picture_chroma_loop0:
- mov x5, #16
+ mov x5, #16
msub x5, x5, x1, x0
- add x6, x4, x1
- ld1 {v0.16b}, [x0], x10
- ld1 {v1.16b}, [x4], x10
+ add x6, x4, x1
+ ld1 {v0.16b}, [x0], x10
+ ld1 {v1.16b}, [x4], x10
- mov x8, #16
+ mov x8, #16
_expand_picture_chroma_loop1:
- st1 {v0.16b}, [x5], x1
- st1 {v1.16b}, [x6], x1
- sub x8, x8, #1
+ st1 {v0.16b}, [x5], x1
+ st1 {v1.16b}, [x6], x1
+ sub x8, x8, #1
cbnz x8, _expand_picture_chroma_loop1
- sub x2, x2, #16
- cbnz x2, _expand_picture_chroma_loop0
+ sub x2, x2, #16
+ cbnz x2, _expand_picture_chroma_loop0
and x9, x9, #15
sub x9, x9, #8
cbnz x9, _expand_picture_chroma_end
- mov x5, #16
+ mov x5, #16
msub x5, x5, x1, x0
- add x6, x4, x1
- ld1 {v0.8b}, [x0]
- ld1 {v1.8b}, [x4]
+ add x6, x4, x1
+ ld1 {v0.8b}, [x0]
+ ld1 {v1.8b}, [x4]
- mov x8, #16
+ mov x8, #16
_expand_picture_chroma_loop3:
- st1 {v0.8b}, [x5], x1
- st1 {v1.8b}, [x6], x1
- sub x8, x8, #1
+ st1 {v0.8b}, [x5], x1
+ st1 {v1.8b}, [x6], x1
+ sub x8, x8, #1
cbnz x8, _expand_picture_chroma_loop3
_expand_picture_chroma_end:
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -39,31 +39,31 @@
#ifdef __APPLE__
.macro FILTER_6TAG_8BITS1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
+ uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun $6.8b, v18.8h, #5
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS2
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
+ uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 $6.16b, v18.8h, #5
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
+ uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -70,13 +70,13 @@
sqrshrun $6.8b, v18.8h, #5
uaddl v19.8h, $2.8b, $6.8b
rshrn $6.8b, v19.8h, #1
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
+ uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -83,13 +83,13 @@
sqrshrun2 $6.16b, v18.8h, #5
uaddl2 v19.8h, $2.16b, $6.16b
rshrn2 $6.16b, v19.8h, #1
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
+ uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -96,13 +96,13 @@
sqrshrun $6.8b, v18.8h, #5
uaddl v19.8h, $3.8b, $6.8b
rshrn $6.8b, v19.8h, #1
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
+ uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -109,134 +109,134 @@
sqrshrun2 $6.16b, v18.8h, #5
uaddl2 v19.8h, $3.16b, $6.16b
rshrn2 $6.16b, v19.8h, #1
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS1
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3]
- uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1]
- mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl v31.8h, $1.8b, $4.8b //src[-1]+src[2]
- mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+ uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3]
+ uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1]
+ mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
+ uaddl v31.8h, $1.8b, $4.8b //src[-1]+src[2]
+ mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS2
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3]
- uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1]
- mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v31.8h, $1.16b, $4.16b //src[-1]+src[2]
- mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+ uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3]
+ uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1]
+ mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
+ uaddl2 v31.8h, $1.16b, $4.16b //src[-1]+src[2]
+ mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS1
-// { // input:a, b, c, dst_d;
- sub $0.8h, $0.8h, $1.8h //a-b
- sshr $0.8h, $0.8h, #2 //(a-b)/4
- sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
- add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
- sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
- add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun $3.8b, $0.8h, #6 //(+32)>>6
-// }
+// { // input:a, b, c, dst_d;
+ sub $0.8h, $0.8h, $1.8h //a-b
+ sshr $0.8h, $0.8h, #2 //(a-b)/4
+ sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
+ add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
+ sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
+ add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ sqrshrun $3.8b, $0.8h, #6 //(+32)>>6
+// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS2
-// { // input:a, b, c, dst_d;
- sub $0.8h, $0.8h, $1.8h //a-b
- sshr $0.8h, $0.8h, #2 //(a-b)/4
- sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
- add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
- sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
- add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun2 $3.16b, $0.8h, #6 //(+32)>>6
-// }
+// { // input:a, b, c, dst_d;
+ sub $0.8h, $0.8h, $1.8h //a-b
+ sshr $0.8h, $0.8h, #2 //(a-b)/4
+ sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
+ add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
+ sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
+ add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ sqrshrun2 $3.16b, $0.8h, #6 //(+32)>>6
+// }
.endm
.macro UNPACK_2_16BITS_TO_ABC
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- ext $4.16b, $0.16b, $1.16b, #4 //src[0]
- ext $3.16b, $0.16b, $1.16b, #6 //src[1]
- add $4.8h, $4.8h, $3.8h //c=src[0]+src[1]
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ ext $4.16b, $0.16b, $1.16b, #4 //src[0]
+ ext $3.16b, $0.16b, $1.16b, #6 //src[1]
+ add $4.8h, $4.8h, $3.8h //c=src[0]+src[1]
- ext $3.16b, $0.16b, $1.16b, #2 //src[-1]
- ext $2.16b, $0.16b, $1.16b, #8 //src[2]
- add $3.8h, $3.8h, $2.8h //b=src[-1]+src[2]
+ ext $3.16b, $0.16b, $1.16b, #2 //src[-1]
+ ext $2.16b, $0.16b, $1.16b, #8 //src[2]
+ add $3.8h, $3.8h, $2.8h //b=src[-1]+src[2]
- ext $2.16b, $0.16b, $1.16b, #10 //src[3]
- add $2.8h, $2.8h, $0.8h //a=src[-2]+src[3]
-// }
+ ext $2.16b, $0.16b, $1.16b, #10 //src[3]
+ add $2.8h, $2.8h, $0.8h //a=src[-2]+src[3]
+// }
.endm
.macro AVERAGE_TWO_8BITS1
-// { // input:dst_d, src_d A and B; working: v5
- uaddl v30.8h, $2.8b, $1.8b
- rshrn $0.8b, v30.8h, #1
-// }
+// { // input:dst_d, src_d A and B; working: v5
+ uaddl v30.8h, $2.8b, $1.8b
+ rshrn $0.8b, v30.8h, #1
+// }
.endm
.macro AVERAGE_TWO_8BITS2
-// { // input:dst_d, src_d A and B; working: v5
- uaddl2 v30.8h, $2.16b, $1.16b
- rshrn2 $0.16b, v30.8h, #1
-// }
+// { // input:dst_d, src_d A and B; working: v5
+ uaddl2 v30.8h, $2.16b, $1.16b
+ rshrn2 $0.16b, v30.8h, #1
+// }
.endm
-.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X},
- rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O
- uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]*
- mul $2.4h, $2.4h, $1.4h // 0+1*[50]-5*[41]+20[32]
+.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X},
+ rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O
+ uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]*
+ mul $2.4h, $2.4h, $1.4h // 0+1*[50]-5*[41]+20[32]
addv $3, $2.4h
sqrshrun $0.8b, $0.8h, #5
-// }
+// }
.endm
.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
-// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
+// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
ext.16b $3, $1, $1, #14 // X[0][1][2][3][4][5]O
ext.16b $4, $3, $3, #8 // [3][4][5]OX[0][1][2]
- rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O
+ rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O
add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]*
- smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32]
+ smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32]
saddlv $5, $3.4s
//sshr $0.2d, $0.2d, #4
sqrshrun $0.2s, $0.2d, #10
uqxtn $0.4h, $0.4s
uqxtn $0.8b, $0.8h
- // }
+ // }
.endm
#else
.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
- uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun \arg6\().8b, v18.8h, #5
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 \arg6\().16b, v18.8h, #5
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
- uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -243,13 +243,13 @@
sqrshrun \arg6\().8b, v18.8h, #5
uaddl v19.8h, \arg2\().8b, \arg6\().8b
rshrn \arg6\().8b, v19.8h, #1
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -256,13 +256,13 @@
sqrshrun2 \arg6\().16b, v18.8h, #5
uaddl2 v19.8h, \arg2\().16b, \arg6\().16b
rshrn2 \arg6\().16b, v19.8h, #1
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
- uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -269,13 +269,13 @@
sqrshrun \arg6\().8b, v18.8h, #5
uaddl v19.8h, \arg3\().8b, \arg6\().8b
rshrn \arg6\().8b, v19.8h, #1
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -282,106 +282,106 @@
sqrshrun2 \arg6\().16b, v18.8h, #5
uaddl2 v19.8h, \arg3\().16b, \arg6\().16b
rshrn2 \arg6\().16b, v19.8h, #1
-// }
+// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl \arg6\().8h, \arg0\().8b, \arg5\().8b //dst_q=src[-2]+src[3]
- uaddl v31.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
- mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl v31.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
- mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+ uaddl \arg6\().8h, \arg0\().8b, \arg5\().8b //dst_q=src[-2]+src[3]
+ uaddl v31.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
+ uaddl v31.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
+ mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl2 \arg6\().8h, \arg0\().16b, \arg5\().16b //dst_q=src[-2]+src[3]
- uaddl2 v31.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
- mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v31.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
- mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+ uaddl2 \arg6\().8h, \arg0\().16b, \arg5\().16b //dst_q=src[-2]+src[3]
+ uaddl2 v31.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
+ uaddl2 v31.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
+ mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
-// { // input:a, b, c, dst_d;
- sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
- sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
- sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
- add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
- sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
- add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun \arg3\().8b, \arg0\().8h, #6 //(+32)>>6
-// }
+// { // input:a, b, c, dst_d;
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
+ sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
+ sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ sqrshrun \arg3\().8b, \arg0\().8h, #6 //(+32)>>6
+// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
-// { // input:a, b, c, dst_d;
- sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
- sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
- sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
- add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
- sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
- add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun2 \arg3\().16b, \arg0\().8h, #6 //(+32)>>6
-// }
+// { // input:a, b, c, dst_d;
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
+ sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
+ sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ sqrshrun2 \arg3\().16b, \arg0\().8h, #6 //(+32)>>6
+// }
.endm
.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4 //src[0]
- ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6 //src[1]
- add \arg4\().8h, \arg4\().8h, \arg3\().8h //c=src[0]+src[1]
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4 //src[0]
+ ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6 //src[1]
+ add \arg4\().8h, \arg4\().8h, \arg3\().8h //c=src[0]+src[1]
- ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2 //src[-1]
- ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8 //src[2]
- add \arg3\().8h, \arg3\().8h, \arg2\().8h //b=src[-1]+src[2]
+ ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2 //src[-1]
+ ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8 //src[2]
+ add \arg3\().8h, \arg3\().8h, \arg2\().8h //b=src[-1]+src[2]
- ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10 //src[3]
- add \arg2\().8h, \arg2\().8h, \arg0\().8h //a=src[-2]+src[3]
-// }
+ ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10 //src[3]
+ add \arg2\().8h, \arg2\().8h, \arg0\().8h //a=src[-2]+src[3]
+// }
.endm
.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2
-// { // input:dst_d, src_d A and B; working: v5
- uaddl v30.8h, \arg2\().8b, \arg1\().8b
- rshrn \arg0\().8b, v30.8h, #1
-// }
+// { // input:dst_d, src_d A and B; working: v5
+ uaddl v30.8h, \arg2\().8b, \arg1\().8b
+ rshrn \arg0\().8b, v30.8h, #1
+// }
.endm
.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2
-// { // input:dst_d, src_d A and B; working: v5
- uaddl2 v30.8h, \arg2\().16b, \arg1\().16b
- rshrn2 \arg0\().16b, v30.8h, #1
-// }
+// { // input:dst_d, src_d A and B; working: v5
+ uaddl2 v30.8h, \arg2\().16b, \arg1\().16b
+ rshrn2 \arg0\().16b, v30.8h, #1
+// }
.endm
.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
// when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X},
- rev64 \arg2\().8b, \arg0\().8b // X[5][4][3][2][1][0]O
- uaddl \arg2\().8h, \arg0\().8b, \arg2\().8b // each 16bits, *[50][41][32][23][14][05]*
- mul \arg2\().4h, \arg2\().4h, \arg1\().4h // 0+1*[50]-5*[41]+20[32]
+// { // input: src_d{Y[0][1][2][3][4][5]X},
+ rev64 \arg2\().8b, \arg0\().8b // X[5][4][3][2][1][0]O
+ uaddl \arg2\().8h, \arg0\().8b, \arg2\().8b // each 16bits, *[50][41][32][23][14][05]*
+ mul \arg2\().4h, \arg2\().4h, \arg1\().4h // 0+1*[50]-5*[41]+20[32]
addv \arg3, \arg2\().4h
sqrshrun \arg0\().8b, \arg0\().8h, #5
-// }
+// }
.endm
.macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5
-// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
+// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
ext \arg3\().16b, \arg1\().16b, \arg1\().16b, #14 // X[0][1][2][3][4][5]O
ext \arg4\().16b, \arg3\().16b, \arg3\().16b, #8 // [3][4][5]OX[0][1][2]
- rev64 \arg4\().8h, \arg4\().8h // X[5][4][3][2][1][0]O
+ rev64 \arg4\().8h, \arg4\().8h // X[5][4][3][2][1][0]O
add \arg3\().8h, \arg3\().8h, \arg4\().8h // each 16bits, *[50][41][32][23][14][05]*
- smull \arg3\().4s, \arg3\().4h, \arg2\().4h // 0+1*[50]-5*[41]+20[32]
+ smull \arg3\().4s, \arg3\().4h, \arg2\().4h // 0+1*[50]-5*[41]+20[32]
saddlv \arg5, \arg3\().4s
//sshr \arg0\().2d, \arg0\().2d, #4
sqrshrun \arg0\().2s, \arg0\().2d, #10
uqxtn \arg0\().4h, \arg0\().4s
uqxtn \arg0\().8b, \arg0\().8h
- // }
+ // }
.endm
#endif
@@ -405,7 +405,7 @@
sub x4, x4, #1
st1 {v20.16b}, [x2], x3 //write 16Byte
- cbnz x4, w16_h_mc_luma_loop
+ cbnz x4, w16_h_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon
@@ -426,7 +426,7 @@
sub x4, x4, #1
st1 {v20.8b}, [x2], x3 //write 8Byte
- cbnz x4, w8_h_mc_luma_loop
+ cbnz x4, w8_h_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon
@@ -461,7 +461,7 @@
st1 {v20.s}[0], [x2], x3 //write 4Byte
st1 {v20.s}[1], [x2], x3 //write 4Byte
sub x4, x4, #1
- cbnz x4, w4_h_mc_luma_loop
+ cbnz x4, w4_h_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon
@@ -483,7 +483,7 @@
sub x4, x4, #1
st1 {v20.16b}, [x2], x3 //write 16Byte
- cbnz x4, w16_xy_10_mc_luma_loop
+ cbnz x4, w16_xy_10_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -505,7 +505,7 @@
sub x4, x4, #1
st1 {v20.8b}, [x2], x3 //write 8Byte
- cbnz x4, w8_xy_10_mc_luma_loop
+ cbnz x4, w8_xy_10_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon
@@ -540,7 +540,7 @@
st1 {v20.s}[0], [x2], x3 //write 4Byte
st1 {v20.s}[1], [x2], x3 //write 4Byte
sub x4, x4, #1
- cbnz x4, w4_xy_10_mc_luma_loop
+ cbnz x4, w4_xy_10_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -563,7 +563,7 @@
sub x4, x4, #1
st1 {v20.16b}, [x2], x3 //write 16Byte
- cbnz x4, w16_xy_30_mc_luma_loop
+ cbnz x4, w16_xy_30_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -585,7 +585,7 @@
sub x4, x4, #1
st1 {v20.8b}, [x2], x3 //write 8Byte
- cbnz x4, w8_xy_30_mc_luma_loop
+ cbnz x4, w8_xy_30_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon
@@ -620,7 +620,7 @@
st1 {v20.s}[0], [x2], x3 //write 4Byte
st1 {v20.s}[1], [x2], x3 //write 4Byte
sub x4, x4, #1
- cbnz x4, w4_xy_30_mc_luma_loop
+ cbnz x4, w4_xy_30_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -703,7 +703,7 @@
mov.16b v4, v6
mov.16b v6, v7
sub x4, x4, #8
- cbnz x4, w16_xy_01_mc_luma_loop
+ cbnz x4, w16_xy_01_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -753,7 +753,7 @@
mov.16b v6, v4
mov.16b v4, v7
sub x4, x4, #4
- cbnz x4, w8_xy_01_mc_luma_loop
+ cbnz x4, w8_xy_01_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -809,7 +809,7 @@
mov.8b v5, v21
sub x4, x4, #4
- cbnz x4, w4_xy_01_mc_luma_loop
+ cbnz x4, w4_xy_01_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -892,7 +892,7 @@
mov.16b v4, v6
mov.16b v6, v7
sub x4, x4, #8
- cbnz x4, w16_xy_03_mc_luma_loop
+ cbnz x4, w16_xy_03_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -942,7 +942,7 @@
mov.16b v6, v4
mov.16b v4, v7
sub x4, x4, #4
- cbnz x4, w8_xy_03_mc_luma_loop
+ cbnz x4, w8_xy_03_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -998,7 +998,7 @@
mov.8b v5, v21
sub x4, x4, #4
- cbnz x4, w4_xy_03_mc_luma_loop
+ cbnz x4, w4_xy_03_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1081,7 +1081,7 @@
mov.16b v4, v6
mov.16b v6, v7
sub x4, x4, #8
- cbnz x4, w16_xy_02_mc_luma_loop
+ cbnz x4, w16_xy_02_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1131,7 +1131,7 @@
mov.16b v6, v4
mov.16b v4, v7
sub x4, x4, #4
- cbnz x4, w8_xy_02_mc_luma_loop
+ cbnz x4, w8_xy_02_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1187,7 +1187,7 @@
mov.8b v5, v21
sub x4, x4, #4
- cbnz x4, w4_xy_02_mc_luma_loop
+ cbnz x4, w4_xy_02_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1220,12 +1220,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 0 line
//prfm pldl1strm, [x0, x1]
@@ -1234,12 +1234,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 1 line
//prfm pldl1strm, [x0, x1]
@@ -1248,12 +1248,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 2 line
//prfm pldl1strm, [x0, x1]
@@ -1262,12 +1262,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 3 line
//prfm pldl1strm, [x0, x1]
@@ -1276,12 +1276,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 4 line
//prfm pldl1strm, [x0, x1]
@@ -1290,12 +1290,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 5 line
//prfm pldl1strm, [x0, x1]
@@ -1304,12 +1304,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 6 line
//prfm pldl1strm, [x0, x1]
@@ -1318,12 +1318,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 7 line
mov.16b v5, v11
@@ -1348,7 +1348,7 @@
mov.16b v16, v30
sub x4, x4, #8
- cbnz x4, w16_hv_mc_luma_loop
+ cbnz x4, w16_hv_mc_luma_loop
ldp d14, d15, [sp], #16
ldp d12, d13, [sp], #16
@@ -1381,8 +1381,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x3 //write 8Byte : 0 line
//prfm pldl1strm, [x0, x1]
@@ -1391,8 +1391,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x3 //write 8Byte : 1 line
//prfm pldl1strm, [x0, x1]
@@ -1401,8 +1401,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x3 //write 8Byte : 2 line
//prfm pldl1strm, [x0, x1]
@@ -1411,8 +1411,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x3 //write 8Byte : 3 line
@@ -1424,7 +1424,7 @@
mov.16b v4, v30
sub x4, x4, #4
- cbnz x4, w8_hv_mc_luma_loop
+ cbnz x4, w8_hv_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1458,12 +1458,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v22, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v23, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26
- UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30
+ UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26
+ UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30
zip1 v24.2d, v24.2d, v28.2d
zip1 v25.2d, v25.2d, v29.2d
zip1 v26.2d, v26.2d, v30.2d
- FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
+ FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
st1 {v27.s}[0], [x2], x3 //write 4Byte : 0 line
st1 {v27.s}[1], [x2], x3 //write 4Byte : 1 line
@@ -1478,12 +1478,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v22, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v23, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26
- UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30
+ UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26
+ UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30
zip1 v24.2d, v24.2d, v28.2d
zip1 v25.2d, v25.2d, v29.2d
zip1 v26.2d, v26.2d, v30.2d
- FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
+ FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
st1 {v27.s}[0], [x2], x3 //write 4Byte : 2 line
st1 {v27.s}[1], [x2], x3 //write 4Byte : 3 line
@@ -1495,7 +1495,7 @@
mov.16b v4, v30
sub x4, x4, #4
- cbnz x4, w4_hv_mc_luma_loop
+ cbnz x4, w4_hv_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon
@@ -1509,7 +1509,7 @@
st1 {v1.16b}, [x2], x3 //write 16Byte : 1 line
sub x4, x4, #2
- cbnz x4, w16_copy_loop
+ cbnz x4, w16_copy_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon
@@ -1523,7 +1523,7 @@
st1 {v1.8b}, [x2], x3 //write 16Byte : 1 line
sub x4, x4, #2
- cbnz x4, w8_copy_loop
+ cbnz x4, w8_copy_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon
@@ -1537,7 +1537,7 @@
st1 {v1.s}[0], [x2], x3 //write 16Byte : 1 line
sub x4, x4, #2
- cbnz x4, w4_copy_loop
+ cbnz x4, w4_copy_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon
@@ -1570,7 +1570,7 @@
st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
sub x6, x6, #4
- cbnz x6, enc_w16_pix_avg_loop
+ cbnz x6, enc_w16_pix_avg_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon
@@ -1607,7 +1607,7 @@
st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
sub x6, x6, #4
- cbnz x6, enc_w8_pix_avg_loop
+ cbnz x6, enc_w8_pix_avg_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon
@@ -1649,7 +1649,7 @@
st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
sub x6, x6, #4
- cbnz x6, w16_pix_avg_loop
+ cbnz x6, w16_pix_avg_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon
@@ -1686,7 +1686,7 @@
st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
sub x6, x6, #4
- cbnz x6, w8_pix_avg_loop
+ cbnz x6, w8_pix_avg_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1707,7 +1707,7 @@
st1 {v2.s}[1], [x0], x1 //write 4Byte : 1 line
sub x6, x6, #2
- cbnz x6, w4_pix_avg_loop
+ cbnz x6, w4_pix_avg_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
@@ -1738,7 +1738,7 @@
mov.16b v0, v18
mov.16b v1, v19
sub x5, x5, #2
- cbnz x5, w8_mc_chroma_loop
+ cbnz x5, w8_mc_chroma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon
@@ -1767,7 +1767,7 @@
mov.8b v0, v18
mov.8b v1, v19
sub x5, x5, #2
- cbnz x5, w4_mc_chroma_loop
+ cbnz x5, w4_mc_chroma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1793,11 +1793,11 @@
st1 {v20.16b}, [x2], x5 //write 16Byte
ext.8b v21, v4, v4, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
- st1 {v21.b}[0], [x2], x3 //write 16th Byte
+ FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
+ st1 {v21.b}[0], [x2], x3 //write 16th Byte
sub x4, x4, #1
- cbnz x4, w17_h_mc_luma_loop
+ cbnz x4, w17_h_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon
@@ -1821,11 +1821,11 @@
st1 {v20.8b}, [x2], x5 //write 8Byte
ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
- st1 {v21.b}[0], [x2], x3 //write 9th Byte
+ FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
+ st1 {v21.b}[0], [x2], x3 //write 9th Byte
sub x4, x4, #1
- cbnz x4, w9_h_mc_luma_loop
+ cbnz x4, w9_h_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1863,12 +1863,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
@@ -1879,12 +1879,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 1 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 1 line
@@ -1895,12 +1895,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 2 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 2 line
@@ -1911,12 +1911,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 3 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 3 line
@@ -1927,12 +1927,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 4 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 4 line
@@ -1943,12 +1943,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 5 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 5 line
@@ -1959,12 +1959,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 6 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 6 line
@@ -1975,12 +1975,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 7 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 7 line
@@ -2007,7 +2007,7 @@
mov.16b v16, v30
sub x4, x4, #8
- cbnz x4, w17_hv_mc_luma_loop
+ cbnz x4, w17_hv_mc_luma_loop
//prfm pldl1strm, [x0, x1]
ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride]
@@ -2015,12 +2015,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
@@ -2061,8 +2061,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
@@ -2073,8 +2073,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x5 //write 0:7Byte : 1 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 1 line
@@ -2085,8 +2085,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x5 //write 0:7Byte : 2 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 2 line
@@ -2097,8 +2097,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x5 //write 0:7Byte : 3 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 3 line
@@ -2112,7 +2112,7 @@
mov.16b v4, v30
sub x4, x4, #4
- cbnz x4, w9_hv_mc_luma_loop
+ cbnz x4, w9_hv_mc_luma_loop
//prfm pldl1strm, [x0, x1]
ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
@@ -2120,8 +2120,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
@@ -2207,7 +2207,7 @@
mov.16b v4, v6
mov.16b v6, v7
sub x4, x4, #8
- cbnz x4, w17_v_mc_luma_loop
+ cbnz x4, w17_v_mc_luma_loop
//prfm pldl1strm, [x0, x1]
ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
@@ -2262,7 +2262,7 @@
mov.16b v6, v4
mov.16b v4, v7
sub x4, x4, #4
- cbnz x4, w9_v_mc_luma_loop
+ cbnz x4, w9_v_mc_luma_loop
//prfm pldl1strm, [x0, x1]
ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -44,15 +44,15 @@
;***********************************************************************
%if 1
- %define MOVDQ movdqa
+ %define MOVDQ movdqa
%else
- %define MOVDQ movdqu
+ %define MOVDQ movdqu
%endif
%if 1
- %define WELSEMMS emms
+ %define WELSEMMS emms
%else
- %define WELSEMMS
+ %define WELSEMMS
%endif
@@ -220,7 +220,7 @@
%macro LOAD_1_PARA 0
%ifdef X86_32
- mov r0, [esp + push_num*4 + 4]
+ mov r0, [esp + push_num*4 + 4]
%endif
%endmacro
@@ -234,8 +234,8 @@
%macro LOAD_3_PARA 0
%ifdef X86_32
mov r0, [esp + push_num*4 + 4]
- mov r1, [esp + push_num*4 + 8]
- mov r2, [esp + push_num*4 + 12]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
%endif
%endmacro
@@ -267,7 +267,7 @@
%macro LOAD_6_PARA 0
%ifdef X86_32
- push r3
+ push r3
push r4
push r5
%assign push_num push_num+3
@@ -310,7 +310,7 @@
%macro LOAD_4_PARA_POP 0
%ifdef X86_32
- pop r3
+ pop r3
%endif
%endmacro
@@ -317,7 +317,7 @@
%macro LOAD_5_PARA_POP 0
%ifdef X86_32
pop r4
- pop r3
+ pop r3
%endif
%endmacro
@@ -324,8 +324,8 @@
%macro LOAD_6_PARA_POP 0
%ifdef X86_32
pop r5
- pop r4
- pop r3
+ pop r4
+ pop r3
%endif
%endmacro
@@ -416,13 +416,13 @@
%macro SIGN_EXTENSION 2
%ifndef X86_32
- movsxd %1, %2
+ movsxd %1, %2
%endif
%endmacro
%macro SIGN_EXTENSIONW 2
%ifndef X86_32
- movsx %1, %2
+ movsx %1, %2
%endif
%endmacro
@@ -438,13 +438,13 @@
%endmacro
%macro WELS_AbsW 2
- pxor %2, %2
+ pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
%endmacro
%macro MMX_XSwap 4
- movq %4, %2
+ movq %4, %2
punpckh%1 %4, %3
punpckl%1 %2, %3
%endmacro
@@ -485,35 +485,35 @@
;in: m1, m2, m3, m4, m5, m6, m7, m8
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
%macro SSE2_TransTwo8x8B 9
- movdqa %9, %8
- SSE2_XSawp bw, %1, %2, %8
- SSE2_XSawp bw, %3, %4, %2
- SSE2_XSawp bw, %5, %6, %4
- movdqa %6, %9
- movdqa %9, %4
- SSE2_XSawp bw, %7, %6, %4
+ movdqa %9, %8
+ SSE2_XSawp bw, %1, %2, %8
+ SSE2_XSawp bw, %3, %4, %2
+ SSE2_XSawp bw, %5, %6, %4
+ movdqa %6, %9
+ movdqa %9, %4
+ SSE2_XSawp bw, %7, %6, %4
- SSE2_XSawp wd, %1, %3, %6
- SSE2_XSawp wd, %8, %2, %3
- SSE2_XSawp wd, %5, %7, %2
- movdqa %7, %9
- movdqa %9, %3
- SSE2_XSawp wd, %7, %4, %3
+ SSE2_XSawp wd, %1, %3, %6
+ SSE2_XSawp wd, %8, %2, %3
+ SSE2_XSawp wd, %5, %7, %2
+ movdqa %7, %9
+ movdqa %9, %3
+ SSE2_XSawp wd, %7, %4, %3
- SSE2_XSawp dq, %1, %5, %4
- SSE2_XSawp dq, %6, %2, %5
- SSE2_XSawp dq, %8, %7, %2
- movdqa %7, %9
- movdqa %9, %5
- SSE2_XSawp dq, %7, %3, %5
+ SSE2_XSawp dq, %1, %5, %4
+ SSE2_XSawp dq, %6, %2, %5
+ SSE2_XSawp dq, %8, %7, %2
+ movdqa %7, %9
+ movdqa %9, %5
+ SSE2_XSawp dq, %7, %3, %5
- SSE2_XSawp qdq, %1, %8, %3
- SSE2_XSawp qdq, %4, %2, %8
- SSE2_XSawp qdq, %6, %7, %2
- movdqa %7, %9
- movdqa %9, %1
- SSE2_XSawp qdq, %7, %5, %1
- movdqa %5, %9
+ SSE2_XSawp qdq, %1, %8, %3
+ SSE2_XSawp qdq, %4, %2, %8
+ SSE2_XSawp qdq, %6, %7, %2
+ movdqa %7, %9
+ movdqa %9, %1
+ SSE2_XSawp qdq, %7, %5, %1
+ movdqa %5, %9
%endmacro
;xmm0, xmm6, xmm7, [eax], [ecx]
@@ -528,32 +528,32 @@
; m2 = m1 + m2, m1 = m1 - m2
%macro SSE2_SumSub 3
- movdqa %3, %2
+ movdqa %3, %2
paddw %2, %1
psubw %1, %3
%endmacro
-%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
- mov %3h, %3l
- movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+ mov %3h, %3l
+ movd %1, e%3x ; i.e, 1% = eax (=b0)
+ pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+ pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
%endmacro
;copy a dw into a xmm for 8 times
%macro SSE2_Copy8Times 2
- movd %1, %2
- punpcklwd %1, %1
- pshufd %1, %1, 0
+ movd %1, %2
+ punpcklwd %1, %1
+ pshufd %1, %1, 0
%endmacro
;copy a db into a xmm for 16 times
%macro SSE2_Copy16Times 2
- movd %1, %2
- pshuflw %1, %1, 0
- punpcklqdq %1, %1
- packuswb %1, %1
+ movd %1, %2
+ pshuflw %1, %1, 0
+ punpcklqdq %1, %1
+ packuswb %1, %1
%endmacro
@@ -564,35 +564,35 @@
;dw 32,32,32,32,32,32,32,32 for xmm
;dw 32,32,32,32 for mm
%macro WELS_DW32 1
- pcmpeqw %1,%1
- psrlw %1,15
- psllw %1,5
+ pcmpeqw %1,%1
+ psrlw %1,15
+ psllw %1,5
%endmacro
;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
;dw 1, 1, 1, 1 for mm
%macro WELS_DW1 1
- pcmpeqw %1,%1
- psrlw %1,15
+ pcmpeqw %1,%1
+ psrlw %1,15
%endmacro
;all 0 for xmm and mm
%macro WELS_Zero 1
- pxor %1, %1
+ pxor %1, %1
%endmacro
;dd 1, 1, 1, 1 for xmm
;dd 1, 1 for mm
%macro WELS_DD1 1
- pcmpeqw %1,%1
- psrld %1,31
+ pcmpeqw %1,%1
+ psrld %1,31
%endmacro
;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
%macro WELS_DB1 1
- pcmpeqw %1,%1
- psrlw %1,15
- packuswb %1,%1
+ pcmpeqw %1,%1
+ psrlw %1,15
+ packuswb %1,%1
%endmacro
--- a/codec/common/x86/cpuid.asm
+++ b/codec/common/x86/cpuid.asm
@@ -29,13 +29,13 @@
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
-;* cpu_mmx.asm
+;* cpu_mmx.asm
;*
;* Abstract
-;* verify cpuid feature support and cpuid detection
+;* verify cpuid feature support and cpuid detection
;*
;* History
-;* 04/29/2009 Created
+;* 04/29/2009 Created
;*
;*************************************************************************/
@@ -115,13 +115,13 @@
%elifdef X86_32
WELS_EXTERN WelsCPUId
- push ebx
- push edi
+ push ebx
+ push edi
- mov eax, [esp+12] ; operating index
+ mov eax, [esp+12] ; operating index
mov edi, [esp+24]
mov ecx, [edi]
- cpuid ; cpuid
+ cpuid ; cpuid
; processing various information return
mov edi, [esp+16]
@@ -133,7 +133,7 @@
mov edi, [esp+28]
mov [edi], edx
- pop edi
+ pop edi
pop ebx
ret
@@ -145,31 +145,31 @@
;****************************************************************************************************
WELS_EXTERN WelsCPUSupportAVX
%ifdef WIN64
- mov eax, ecx
- mov ecx, edx
+ mov eax, ecx
+ mov ecx, edx
%elifdef UNIX64
- mov eax, edi
- mov ecx, esi
+ mov eax, edi
+ mov ecx, esi
%else
- mov eax, [esp+4]
- mov ecx, [esp+8]
+ mov eax, [esp+4]
+ mov ecx, [esp+8]
%endif
- ; refer to detection of AVX addressed in INTEL AVX manual document
- and ecx, 018000000H
- cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
- jne avx_not_supported
- ; processor supports AVX instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne avx_not_supported
- mov eax, 1
- ret
+ ; refer to detection of AVX addressed in INTEL AVX manual document
+ and ecx, 018000000H
+ cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
+ jne avx_not_supported
+ ; processor supports AVX instructions and XGETBV is enabled by OS
+ mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
+ XGETBV ; result in EDX:EAX
+ and eax, 06H
+ cmp eax, 06H ; check OS has enabled both XMM and YMM state support
+ jne avx_not_supported
+ mov eax, 1
+ ret
avx_not_supported:
- mov eax, 0
- ret
+ mov eax, 0
+ ret
; need call after cpuid=1 and eax, ecx flag got then
@@ -178,35 +178,35 @@
;****************************************************************************************************
WELS_EXTERN WelsCPUSupportFMA
%ifdef WIN64
- mov eax, ecx
- mov ecx, edx
+ mov eax, ecx
+ mov ecx, edx
%elifdef UNIX64
- mov eax, edi
- mov ecx, esi
+ mov eax, edi
+ mov ecx, esi
%else
- mov eax, [esp+4]
- mov ecx, [esp+8]
+ mov eax, [esp+4]
+ mov ecx, [esp+8]
%endif
- ; refer to detection of FMA addressed in INTEL AVX manual document
- and ecx, 018001000H
- cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
- jne fma_not_supported
- ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne fma_not_supported
- mov eax, 1
- ret
+ ; refer to detection of FMA addressed in INTEL AVX manual document
+ and ecx, 018001000H
+ cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
+ jne fma_not_supported
+ ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
+ mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
+ XGETBV ; result in EDX:EAX
+ and eax, 06H
+ cmp eax, 06H ; check OS has enabled both XMM and YMM state support
+ jne fma_not_supported
+ mov eax, 1
+ ret
fma_not_supported:
- mov eax, 0
- ret
+ mov eax, 0
+ ret
;******************************************************************************************
; void WelsEmms()
;******************************************************************************************
WELS_EXTERN WelsEmms
- emms ; empty mmx technology states
- ret
+ emms ; empty mmx technology states
+ ret
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -57,1032 +57,1032 @@
WELS_EXTERN DeblockLumaLt4V_ssse3
- push rbp
- mov r11,[rsp + 16 + 20h] ; pTC
- PUSH_XMM 16
- sub rsp,1B0h
- lea rbp,[rsp+20h]
- movd xmm4,r8d
- movd xmm2,r9d
- mov qword [rbp+180h],r12
- mov r10,rcx
- movsxd r12,edx
- add edx,edx
- movsxd rdx,edx
- sub r10,r12
- movsx r8d,byte [r11]
- pxor xmm3,xmm3
- punpcklwd xmm2,xmm2
- movaps [rbp+50h],xmm14
- lea rax,[r12+r12*2]
- movdqa xmm14,[rdx+rcx]
- neg rax
- pshufd xmm0,xmm2,0
- movd xmm2,r8d
- movsx edx,byte [r11+1]
- movsx r8d,byte [r11+2]
- movsx r11d,byte [r11+3]
- movaps [rbp+70h],xmm12
- movd xmm1,edx
- movaps [rbp+80h],xmm11
- movd xmm12,r8d
- movd xmm11,r11d
- movdqa xmm5, [rax+rcx]
- lea rax,[r12+r12]
- punpcklwd xmm12,xmm12
- neg rax
- punpcklwd xmm11,xmm11
- movaps [rbp],xmm8
- movdqa xmm8, [r10]
- punpcklwd xmm2,xmm2
- punpcklwd xmm1,xmm1
- punpcklqdq xmm12,xmm12
- punpcklqdq xmm11,xmm11
- punpcklqdq xmm2,xmm2
- punpcklqdq xmm1,xmm1
- shufps xmm12,xmm11,88h
- movdqa xmm11,xmm8
- movaps [rbp+30h],xmm9
- movdqa xmm9,[rcx]
- shufps xmm2,xmm1,88h
- movdqa xmm1,xmm5
- punpcklbw xmm11,xmm3
- movaps [rbp+20h],xmm6
- movaps [rbp+60h],xmm13
- movdqa xmm13,xmm11
- movaps [rbp+90h],xmm10
- movdqa xmm10,xmm9
- movdqa xmm6,[rax+rcx]
- punpcklbw xmm1,xmm3
- movaps [rbp+0A0h],xmm12
- psubw xmm13,xmm1
- movaps [rbp+40h],xmm15
- movdqa xmm15,xmm14
- movaps [rbp+10h],xmm7
- movdqa xmm7,xmm6
- punpcklbw xmm10,xmm3
- movdqa xmm12,[r12+rcx]
- punpcklbw xmm7,xmm3
- punpcklbw xmm12,xmm3
- punpcklbw xmm15,xmm3
- pabsw xmm3,xmm13
- movdqa xmm13,xmm10
- psubw xmm13,xmm15
- movdqa [rbp+0F0h],xmm15
- pabsw xmm15,xmm13
- movdqa xmm13,xmm11
- movdqa [rbp+0B0h],xmm1
- movdqa xmm1,xmm0
- pavgw xmm13,xmm10
- pcmpgtw xmm1,xmm3
- movdqa [rbp+120h],xmm13
- movaps xmm13,xmm2
- punpcklwd xmm4,xmm4
- movdqa xmm3,xmm0
- movdqa [rbp+100h],xmm1
- psubw xmm13,xmm1
- movdqa xmm1,xmm10
- pcmpgtw xmm3,xmm15
- pshufd xmm4,xmm4,0
- psubw xmm1,xmm11
- movdqa [rbp+0D0h],xmm10
- psubw xmm13,xmm3
- movdqa [rbp+110h],xmm3
- pabsw xmm15,xmm1
- movdqa xmm3,xmm4
- psubw xmm10,xmm12
- pcmpgtw xmm3,xmm15
- pabsw xmm15,xmm10
- movdqa xmm10,xmm0
- psllw xmm1,2
- movdqa [rbp+0C0h],xmm11
- psubw xmm11,xmm7
- pcmpgtw xmm10,xmm15
- pabsw xmm11,xmm11
- movdqa xmm15,xmm0
- pand xmm3,xmm10
- pcmpgtw xmm15,xmm11
- movaps xmm11,xmm2
- pxor xmm10,xmm10
- pand xmm3,xmm15
- pcmpgtw xmm11,xmm10
- pcmpeqw xmm10,xmm2
- por xmm11,xmm10
- pand xmm3,xmm11
- movdqa xmm11,xmm7
- psubw xmm11,xmm12
- pxor xmm15,xmm15
- paddw xmm11,xmm1
- psubw xmm15,xmm13
- movdqa [rbp+0E0h],xmm12
- paddw xmm11,[FOUR_16B_SSE2]
- pxor xmm12,xmm12
- psraw xmm11,3
- punpckhbw xmm8,xmm12
- pmaxsw xmm15,xmm11
- punpckhbw xmm5,xmm12
- movdqa xmm11,xmm8
- pminsw xmm13,xmm15
- psubw xmm11,xmm5
- punpckhbw xmm9,xmm12
- pand xmm13,xmm3
- movdqa [rbp+130h],xmm13
- pabsw xmm13,xmm11
- punpckhbw xmm14,xmm12
- movdqa xmm11,xmm9
- psubw xmm11,xmm14
- movdqa xmm15,xmm0
- movdqa [rbp+140h],xmm14
- pabsw xmm14,xmm11
- movdqa xmm11,xmm8
- pcmpgtw xmm15,xmm14
- movdqa xmm1,[r12+rcx]
- pavgw xmm11,xmm9
- movdqa [rbp+170h],xmm11
- movdqa xmm10,xmm9
- punpckhbw xmm6,xmm12
- psubw xmm10,xmm8
- punpckhbw xmm1,xmm12
- movdqa xmm12,xmm0
- movaps xmm11,[rbp+0A0h]
- pcmpgtw xmm12,xmm13
- movaps xmm13,xmm11
- psubw xmm13,xmm12
- movdqa [rbp+160h],xmm15
- psubw xmm13,xmm15
- movdqa xmm15,xmm9
- psubw xmm15,xmm1
- movdqa [rbp+150h],xmm12
- pabsw xmm12,xmm10
- pabsw xmm14,xmm15
- movdqa xmm15,xmm8
- pcmpgtw xmm4,xmm12
- movdqa xmm12,xmm0
- psubw xmm15,xmm6
- pcmpgtw xmm12,xmm14
- pabsw xmm14,xmm15
- psllw xmm10,2
- pcmpgtw xmm0,xmm14
- movdqa xmm14,xmm6
- psubw xmm14,xmm1
- pand xmm4,xmm12
- paddw xmm14,xmm10
- pand xmm4,xmm0
- paddw xmm14,[FOUR_16B_SSE2]
- pxor xmm15,xmm15
- movaps xmm12,xmm11
- psubw xmm15,xmm13
- pxor xmm0,xmm0
- psraw xmm14,3
- pcmpgtw xmm12,xmm0
- pcmpeqw xmm0,xmm11
- pmaxsw xmm15,xmm14
- por xmm12,xmm0
- movdqa xmm0,[rbp+120h]
- pminsw xmm13,xmm15
- movdqa xmm15,[rbp+0B0h]
- movdqa xmm10,xmm7
- pand xmm4,xmm12
- paddw xmm15,xmm0
- pxor xmm12,xmm12
- paddw xmm10,xmm7
- movdqa xmm14,xmm12
- psubw xmm15,xmm10
- psubw xmm14,xmm2
- psraw xmm15,1
- pmaxsw xmm15,xmm14
- movdqa xmm10,xmm6
- pminsw xmm15,xmm2
- paddw xmm10,xmm6
- pand xmm15,xmm3
- psubw xmm12,xmm11
- pand xmm15,[rbp+100h]
- pand xmm13,xmm4
- paddw xmm7,xmm15
- paddw xmm8,xmm13
- movdqa xmm15,[rbp+170h]
- psubw xmm9,xmm13
- paddw xmm5,xmm15
- psubw xmm5,xmm10
- psraw xmm5,1
- pmaxsw xmm5,xmm12
- pminsw xmm5,xmm11
- pand xmm5,xmm4
- pand xmm5,[rbp+150h]
- paddw xmm6,xmm5
- movdqa xmm5,[rbp+0C0h]
- packuswb xmm7,xmm6
- movdqa xmm6,[rbp+130h]
- paddw xmm5,xmm6
- packuswb xmm5,xmm8
- movdqa xmm8,[rbp+0D0h]
- psubw xmm8,xmm6
- movdqa xmm6,[rbp+0F0h]
- paddw xmm6,xmm0
- movdqa xmm0,[rbp+0E0h]
- packuswb xmm8,xmm9
- movdqa xmm9,xmm0
- paddw xmm9,xmm0
- psubw xmm6,xmm9
- psraw xmm6,1
- pmaxsw xmm14,xmm6
- pminsw xmm2,xmm14
- pand xmm2,xmm3
- pand xmm2,[rbp+110h]
- paddw xmm0,xmm2
- movdqa xmm2,[rbp+140h]
- paddw xmm2,xmm15
- movdqa xmm15,xmm1
- paddw xmm15,xmm1
- psubw xmm2,xmm15
- psraw xmm2,1
- pmaxsw xmm12,xmm2
- pminsw xmm11,xmm12
- pand xmm11,xmm4
- pand xmm11,[rbp+160h]
- paddw xmm1,xmm11
- movdqa [rax+rcx],xmm7
- movdqa [r10],xmm5
- packuswb xmm0,xmm1
- movdqa [rcx],xmm8
- movdqa [r12+rcx],xmm0
- mov r12,qword [rbp+180h]
- lea rsp,[rbp+190h]
- POP_XMM
- pop rbp
- ret
+ push rbp
+ mov r11,[rsp + 16 + 20h] ; pTC
+ PUSH_XMM 16
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,r8d
+ movd xmm2,r9d
+ mov qword [rbp+180h],r12
+ mov r10,rcx
+ movsxd r12,edx
+ add edx,edx
+ movsxd rdx,edx
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rcx]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx edx,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,edx
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rcx]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rcx]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rcx]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rcx]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rcx]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rcx],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rcx],xmm8
+ movdqa [r12+rcx],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ POP_XMM
+ pop rbp
+ ret
WELS_EXTERN DeblockLumaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rdi
- pop rsi
- pop rbp
- pop rbx
- ret
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaLt4V_ssse3
- mov rax,rsp
- push rbx
- push rdi
- PUSH_XMM 16
- sub rsp,0C8h
- mov r10,qword [rax + 30h] ; pTC
- pxor xmm1,xmm1
- mov rbx,rcx
- movsxd r11,r8d
- movsx ecx,byte [r10]
- movsx r8d,byte [r10+2]
- mov rdi,rdx
- movq xmm2,[rbx]
- movq xmm9,[r11+rbx]
- movsx edx,byte [r10+1]
- mov word [rsp+2],cx
- mov word [rsp],cx
- movsx eax,byte [r10+3]
- mov word [rsp+6],dx
- mov word [rsp+4],dx
- movdqa xmm11,xmm1
- mov word [rsp+0Eh],ax
- mov word [rsp+0Ch],ax
- lea eax,[r11+r11]
- movsxd rcx,eax
- mov rax,rbx
- mov rdx,rdi
- sub rax,rcx
- mov word [rsp+0Ah],r8w
- mov word [rsp+8],r8w
- movdqa xmm6,[rsp]
- movdqa xmm7,xmm6
- movq xmm13, [rax]
- mov rax,rdi
- sub rax,rcx
- mov rcx,rbx
- pcmpgtw xmm7,xmm1
- psubw xmm11,xmm6
- sub rcx,r11
- sub rdx,r11
- movq xmm0,[rax]
- movsx eax,r9w
- movq xmm15,[rcx]
- punpcklqdq xmm13,xmm0
- movq xmm0, [rdx]
- movdqa xmm4,xmm13
- punpcklqdq xmm15,xmm0
- movq xmm0, [rdi]
- punpcklbw xmm4,xmm1
- movdqa xmm12,xmm15
- punpcklqdq xmm2,xmm0
- movq xmm0, [r11+rdi]
- punpcklbw xmm12,xmm1
- movdqa xmm14,xmm2
- punpcklqdq xmm9,xmm0
- punpckhbw xmm2,xmm1
- punpcklbw xmm14,xmm1
- movd xmm0,eax
- movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta
- punpckhbw xmm13,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm9
- movdqa [rsp+10h],xmm2
- punpcklwd xmm0,xmm0
- punpckhbw xmm9,xmm1
- punpcklbw xmm3,xmm1
- movdqa xmm1,xmm14
- pshufd xmm10,xmm0,0
- movd xmm0,eax
- mov eax,4
- cwde
- punpcklwd xmm0,xmm0
- pshufd xmm8,xmm0,0
- movd xmm0,eax
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- psubw xmm1,xmm12
- movdqa xmm2,xmm10
- lea r11,[rsp+0C8h]
- psllw xmm1,2
- movdqa xmm0,xmm4
- psubw xmm4,xmm12
- psubw xmm0,xmm3
- psubw xmm3,xmm14
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm11
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm12
- psubw xmm0,xmm14
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- movdqa xmm3,[rsp]
- pand xmm2,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- psubw xmm0,xmm9
- psubw xmm13,xmm15
- pand xmm2,xmm7
- pand xmm6,xmm2
- paddw xmm12,xmm6
- psubw xmm14,xmm6
- movdqa xmm2,[rsp+10h]
- movaps xmm6,[r11-18h]
- movdqa xmm1,xmm2
- psubw xmm1,xmm15
- psubw xmm9,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm15
- psubw xmm0,xmm2
- psraw xmm1,3
- pmaxsw xmm11,xmm1
- pabsw xmm0,xmm0
- movdqa xmm1,xmm8
- pcmpgtw xmm10,xmm0
- pabsw xmm0,xmm13
- pminsw xmm3,xmm11
- movaps xmm11,[r11-68h]
- movaps xmm13,[rsp+40h]
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm9
- movaps xmm9, [r11-48h]
- pand xmm10,xmm1
- pcmpgtw xmm8,xmm0
- pand xmm10,xmm8
- pand xmm10,xmm7
- movaps xmm8,[r11-38h]
- movaps xmm7,[r11-28h]
- pand xmm3,xmm10
- paddw xmm15,xmm3
- psubw xmm2,xmm3
- movaps xmm10,[r11-58h]
- packuswb xmm12,xmm15
- movaps xmm15,[rsp+20h]
- packuswb xmm14,xmm2
- movq [rcx],xmm12
- movq [rbx],xmm14
- psrldq xmm12,8
- psrldq xmm14,8
- movq [rdx],xmm12
- movaps xmm12,[r11-78h]
- movq [rdi],xmm14
- movaps xmm14,[rsp+30h]
- mov rsp,r11
- POP_XMM
- pop rdi
- pop rbx
- ret
+ mov rax,rsp
+ push rbx
+ push rdi
+ PUSH_XMM 16
+ sub rsp,0C8h
+ mov r10,qword [rax + 30h] ; pTC
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
+ movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ POP_XMM
+ pop rdi
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaEq4V_ssse3
- mov rax,rsp
- push rbx
- PUSH_XMM 15
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
- movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movaps xmm7,[rsp+70h]
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movaps xmm6,[r11-10h]
- movaps xmm8,[r11-30h]
- movaps xmm9,[r11-40h]
- movq [rbx],xmm1
- movaps xmm10,[r11-50h]
- movaps xmm11,[r11-60h]
- movaps xmm12,[r11-70h]
- movaps xmm13,[r11-80h]
- mov rsp,r11
- POP_XMM
- pop rbx
- ret
+ mov rax,rsp
+ push rbx
+ PUSH_XMM 15
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
+ movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movaps xmm7,[rsp+70h]
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movaps xmm6,[r11-10h]
+ movaps xmm8,[r11-30h]
+ movaps xmm9,[r11-40h]
+ movq [rbx],xmm1
+ movaps xmm10,[r11-50h]
+ movaps xmm11,[r11-60h]
+ movaps xmm12,[r11-70h]
+ movaps xmm13,[r11-80h]
+ mov rsp,r11
+ POP_XMM
+ pop rbx
+ ret
@@ -1089,548 +1089,548 @@
WELS_EXTERN DeblockChromaEq4H_ssse3
- mov rax,rsp
- mov [rax+20h],rbx
- push rdi
- PUSH_XMM 16
- sub rsp,140h
- mov rdi,rdx
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
- mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- movsx eax,word [rsp+170h + 160] ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea rsp,[rsp+140h]
- POP_XMM
- mov rbx, [rsp+28h]
- pop rdi
- ret
+ mov rax,rsp
+ mov [rax+20h],rbx
+ push rdi
+ PUSH_XMM 16
+ sub rsp,140h
+ mov rdi,rdx
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
+ mov eax,[rdx+rcx-2]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ movsx eax,word [rsp+170h + 160] ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea rsp,[rsp+140h]
+ POP_XMM
+ mov rbx, [rsp+28h]
+ pop rdi
+ ret
WELS_EXTERN DeblockChromaLt4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- push r12
- PUSH_XMM 16
- sub rsp,170h
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ push r12
+ PUSH_XMM 16
+ sub rsp,170h
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
- mov rax, [rsp+1C8h+160] ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
- movsx eax,word [rsp+1C0h+160] ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
- POP_XMM
- pop r12
- pop rdi
- pop rsi
- pop rbp
- pop rbx
- ret
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
+ mov rax, [rsp+1C8h+160] ; pTC
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
+ movsx eax,word [rsp+1C0h+160] ; iBeta
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
+ POP_XMM
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ ret
@@ -1638,1591 +1638,1591 @@
WELS_EXTERN DeblockLumaLt4V_ssse3
- push rbp
- mov r11,r8 ; pTC
- sub rsp,1B0h
- lea rbp,[rsp+20h]
- movd xmm4,edx
- movd xmm2,ecx
- mov qword [rbp+180h],r12
- mov r10,rdi
- movsxd r12,esi
- add rsi,rsi
- movsxd rdx,esi
- sub r10,r12
- movsx r8d,byte [r11]
- pxor xmm3,xmm3
- punpcklwd xmm2,xmm2
- movaps [rbp+50h],xmm14
- lea rax,[r12+r12*2]
- movdqa xmm14,[rdx+rdi]
- neg rax
- pshufd xmm0,xmm2,0
- movd xmm2,r8d
- movsx rsi,byte [r11+1]
- movsx r8d,byte [r11+2]
- movsx r11d,byte [r11+3]
- movaps [rbp+70h],xmm12
- movd xmm1,esi
- movaps [rbp+80h],xmm11
- movd xmm12,r8d
- movd xmm11,r11d
- movdqa xmm5, [rax+rdi]
- lea rax,[r12+r12]
- punpcklwd xmm12,xmm12
- neg rax
- punpcklwd xmm11,xmm11
- movaps [rbp],xmm8
- movdqa xmm8, [r10]
- punpcklwd xmm2,xmm2
- punpcklwd xmm1,xmm1
- punpcklqdq xmm12,xmm12
- punpcklqdq xmm11,xmm11
- punpcklqdq xmm2,xmm2
- punpcklqdq xmm1,xmm1
- shufps xmm12,xmm11,88h
- movdqa xmm11,xmm8
- movaps [rbp+30h],xmm9
- movdqa xmm9,[rdi]
- shufps xmm2,xmm1,88h
- movdqa xmm1,xmm5
- punpcklbw xmm11,xmm3
- movaps [rbp+20h],xmm6
- movaps [rbp+60h],xmm13
- movdqa xmm13,xmm11
- movaps [rbp+90h],xmm10
- movdqa xmm10,xmm9
- movdqa xmm6,[rax+rdi]
- punpcklbw xmm1,xmm3
- movaps [rbp+0A0h],xmm12
- psubw xmm13,xmm1
- movaps [rbp+40h],xmm15
- movdqa xmm15,xmm14
- movaps [rbp+10h],xmm7
- movdqa xmm7,xmm6
- punpcklbw xmm10,xmm3
- movdqa xmm12,[r12+rdi]
- punpcklbw xmm7,xmm3
- punpcklbw xmm12,xmm3
- punpcklbw xmm15,xmm3
- pabsw xmm3,xmm13
- movdqa xmm13,xmm10
- psubw xmm13,xmm15
- movdqa [rbp+0F0h],xmm15
- pabsw xmm15,xmm13
- movdqa xmm13,xmm11
- movdqa [rbp+0B0h],xmm1
- movdqa xmm1,xmm0
- pavgw xmm13,xmm10
- pcmpgtw xmm1,xmm3
- movdqa [rbp+120h],xmm13
- movaps xmm13,xmm2
- punpcklwd xmm4,xmm4
- movdqa xmm3,xmm0
- movdqa [rbp+100h],xmm1
- psubw xmm13,xmm1
- movdqa xmm1,xmm10
- pcmpgtw xmm3,xmm15
- pshufd xmm4,xmm4,0
- psubw xmm1,xmm11
- movdqa [rbp+0D0h],xmm10
- psubw xmm13,xmm3
- movdqa [rbp+110h],xmm3
- pabsw xmm15,xmm1
- movdqa xmm3,xmm4
- psubw xmm10,xmm12
- pcmpgtw xmm3,xmm15
- pabsw xmm15,xmm10
- movdqa xmm10,xmm0
- psllw xmm1,2
- movdqa [rbp+0C0h],xmm11
- psubw xmm11,xmm7
- pcmpgtw xmm10,xmm15
- pabsw xmm11,xmm11
- movdqa xmm15,xmm0
- pand xmm3,xmm10
- pcmpgtw xmm15,xmm11
- movaps xmm11,xmm2
- pxor xmm10,xmm10
- pand xmm3,xmm15
- pcmpgtw xmm11,xmm10
- pcmpeqw xmm10,xmm2
- por xmm11,xmm10
- pand xmm3,xmm11
- movdqa xmm11,xmm7
- psubw xmm11,xmm12
- pxor xmm15,xmm15
- paddw xmm11,xmm1
- psubw xmm15,xmm13
- movdqa [rbp+0E0h],xmm12
- paddw xmm11,[FOUR_16B_SSE2]
- pxor xmm12,xmm12
- psraw xmm11,3
- punpckhbw xmm8,xmm12
- pmaxsw xmm15,xmm11
- punpckhbw xmm5,xmm12
- movdqa xmm11,xmm8
- pminsw xmm13,xmm15
- psubw xmm11,xmm5
- punpckhbw xmm9,xmm12
- pand xmm13,xmm3
- movdqa [rbp+130h],xmm13
- pabsw xmm13,xmm11
- punpckhbw xmm14,xmm12
- movdqa xmm11,xmm9
- psubw xmm11,xmm14
- movdqa xmm15,xmm0
- movdqa [rbp+140h],xmm14
- pabsw xmm14,xmm11
- movdqa xmm11,xmm8
- pcmpgtw xmm15,xmm14
- movdqa xmm1,[r12+rdi]
- pavgw xmm11,xmm9
- movdqa [rbp+170h],xmm11
- movdqa xmm10,xmm9
- punpckhbw xmm6,xmm12
- psubw xmm10,xmm8
- punpckhbw xmm1,xmm12
- movdqa xmm12,xmm0
- movaps xmm11,[rbp+0A0h]
- pcmpgtw xmm12,xmm13
- movaps xmm13,xmm11
- psubw xmm13,xmm12
- movdqa [rbp+160h],xmm15
- psubw xmm13,xmm15
- movdqa xmm15,xmm9
- psubw xmm15,xmm1
- movdqa [rbp+150h],xmm12
- pabsw xmm12,xmm10
- pabsw xmm14,xmm15
- movdqa xmm15,xmm8
- pcmpgtw xmm4,xmm12
- movdqa xmm12,xmm0
- psubw xmm15,xmm6
- pcmpgtw xmm12,xmm14
- pabsw xmm14,xmm15
- psllw xmm10,2
- pcmpgtw xmm0,xmm14
- movdqa xmm14,xmm6
- psubw xmm14,xmm1
- pand xmm4,xmm12
- paddw xmm14,xmm10
- pand xmm4,xmm0
- paddw xmm14,[FOUR_16B_SSE2]
- pxor xmm15,xmm15
- movaps xmm12,xmm11
- psubw xmm15,xmm13
- pxor xmm0,xmm0
- psraw xmm14,3
- pcmpgtw xmm12,xmm0
- pcmpeqw xmm0,xmm11
- pmaxsw xmm15,xmm14
- por xmm12,xmm0
- movdqa xmm0,[rbp+120h]
- pminsw xmm13,xmm15
- movdqa xmm15,[rbp+0B0h]
- movdqa xmm10,xmm7
- pand xmm4,xmm12
- paddw xmm15,xmm0
- pxor xmm12,xmm12
- paddw xmm10,xmm7
- movdqa xmm14,xmm12
- psubw xmm15,xmm10
- psubw xmm14,xmm2
- psraw xmm15,1
- pmaxsw xmm15,xmm14
- movdqa xmm10,xmm6
- pminsw xmm15,xmm2
- paddw xmm10,xmm6
- pand xmm15,xmm3
- psubw xmm12,xmm11
- pand xmm15,[rbp+100h]
- pand xmm13,xmm4
- paddw xmm7,xmm15
- paddw xmm8,xmm13
- movdqa xmm15,[rbp+170h]
- psubw xmm9,xmm13
- paddw xmm5,xmm15
- psubw xmm5,xmm10
- psraw xmm5,1
- pmaxsw xmm5,xmm12
- pminsw xmm5,xmm11
- pand xmm5,xmm4
- pand xmm5,[rbp+150h]
- paddw xmm6,xmm5
- movdqa xmm5,[rbp+0C0h]
- packuswb xmm7,xmm6
- movdqa xmm6,[rbp+130h]
- paddw xmm5,xmm6
- packuswb xmm5,xmm8
- movdqa xmm8,[rbp+0D0h]
- psubw xmm8,xmm6
- movdqa xmm6,[rbp+0F0h]
- paddw xmm6,xmm0
- movdqa xmm0,[rbp+0E0h]
- packuswb xmm8,xmm9
- movdqa xmm9,xmm0
- paddw xmm9,xmm0
- psubw xmm6,xmm9
- psraw xmm6,1
- pmaxsw xmm14,xmm6
- pminsw xmm2,xmm14
- pand xmm2,xmm3
- pand xmm2,[rbp+110h]
- paddw xmm0,xmm2
- movdqa xmm2,[rbp+140h]
- paddw xmm2,xmm15
- movdqa xmm15,xmm1
- paddw xmm15,xmm1
- psubw xmm2,xmm15
- psraw xmm2,1
- pmaxsw xmm12,xmm2
- pminsw xmm11,xmm12
- pand xmm11,xmm4
- pand xmm11,[rbp+160h]
- paddw xmm1,xmm11
- movdqa [rax+rdi],xmm7
- movdqa [r10],xmm5
- packuswb xmm0,xmm1
- movdqa [rdi],xmm8
- movdqa [r12+rdi],xmm0
- mov r12,qword [rbp+180h]
- lea rsp,[rbp+190h]
- pop rbp
- ret
+ push rbp
+ mov r11,r8 ; pTC
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,edx
+ movd xmm2,ecx
+ mov qword [rbp+180h],r12
+ mov r10,rdi
+ movsxd r12,esi
+ add rsi,rsi
+ movsxd rdx,esi
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rdi]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx rsi,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,esi
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rdi]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rdi]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rdi]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rdi]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rdi]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rdi],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rdi],xmm8
+ movdqa [r12+rdi],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ pop rbp
+ ret
WELS_EXTERN DeblockLumaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rbp
- pop rbx
- ret
+ mov rax,rsp
+ push rbx
+ push rbp
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaLt4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
- mov r10, rdx
- mov r11, rcx
- mov rcx, rdi
- mov rdx, rsi
- mov rsi, r10
- mov r10, r9
- mov rbp, r8
- mov r8, rsi
- mov r9, r11
- sub rsp,0C8h
- pxor xmm1,xmm1
- mov rbx,rcx
- movsxd r11,r8d
- movsx ecx,byte [r10]
- movsx r8d,byte [r10+2]
- mov rdi,rdx
- movq xmm2,[rbx]
- movq xmm9,[r11+rbx]
- movsx edx,byte [r10+1]
- mov word [rsp+2],cx
- mov word [rsp],cx
- movsx eax,byte [r10+3]
- mov word [rsp+6],dx
- mov word [rsp+4],dx
- movdqa xmm11,xmm1
- mov word [rsp+0Eh],ax
- mov word [rsp+0Ch],ax
- lea eax,[r11+r11]
- movsxd rcx,eax
- mov rax,rbx
- mov rdx,rdi
- sub rax,rcx
- mov word [rsp+0Ah],r8w
- mov word [rsp+8],r8w
- movdqa xmm6,[rsp]
- movdqa xmm7,xmm6
- movq xmm13, [rax]
- mov rax,rdi
- sub rax,rcx
- mov rcx,rbx
- pcmpgtw xmm7,xmm1
- psubw xmm11,xmm6
- sub rcx,r11
- sub rdx,r11
- movq xmm0,[rax]
- movsx eax,r9w
- movq xmm15,[rcx]
- punpcklqdq xmm13,xmm0
- movq xmm0, [rdx]
- movdqa xmm4,xmm13
- punpcklqdq xmm15,xmm0
- movq xmm0, [rdi]
- punpcklbw xmm4,xmm1
- movdqa xmm12,xmm15
- punpcklqdq xmm2,xmm0
- movq xmm0, [r11+rdi]
- punpcklbw xmm12,xmm1
- movdqa xmm14,xmm2
- punpcklqdq xmm9,xmm0
- punpckhbw xmm2,xmm1
- punpcklbw xmm14,xmm1
- movd xmm0,eax
- mov eax, ebp ; iBeta
- punpckhbw xmm13,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm9
- movdqa [rsp+10h],xmm2
- punpcklwd xmm0,xmm0
- punpckhbw xmm9,xmm1
- punpcklbw xmm3,xmm1
- movdqa xmm1,xmm14
- pshufd xmm10,xmm0,0
- movd xmm0,eax
- mov eax,4
- cwde
- punpcklwd xmm0,xmm0
- pshufd xmm8,xmm0,0
- movd xmm0,eax
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- psubw xmm1,xmm12
- movdqa xmm2,xmm10
- lea r11,[rsp+0C8h]
- psllw xmm1,2
- movdqa xmm0,xmm4
- psubw xmm4,xmm12
- psubw xmm0,xmm3
- psubw xmm3,xmm14
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm11
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm12
- psubw xmm0,xmm14
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- movdqa xmm3,[rsp]
- pand xmm2,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- psubw xmm0,xmm9
- psubw xmm13,xmm15
- pand xmm2,xmm7
- pand xmm6,xmm2
- paddw xmm12,xmm6
- psubw xmm14,xmm6
- movdqa xmm2,[rsp+10h]
- movaps xmm6,[r11-18h]
- movdqa xmm1,xmm2
- psubw xmm1,xmm15
- psubw xmm9,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm15
- psubw xmm0,xmm2
- psraw xmm1,3
- pmaxsw xmm11,xmm1
- pabsw xmm0,xmm0
- movdqa xmm1,xmm8
- pcmpgtw xmm10,xmm0
- pabsw xmm0,xmm13
- pminsw xmm3,xmm11
- movaps xmm11,[r11-68h]
- movaps xmm13,[rsp+40h]
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm9
- movaps xmm9, [r11-48h]
- pand xmm10,xmm1
- pcmpgtw xmm8,xmm0
- pand xmm10,xmm8
- pand xmm10,xmm7
- movaps xmm8,[r11-38h]
- movaps xmm7,[r11-28h]
- pand xmm3,xmm10
- paddw xmm15,xmm3
- psubw xmm2,xmm3
- movaps xmm10,[r11-58h]
- packuswb xmm12,xmm15
- movaps xmm15,[rsp+20h]
- packuswb xmm14,xmm2
- movq [rcx],xmm12
- movq [rbx],xmm14
- psrldq xmm12,8
- psrldq xmm14,8
- movq [rdx],xmm12
- movaps xmm12,[r11-78h]
- movq [rdi],xmm14
- movaps xmm14,[rsp+30h]
- mov rsp,r11
- pop rbp
- pop rbx
- ret
+ mov rax,rsp
+ push rbx
+ push rbp
+ mov r10, rdx
+ mov r11, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ mov rsi, r10
+ mov r10, r9
+ mov rbp, r8
+ mov r8, rsi
+ mov r9, r11
+ sub rsp,0C8h
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
+ mov eax, ebp ; iBeta
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
+ mov rax,rsp
+ push rbx
+ push rbp
- mov rbp, r8
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
+ mov rbp, r8
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
- mov eax, ebp ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- ;movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movq [rbx],xmm1
- mov rsp,r11
- pop rbp
- pop rbx
- ret
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
+ mov eax, ebp ; iBeta
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ ;movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movq [rbx],xmm1
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaEq4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push r12
+ mov rax,rsp
+ push rbx
+ push rbp
+ push r12
- mov rbp, r8
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
- mov rdi, rdx
+ mov rbp, r8
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ mov rdi, rdx
- sub rsp,140h
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
+ sub rsp,140h
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
- mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- mov eax, ebp ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea r11,[rsp+140h]
- mov rbx, [r11+28h]
- mov rsp,r11
- pop r12
- pop rbp
- pop rbx
- ret
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
+ mov eax,[rdx+rcx-2]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ mov eax, ebp ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea r11,[rsp+140h]
+ mov rbx, [r11+28h]
+ mov rsp,r11
+ pop r12
+ pop rbp
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaLt4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push r12
- push r13
- push r14
- sub rsp,170h
+ mov rax,rsp
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ sub rsp,170h
- mov r13, r8
- mov r14, r9
- mov r8, rdx
- mov r9, rcx
- mov rdx, rdi
- mov rcx, rsi
+ mov r13, r8
+ mov r14, r9
+ mov r8, rdx
+ mov r9, rcx
+ mov rdx, rdi
+ mov rcx, rsi
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
- mov rax, r14 ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
- mov eax, r13d ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- ret
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
+ mov rax, r14 ; pTC
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
+ mov eax, r13d ; iBeta
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ ret
@@ -3233,166 +3233,166 @@
; int32_t iAlpha, int32_t iBeta)
;********************************************************************************
WELS_EXTERN DeblockChromaEq4V_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
- mov edx,[ebp+10h] ; iStride
- mov eax,[ebp+8] ; pPixCb
- mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,68h
+ mov edx,[ebp+10h] ; iStride
+ mov eax,[ebp+8] ; pPixCb
+ mov ecx,[ebp+0Ch] ; pPixCr
+ movq xmm4,[ecx]
+ movq xmm5,[edx+ecx]
+ push esi
+ push edi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ movq xmm1,[edi]
+ mov edi,ecx
+ sub edi,esi
+ movq xmm2,[edi]
+ punpcklqdq xmm1,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm2,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm3,[edi]
+ punpcklqdq xmm2,xmm3
+ movq xmm3,[eax]
+ punpcklqdq xmm3,xmm4
+ movq xmm4,[edx+eax]
+ mov edx, [ebp + 14h]
+ punpcklqdq xmm4,xmm5
+ movd xmm5,edx
+ mov edx, [ebp + 18h]
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm5
+ punpcklwd xmm6,xmm5
+ pshufd xmm5,xmm6,0
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,xmm1
+ punpckhbw xmm1,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+40h],xmm1
+ movdqa [esp+60h],xmm7
+ movdqa xmm7,xmm2
+ punpcklbw xmm7,xmm0
+ movdqa [esp+10h],xmm7
+ movdqa xmm7,xmm3
+ punpcklbw xmm7,xmm0
+ punpckhbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm7,xmm4
+ punpckhbw xmm4,xmm0
+ punpckhbw xmm2,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+30h],xmm3
+ movdqa xmm3,[esp+10h]
+ movdqa xmm1,xmm3
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa [esp+20h],xmm4
+ movdqa xmm0,xmm5
+ pcmpgtw xmm0,xmm1
+ movdqa xmm1,[esp+60h]
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ pand xmm0,xmm4
+ movdqa xmm1,xmm7
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,xmm2
+ psubw xmm1,[esp+30h]
+ pabsw xmm1,xmm1
+ pcmpgtw xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ pand xmm0,xmm4
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,[esp+20h]
+ psubw xmm1,[esp+30h]
+ pand xmm5,xmm4
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ pand xmm5,xmm6
+ mov edx,2
+ movsx edx,dx
+ movd xmm1,edx
+ movdqa xmm4,xmm1
+ punpcklwd xmm4,xmm1
+ pshufd xmm1,xmm4,0
+ movdqa xmm4,[esp+60h]
+ movdqa xmm6,xmm4
+ paddw xmm6,xmm4
+ paddw xmm6,xmm3
+ paddw xmm6,xmm7
+ movdqa [esp+10h],xmm1
+ paddw xmm6,[esp+10h]
+ psraw xmm6,2
+ movdqa xmm4,xmm0
+ pandn xmm4,xmm3
+ movdqa xmm3,[esp+40h]
+ movdqa xmm1,xmm0
+ pand xmm1,xmm6
+ por xmm1,xmm4
+ movdqa xmm6,xmm3
+ paddw xmm6,xmm3
+ movdqa xmm3,[esp+10h]
+ paddw xmm6,xmm2
+ paddw xmm6,[esp+20h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm4,xmm5
+ pand xmm4,xmm6
+ movdqa xmm6,xmm5
+ pandn xmm6,xmm2
+ por xmm4,xmm6
+ packuswb xmm1,xmm4
+ movdqa xmm4,[esp+50h]
+ movdqa xmm6,xmm7
+ paddw xmm6,xmm7
+ paddw xmm6,xmm4
+ paddw xmm6,[esp+60h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm2,xmm0
+ pand xmm2,xmm6
+ pandn xmm0,xmm4
+ por xmm2,xmm0
+ movdqa xmm0,[esp+20h]
+ movdqa xmm6,xmm0
+ paddw xmm6,xmm0
+ movdqa xmm0,[esp+30h]
+ paddw xmm6,xmm0
+ paddw xmm6,[esp+40h]
+ movdqa xmm4,xmm5
+ paddw xmm6,xmm3
+ movq [esi],xmm1
+ psraw xmm6,2
+ pand xmm4,xmm6
+ pandn xmm5,xmm0
+ por xmm4,xmm5
+ packuswb xmm2,xmm4
+ movq [eax],xmm2
+ psrldq xmm1,8
+ movq [edi],xmm1
+ pop edi
+ psrldq xmm2,8
+ movq [ecx],xmm2
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
;******************************************************************************
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3400,200 +3400,200 @@
;*******************************************************************************
WELS_EXTERN DeblockChromaLt4V_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0E4h
- push ebx
- push esi
- mov esi, [ebp+1Ch] ; pTC
- movsx ebx, byte [esi+2]
- push edi
- movsx di,byte [esi+3]
- mov word [esp+0Ch],bx
- movsx bx,byte [esi+1]
- movsx esi,byte [esi]
- mov word [esp+0Eh],si
- movzx esi,di
- movd xmm1,esi
- movzx esi,di
- movd xmm2,esi
- mov si,word [esp+0Ch]
- mov edx, [ebp + 10h]
- mov eax, [ebp + 08h]
- movzx edi,si
- movzx esi,si
- mov ecx, [ebp + 0Ch]
- movd xmm4,esi
- movzx esi,bx
- movd xmm5,esi
- movd xmm3,edi
- movzx esi,bx
- movd xmm6,esi
- mov si,word [esp+0Eh]
- movzx edi,si
- movzx esi,si
- punpcklwd xmm6,xmm2
- pxor xmm0,xmm0
- movdqa [esp+40h],xmm0
- movd xmm7,edi
- movd xmm0,esi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+40h]
- punpcklwd xmm0,xmm4
- movq xmm4,[edx+ecx]
- punpcklwd xmm7,xmm3
- movq xmm3,[eax]
- punpcklwd xmm0,xmm6
- movq xmm6,[edi]
- punpcklwd xmm7,xmm5
- punpcklwd xmm0,xmm7
- mov edi,ecx
- sub edi,esi
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+60h],xmm2
- movq xmm2, [edi]
- punpcklqdq xmm6,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm7,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm2,[edi]
- punpcklqdq xmm7,xmm2
- movq xmm2,[ecx]
- punpcklqdq xmm3,xmm2
- movq xmm2,[edx+eax]
- movsx edx,word [ebp + 14h]
- punpcklqdq xmm2,xmm4
- movdqa [esp+0E0h],xmm2
- movd xmm2,edx
- movsx edx,word [ebp + 18h]
- movdqa xmm4,xmm2
- punpcklwd xmm4,xmm2
- movd xmm2,edx
- movdqa xmm5,xmm2
- punpcklwd xmm5,xmm2
- pshufd xmm2,xmm5,0
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- movdqa [esp+0D0h],xmm3
- pshufd xmm4,xmm4,0
- movdqa [esp+30h],xmm2
- punpckhbw xmm6,xmm1
- movdqa [esp+80h],xmm6
- movdqa xmm6,[esp+0D0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+70h],xmm6
- movdqa xmm6, [esp+0E0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+90h],xmm6
- movdqa xmm5, [esp+0E0h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0A0h],xmm7
- punpcklbw xmm3,xmm1
- mov edx,4
- punpcklbw xmm2,xmm1
- movsx edx,dx
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,[esp+30h]
- movdqa [esp+20h],xmm6
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa xmm1,[esp+60h]
- movdqa [esp+40h],xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6, [esp+20h]
- movdqa xmm7, [esp+50h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa [esp+10h],xmm0
- movdqa xmm6, [esp+10h]
- pminsw xmm6,xmm1
- movdqa [esp+10h],xmm6
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm6,xmm4
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+30h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1,[esp+50h]
- pand xmm6,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5,[esp+80h]
- psubw xmm5,[esp+90h]
- pand xmm6,xmm1
- pand xmm6,[esp+40h]
- movdqa xmm1,[esp+10h]
- pand xmm1,xmm6
- movdqa xmm6,[esp+70h]
- movdqa [esp+30h],xmm1
- movdqa xmm1,[esp+0A0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6,[esp+20h]
- movdqa xmm5,[esp+60h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+70h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+80h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+90h]
- pand xmm4,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+40h]
- pand xmm0,xmm4
- movdqa xmm4,[esp+30h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- packuswb xmm2,xmm1
- movq [esi],xmm2
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm3,xmm5
- movq [eax],xmm3
- psrldq xmm2,8
- movq [edi],xmm2
- pop edi
- pop esi
- psrldq xmm3,8
- movq [ecx],xmm3
- pop ebx
- mov esp,ebp
- pop ebp
- ret
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0E4h
+ push ebx
+ push esi
+ mov esi, [ebp+1Ch] ; pTC
+ movsx ebx, byte [esi+2]
+ push edi
+ movsx di,byte [esi+3]
+ mov word [esp+0Ch],bx
+ movsx bx,byte [esi+1]
+ movsx esi,byte [esi]
+ mov word [esp+0Eh],si
+ movzx esi,di
+ movd xmm1,esi
+ movzx esi,di
+ movd xmm2,esi
+ mov si,word [esp+0Ch]
+ mov edx, [ebp + 10h]
+ mov eax, [ebp + 08h]
+ movzx edi,si
+ movzx esi,si
+ mov ecx, [ebp + 0Ch]
+ movd xmm4,esi
+ movzx esi,bx
+ movd xmm5,esi
+ movd xmm3,edi
+ movzx esi,bx
+ movd xmm6,esi
+ mov si,word [esp+0Eh]
+ movzx edi,si
+ movzx esi,si
+ punpcklwd xmm6,xmm2
+ pxor xmm0,xmm0
+ movdqa [esp+40h],xmm0
+ movd xmm7,edi
+ movd xmm0,esi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ punpcklwd xmm0,xmm4
+ movq xmm4,[edx+ecx]
+ punpcklwd xmm7,xmm3
+ movq xmm3,[eax]
+ punpcklwd xmm0,xmm6
+ movq xmm6,[edi]
+ punpcklwd xmm7,xmm5
+ punpcklwd xmm0,xmm7
+ mov edi,ecx
+ sub edi,esi
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+60h],xmm2
+ movq xmm2, [edi]
+ punpcklqdq xmm6,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm7,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm2,[edi]
+ punpcklqdq xmm7,xmm2
+ movq xmm2,[ecx]
+ punpcklqdq xmm3,xmm2
+ movq xmm2,[edx+eax]
+ movsx edx,word [ebp + 14h]
+ punpcklqdq xmm2,xmm4
+ movdqa [esp+0E0h],xmm2
+ movd xmm2,edx
+ movsx edx,word [ebp + 18h]
+ movdqa xmm4,xmm2
+ punpcklwd xmm4,xmm2
+ movd xmm2,edx
+ movdqa xmm5,xmm2
+ punpcklwd xmm5,xmm2
+ pshufd xmm2,xmm5,0
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ movdqa [esp+0D0h],xmm3
+ pshufd xmm4,xmm4,0
+ movdqa [esp+30h],xmm2
+ punpckhbw xmm6,xmm1
+ movdqa [esp+80h],xmm6
+ movdqa xmm6,[esp+0D0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+70h],xmm6
+ movdqa xmm6, [esp+0E0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+90h],xmm6
+ movdqa xmm5, [esp+0E0h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0A0h],xmm7
+ punpcklbw xmm3,xmm1
+ mov edx,4
+ punpcklbw xmm2,xmm1
+ movsx edx,dx
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,[esp+30h]
+ movdqa [esp+20h],xmm6
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1,[esp+60h]
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6, [esp+20h]
+ movdqa xmm7, [esp+50h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa [esp+10h],xmm0
+ movdqa xmm6, [esp+10h]
+ pminsw xmm6,xmm1
+ movdqa [esp+10h],xmm6
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm6,xmm4
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+30h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1,[esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5,[esp+80h]
+ psubw xmm5,[esp+90h]
+ pand xmm6,xmm1
+ pand xmm6,[esp+40h]
+ movdqa xmm1,[esp+10h]
+ pand xmm1,xmm6
+ movdqa xmm6,[esp+70h]
+ movdqa [esp+30h],xmm1
+ movdqa xmm1,[esp+0A0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6,[esp+20h]
+ movdqa xmm5,[esp+60h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+70h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+80h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+90h]
+ pand xmm4,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+40h]
+ pand xmm0,xmm4
+ movdqa xmm4,[esp+30h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ packuswb xmm2,xmm1
+ movq [esi],xmm2
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm3,xmm5
+ movq [eax],xmm3
+ psrldq xmm2,8
+ movq [edi],xmm2
+ pop edi
+ pop esi
+ psrldq xmm3,8
+ movq [ecx],xmm3
+ pop ebx
+ mov esp,ebp
+ pop ebp
+ ret
;***************************************************************************
; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3601,280 +3601,280 @@
;***************************************************************************
WELS_EXTERN DeblockChromaEq4H_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0C8h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+18h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+7Ch]
+ push edi
+ mov dword [esp+14h],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+0Ch],edx
+ mov dword [esp+10h],eax
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+0Ch]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+10h]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ movsx ecx,word [ebp+14h]
+ movsx edx,word [ebp+18h]
+ movdqa xmm6,[esp+80h]
+ movdqa xmm4,[esp+90h]
+ movdqa xmm5,[esp+0A0h]
+ movdqa xmm7,[esp+0B0h]
+ pxor xmm0,xmm0
+ movd xmm1,ecx
+ movdqa xmm2,xmm1
+ punpcklwd xmm2,xmm1
+ pshufd xmm1,xmm2,0
+ movd xmm2,edx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3,xmm6
+ punpckhbw xmm6,xmm0
+ movdqa [esp+60h],xmm6
+ movdqa xmm6,[esp+90h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+30h],xmm6
+ movdqa xmm6,[esp+0A0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,[esp+0B0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+70h],xmm6
+ punpcklbw xmm7,xmm0
+ punpcklbw xmm4,xmm0
+ punpcklbw xmm5,xmm0
+ punpcklbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm6,xmm4
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ movdqa xmm0,xmm1
+ pcmpgtw xmm0,xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm4
+ pabsw xmm6,xmm6
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+30h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pcmpgtw xmm1,xmm6
+ movdqa xmm6,[esp+60h]
+ psubw xmm6,[esp+30h]
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+70h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pand xmm1,xmm7
+ pcmpgtw xmm2,xmm6
+ pand xmm1,xmm2
+ mov eax,2
+ movsx ecx,ax
+ movd xmm2,ecx
+ movdqa xmm6,xmm2
+ punpcklwd xmm6,xmm2
+ pshufd xmm2,xmm6,0
+ movdqa [esp+20h],xmm2
+ movdqa xmm2,xmm3
+ paddw xmm2,xmm3
+ paddw xmm2,xmm4
+ paddw xmm2,[esp+50h]
+ paddw xmm2,[esp+20h]
+ psraw xmm2,2
+ movdqa xmm6,xmm0
+ pand xmm6,xmm2
+ movdqa xmm2,xmm0
+ pandn xmm2,xmm4
+ por xmm6,xmm2
+ movdqa xmm2,[esp+60h]
+ movdqa xmm7,xmm2
+ paddw xmm7,xmm2
+ paddw xmm7,[esp+30h]
+ paddw xmm7,[esp+70h]
+ paddw xmm7,[esp+20h]
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ pandn xmm2,[esp+30h]
+ psraw xmm7,2
+ pand xmm4,xmm7
+ por xmm4,xmm2
+ movdqa xmm2,[esp+50h]
+ packuswb xmm6,xmm4
+ movdqa [esp+90h],xmm6
+ movdqa xmm6,xmm2
+ paddw xmm6,xmm2
+ movdqa xmm2,[esp+20h]
+ paddw xmm6,xmm5
+ paddw xmm6,xmm3
+ movdqa xmm4,xmm0
+ pandn xmm0,xmm5
+ paddw xmm6,xmm2
+ psraw xmm6,2
+ pand xmm4,xmm6
+ por xmm4,xmm0
+ movdqa xmm0,[esp+70h]
+ movdqa xmm5,xmm0
+ paddw xmm5,xmm0
+ movdqa xmm0,[esp+40h]
+ paddw xmm5,xmm0
+ paddw xmm5,[esp+60h]
+ movdqa xmm3,xmm1
+ paddw xmm5,xmm2
+ psraw xmm5,2
+ pand xmm3,xmm5
+ pandn xmm1,xmm0
+ por xmm3,xmm1
+ packuswb xmm4,xmm3
+ movdqa [esp+0A0h],xmm4
+ mov esi,dword [esp+10h]
+ movdqa xmm0,[esi]
+ movdqa xmm1,[esi+10h]
+ movdqa xmm2,[esi+20h]
+ movdqa xmm3,[esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+0Ch]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
;*******************************************************************************
; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3882,308 +3882,308 @@
;*******************************************************************************
WELS_EXTERN DeblockChromaLt4H_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,108h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+10h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+6Ch]
+ push edi
+ mov dword [esp+0Ch],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+10h],edx
+ mov dword [esp+1Ch],eax
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+10h]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+1Ch]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ mov eax,dword [ebp+1Ch]
+ movsx cx,byte [eax+3]
+ movsx dx,byte [eax+2]
+ movsx si,byte [eax+1]
+ movsx ax,byte [eax]
+ movzx edi,cx
+ movzx ecx,cx
+ movd xmm2,ecx
+ movzx ecx,dx
+ movzx edx,dx
+ movd xmm3,ecx
+ movd xmm4,edx
+ movzx ecx,si
+ movzx edx,si
+ movd xmm5,ecx
+ pxor xmm0,xmm0
+ movd xmm6,edx
+ movzx ecx,ax
+ movdqa [esp+60h],xmm0
+ movzx edx,ax
+ movsx eax,word [ebp+14h]
+ punpcklwd xmm6,xmm2
+ movd xmm1,edi
+ movd xmm7,ecx
+ movsx ecx,word [ebp+18h]
+ movd xmm0,edx
+ punpcklwd xmm7,xmm3
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+60h]
+ punpcklwd xmm7,xmm5
+ movdqa xmm5,[esp+0A0h]
+ punpcklwd xmm0,xmm4
+ punpcklwd xmm0,xmm6
+ movdqa xmm6, [esp+70h]
+ punpcklwd xmm0,xmm7
+ movdqa xmm7,[esp+80h]
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+0D0h],xmm2
+ movd xmm2,eax
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm4,xmm3,0
+ movd xmm2,ecx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3, [esp+90h]
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa [esp+40h],xmm2
+ movdqa [esp+0B0h],xmm6
+ movdqa xmm6,[esp+90h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm2,xmm1
+ punpcklbw xmm3,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0F0h],xmm7
+ movdqa [esp+0C0h],xmm6
+ movdqa xmm6, [esp+0A0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+0E0h],xmm6
+ mov edx,4
+ movsx eax,dx
+ movd xmm6,eax
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa [esp+30h],xmm6
+ movdqa xmm7, [esp+40h]
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa [esp+60h],xmm6
+ movdqa xmm1, [esp+0D0h]
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6,[esp+30h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa xmm7,[esp+50h]
+ movdqa [esp+20h],xmm0
+ movdqa xmm6, [esp+20h]
+ pminsw xmm6,xmm1
+ movdqa [esp+20h],xmm6
+ movdqa xmm6,xmm4
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+40h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1, [esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5, [esp+0B0h]
+ psubw xmm5,[esp+0E0h]
+ pand xmm6,xmm1
+ pand xmm6, [esp+60h]
+ movdqa xmm1, [esp+20h]
+ pand xmm1,xmm6
+ movdqa xmm6, [esp+0C0h]
+ movdqa [esp+40h],xmm1
+ movdqa xmm1, [esp+0F0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6, [esp+30h]
+ movdqa xmm5, [esp+0D0h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+0C0h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+0B0h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6, [esp+0E0h]
+ pand xmm4,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+60h]
+ pand xmm0,xmm4
+ movdqa xmm4, [esp+40h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm2,xmm1
+ packuswb xmm3,xmm5
+ movdqa [esp+80h],xmm2
+ movdqa [esp+90h],xmm3
+ mov esi,dword [esp+1Ch]
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi+10h]
+ movdqa xmm2, [esi+20h]
+ movdqa xmm3, [esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+10h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
@@ -4194,385 +4194,385 @@
WELS_EXTERN DeblockLumaLt4V_ssse3
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 420 ; 000001a4H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 420 ; 000001a4H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
- pxor xmm0, xmm0
- push ebx
- mov edx, dword [ebp+24]
- movdqa [esp+424-384], xmm0
- push esi
+ pxor xmm0, xmm0
+ push ebx
+ mov edx, dword [ebp+24]
+ movdqa [esp+424-384], xmm0
+ push esi
- lea esi, [ecx+ecx*2]
- push edi
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
+ lea esi, [ecx+ecx*2]
+ push edi
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
- lea esi, [ecx+ecx]
- movdqa [esp+432-208], xmm0
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
- movdqa [esp+448-208], xmm0
+ lea esi, [ecx+ecx]
+ movdqa [esp+432-208], xmm0
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+ movdqa [esp+448-208], xmm0
- mov ebx, eax
- sub ebx, ecx
- movdqa xmm0, [ebx]
- movdqa [esp+464-208], xmm0
+ mov ebx, eax
+ sub ebx, ecx
+ movdqa xmm0, [ebx]
+ movdqa [esp+464-208], xmm0
- movdqa xmm0, [eax]
+ movdqa xmm0, [eax]
- add ecx, eax
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [ecx]
- mov dword [esp+432-404], ecx
+ add ecx, eax
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [ecx]
+ mov dword [esp+432-404], ecx
- movsx ecx, word [ebp+16]
- movdqa [esp+496-208], xmm0
- movdqa xmm0, [esi+eax]
+ movsx ecx, word [ebp+16]
+ movdqa [esp+496-208], xmm0
+ movdqa xmm0, [esi+eax]
- movsx si, byte [edx]
- movdqa [esp+512-208], xmm0
- movd xmm0, ecx
- movsx ecx, word [ebp+20]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- pshufd xmm0, xmm1, 0
- movdqa [esp+432-112], xmm0
- movd xmm0, ecx
- movsx cx, byte [edx+1]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- mov dword [esp+432-408], ebx
- movzx ebx, cx
- pshufd xmm0, xmm1, 0
- movd xmm1, ebx
- movzx ebx, cx
- movd xmm2, ebx
- movzx ebx, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, si
- movd xmm5, ecx
- movzx ecx, si
- movd xmm6, ecx
- movzx ecx, si
- movd xmm7, ecx
- movzx ecx, si
- movdqa [esp+432-336], xmm0
- movd xmm0, ecx
+ movsx si, byte [edx]
+ movdqa [esp+512-208], xmm0
+ movd xmm0, ecx
+ movsx ecx, word [ebp+20]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ pshufd xmm0, xmm1, 0
+ movdqa [esp+432-112], xmm0
+ movd xmm0, ecx
+ movsx cx, byte [edx+1]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ mov dword [esp+432-408], ebx
+ movzx ebx, cx
+ pshufd xmm0, xmm1, 0
+ movd xmm1, ebx
+ movzx ebx, cx
+ movd xmm2, ebx
+ movzx ebx, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, si
+ movd xmm5, ecx
+ movzx ecx, si
+ movd xmm6, ecx
+ movzx ecx, si
+ movd xmm7, ecx
+ movzx ecx, si
+ movdqa [esp+432-336], xmm0
+ movd xmm0, ecx
- movsx cx, byte [edx+3]
- movsx dx, byte [edx+2]
- movd xmm3, ebx
- punpcklwd xmm0, xmm4
- movzx esi, cx
- punpcklwd xmm6, xmm2
- punpcklwd xmm5, xmm1
- punpcklwd xmm0, xmm6
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- punpcklwd xmm0, xmm7
- movdqa [esp+432-400], xmm0
- movd xmm0, esi
- movzx esi, cx
- movd xmm2, esi
- movzx esi, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, dx
- movd xmm3, esi
- movd xmm5, ecx
- punpcklwd xmm5, xmm0
+ movsx cx, byte [edx+3]
+ movsx dx, byte [edx+2]
+ movd xmm3, ebx
+ punpcklwd xmm0, xmm4
+ movzx esi, cx
+ punpcklwd xmm6, xmm2
+ punpcklwd xmm5, xmm1
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ punpcklwd xmm0, xmm7
+ movdqa [esp+432-400], xmm0
+ movd xmm0, esi
+ movzx esi, cx
+ movd xmm2, esi
+ movzx esi, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, dx
+ movd xmm3, esi
+ movd xmm5, ecx
+ punpcklwd xmm5, xmm0
- movdqa xmm0, [esp+432-384]
- movzx ecx, dx
- movd xmm6, ecx
- movzx ecx, dx
- movzx edx, dx
- punpcklwd xmm6, xmm2
- movd xmm7, ecx
- movd xmm1, edx
+ movdqa xmm0, [esp+432-384]
+ movzx ecx, dx
+ movd xmm6, ecx
+ movzx ecx, dx
+ movzx edx, dx
+ punpcklwd xmm6, xmm2
+ movd xmm7, ecx
+ movd xmm1, edx
- movdqa xmm2, [esp+448-208]
- punpcklbw xmm2, xmm0
+ movdqa xmm2, [esp+448-208]
+ punpcklbw xmm2, xmm0
- mov ecx, 4
- movsx edx, cx
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- movdqa xmm5, [esp+496-208]
- movdqa xmm3, [esp+464-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-240], xmm5
- movdqa xmm5, [esp+512-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-352], xmm5
- punpcklwd xmm1, xmm4
- movdqa xmm4, [esp+432-208]
- punpcklwd xmm1, xmm6
- movdqa xmm6, [esp+480-208]
- punpcklwd xmm1, xmm7
- punpcklbw xmm6, xmm0
- punpcklbw xmm3, xmm0
- punpcklbw xmm4, xmm0
- movdqa xmm7, xmm3
- psubw xmm7, xmm4
- pabsw xmm7, xmm7
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-336]
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-352]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
- movdqa xmm5, xmm3
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
- movdqa xmm5, [esp+432-400]
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, xmm3
- movdqa [esp+432-32], xmm6
- psubw xmm6, [esp+432-240]
- movdqa xmm7, xmm5
- movdqa [esp+432-384], xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
+ mov ecx, 4
+ movsx edx, cx
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ movdqa xmm5, [esp+496-208]
+ movdqa xmm3, [esp+464-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-240], xmm5
+ movdqa xmm5, [esp+512-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-352], xmm5
+ punpcklwd xmm1, xmm4
+ movdqa xmm4, [esp+432-208]
+ punpcklwd xmm1, xmm6
+ movdqa xmm6, [esp+480-208]
+ punpcklwd xmm1, xmm7
+ punpcklbw xmm6, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ movdqa xmm7, xmm3
+ psubw xmm7, xmm4
+ pabsw xmm7, xmm7
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-336]
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-352]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+ movdqa xmm5, xmm3
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+ movdqa xmm5, [esp+432-400]
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, xmm3
+ movdqa [esp+432-32], xmm6
+ psubw xmm6, [esp+432-240]
+ movdqa xmm7, xmm5
+ movdqa [esp+432-384], xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
- pand xmm5, xmm7
- movdqa xmm6, xmm3
- psubw xmm6, xmm2
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-400]
- pand xmm5, xmm7
- movdqa xmm7, xmm6
- pcmpeqw xmm6, xmm0
- pcmpgtw xmm7, xmm0
- por xmm7, xmm6
- pand xmm5, xmm7
- movdqa [esp+432-320], xmm5
- movd xmm5, edx
- movdqa xmm6, xmm5
- punpcklwd xmm6, xmm5
- pshufd xmm5, xmm6, 0
- movdqa [esp+432-336], xmm5
- movdqa xmm5, [esp+432-224]
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm0
- psubw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- psllw xmm5, 2
- movdqa xmm7, xmm2
- psubw xmm7, [esp+432-240]
- paddw xmm7, xmm5
- paddw xmm7, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- psraw xmm7, 3
- pmaxsw xmm6, xmm7
- pminsw xmm5, xmm6
+ pand xmm5, xmm7
+ movdqa xmm6, xmm3
+ psubw xmm6, xmm2
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-400]
+ pand xmm5, xmm7
+ movdqa xmm7, xmm6
+ pcmpeqw xmm6, xmm0
+ pcmpgtw xmm7, xmm0
+ por xmm7, xmm6
+ pand xmm5, xmm7
+ movdqa [esp+432-320], xmm5
+ movd xmm5, edx
+ movdqa xmm6, xmm5
+ punpcklwd xmm6, xmm5
+ pshufd xmm5, xmm6, 0
+ movdqa [esp+432-336], xmm5
+ movdqa xmm5, [esp+432-224]
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm0
+ psubw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ psllw xmm5, 2
+ movdqa xmm7, xmm2
+ psubw xmm7, [esp+432-240]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ psraw xmm7, 3
+ pmaxsw xmm6, xmm7
+ pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- movdqa xmm6, [esp+432-400]
- movdqa [esp+432-64], xmm5
- movdqa [esp+432-384], xmm6
- movdqa xmm5, xmm0
- psubw xmm5, xmm6
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm2
- paddw xmm7, xmm2
- psubw xmm5, xmm7
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- pminsw xmm5, xmm6
+ pand xmm5, [esp+432-320]
+ movdqa xmm6, [esp+432-400]
+ movdqa [esp+432-64], xmm5
+ movdqa [esp+432-384], xmm6
+ movdqa xmm5, xmm0
+ psubw xmm5, xmm6
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm2
+ psubw xmm5, xmm7
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-288]
- movdqa xmm6, [esp+432-240]
- movdqa [esp+432-96], xmm5
- movdqa xmm5, [esp+432-352]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm6
- paddw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
- psubw xmm5, xmm7
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-288]
+ movdqa xmm6, [esp+432-240]
+ movdqa [esp+432-96], xmm5
+ movdqa xmm5, [esp+432-352]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm6
+ paddw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+ psubw xmm5, xmm7
- movdqa xmm7, [esp+496-208]
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-400]
- pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-256]
- movdqa xmm6, [esp+448-208]
- punpckhbw xmm7, xmm0
- movdqa [esp+432-352], xmm7
+ movdqa xmm7, [esp+496-208]
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-400]
+ pminsw xmm5, xmm6
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-256]
+ movdqa xmm6, [esp+448-208]
+ punpckhbw xmm7, xmm0
+ movdqa [esp+432-352], xmm7
- movdqa xmm7, [esp+512-208]
- punpckhbw xmm6, xmm0
- movdqa [esp+432-48], xmm5
- movdqa xmm5, [esp+432-208]
- movdqa [esp+432-368], xmm6
- movdqa xmm6, [esp+464-208]
- punpckhbw xmm7, xmm0
- punpckhbw xmm5, xmm0
- movdqa [esp+432-384], xmm7
- punpckhbw xmm6, xmm0
- movdqa [esp+432-400], xmm6
+ movdqa xmm7, [esp+512-208]
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-48], xmm5
+ movdqa xmm5, [esp+432-208]
+ movdqa [esp+432-368], xmm6
+ movdqa xmm6, [esp+464-208]
+ punpckhbw xmm7, xmm0
+ punpckhbw xmm5, xmm0
+ movdqa [esp+432-384], xmm7
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-400], xmm6
- movdqa xmm7, [esp+432-400]
- movdqa xmm6, [esp+480-208]
- psubw xmm7, xmm5
- movdqa [esp+432-16], xmm5
- pabsw xmm7, xmm7
- punpckhbw xmm6, xmm0
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
+ movdqa xmm7, [esp+432-400]
+ movdqa xmm6, [esp+480-208]
+ psubw xmm7, xmm5
+ movdqa [esp+432-16], xmm5
+ pabsw xmm7, xmm7
+ punpckhbw xmm6, xmm0
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-384]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-384]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
- movdqa xmm5, [esp+432-400]
- movdqa [esp+432-80], xmm6
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
+ movdqa xmm5, [esp+432-400]
+ movdqa [esp+432-80], xmm6
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
- movdqa xmm5, xmm1
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, [esp+432-400]
- psubw xmm6, [esp+432-352]
- movdqa [esp+432-272], xmm5
- movdqa xmm7, xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- movdqa xmm7, xmm4
- pabsw xmm6, xmm6
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
+ movdqa xmm5, xmm1
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, [esp+432-400]
+ psubw xmm6, [esp+432-352]
+ movdqa [esp+432-272], xmm5
+ movdqa xmm7, xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ movdqa xmm7, xmm4
+ pabsw xmm6, xmm6
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
- pand xmm5, xmm7
- movdqa xmm7, [esp+432-400]
- psubw xmm7, xmm6
- psubw xmm6, [esp+432-352]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
- pand xmm5, xmm4
+ pand xmm5, xmm7
+ movdqa xmm7, [esp+432-400]
+ psubw xmm7, xmm6
+ psubw xmm6, [esp+432-352]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+ pand xmm5, xmm4
- paddw xmm2, [esp+432-96]
- movdqa xmm4, xmm1
- pcmpgtw xmm4, xmm0
- movdqa xmm7, xmm1
- pcmpeqw xmm7, xmm0
- por xmm4, xmm7
- pand xmm5, xmm4
- movdqa xmm4, [esp+432-224]
- movdqa [esp+432-320], xmm5
- movdqa xmm5, [esp+432-272]
- movdqa xmm7, xmm0
- psubw xmm7, xmm4
- psubw xmm0, xmm1
- psllw xmm5, 2
- paddw xmm6, xmm5
- paddw xmm6, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- movdqa [esp+432-336], xmm0
- psraw xmm6, 3
- pmaxsw xmm7, xmm6
- pminsw xmm4, xmm7
- pand xmm4, [esp+432-320]
- movdqa xmm6, xmm0
- movdqa xmm0, [esp+432-16]
- paddw xmm0, [esp+432-304]
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-368]
- paddw xmm4, xmm4
- psubw xmm0, xmm4
+ paddw xmm2, [esp+432-96]
+ movdqa xmm4, xmm1
+ pcmpgtw xmm4, xmm0
+ movdqa xmm7, xmm1
+ pcmpeqw xmm7, xmm0
+ por xmm4, xmm7
+ pand xmm5, xmm4
+ movdqa xmm4, [esp+432-224]
+ movdqa [esp+432-320], xmm5
+ movdqa xmm5, [esp+432-272]
+ movdqa xmm7, xmm0
+ psubw xmm7, xmm4
+ psubw xmm0, xmm1
+ psllw xmm5, 2
+ paddw xmm6, xmm5
+ paddw xmm6, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ movdqa [esp+432-336], xmm0
+ psraw xmm6, 3
+ pmaxsw xmm7, xmm6
+ pminsw xmm4, xmm7
+ pand xmm4, [esp+432-320]
+ movdqa xmm6, xmm0
+ movdqa xmm0, [esp+432-16]
+ paddw xmm0, [esp+432-304]
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-368]
+ paddw xmm4, xmm4
+ psubw xmm0, xmm4
- movdqa xmm4, [esp+432-64]
- psraw xmm0, 1
- pmaxsw xmm6, xmm0
- movdqa xmm0, [esp+432-400]
- movdqa xmm7, xmm1
- pminsw xmm7, xmm6
- movdqa xmm6, [esp+432-320]
- pand xmm7, xmm6
- pand xmm7, [esp+432-288]
- paddw xmm5, xmm7
- packuswb xmm2, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm0, xmm5
- paddw xmm3, xmm4
- packuswb xmm3, xmm0
+ movdqa xmm4, [esp+432-64]
+ psraw xmm0, 1
+ pmaxsw xmm6, xmm0
+ movdqa xmm0, [esp+432-400]
+ movdqa xmm7, xmm1
+ pminsw xmm7, xmm6
+ movdqa xmm6, [esp+432-320]
+ pand xmm7, xmm6
+ pand xmm7, [esp+432-288]
+ paddw xmm5, xmm7
+ packuswb xmm2, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm0, xmm5
+ paddw xmm3, xmm4
+ packuswb xmm3, xmm0
- movdqa xmm0, [esp+432-32]
- psubw xmm0, xmm4
- movdqa xmm4, [esp+432-80]
- psubw xmm4, xmm5
+ movdqa xmm0, [esp+432-32]
+ psubw xmm0, xmm4
+ movdqa xmm4, [esp+432-80]
+ psubw xmm4, xmm5
- movdqa xmm5, [esp+432-240]
- paddw xmm5, [esp+432-48]
- packuswb xmm0, xmm4
- movdqa xmm4, [esp+432-384]
- paddw xmm4, [esp+432-304]
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [esp+432-352]
- movdqa xmm7, xmm0
- paddw xmm0, xmm0
+ movdqa xmm5, [esp+432-240]
+ paddw xmm5, [esp+432-48]
+ packuswb xmm0, xmm4
+ movdqa xmm4, [esp+432-384]
+ paddw xmm4, [esp+432-304]
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [esp+432-352]
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm0
- mov ecx, dword [esp+432-408]
+ mov ecx, dword [esp+432-408]
- mov edx, dword [esp+432-404]
- psubw xmm4, xmm0
- movdqa xmm0, [esp+432-336]
- movdqa [edi], xmm2
- psraw xmm4, 1
- pmaxsw xmm0, xmm4
- pminsw xmm1, xmm0
- movdqa xmm0, [esp+480-208]
+ mov edx, dword [esp+432-404]
+ psubw xmm4, xmm0
+ movdqa xmm0, [esp+432-336]
+ movdqa [edi], xmm2
+ psraw xmm4, 1
+ pmaxsw xmm0, xmm4
+ pminsw xmm1, xmm0
+ movdqa xmm0, [esp+480-208]
- pop edi
- pand xmm1, xmm6
- pand xmm1, [esp+428-256]
- movdqa [ecx], xmm3
- paddw xmm7, xmm1
- pop esi
- packuswb xmm5, xmm7
- movdqa [eax], xmm0
- movdqa [edx], xmm5
- pop ebx
- mov esp, ebp
- pop ebp
- ret
+ pop edi
+ pand xmm1, xmm6
+ pand xmm1, [esp+428-256]
+ movdqa [ecx], xmm3
+ paddw xmm7, xmm1
+ pop esi
+ packuswb xmm5, xmm7
+ movdqa [eax], xmm0
+ movdqa [edx], xmm5
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
;*******************************************************************************
@@ -4583,542 +4583,542 @@
WELS_EXTERN DeblockLumaEq4V_ssse3
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 628 ; 00000274H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
- push ebx
- push esi
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 628 ; 00000274H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+ push ebx
+ push esi
- lea edx, [ecx*4]
- pxor xmm0, xmm0
- movdqa xmm2, xmm0
+ lea edx, [ecx*4]
+ pxor xmm0, xmm0
+ movdqa xmm2, xmm0
- movdqa xmm0, [ecx+eax]
- mov esi, eax
- sub esi, edx
- movdqa xmm3, [esi]
- movdqa xmm5, [eax]
- push edi
- lea edi, [ecx+ecx]
- lea ebx, [ecx+ecx*2]
- mov dword [esp+640-600], edi
- mov esi, eax
- sub esi, edi
- movdqa xmm1, [esi]
- movdqa [esp+720-272], xmm0
- mov edi, eax
- sub edi, ecx
- movdqa xmm4, [edi]
- add ecx, eax
- mov dword [esp+640-596], ecx
+ movdqa xmm0, [ecx+eax]
+ mov esi, eax
+ sub esi, edx
+ movdqa xmm3, [esi]
+ movdqa xmm5, [eax]
+ push edi
+ lea edi, [ecx+ecx]
+ lea ebx, [ecx+ecx*2]
+ mov dword [esp+640-600], edi
+ mov esi, eax
+ sub esi, edi
+ movdqa xmm1, [esi]
+ movdqa [esp+720-272], xmm0
+ mov edi, eax
+ sub edi, ecx
+ movdqa xmm4, [edi]
+ add ecx, eax
+ mov dword [esp+640-596], ecx
- mov ecx, dword [esp+640-600]
- movdqa xmm0, [ecx+eax]
- movdqa [esp+736-272], xmm0
+ mov ecx, dword [esp+640-600]
+ movdqa xmm0, [ecx+eax]
+ movdqa [esp+736-272], xmm0
- movdqa xmm0, [eax+ebx]
- mov edx, eax
- sub edx, ebx
+ movdqa xmm0, [eax+ebx]
+ mov edx, eax
+ sub edx, ebx
- movsx ebx, word [ebp+16]
- movdqa xmm6, [edx]
- add ecx, eax
- movdqa [esp+752-272], xmm0
- movd xmm0, ebx
+ movsx ebx, word [ebp+16]
+ movdqa xmm6, [edx]
+ add ecx, eax
+ movdqa [esp+752-272], xmm0
+ movd xmm0, ebx
- movsx ebx, word [ebp+20]
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
- movdqa [esp+640-320], xmm0
- movd xmm0, ebx
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
+ movsx ebx, word [ebp+20]
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+ movdqa [esp+640-320], xmm0
+ movd xmm0, ebx
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
- movdqa xmm7, [esp+736-272]
- punpcklbw xmm7, xmm2
- movdqa [esp+640-416], xmm7
- movdqa [esp+640-512], xmm0
- movdqa xmm0, xmm1
- movdqa [esp+672-272], xmm1
- movdqa xmm1, xmm4
- movdqa [esp+704-272], xmm5
- punpcklbw xmm5, xmm2
- punpcklbw xmm1, xmm2
+ movdqa xmm7, [esp+736-272]
+ punpcklbw xmm7, xmm2
+ movdqa [esp+640-416], xmm7
+ movdqa [esp+640-512], xmm0
+ movdqa xmm0, xmm1
+ movdqa [esp+672-272], xmm1
+ movdqa xmm1, xmm4
+ movdqa [esp+704-272], xmm5
+ punpcklbw xmm5, xmm2
+ punpcklbw xmm1, xmm2
- movdqa xmm7, xmm5
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- punpcklbw xmm0, xmm2
- movdqa [esp+688-272], xmm4
- movdqa xmm4, [esp+720-272]
- movdqa [esp+640-480], xmm0
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ punpcklbw xmm0, xmm2
+ movdqa [esp+688-272], xmm4
+ movdqa xmm4, [esp+720-272]
+ movdqa [esp+640-480], xmm0
- movdqa xmm7, xmm1
- psubw xmm7, xmm0
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm0
- movdqa xmm0, [esp+640-512]
- pabsw xmm7, xmm7
- punpcklbw xmm4, xmm2
- pcmpgtw xmm0, xmm7
- movdqa [esp+640-384], xmm4
- movdqa xmm7, xmm5
- psubw xmm7, xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+656-272], xmm6
- punpcklbw xmm6, xmm2
- pabsw xmm7, xmm7
- movdqa [esp+640-48], xmm2
- movdqa [esp+640-368], xmm6
- movdqa [esp+640-144], xmm1
- movdqa [esp+640-400], xmm5
- pcmpgtw xmm4, xmm7
- pand xmm0, xmm4
- movdqa xmm4, [esp+640-320]
- pcmpgtw xmm4, [esp+640-560]
- pand xmm0, xmm4
+ movdqa xmm0, [esp+640-512]
+ pabsw xmm7, xmm7
+ punpcklbw xmm4, xmm2
+ pcmpgtw xmm0, xmm7
+ movdqa [esp+640-384], xmm4
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+656-272], xmm6
+ punpcklbw xmm6, xmm2
+ pabsw xmm7, xmm7
+ movdqa [esp+640-48], xmm2
+ movdqa [esp+640-368], xmm6
+ movdqa [esp+640-144], xmm1
+ movdqa [esp+640-400], xmm5
+ pcmpgtw xmm4, xmm7
+ pand xmm0, xmm4
+ movdqa xmm4, [esp+640-320]
+ pcmpgtw xmm4, [esp+640-560]
+ pand xmm0, xmm4
- mov ebx, 2
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, [esp+640-320]
- psraw xmm4, 2
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm7
- movdqa [esp+640-576], xmm4
- pcmpgtw xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
+ mov ebx, 2
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, [esp+640-320]
+ psraw xmm4, 2
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm7
+ movdqa [esp+640-576], xmm4
+ pcmpgtw xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+640-624], xmm7
- movdqa xmm7, xmm1
- psubw xmm7, xmm6
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+640-624], xmm7
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm6
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
- pand xmm4, [esp+640-560]
- movdqa [esp+640-544], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa xmm7, xmm5
- psubw xmm7, [esp+640-416]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-544], xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa xmm7, xmm5
+ psubw xmm7, [esp+640-416]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
- pand xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
- movdqa xmm4, [esp+640-544]
- pandn xmm4, xmm6
- movdqa [esp+640-16], xmm4
- mov ebx, 4
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm2
- psllw xmm4, 1
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, [esp+640-480]
+ movdqa xmm4, [esp+640-544]
+ pandn xmm4, xmm6
+ movdqa [esp+640-16], xmm4
+ mov ebx, 4
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, xmm3
+ punpcklbw xmm4, xmm2
+ psllw xmm4, 1
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, [esp+640-480]
- movdqa xmm6, [esp+640-560]
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm1
- movdqa [esp+640-592], xmm7
- paddw xmm4, xmm5
- paddw xmm4, xmm7
- movdqa xmm7, [esp+640-416]
- pandn xmm6, xmm7
- movdqa [esp+640-80], xmm6
- movdqa xmm6, [esp+752-272]
- punpcklbw xmm6, xmm2
- psllw xmm6, 1
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-384]
+ movdqa xmm6, [esp+640-560]
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm1
+ movdqa [esp+640-592], xmm7
+ paddw xmm4, xmm5
+ paddw xmm4, xmm7
+ movdqa xmm7, [esp+640-416]
+ pandn xmm6, xmm7
+ movdqa [esp+640-80], xmm6
+ movdqa xmm6, [esp+752-272]
+ punpcklbw xmm6, xmm2
+ psllw xmm6, 1
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-384]
- movdqa xmm7, [esp+640-480]
- paddw xmm6, xmm5
- paddw xmm6, xmm1
- paddw xmm6, [esp+640-592]
- psraw xmm6, 3
- pand xmm6, [esp+640-560]
- movdqa [esp+640-112], xmm6
- movdqa xmm6, [esp+640-544]
- pandn xmm6, xmm7
- movdqa [esp+640-336], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-528], xmm6
- movdqa xmm6, [esp+640-368]
- paddw xmm6, xmm7
- movdqa xmm7, xmm1
- psraw xmm4, 3
- pand xmm4, [esp+640-544]
- paddw xmm7, xmm5
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
+ movdqa xmm7, [esp+640-480]
+ paddw xmm6, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, [esp+640-592]
+ psraw xmm6, 3
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-112], xmm6
+ movdqa xmm6, [esp+640-544]
+ pandn xmm6, xmm7
+ movdqa [esp+640-336], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-528], xmm6
+ movdqa xmm6, [esp+640-368]
+ paddw xmm6, xmm7
+ movdqa xmm7, xmm1
+ psraw xmm4, 3
+ pand xmm4, [esp+640-544]
+ paddw xmm7, xmm5
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
- paddw xmm5, xmm1
- psraw xmm6, 2
- pand xmm7, xmm6
+ paddw xmm5, xmm1
+ psraw xmm6, 2
+ pand xmm7, xmm6
- movdqa xmm6, [esp+640-384]
- movdqa [esp+640-64], xmm7
- movdqa xmm7, [esp+640-560]
- pandn xmm7, xmm6
- movdqa [esp+640-304], xmm7
- movdqa xmm7, [esp+640-560]
- movdqa [esp+640-528], xmm7
- movdqa xmm7, [esp+640-416]
- paddw xmm7, xmm6
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pand xmm5, xmm7
- movdqa [esp+640-32], xmm5
+ movdqa xmm6, [esp+640-384]
+ movdqa [esp+640-64], xmm7
+ movdqa xmm7, [esp+640-560]
+ pandn xmm7, xmm6
+ movdqa [esp+640-304], xmm7
+ movdqa xmm7, [esp+640-560]
+ movdqa [esp+640-528], xmm7
+ movdqa xmm7, [esp+640-416]
+ paddw xmm7, xmm6
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pand xmm5, xmm7
+ movdqa [esp+640-32], xmm5
- movdqa xmm5, [esp+640-544]
- movdqa [esp+640-528], xmm5
- movdqa xmm5, [esp+640-480]
- movdqa xmm7, xmm5
- paddw xmm7, xmm5
- movdqa xmm5, xmm1
- paddw xmm5, xmm6
- paddw xmm6, [esp+640-592]
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pandn xmm5, xmm7
- movdqa xmm7, [esp+640-480]
- paddw xmm7, xmm1
- paddw xmm7, [esp+640-400]
- movdqa xmm1, [esp+640-544]
- movdqa [esp+640-352], xmm5
- movdqa xmm5, [esp+640-368]
- psllw xmm7, 1
- paddw xmm7, xmm6
- paddw xmm5, xmm7
+ movdqa xmm5, [esp+640-544]
+ movdqa [esp+640-528], xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa xmm7, xmm5
+ paddw xmm7, xmm5
+ movdqa xmm5, xmm1
+ paddw xmm5, xmm6
+ paddw xmm6, [esp+640-592]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pandn xmm5, xmm7
+ movdqa xmm7, [esp+640-480]
+ paddw xmm7, xmm1
+ paddw xmm7, [esp+640-400]
+ movdqa xmm1, [esp+640-544]
+ movdqa [esp+640-352], xmm5
+ movdqa xmm5, [esp+640-368]
+ psllw xmm7, 1
+ paddw xmm7, xmm6
+ paddw xmm5, xmm7
- movdqa xmm7, [esp+640-400]
- psraw xmm5, 3
- pand xmm1, xmm5
- movdqa xmm5, [esp+640-480]
- movdqa [esp+640-96], xmm1
- movdqa xmm1, [esp+640-560]
- movdqa [esp+640-528], xmm1
- movdqa xmm1, [esp+640-384]
- movdqa xmm6, xmm1
- paddw xmm6, xmm1
- paddw xmm1, [esp+640-400]
- paddw xmm1, [esp+640-144]
- paddw xmm7, xmm5
- paddw xmm5, [esp+640-592]
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
- psraw xmm6, 2
- psllw xmm1, 1
- paddw xmm1, xmm5
+ movdqa xmm7, [esp+640-400]
+ psraw xmm5, 3
+ pand xmm1, xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa [esp+640-96], xmm1
+ movdqa xmm1, [esp+640-560]
+ movdqa [esp+640-528], xmm1
+ movdqa xmm1, [esp+640-384]
+ movdqa xmm6, xmm1
+ paddw xmm6, xmm1
+ paddw xmm1, [esp+640-400]
+ paddw xmm1, [esp+640-144]
+ paddw xmm7, xmm5
+ paddw xmm5, [esp+640-592]
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+ psraw xmm6, 2
+ psllw xmm1, 1
+ paddw xmm1, xmm5
- movdqa xmm5, [esp+656-272]
- pandn xmm7, xmm6
- movdqa xmm6, [esp+640-416]
- paddw xmm6, xmm1
- movdqa xmm1, [esp+640-560]
- psraw xmm6, 3
- pand xmm1, xmm6
+ movdqa xmm5, [esp+656-272]
+ pandn xmm7, xmm6
+ movdqa xmm6, [esp+640-416]
+ paddw xmm6, xmm1
+ movdqa xmm1, [esp+640-560]
+ psraw xmm6, 3
+ pand xmm1, xmm6
- movdqa xmm6, [esp+704-272]
- movdqa [esp+640-128], xmm1
- movdqa xmm1, [esp+672-272]
- punpckhbw xmm1, xmm2
- movdqa [esp+640-448], xmm1
- movdqa xmm1, [esp+688-272]
- punpckhbw xmm1, xmm2
- punpckhbw xmm6, xmm2
- movdqa [esp+640-288], xmm7
- punpckhbw xmm5, xmm2
- movdqa [esp+640-496], xmm1
- movdqa [esp+640-432], xmm6
+ movdqa xmm6, [esp+704-272]
+ movdqa [esp+640-128], xmm1
+ movdqa xmm1, [esp+672-272]
+ punpckhbw xmm1, xmm2
+ movdqa [esp+640-448], xmm1
+ movdqa xmm1, [esp+688-272]
+ punpckhbw xmm1, xmm2
+ punpckhbw xmm6, xmm2
+ movdqa [esp+640-288], xmm7
+ punpckhbw xmm5, xmm2
+ movdqa [esp+640-496], xmm1
+ movdqa [esp+640-432], xmm6
- movdqa xmm7, [esp+720-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-464], xmm7
+ movdqa xmm7, [esp+720-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-464], xmm7
- movdqa xmm7, [esp+736-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-528], xmm7
+ movdqa xmm7, [esp+736-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-528], xmm7
- movdqa xmm7, xmm6
+ movdqa xmm7, xmm6
- psubw xmm6, [esp+640-464]
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- por xmm4, [esp+640-16]
- pabsw xmm6, xmm6
- movdqa xmm7, xmm1
- psubw xmm7, [esp+640-448]
+ psubw xmm6, [esp+640-464]
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ por xmm4, [esp+640-16]
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm1
+ psubw xmm7, [esp+640-448]
- movdqa xmm1, [esp+640-512]
- pabsw xmm7, xmm7
- pcmpgtw xmm1, xmm7
- movdqa xmm7, [esp+640-512]
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+640-320]
- pand xmm1, xmm7
- movdqa xmm7, [esp+640-560]
- pcmpgtw xmm6, xmm7
- pand xmm1, xmm6
+ movdqa xmm1, [esp+640-512]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm1, xmm7
+ movdqa xmm7, [esp+640-512]
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+640-320]
+ pand xmm1, xmm7
+ movdqa xmm7, [esp+640-560]
+ pcmpgtw xmm6, xmm7
+ pand xmm1, xmm6
- movdqa xmm6, [esp+640-576]
- pcmpgtw xmm6, xmm7
+ movdqa xmm6, [esp+640-576]
+ pcmpgtw xmm6, xmm7
- movdqa xmm7, [esp+640-496]
- punpckhbw xmm3, xmm2
- movdqa [esp+640-560], xmm6
- movdqa xmm6, [esp+640-512]
- psubw xmm7, xmm5
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
+ movdqa xmm7, [esp+640-496]
+ punpckhbw xmm3, xmm2
+ movdqa [esp+640-560], xmm6
+ movdqa xmm6, [esp+640-512]
+ psubw xmm7, xmm5
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa xmm7, [esp+640-432]
- psubw xmm7, [esp+640-528]
+ pand xmm6, [esp+640-560]
+ movdqa xmm7, [esp+640-432]
+ psubw xmm7, [esp+640-528]
- psllw xmm3, 1
- movdqa [esp+640-544], xmm6
- movdqa xmm6, [esp+640-512]
+ psllw xmm3, 1
+ movdqa [esp+640-544], xmm6
+ movdqa xmm6, [esp+640-512]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, [esp+640-448]
- paddw xmm3, [esp+640-496]
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa [esp+640-560], xmm6
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, [esp+640-448]
+ paddw xmm3, [esp+640-496]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-560], xmm6
- movdqa xmm6, xmm0
- pand xmm6, xmm4
- movdqa xmm4, xmm0
- pandn xmm4, [esp+640-368]
- por xmm6, xmm4
- movdqa xmm4, [esp+640-432]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-592]
- psraw xmm3, 3
- pand xmm3, xmm2
- pandn xmm2, xmm5
- por xmm3, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm3
- movdqa xmm3, [esp+640-64]
- por xmm3, [esp+640-336]
- movdqa xmm2, xmm1
- pandn xmm2, xmm5
- por xmm7, xmm2
+ movdqa xmm6, xmm0
+ pand xmm6, xmm4
+ movdqa xmm4, xmm0
+ pandn xmm4, [esp+640-368]
+ por xmm6, xmm4
+ movdqa xmm4, [esp+640-432]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-592]
+ psraw xmm3, 3
+ pand xmm3, xmm2
+ pandn xmm2, xmm5
+ por xmm3, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm3
+ movdqa xmm3, [esp+640-64]
+ por xmm3, [esp+640-336]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm5
+ por xmm7, xmm2
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-480]
- por xmm2, xmm3
- packuswb xmm6, xmm7
- movdqa [esp+640-336], xmm2
- movdqa [esp+656-272], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa xmm2, xmm5
- paddw xmm2, [esp+640-448]
- movdqa xmm3, xmm1
- movdqa xmm7, [esp+640-496]
- paddw xmm7, xmm4
- paddw xmm2, xmm7
- paddw xmm2, [esp+640-624]
- movdqa xmm7, [esp+640-544]
- psraw xmm2, 2
- pand xmm6, xmm2
- movdqa xmm2, [esp+640-448]
- pandn xmm7, xmm2
- por xmm6, xmm7
- pand xmm3, xmm6
- movdqa xmm6, xmm1
- pandn xmm6, xmm2
- paddw xmm2, [esp+640-496]
- paddw xmm2, xmm4
- por xmm3, xmm6
- movdqa xmm6, [esp+640-336]
- packuswb xmm6, xmm3
- psllw xmm2, 1
- movdqa [esp+672-272], xmm6
- movdqa xmm6, [esp+640-96]
- por xmm6, [esp+640-352]
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-480]
+ por xmm2, xmm3
+ packuswb xmm6, xmm7
+ movdqa [esp+640-336], xmm2
+ movdqa [esp+656-272], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa xmm2, xmm5
+ paddw xmm2, [esp+640-448]
+ movdqa xmm3, xmm1
+ movdqa xmm7, [esp+640-496]
+ paddw xmm7, xmm4
+ paddw xmm2, xmm7
+ paddw xmm2, [esp+640-624]
+ movdqa xmm7, [esp+640-544]
+ psraw xmm2, 2
+ pand xmm6, xmm2
+ movdqa xmm2, [esp+640-448]
+ pandn xmm7, xmm2
+ por xmm6, xmm7
+ pand xmm3, xmm6
+ movdqa xmm6, xmm1
+ pandn xmm6, xmm2
+ paddw xmm2, [esp+640-496]
+ paddw xmm2, xmm4
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-336]
+ packuswb xmm6, xmm3
+ psllw xmm2, 1
+ movdqa [esp+672-272], xmm6
+ movdqa xmm6, [esp+640-96]
+ por xmm6, [esp+640-352]
- movdqa xmm3, xmm0
- pand xmm3, xmm6
- movdqa xmm6, xmm0
- pandn xmm6, [esp+640-144]
- por xmm3, xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-352], xmm3
- movdqa xmm3, [esp+640-464]
- paddw xmm3, [esp+640-592]
- paddw xmm2, xmm3
- movdqa xmm3, [esp+640-448]
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-496]
- psraw xmm5, 3
- pand xmm6, xmm5
- movdqa xmm5, [esp+640-464]
- paddw xmm2, xmm5
- paddw xmm5, [esp+640-432]
- movdqa xmm4, xmm3
- paddw xmm4, xmm3
- paddw xmm4, xmm2
- paddw xmm4, [esp+640-624]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, [esp+640-592]
- psraw xmm4, 2
- pandn xmm2, xmm4
- por xmm6, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-496]
- movdqa xmm2, xmm1
- pandn xmm2, xmm6
- por xmm7, xmm2
- movdqa xmm2, [esp+640-352]
- packuswb xmm2, xmm7
- movdqa [esp+688-272], xmm2
- movdqa xmm2, [esp+640-128]
- por xmm2, [esp+640-288]
+ movdqa xmm3, xmm0
+ pand xmm3, xmm6
+ movdqa xmm6, xmm0
+ pandn xmm6, [esp+640-144]
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-352], xmm3
+ movdqa xmm3, [esp+640-464]
+ paddw xmm3, [esp+640-592]
+ paddw xmm2, xmm3
+ movdqa xmm3, [esp+640-448]
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-496]
+ psraw xmm5, 3
+ pand xmm6, xmm5
+ movdqa xmm5, [esp+640-464]
+ paddw xmm2, xmm5
+ paddw xmm5, [esp+640-432]
+ movdqa xmm4, xmm3
+ paddw xmm4, xmm3
+ paddw xmm4, xmm2
+ paddw xmm4, [esp+640-624]
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, [esp+640-592]
+ psraw xmm4, 2
+ pandn xmm2, xmm4
+ por xmm6, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-496]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm6
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-352]
+ packuswb xmm2, xmm7
+ movdqa [esp+688-272], xmm2
+ movdqa xmm2, [esp+640-128]
+ por xmm2, [esp+640-288]
- movdqa xmm4, xmm0
- pand xmm4, xmm2
- paddw xmm5, xmm6
- movdqa xmm2, xmm0
- pandn xmm2, [esp+640-400]
- por xmm4, xmm2
- movdqa xmm2, [esp+640-528]
- psllw xmm5, 1
- paddw xmm5, xmm3
- movdqa xmm3, [esp+640-560]
- paddw xmm2, xmm5
- psraw xmm2, 3
- movdqa [esp+640-288], xmm4
- movdqa xmm4, [esp+640-560]
- pand xmm4, xmm2
- movdqa xmm2, [esp+640-464]
- movdqa xmm5, xmm2
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-432]
- paddw xmm2, [esp+640-448]
- movdqa xmm7, xmm1
- paddw xmm5, xmm2
- paddw xmm5, [esp+640-624]
- movdqa xmm6, [esp+640-560]
- psraw xmm5, 2
- pandn xmm3, xmm5
- por xmm4, xmm3
- movdqa xmm3, [esp+640-32]
- por xmm3, [esp+640-304]
- pand xmm7, xmm4
- movdqa xmm4, [esp+640-432]
- movdqa xmm5, [esp+640-464]
- movdqa xmm2, xmm1
- pandn xmm2, xmm4
- paddw xmm4, [esp+640-496]
- por xmm7, xmm2
- movdqa xmm2, [esp+640-288]
- packuswb xmm2, xmm7
- movdqa [esp+704-272], xmm2
+ movdqa xmm4, xmm0
+ pand xmm4, xmm2
+ paddw xmm5, xmm6
+ movdqa xmm2, xmm0
+ pandn xmm2, [esp+640-400]
+ por xmm4, xmm2
+ movdqa xmm2, [esp+640-528]
+ psllw xmm5, 1
+ paddw xmm5, xmm3
+ movdqa xmm3, [esp+640-560]
+ paddw xmm2, xmm5
+ psraw xmm2, 3
+ movdqa [esp+640-288], xmm4
+ movdqa xmm4, [esp+640-560]
+ pand xmm4, xmm2
+ movdqa xmm2, [esp+640-464]
+ movdqa xmm5, xmm2
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-432]
+ paddw xmm2, [esp+640-448]
+ movdqa xmm7, xmm1
+ paddw xmm5, xmm2
+ paddw xmm5, [esp+640-624]
+ movdqa xmm6, [esp+640-560]
+ psraw xmm5, 2
+ pandn xmm3, xmm5
+ por xmm4, xmm3
+ movdqa xmm3, [esp+640-32]
+ por xmm3, [esp+640-304]
+ pand xmm7, xmm4
+ movdqa xmm4, [esp+640-432]
+ movdqa xmm5, [esp+640-464]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm4
+ paddw xmm4, [esp+640-496]
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-288]
+ packuswb xmm2, xmm7
+ movdqa [esp+704-272], xmm2
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-384]
- por xmm2, xmm3
- movdqa [esp+640-304], xmm2
- movdqa xmm2, [esp+640-528]
- movdqa xmm3, xmm2
- paddw xmm3, [esp+640-464]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-624]
- psraw xmm3, 2
- pand xmm6, xmm3
- movdqa xmm3, [esp+640-560]
- movdqa xmm4, xmm3
- pandn xmm4, xmm5
- por xmm6, xmm4
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-304]
- movdqa xmm4, xmm1
- pandn xmm4, xmm5
- por xmm7, xmm4
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-384]
+ por xmm2, xmm3
+ movdqa [esp+640-304], xmm2
+ movdqa xmm2, [esp+640-528]
+ movdqa xmm3, xmm2
+ paddw xmm3, [esp+640-464]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-624]
+ psraw xmm3, 2
+ pand xmm6, xmm3
+ movdqa xmm3, [esp+640-560]
+ movdqa xmm4, xmm3
+ pandn xmm4, xmm5
+ por xmm6, xmm4
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-304]
+ movdqa xmm4, xmm1
+ pandn xmm4, xmm5
+ por xmm7, xmm4
- movdqa xmm4, xmm0
- pandn xmm0, [esp+640-416]
- packuswb xmm6, xmm7
- movdqa xmm7, [esp+640-112]
- por xmm7, [esp+640-80]
- pand xmm4, xmm7
- por xmm4, xmm0
- movdqa xmm0, [esp+752-272]
- punpckhbw xmm0, [esp+640-48]
- psllw xmm0, 1
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm5
- paddw xmm0, [esp+640-432]
- paddw xmm0, [esp+640-496]
- paddw xmm0, [esp+640-592]
- psraw xmm0, 3
- pand xmm0, xmm3
- movdqa xmm7, xmm1
- pandn xmm3, xmm2
- por xmm0, xmm3
- pand xmm7, xmm0
+ movdqa xmm4, xmm0
+ pandn xmm0, [esp+640-416]
+ packuswb xmm6, xmm7
+ movdqa xmm7, [esp+640-112]
+ por xmm7, [esp+640-80]
+ pand xmm4, xmm7
+ por xmm4, xmm0
+ movdqa xmm0, [esp+752-272]
+ punpckhbw xmm0, [esp+640-48]
+ psllw xmm0, 1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm5
+ paddw xmm0, [esp+640-432]
+ paddw xmm0, [esp+640-496]
+ paddw xmm0, [esp+640-592]
+ psraw xmm0, 3
+ pand xmm0, xmm3
+ movdqa xmm7, xmm1
+ pandn xmm3, xmm2
+ por xmm0, xmm3
+ pand xmm7, xmm0
- movdqa xmm0, [esp+656-272]
- movdqa [edx], xmm0
+ movdqa xmm0, [esp+656-272]
+ movdqa [edx], xmm0
- movdqa xmm0, [esp+672-272]
+ movdqa xmm0, [esp+672-272]
- mov edx, dword [esp+640-596]
- movdqa [esi], xmm0
- movdqa xmm0, [esp+688-272]
- movdqa [edi], xmm0
- movdqa xmm0, [esp+704-272]
+ mov edx, dword [esp+640-596]
+ movdqa [esi], xmm0
+ movdqa xmm0, [esp+688-272]
+ movdqa [edi], xmm0
+ movdqa xmm0, [esp+704-272]
- pop edi
- pandn xmm1, xmm2
- movdqa [eax], xmm0
- por xmm7, xmm1
- pop esi
- packuswb xmm4, xmm7
- movdqa [edx], xmm6
- movdqa [ecx], xmm4
- pop ebx
- mov esp, ebp
- pop ebp
- ret
+ pop edi
+ pandn xmm1, xmm2
+ movdqa [eax], xmm0
+ por xmm7, xmm1
+ pop esi
+ packuswb xmm4, xmm7
+ movdqa [edx], xmm6
+ movdqa [ecx], xmm4
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
%endif
--- a/codec/common/x86/expand_picture.asm
+++ b/codec/common/x86/expand_picture.asm
@@ -77,280 +77,280 @@
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
-%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
+%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
%endmacro
-%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+%2]
+%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+%2]
%endmacro
-%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
- movdq%4 [%1], %3 ; top(bottom)_0
- movdq%4 [%1+%2], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdq%4 [%1], %3 ; top(bottom)_2
- movdq%4 [%1+%2], %3 ; top(bottom)_3
- lea %1, [%1+2*%2]
+%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
+ movdq%4 [%1], %3 ; top(bottom)_0
+ movdq%4 [%1+%2], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdq%4 [%1], %3 ; top(bottom)_2
+ movdq%4 [%1+%2], %3 ; top(bottom)_3
+ lea %1, [%1+2*%2]
%endmacro
-%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
- movdq%4 [%1], %3 ; top(bottom)_0
- movdq%4 [%1+%2], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdq%4 [%1], %3 ; top(bottom)_2
- movdq%4 [%1+%2], %3 ; top(bottom)_3
- lea %1, [%1+%2]
+%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
+ movdq%4 [%1], %3 ; top(bottom)_0
+ movdq%4 [%1+%2], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdq%4 [%1], %3 ; top(bottom)_2
+ movdq%4 [%1+%2], %3 ; top(bottom)_3
+ lea %1, [%1+%2]
%endmacro
-%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
- movdqa [%1], %3 ; top(bottom)_0
- movdqa [%1+16], %3 ; top(bottom)_0
- movdqa [%1+%2], %3 ; top(bottom)_1
- movdqa [%1+%2+16], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdqa [%1], %3 ; top(bottom)_2
- movdqa [%1+16], %3 ; top(bottom)_2
- movdqa [%1+%2], %3 ; top(bottom)_3
- movdqa [%1+%2+16], %3 ; top(bottom)_3
- lea %1, [%1+2*%2]
+%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
+ movdqa [%1], %3 ; top(bottom)_0
+ movdqa [%1+16], %3 ; top(bottom)_0
+ movdqa [%1+%2], %3 ; top(bottom)_1
+ movdqa [%1+%2+16], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdqa [%1], %3 ; top(bottom)_2
+ movdqa [%1+16], %3 ; top(bottom)_2
+ movdqa [%1+%2], %3 ; top(bottom)_3
+ movdqa [%1+%2+16], %3 ; top(bottom)_3
+ lea %1, [%1+2*%2]
%endmacro
-%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
- movdqa [%1], %3 ; top(bottom)_0
- movdqa [%1+16], %3 ; top(bottom)_0
- movdqa [%1+%2], %3 ; top(bottom)_1
- movdqa [%1+%2+16], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdqa [%1], %3 ; top(bottom)_2
- movdqa [%1+16], %3 ; top(bottom)_2
- movdqa [%1+%2], %3 ; top(bottom)_3
- movdqa [%1+%2+16], %3 ; top(bottom)_3
- lea %1, [%1+%2]
+%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
+ movdqa [%1], %3 ; top(bottom)_0
+ movdqa [%1+16], %3 ; top(bottom)_0
+ movdqa [%1+%2], %3 ; top(bottom)_1
+ movdqa [%1+%2+16], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdqa [%1], %3 ; top(bottom)_2
+ movdqa [%1+16], %3 ; top(bottom)_2
+ movdqa [%1+%2], %3 ; top(bottom)_3
+ movdqa [%1+%2+16], %3 ; top(bottom)_3
+ lea %1, [%1+%2]
%endmacro
-%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
+%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
;r2 [width/16(8)]
;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
-%if %1 == 32 ; for luma
- sar r2, 04h ; width / 16(8) pixels
+%if %1 == 32 ; for luma
+ sar r2, 04h ; width / 16(8) pixels
.top_bottom_loops:
- ; top
- movdqa xmm0, [r0] ; first line of picture pData
- mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_end16x4_sse2 r5, r1, xmm0, a
+ ; top
+ movdqa xmm0, [r0] ; first line of picture pData
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_end16x4_sse2 r5, r1, xmm0, a
- ; bottom
- movdqa xmm1, [r3] ; last line of picture pData
- mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_end16x4_sse2 r4, r1, xmm1, a
+ ; bottom
+ movdqa xmm1, [r3] ; last line of picture pData
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_end16x4_sse2 r4, r1, xmm1, a
- lea r0, [r0+16] ; top pSrc
- lea r5, [r5+16] ; top dst
- lea r3, [r3+16] ; bottom pSrc
- lea r4, [r4+16] ; bottom dst
- neg r1 ; positive/negative stride need for next loop?
+ lea r0, [r0+16] ; top pSrc
+ lea r5, [r5+16] ; top dst
+ lea r3, [r3+16] ; bottom pSrc
+ lea r4, [r4+16] ; bottom dst
+ neg r1 ; positive/negative stride need for next loop?
- dec r2
- jnz near .top_bottom_loops
-%elif %1 == 16 ; for chroma ??
- mov r6, r2
- sar r2, 04h ; (width / 16) pixels
+ dec r2
+ jnz near .top_bottom_loops
+%elif %1 == 16 ; for chroma ??
+ mov r6, r2
+ sar r2, 04h ; (width / 16) pixels
.top_bottom_loops:
- ; top
- movdqa xmm0, [r0] ; first line of picture pData
- mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_end16x4_sse2 r5, r1, xmm0, a
+ ; top
+ movdqa xmm0, [r0] ; first line of picture pData
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_end16x4_sse2 r5, r1, xmm0, a
- ; bottom
- movdqa xmm1, [r3] ; last line of picture pData
- mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_end16x4_sse2 r4, r1, xmm1, a
+ ; bottom
+ movdqa xmm1, [r3] ; last line of picture pData
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_end16x4_sse2 r4, r1, xmm1, a
- lea r0, [r0+16] ; top pSrc
- lea r5, [r5+16] ; top dst
- lea r3, [r3+16] ; bottom pSrc
- lea r4, [r4+16] ; bottom dst
- neg r1 ; positive/negative stride need for next loop?
+ lea r0, [r0+16] ; top pSrc
+ lea r5, [r5+16] ; top dst
+ lea r3, [r3+16] ; bottom pSrc
+ lea r4, [r4+16] ; bottom dst
+ neg r1 ; positive/negative stride need for next loop?
- dec r2
- jnz near .top_bottom_loops
+ dec r2
+ jnz near .top_bottom_loops
- ; for remaining 8 bytes
- and r6, 0fh ; any 8 bytes left?
- test r6, r6
- jz near .to_be_continued ; no left to exit here
+ ; for remaining 8 bytes
+ and r6, 0fh ; any 8 bytes left?
+ test r6, r6
+ jz near .to_be_continued ; no left to exit here
- ; top
- movq mm0, [r0] ; remained 8 byte
- mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- ; bottom
- movq mm1, [r3]
- mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- WELSEMMS
+ ; top
+ movq mm0, [r0] ; remained 8 byte
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ ; bottom
+ movq mm1, [r3]
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ WELSEMMS
.to_be_continued:
%endif
%endmacro
-%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
;r6 [height]
;r0 [pSrc+0] r5[pSrc-32] r1[stride]
;r3 [pSrc+(w-1)] r4[pSrc+w]
-%if %1 == 32 ; for luma
+%if %1 == 32 ; for luma
.left_right_loops:
- ; left
- movzx r2d, byte [r0] ; pixel pData for left border
- SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [r5], xmm0
- movdqa [r5+16], xmm0
+ ; left
+ movzx r2d, byte [r0] ; pixel pData for left border
+ SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r5], xmm0
+ movdqa [r5+16], xmm0
- ; right
- movzx r2d, byte [r3]
- SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [r4], xmm1
- movdqa [r4+16], xmm1
+ ; right
+ movzx r2d, byte [r3]
+ SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r4], xmm1
+ movdqa [r4+16], xmm1
- lea r0, [r0+r1] ; left pSrc
- lea r5, [r5+r1] ; left dst
- lea r3, [r3+r1] ; right pSrc
- lea r4, [r4+r1] ; right dst
+ lea r0, [r0+r1] ; left pSrc
+ lea r5, [r5+r1] ; left dst
+ lea r3, [r3+r1] ; right pSrc
+ lea r4, [r4+r1] ; right dst
- dec r6
- jnz near .left_right_loops
-%elif %1 == 16 ; for chroma ??
+ dec r6
+ jnz near .left_right_loops
+%elif %1 == 16 ; for chroma ??
.left_right_loops:
- ; left
- movzx r2d, byte [r0] ; pixel pData for left border
- SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [r5], xmm0
+ ; left
+ movzx r2d, byte [r0] ; pixel pData for left border
+ SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r5], xmm0
- ; right
- movzx r2d, byte [r3]
- SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
+ ; right
+ movzx r2d, byte [r3]
+ SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
- lea r0, [r0+r1] ; left pSrc
- lea r5, [r5+r1] ; left dst
- lea r3, [r3+r1] ; right pSrc
- lea r4, [r4+r1] ; right dst
+ lea r0, [r0+r1] ; left pSrc
+ lea r5, [r5+r1] ; left dst
+ lea r3, [r3+r1] ; right pSrc
+ lea r4, [r4+r1] ; right dst
- dec r6
- jnz near .left_right_loops
+ dec r6
+ jnz near .left_right_loops
%endif
%endmacro
-%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
- ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
- ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
+%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+ ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
+ ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
-%if %1 == 32 ; luma
- ; TL
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+%if %1 == 32 ; luma
+ ; TL
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- ; TR
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ ; TR
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- ; BL
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ ; BL
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- ; BR
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
-%elif %1 == 16 ; chroma
- ; TL
- mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
- mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ ; BR
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+%elif %1 == 16 ; chroma
+ ; TL
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
- ; TR
- mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
- mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ ; TR
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
- ; BL
- mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
- mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ ; BL
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
- ; BR
- mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
- mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ ; BR
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
%endif
%endmacro
;***********************************************************************----------------
-; void ExpandPictureLuma_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
+; void ExpandPictureLuma_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureLuma_sse2
@@ -403,8 +403,8 @@
exp_top_bottom_sse2 32
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
@@ -416,8 +416,8 @@
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
- movzx r6d,byte [r3] ;top -rigth
- SSE2_Copy16Times xmm4,r6d
+ movzx r6d,byte [r3] ;top -rigth
+ SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
@@ -438,8 +438,8 @@
pop r1
pop r0
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
@@ -472,13 +472,13 @@
%assign push_num 0
- ret
+ ret
;***********************************************************************----------------
-; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
+; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureChromaAlign_sse2
@@ -531,8 +531,8 @@
exp_top_bottom_sse2 16
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
@@ -557,7 +557,7 @@
push r0
push r1
push r2
- push r6
+ push r6
exp_left_right_sse2 16,a
pop r6
@@ -565,8 +565,8 @@
pop r1
pop r0
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
@@ -599,16 +599,16 @@
%assign push_num 0
- ret
+ ret
;***********************************************************************----------------
-; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
+; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureChromaUnalign_sse2
- push r4
+ push r4
push r5
push r6
@@ -657,8 +657,8 @@
exp_top_bottom_sse2 16
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
@@ -683,7 +683,7 @@
push r0
push r1
push r2
- push r6
+ push r6
exp_left_right_sse2 16,u
pop r6
@@ -691,8 +691,8 @@
pop r1
pop r0
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
@@ -725,4 +725,4 @@
%assign push_num 0
- ret
+ ret
--- a/codec/common/x86/mb_copy.asm
+++ b/codec/common/x86/mb_copy.asm
@@ -36,9 +36,9 @@
;*
;* History
;* 15/09/2009 Created
-;* 12/28/2009 Modified with larger throughput
-;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
-;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
+;* 12/28/2009 Modified with larger throughput
+;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
;*
;*
;*********************************************************************************************/
@@ -56,174 +56,174 @@
;***********************************************************************
-; void WelsCopy16x16_sse2( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
+; void WelsCopy16x16_sse2( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy16x16_sse2
- push r4
- push r5
- %assign push_num 2
+ push r4
+ push r5
+ %assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+2*r3]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+2*r3]
- movdqa xmm7, [r2+r5]
- lea r2, [r2+4*r3]
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+2*r3]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+2*r3]
+ movdqa xmm7, [r2+r5]
+ lea r2, [r2+4*r3]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ lea r0, [r0+4*r1]
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+2*r3]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+2*r3]
- movdqa xmm7, [r2+r5]
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+2*r3]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+2*r3]
+ movdqa xmm7, [r2+r5]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- POP_XMM
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
;***********************************************************************
-; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
+; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
;***********************************************************************
; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
WELS_EXTERN WelsCopy16x16NotAligned_sse2
- push r4
- push r5
- %assign push_num 2
+ push r4
+ push r5
+ %assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
- lea r2, [r2+4*r3]
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
+ lea r2, [r2+4*r3]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ lea r0, [r0+4*r1]
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- POP_XMM
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
; , 12/29/2011
;***********************************************************************
; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy16x8NotAligned_sse2
- push r4
- push r5
- %assign push_num 2
+ push r4
+ push r5
+ %assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- POP_XMM
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
;***********************************************************************
@@ -233,62 +233,62 @@
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy8x16_mmx
- %assign push_num 0
+ %assign push_num 0
LOAD_4_PARA
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
- lea r2, [r2+2*r3]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
+ lea r2, [r2+2*r3]
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
- lea r0, [r0+2*r1]
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
+ lea r0, [r0+2*r1]
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
- WELSEMMS
- LOAD_4_PARA_POP
- ret
+ WELSEMMS
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
; void WelsCopy8x8_mmx( uint8_t* Dst,
@@ -297,48 +297,48 @@
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy8x8_mmx
- push r4
- %assign push_num 1
+ push r4
+ %assign push_num 1
LOAD_4_PARA
- lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
+ lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
- WELSEMMS
- LOAD_4_PARA_POP
- pop r4
- ret
+ WELSEMMS
+ LOAD_4_PARA_POP
+ pop r4
+ ret
; (dunhuang@cisco), 12/21/2011
;***********************************************************************
@@ -349,13 +349,13 @@
%assign push_num 0
LOAD_2_PARA
- movd xmm0, r1d ; _mv
- pshufd xmm1, xmm0, $00
- movdqa [r0 ], xmm1
- movdqa [r0+0x10], xmm1
- movdqa [r0+0x20], xmm1
- movdqa [r0+0x30], xmm1
- ret
+ movd xmm0, r1d ; _mv
+ pshufd xmm1, xmm0, $00
+ movdqa [r0 ], xmm1
+ movdqa [r0+0x10], xmm1
+ movdqa [r0+0x20], xmm1
+ movdqa [r0+0x30], xmm1
+ ret
;*******************************************************************************
; Macros and other preprocessor constants
@@ -381,14 +381,14 @@
%assign push_num 0
LOAD_7_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
- SIGN_EXTENSION r6, r6d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
- movd mm0, [r4]
+ movd mm0, [r4]
pavgb mm0, [r2]
movd [r0], mm0
@@ -398,8 +398,8 @@
lea r4, [r4+r5]
jne .height_loop
- WELSEMMS
- LOAD_7_PARA_POP
+ WELSEMMS
+ LOAD_7_PARA_POP
ret
@@ -413,29 +413,29 @@
%assign push_num 0
LOAD_7_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
- SIGN_EXTENSION r6, r6d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
- movq mm0, [r2]
+ movq mm0, [r2]
pavgb mm0, [r4]
movq [r0], mm0
movq mm0, [r2+r3]
pavgb mm0, [r4+r5]
- movq [r0+r1], mm0
+ movq [r0+r1], mm0
- lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
sub r6, 2
jnz .height_loop
- WELSEMMS
- LOAD_7_PARA_POP
+ WELSEMMS
+ LOAD_7_PARA_POP
ret
@@ -450,46 +450,46 @@
%assign push_num 0
LOAD_7_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
- SIGN_EXTENSION r6, r6d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
- movdqu xmm0, [r2]
- movdqu xmm1, [r4]
- pavgb xmm0, xmm1
- ;pavgb xmm0, [r4]
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r4]
+ pavgb xmm0, xmm1
+ ;pavgb xmm0, [r4]
movdqu [r0], xmm0
- movdqu xmm0, [r2+r3]
- movdqu xmm1, [r4+r5]
- pavgb xmm0, xmm1
+ movdqu xmm0, [r2+r3]
+ movdqu xmm1, [r4+r5]
+ pavgb xmm0, xmm1
movdqu [r0+r1], xmm0
- movdqu xmm0, [r2+2*r3]
- movdqu xmm1, [r4+2*r5]
- pavgb xmm0, xmm1
+ movdqu xmm0, [r2+2*r3]
+ movdqu xmm1, [r4+2*r5]
+ pavgb xmm0, xmm1
movdqu [r0+2*r1], xmm0
lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
- movdqu xmm0, [r2+r3]
- movdqu xmm1, [r4+r5]
- pavgb xmm0, xmm1
+ movdqu xmm0, [r2+r3]
+ movdqu xmm1, [r4+r5]
+ pavgb xmm0, xmm1
movdqu [r0+r1], xmm0
lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
sub r6, 4
jne .height_loop
- WELSEMMS
- LOAD_7_PARA_POP
+ WELSEMMS
+ LOAD_7_PARA_POP
ret
;*******************************************************************************
@@ -497,26 +497,26 @@
; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
WELS_EXTERN McCopyWidthEq4_mmx
- push r5
+ push r5
%assign push_num 1
LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
- mov r5d, [r0]
- mov [r2], r5d
+ mov r5d, [r0]
+ mov [r2], r5d
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
- WELSEMMS
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
+ WELSEMMS
LOAD_5_PARA_POP
- pop r5
+ pop r5
ret
;*******************************************************************************
@@ -527,21 +527,21 @@
%assign push_num 0
LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
- movq mm0, [r0]
- movq [r2], mm0
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
+ movq mm0, [r0]
+ movq [r2], mm0
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
- WELSEMMS
- LOAD_5_PARA_POP
+ WELSEMMS
+ LOAD_5_PARA_POP
ret
@@ -550,32 +550,32 @@
;*******************************************************************************
;read unaligned memory
%macro SSE_READ_UNA 2
- movq %1, [%2]
- movhps %1, [%2+8]
+ movq %1, [%2]
+ movhps %1, [%2+8]
%endmacro
;write unaligned memory
%macro SSE_WRITE_UNA 2
- movq [%1], %2
- movhps [%1+8], %2
+ movq [%1], %2
+ movhps [%1+8], %2
%endmacro
WELS_EXTERN McCopyWidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
- SSE_READ_UNA xmm0, r0
- SSE_READ_UNA xmm1, r0+r1
- SSE_WRITE_UNA r2, xmm0
- SSE_WRITE_UNA r2+r3, xmm1
+ SSE_READ_UNA xmm0, r0
+ SSE_READ_UNA xmm1, r0+r1
+ SSE_WRITE_UNA r2, xmm0
+ SSE_WRITE_UNA r2+r3, xmm1
- sub r4, 2
+ sub r4, 2
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
jnz .height_loop
- LOAD_5_PARA_POP
+ LOAD_5_PARA_POP
ret
--- a/codec/common/x86/mc_chroma.asm
+++ b/codec/common/x86/mc_chroma.asm
@@ -53,10 +53,10 @@
ALIGN 16
h264_d0x20_sse2:
- dw 32,32,32,32,32,32,32,32
+ dw 32,32,32,32,32,32,32,32
ALIGN 16
h264_d0x20_mmx:
- dw 32,32,32,32
+ dw 32,32,32,32
;=============================================================================
@@ -67,152 +67,152 @@
;*******************************************************************************
; void McChromaWidthEq4_mmx( const uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; const uint8_t *pABCD,
-; int32_t iHeigh );
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; const uint8_t *pABCD,
+; int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
- %assign push_num 0
- LOAD_6_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_6_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
- movd mm3, [r4]; [eax]
- WELS_Zero mm7
- punpcklbw mm3, mm3
- movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
+ movd mm3, [r4]; [eax]
+ WELS_Zero mm7
+ punpcklbw mm3, mm3
+ movq mm4, mm3
+ punpcklwd mm3, mm3
+ punpckhwd mm4, mm4
- movq mm5, mm3
- punpcklbw mm3, mm7
- punpckhbw mm5, mm7
+ movq mm5, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm5, mm7
- movq mm6, mm4
- punpcklbw mm4, mm7
- punpckhbw mm6, mm7
+ movq mm6, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm6, mm7
- lea r4, [r0 + r1] ;lea ebx, [esi + eax]
- movd mm0, [r0]
- movd mm1, [r0+1]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
+ lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+ movd mm0, [r0]
+ movd mm1, [r0+1]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
.xloop:
- pmullw mm0, mm3
- pmullw mm1, mm5
- paddw mm0, mm1
+ pmullw mm0, mm3
+ pmullw mm1, mm5
+ paddw mm0, mm1
- movd mm1, [r4]
- punpcklbw mm1, mm7
- movq mm2, mm1
- pmullw mm1, mm4
- paddw mm0, mm1
+ movd mm1, [r4]
+ punpcklbw mm1, mm7
+ movq mm2, mm1
+ pmullw mm1, mm4
+ paddw mm0, mm1
- movd mm1, [r4+1]
- punpcklbw mm1, mm7
- movq mm7, mm1
- pmullw mm1,mm6
- paddw mm0, mm1
- movq mm1,mm7
+ movd mm1, [r4+1]
+ punpcklbw mm1, mm7
+ movq mm7, mm1
+ pmullw mm1,mm6
+ paddw mm0, mm1
+ movq mm1,mm7
- paddw mm0, [h264_d0x20_mmx]
- psrlw mm0, 6
+ paddw mm0, [h264_d0x20_mmx]
+ psrlw mm0, 6
- WELS_Zero mm7
- packuswb mm0, mm7
- movd [r2], mm0
+ WELS_Zero mm7
+ packuswb mm0, mm7
+ movd [r2], mm0
- movq mm0, mm2
+ movq mm0, mm2
- lea r2, [r2 + r3]
- lea r4, [r4 + r1]
+ lea r2, [r2 + r3]
+ lea r4, [r4 + r1]
- dec r5
- jnz near .xloop
- WELSEMMS
- LOAD_6_PARA_POP
- ret
+ dec r5
+ jnz near .xloop
+ WELSEMMS
+ LOAD_6_PARA_POP
+ ret
;*******************************************************************************
; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; const uint8_t *pABCD,
-; int32_t iheigh );
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; const uint8_t *pABCD,
+; int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
- %assign push_num 0
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
- movd xmm3, [r4]
- WELS_Zero xmm7
- punpcklbw xmm3, xmm3
- punpcklwd xmm3, xmm3
+ movd xmm3, [r4]
+ WELS_Zero xmm7
+ punpcklbw xmm3, xmm3
+ punpcklwd xmm3, xmm3
- movdqa xmm4, xmm3
- punpckldq xmm3, xmm3
- punpckhdq xmm4, xmm4
- movdqa xmm5, xmm3
- movdqa xmm6, xmm4
+ movdqa xmm4, xmm3
+ punpckldq xmm3, xmm3
+ punpckhdq xmm4, xmm4
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm4
- punpcklbw xmm3, xmm7
- punpckhbw xmm5, xmm7
- punpcklbw xmm4, xmm7
- punpckhbw xmm6, xmm7
+ punpcklbw xmm3, xmm7
+ punpckhbw xmm5, xmm7
+ punpcklbw xmm4, xmm7
+ punpckhbw xmm6, xmm7
- lea r4, [r0 + r1] ;lea ebx, [esi + eax]
- movq xmm0, [r0]
- movq xmm1, [r0+1]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
+ lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+ movq xmm0, [r0]
+ movq xmm1, [r0+1]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
.xloop:
- pmullw xmm0, xmm3
- pmullw xmm1, xmm5
- paddw xmm0, xmm1
+ pmullw xmm0, xmm3
+ pmullw xmm1, xmm5
+ paddw xmm0, xmm1
- movq xmm1, [r4]
- punpcklbw xmm1, xmm7
- movdqa xmm2, xmm1
- pmullw xmm1, xmm4
- paddw xmm0, xmm1
+ movq xmm1, [r4]
+ punpcklbw xmm1, xmm7
+ movdqa xmm2, xmm1
+ pmullw xmm1, xmm4
+ paddw xmm0, xmm1
- movq xmm1, [r4+1]
- punpcklbw xmm1, xmm7
- movdqa xmm7, xmm1
- pmullw xmm1, xmm6
- paddw xmm0, xmm1
- movdqa xmm1,xmm7
+ movq xmm1, [r4+1]
+ punpcklbw xmm1, xmm7
+ movdqa xmm7, xmm1
+ pmullw xmm1, xmm6
+ paddw xmm0, xmm1
+ movdqa xmm1,xmm7
- paddw xmm0, [h264_d0x20_sse2]
- psrlw xmm0, 6
+ paddw xmm0, [h264_d0x20_sse2]
+ psrlw xmm0, 6
- WELS_Zero xmm7
- packuswb xmm0, xmm7
- movq [r2], xmm0
+ WELS_Zero xmm7
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
- movdqa xmm0, xmm2
+ movdqa xmm0, xmm2
- lea r2, [r2 + r3]
- lea r4, [r4 + r1]
+ lea r2, [r2 + r3]
+ lea r4, [r4 + r1]
- dec r5
- jnz near .xloop
+ dec r5
+ jnz near .xloop
- POP_XMM
- LOAD_6_PARA_POP
+ POP_XMM
+ LOAD_6_PARA_POP
- ret
+ ret
@@ -219,19 +219,19 @@
;***********************************************************************
; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
-; int32_t iSrcStride,
+; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; const uint8_t *pABCD,
-; int32_t iHeigh);
+; int32_t iHeigh);
;***********************************************************************
WELS_EXTERN McChromaWidthEq8_ssse3
- %assign push_num 0
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
pxor xmm7, xmm7
movd xmm5, [r4]
@@ -243,27 +243,27 @@
sub r2, r3 ;sub esi, edi
sub r2, r3
- movdqa xmm7, [h264_d0x20_sse2]
+ movdqa xmm7, [h264_d0x20_sse2]
- movdqu xmm0, [r0]
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- punpcklbw xmm0, xmm1
+ movdqu xmm0, [r0]
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+ punpcklbw xmm0, xmm1
.hloop_chroma:
- lea r2, [r2+2*r3]
+ lea r2, [r2+2*r3]
- movdqu xmm2, [r0+r1]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm4, xmm2
+ movdqu xmm2, [r0+r1]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm4, xmm2
pmaddubsw xmm0, xmm5
pmaddubsw xmm2, xmm6
paddw xmm0, xmm2
paddw xmm0, xmm7
- psrlw xmm0, 6
+ psrlw xmm0, 6
packuswb xmm0, xmm0
movq [r2],xmm0
@@ -278,16 +278,16 @@
pmaddubsw xmm2, xmm6
paddw xmm4, xmm2
paddw xmm4, xmm7
- psrlw xmm4, 6
+ psrlw xmm4, 6
packuswb xmm4, xmm4
movq [r2+r3],xmm4
- sub r5, 2
- jnz .hloop_chroma
+ sub r5, 2
+ jnz .hloop_chroma
- POP_XMM
- LOAD_6_PARA_POP
+ POP_XMM
+ LOAD_6_PARA_POP
- ret
+ ret
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -52,13 +52,13 @@
ALIGN 16
h264_w0x10:
- dw 16, 16, 16, 16
+ dw 16, 16, 16, 16
ALIGN 16
h264_w0x10_1:
- dw 16, 16, 16, 16, 16, 16, 16, 16
+ dw 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 16
h264_mc_hc_32:
- dw 32, 32, 32, 32, 32, 32, 32, 32
+ dw 32, 32, 32, 32, 32, 32, 32, 32
;*******************************************************************************
@@ -72,55 +72,55 @@
;*******************************************************************************
; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight)
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight)
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq4_mmx
%assign push_num 0
LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
- sub r0, 2
- WELS_Zero mm7
- movq mm6, [h264_w0x10]
+ sub r0, 2
+ WELS_Zero mm7
+ movq mm6, [h264_w0x10]
.height_loop:
- movd mm0, [r0]
- punpcklbw mm0, mm7
- movd mm1, [r0+5]
- punpcklbw mm1, mm7
- movd mm2, [r0+1]
- punpcklbw mm2, mm7
- movd mm3, [r0+4]
- punpcklbw mm3, mm7
- movd mm4, [r0+2]
- punpcklbw mm4, mm7
- movd mm5, [r0+3]
- punpcklbw mm5, mm7
+ movd mm0, [r0]
+ punpcklbw mm0, mm7
+ movd mm1, [r0+5]
+ punpcklbw mm1, mm7
+ movd mm2, [r0+1]
+ punpcklbw mm2, mm7
+ movd mm3, [r0+4]
+ punpcklbw mm3, mm7
+ movd mm4, [r0+2]
+ punpcklbw mm4, mm7
+ movd mm5, [r0+3]
+ punpcklbw mm5, mm7
- paddw mm2, mm3
- paddw mm4, mm5
- psllw mm4, 2
- psubw mm4, mm2
- paddw mm0, mm1
- paddw mm0, mm4
- psllw mm4, 2
- paddw mm0, mm4
- paddw mm0, mm6
- psraw mm0, 5
- packuswb mm0, mm7
- movd [r2], mm0
+ paddw mm2, mm3
+ paddw mm4, mm5
+ psllw mm4, 2
+ psubw mm4, mm2
+ paddw mm0, mm1
+ paddw mm0, mm4
+ psllw mm4, 2
+ paddw mm0, mm4
+ paddw mm0, mm6
+ psraw mm0, 5
+ packuswb mm0, mm7
+ movd [r2], mm0
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
- WELSEMMS
- LOAD_5_PARA_POP
- ret
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
;*******************************************************************************
; Macros and other preprocessor constants
@@ -128,26 +128,26 @@
%macro SSE_LOAD_8P 3
- movq %1, %3
- punpcklbw %1, %2
+ movq %1, %3
+ punpcklbw %1, %2
%endmacro
%macro FILTER_HV_W8 9
- paddw %1, %6
- movdqa %8, %3
- movdqa %7, %2
- paddw %1, [h264_w0x10_1]
- paddw %8, %4
- paddw %7, %5
- psllw %8, 2
- psubw %8, %7
- paddw %1, %8
- psllw %8, 2
- paddw %1, %8
- psraw %1, 5
- WELS_Zero %8
- packuswb %1, %8
- movq %9, %1
+ paddw %1, %6
+ movdqa %8, %3
+ movdqa %7, %2
+ paddw %1, [h264_w0x10_1]
+ paddw %8, %4
+ paddw %7, %5
+ psllw %8, 2
+ psubw %8, %7
+ paddw %1, %8
+ psllw %8, 2
+ paddw %1, %8
+ psraw %1, 5
+ WELS_Zero %8
+ packuswb %1, %8
+ movq %9, %1
%endmacro
;*******************************************************************************
@@ -159,192 +159,192 @@
;***********************************************************************
; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
; int16_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride
-; int32_t iHeight
+; uint8_t *pDst,
+; int32_t iDstStride
+; int32_t iHeight
; )
;***********************************************************************
WELS_EXTERN McHorVer22Width8HorFirst_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- pxor xmm7, xmm7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ pxor xmm7, xmm7
- sub r0, r1 ;;;;;;;;need more 5 lines.
- sub r0, r1
+ sub r0, r1 ;;;;;;;;need more 5 lines.
+ sub r0, r1
.yloop_width_8:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- movdqa [r2], xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ movdqa [r2], xmm0
- add r0, r1
- add r2, r3
- dec r4
- jnz .yloop_width_8
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .yloop_width_8
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
;*******************************************************************************
; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc,
; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight,
; );
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq8_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- lea r0, [r0-2] ;pSrc -= 2;
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ lea r0, [r0-2] ;pSrc -= 2;
- pxor xmm7, xmm7
- movdqa xmm6, [h264_w0x10_1]
+ pxor xmm7, xmm7
+ movdqa xmm6, [h264_w0x10_1]
.y_loop:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [r2], xmm0
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
- lea r2, [r2+r3]
- lea r0, [r0+r1]
- dec r4
- jnz near .y_loop
+ lea r2, [r2+r3]
+ lea r0, [r0+r1]
+ dec r4
+ jnz near .y_loop
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
;*******************************************************************************
; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc,
; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight,
; );
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq16_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- lea r0, [r0-2] ;pSrc -= 2;
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ lea r0, [r0-2] ;pSrc -= 2;
- pxor xmm7, xmm7
- movdqa xmm6, [h264_w0x10_1]
+ pxor xmm7, xmm7
+ movdqa xmm6, [h264_w0x10_1]
.y_loop:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [r2], xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [r2+8], xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+ packuswb xmm0, xmm7
+ movq [r2+8], xmm0
- lea r2, [r2+r3]
- lea r0, [r0+r1]
- dec r4
- jnz near .y_loop
+ lea r2, [r2+r3]
+ lea r0, [r0+r1]
+ dec r4
+ jnz near .y_loop
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
;*******************************************************************************
@@ -355,81 +355,81 @@
; int iHeight )
;*******************************************************************************
WELS_EXTERN McHorVer02WidthEq8_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- sub r0, r1
- sub r0, r1
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ sub r0, r1
+ sub r0, r1
- WELS_Zero xmm7
+ WELS_Zero xmm7
- SSE_LOAD_8P xmm0, xmm7, [r0]
- SSE_LOAD_8P xmm1, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm7, [r0]
- SSE_LOAD_8P xmm3, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm7, [r0]
- SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+ SSE_LOAD_8P xmm0, xmm7, [r0]
+ SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm7, [r0]
+ SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm7, [r0]
+ SSE_LOAD_8P xmm5, xmm7, [r0+r1]
.start:
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r4
- jz near .xx_exit
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r4
+ jz near .xx_exit
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
- dec r4
- jz near .xx_exit
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+ dec r4
+ jz near .xx_exit
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm7, xmm0, [r0+r1]
- FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r4
- jz near .xx_exit
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+ FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r4
+ jz near .xx_exit
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm0, xmm1, [r0]
- FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
- dec r4
- jz near .xx_exit
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm0, xmm1, [r0]
+ FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+ dec r4
+ jz near .xx_exit
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm1, xmm2, [r0+r1]
- FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
- dec r4
- jz near .xx_exit
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+ FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+ dec r4
+ jz near .xx_exit
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm3, [r0]
- FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
- dec r4
- jz near .xx_exit
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm3, [r0]
+ FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+ dec r4
+ jz near .xx_exit
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm3, xmm4, [r0+r1]
- FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
- dec r4
- jz near .xx_exit
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+ FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+ dec r4
+ jz near .xx_exit
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm5, [r0]
- FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
- dec r4
- jz near .xx_exit
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm5, [r0]
+ FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+ dec r4
+ jz near .xx_exit
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm5, xmm6, [r0+r1]
- jmp near .start
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+ jmp near .start
.xx_exit:
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
;***********************************************************************
; Code
@@ -440,725 +440,725 @@
;***********************************************************************
-; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc,
+; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
-; int32_t iWidth,
+; int32_t iWidth,
; int32_t iHeight )
;***********************************************************************
WELS_EXTERN McHorVer02Height9Or17_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
%endif
- shr r4, 3
- sub r0, r1
- sub r0, r1
+ shr r4, 3
+ sub r0, r1
+ sub r0, r1
.xloop:
- WELS_Zero xmm7
- SSE_LOAD_8P xmm0, xmm7, [r0]
- SSE_LOAD_8P xmm1, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm7, [r0]
- SSE_LOAD_8P xmm3, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm7, [r0]
- SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+ WELS_Zero xmm7
+ SSE_LOAD_8P xmm0, xmm7, [r0]
+ SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm7, [r0]
+ SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm7, [r0]
+ SSE_LOAD_8P xmm5, xmm7, [r0+r1]
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- movdqa xmm0,xmm1
- movdqa xmm1,xmm2
- movdqa xmm2,xmm3
- movdqa xmm3,xmm4
- movdqa xmm4,xmm5
- movdqa xmm5,xmm6
- add r2, r3
- sub r0, r1
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm2
+ movdqa xmm2,xmm3
+ movdqa xmm3,xmm4
+ movdqa xmm4,xmm5
+ movdqa xmm5,xmm6
+ add r2, r3
+ sub r0, r1
.start:
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm7, xmm0, [r0+r1]
- FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+ FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm0, xmm1, [r0]
- FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm0, xmm1, [r0]
+ FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm1, xmm2, [r0+r1]
- FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+ FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm3, [r0]
- FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm3, [r0]
+ FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm3, xmm4, [r0+r1]
- FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+ FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm5, [r0]
- FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm5, [r0]
+ FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm5, xmm6, [r0+r1]
- jmp near .start
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+ jmp near .start
.x_loop_dec:
- dec r4
- jz near .xx_exit
+ dec r4
+ jz near .xx_exit
%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
%endif
- sub r0, r1
- sub r0, r1
- add r0, 8
- add r2, 8
- jmp near .xloop
+ sub r0, r1
+ sub r0, r1
+ add r0, 8
+ add r2, 8
+ jmp near .xloop
.xx_exit:
%ifndef X86_32
- pop r14
- pop r13
- pop r12
+ pop r14
+ pop r13
+ pop r12
%endif
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
;***********************************************************************
-; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
+; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight
; );
;***********************************************************************
WELS_EXTERN McHorVer20Width9Or17_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
- sub r0, 2
- pxor xmm7, xmm7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+ sub r0, 2
+ pxor xmm7, xmm7
- cmp r4, 9
- jne near .width_17
+ cmp r4, 9
+ jne near .width_17
.yloop_width_9:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movd [r2], xmm0
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movd [r2], xmm0
- pxor xmm7, xmm7
- movq xmm0, [r0+6]
- punpcklbw xmm0, xmm7
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6]
+ punpcklbw xmm0, xmm7
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- paddw xmm2, [h264_w0x10_1]
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r2+1], xmm2
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ paddw xmm2, [h264_w0x10_1]
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r2+1], xmm2
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_9
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_9
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
.width_17:
.yloop_width_17:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movq [r2], xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movq [r2], xmm0
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movd [r2+8], xmm0
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movd [r2+8], xmm0
- pxor xmm7, xmm7
- movq xmm0, [r0+6+8]
- punpcklbw xmm0, xmm7
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6+8]
+ punpcklbw xmm0, xmm7
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- paddw xmm2, [h264_w0x10_1]
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r2+9], xmm2
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_17
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ paddw xmm2, [h264_w0x10_1]
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r2+9], xmm2
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_17
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
;***********************************************************************
;void McHorVer22HorFirst_sse2
-; (const uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t * pTap,
-; int32_t iTapStride,
-; int32_t iWidth,int32_t iHeight);
+; (const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t * pTap,
+; int32_t iTapStride,
+; int32_t iWidth,int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22HorFirst_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
- pxor xmm7, xmm7
- sub r0, r1 ;;;;;;;;need more 5 lines.
- sub r0, r1
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+ pxor xmm7, xmm7
+ sub r0, r1 ;;;;;;;;need more 5 lines.
+ sub r0, r1
- cmp r4, 9
- jne near .width_17
+ cmp r4, 9
+ jne near .width_17
.yloop_width_9:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- movd [r2], xmm0
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ movd [r2], xmm0
- pxor xmm7, xmm7
- movq xmm0, [r0+6]
- punpcklbw xmm0, xmm7
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6]
+ punpcklbw xmm0, xmm7
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- movq [r2+2], xmm2
- movhps [r2+2+8], xmm2
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ movq [r2+2], xmm2
+ movhps [r2+2+8], xmm2
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_9
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_9
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
.width_17:
.yloop_width_17:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- movdqa [r2], xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ movdqa [r2], xmm0
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- movd [r2+16], xmm0
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ movd [r2+16], xmm0
- pxor xmm7, xmm7
- movq xmm0, [r0+6+8]
- punpcklbw xmm0, xmm7
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6+8]
+ punpcklbw xmm0, xmm7
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- movq [r2+18], xmm2
- movhps [r2+18+8], xmm2
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ movq [r2+18], xmm2
+ movhps [r2+18+8], xmm2
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_17
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_17
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
%macro FILTER_VER 9
- paddw %1, %6
- movdqa %7, %2
- movdqa %8, %3
+ paddw %1, %6
+ movdqa %7, %2
+ movdqa %8, %3
- paddw %7, %5
- paddw %8, %4
+ paddw %7, %5
+ paddw %8, %4
- psubw %1, %7
- psraw %1, 2
- paddw %1, %8
- psubw %1, %7
- psraw %1, 2
- paddw %8, %1
- paddw %8, [h264_mc_hc_32]
- psraw %8, 6
- packuswb %8, %8
- movq %9, %8
+ psubw %1, %7
+ psraw %1, 2
+ paddw %1, %8
+ psubw %1, %7
+ psraw %1, 2
+ paddw %8, %1
+ paddw %8, [h264_mc_hc_32]
+ psraw %8, 6
+ packuswb %8, %8
+ movq %9, %8
%endmacro
;***********************************************************************
;void McHorVer22Width8VerLastAlign_sse2(
-; const uint8_t *pTap,
-; int32_t iTapStride,
-; uint8_t * pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight);
+; const uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
%endif
- shr r4, 3
+ shr r4, 3
.width_loop:
- movdqa xmm0, [r0]
- movdqa xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movdqa xmm2, [r0]
- movdqa xmm3, [r0+r1]
- lea r0, [r0+2*r1]
- movdqa xmm4, [r0]
- movdqa xmm5, [r0+r1]
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ movdqa xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ movdqa xmm5, [r0+r1]
- FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- movdqa xmm6, [r0]
+ FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ movdqa xmm6, [r0]
- movdqa xmm0, xmm1
- movdqa xmm1, xmm2
- movdqa xmm2, xmm3
- movdqa xmm3, xmm4
- movdqa xmm4, xmm5
- movdqa xmm5, xmm6
+ movdqa xmm0, xmm1
+ movdqa xmm1, xmm2
+ movdqa xmm2, xmm3
+ movdqa xmm3, xmm4
+ movdqa xmm4, xmm5
+ movdqa xmm5, xmm6
- add r2, r3
- sub r0, r1
+ add r2, r3
+ sub r0, r1
.start:
- FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
+ FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqa xmm6, [r0]
- FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqa xmm6, [r0]
+ FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqa xmm7, [r0+r1]
- FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqa xmm7, [r0+r1]
+ FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqa xmm0, [r0]
- FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqa xmm0, [r0]
+ FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0+r1]
- FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0+r1]
+ FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqa xmm2, [r0]
- FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqa xmm3, [r0+r1]
- FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqa xmm3, [r0+r1]
+ FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqa xmm4, [r0]
- FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqa xmm5, [r0+r1]
- jmp near .start
+ lea r2, [r2+2*r3]
+ movdqa xmm5, [r0+r1]
+ jmp near .start
.x_loop_dec:
- dec r4
- jz near .exit
+ dec r4
+ jz near .exit
%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
%endif
- add r0, 16
- add r2, 8
- jmp .width_loop
+ add r0, 16
+ add r2, 8
+ jmp .width_loop
.exit:
%ifndef X86_32
- pop r14
- pop r13
- pop r12
+ pop r14
+ pop r13
+ pop r12
%endif
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
;***********************************************************************
;void McHorVer22Width8VerLastUnAlign_sse2(
-; const uint8_t *pTap,
-; int32_t iTapStride,
-; uint8_t * pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight);
+; const uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
%endif
- shr r4, 3
+ shr r4, 3
.width_loop:
- movdqu xmm0, [r0]
- movdqu xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm2, [r0]
- movdqu xmm3, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm4, [r0]
- movdqu xmm5, [r0+r1]
+ movdqu xmm0, [r0]
+ movdqu xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ movdqu xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm4, [r0]
+ movdqu xmm5, [r0+r1]
- FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- movdqu xmm6, [r0]
+ FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ movdqu xmm6, [r0]
- movdqa xmm0, xmm1
- movdqa xmm1, xmm2
- movdqa xmm2, xmm3
- movdqa xmm3, xmm4
- movdqa xmm4, xmm5
- movdqa xmm5, xmm6
+ movdqa xmm0, xmm1
+ movdqa xmm1, xmm2
+ movdqa xmm2, xmm3
+ movdqa xmm3, xmm4
+ movdqa xmm4, xmm5
+ movdqa xmm5, xmm6
- add r2, r3
- sub r0, r1
+ add r2, r3
+ sub r0, r1
.start:
- FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
+ FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqu xmm6, [r0]
- FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqu xmm6, [r0]
+ FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqu xmm7, [r0+r1]
- FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqu xmm7, [r0+r1]
+ FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqu xmm0, [r0]
- FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqu xmm0, [r0]
+ FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqu xmm1, [r0+r1]
- FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqu xmm1, [r0+r1]
+ FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqu xmm2, [r0]
- FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqu xmm3, [r0+r1]
- FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r0+r1]
+ FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqu xmm4, [r0]
- FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqu xmm4, [r0]
+ FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqu xmm5, [r0+r1]
- jmp near .start
+ lea r2, [r2+2*r3]
+ movdqu xmm5, [r0+r1]
+ jmp near .start
.x_loop_dec:
- dec r4
- jz near .exit
+ dec r4
+ jz near .exit
%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
%endif
- add r0, 16
- add r2, 8
- jmp .width_loop
+ add r0, 16
+ add r2, 8
+ jmp .width_loop
.exit:
%ifndef X86_32
- pop r14
- pop r13
- pop r12
+ pop r14
+ pop r13
+ pop r12
%endif
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -77,77 +77,77 @@
;
;***********************************************************************
%macro MMX_DW_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubw %1, %2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubw %1, %2
%endmacro
%macro SSE2_SumWHorizon1 2
- movdqa %2, %1
- psrldq %2, 8
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 4
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 2
- paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 8
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ paddusw %1, %2
%endmacro
%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %4, %5
- SSE2_SumSub %2, %4, %5
- SSE2_SumSub %1, %3, %5
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %4, %5
+ SSE2_SumSub %2, %4, %5
+ SSE2_SumSub %1, %3, %5
%endmacro
%macro SSE2_SumAbs4 7
- WELS_AbsW %1, %3
- WELS_AbsW %2, %3
- WELS_AbsW %4, %6
- WELS_AbsW %5, %6
- paddusw %1, %2
- paddusw %4, %5
- paddusw %7, %1
- paddusw %7, %4
+ WELS_AbsW %1, %3
+ WELS_AbsW %2, %3
+ WELS_AbsW %4, %6
+ WELS_AbsW %5, %6
+ paddusw %1, %2
+ paddusw %4, %5
+ paddusw %7, %1
+ paddusw %7, %4
%endmacro
%macro SSE2_SumWHorizon 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
%endmacro
%macro SSE2_GetSatd8x8 0
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
%endmacro
;***********************************************************************
@@ -156,11 +156,11 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
movd xmm0, [r0]
movd xmm1, [r0+r1]
lea r0 , [r0+2*r1]
@@ -199,14 +199,14 @@
punpcklwd xmm0, xmm4
punpckhwd xmm4, xmm2
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
+ SSE2_XSawp dq, xmm0, xmm4, xmm3
+ SSE2_XSawp qdq, xmm0, xmm3, xmm5
movdqa xmm7, xmm0
paddw xmm0, xmm5
psubw xmm7, xmm5
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
+ SSE2_XSawp qdq, xmm0, xmm7, xmm1
movdqa xmm2, xmm0
paddw xmm0, xmm1
@@ -214,15 +214,15 @@
WELS_AbsW xmm0, xmm3
paddusw xmm6, xmm0
- WELS_AbsW xmm2, xmm4
+ WELS_AbsW xmm2, xmm4
paddusw xmm6, xmm2
SSE2_SumWHorizon1 xmm6, xmm4
- movd retrd, xmm6
+ movd retrd, xmm6
and retrd, 0xffff
shr retrd, 1
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -230,20 +230,20 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm6, xmm6
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -251,25 +251,25 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm6, xmm6
- pxor xmm7, xmm7
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -277,30 +277,30 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- push r0
- push r2
- pxor xmm6, xmm6
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
pxor xmm7, xmm7
- SSE2_GetSatd8x8
+ SSE2_GetSatd8x8
- pop r2
- pop r0
+ pop r2
+ pop r0
add r0, 8
add r2, 8
- SSE2_GetSatd8x8
+ SSE2_GetSatd8x8
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -308,38 +308,38 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- push r0
- push r2
- pxor xmm6, xmm6
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
pxor xmm7, xmm7
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
- pop r2
- pop r0
- add r0, 8
- add r2, 8
+ pop r2
+ pop r0
+ add r0, 8
+ add r2, 8
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -355,9 +355,9 @@
%macro SSE_DB_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubb %1, %2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubb %1, %2
%endmacro
;***********************************************************************
@@ -369,668 +369,668 @@
WELS_EXTERN WelsSampleSatdThree4x4_sse2
%ifdef X86_32
- push r3
- push r4
- push r5
- push r6
- %assign push_num 4
+ push r3
+ push r4
+ push r5
+ push r6
+ %assign push_num 4
%else
- %assign push_num 0
+ %assign push_num 0
%endif
- PUSH_XMM 8
+ PUSH_XMM 8
- mov r2, arg3
- mov r3, arg4
- SIGN_EXTENSION r3, r3d
+ mov r2, arg3
+ mov r3, arg4
+ SIGN_EXTENSION r3, r3d
- ; load source 4x4 samples and Hadamard transform
- movd xmm0, [r2]
- movd xmm1, [r2+r3]
- lea r2 , [r2+2*r3]
- movd xmm2, [r2]
- movd xmm3, [r2+r3]
- punpckldq xmm0, xmm2
- punpckldq xmm1, xmm3
+ ; load source 4x4 samples and Hadamard transform
+ movd xmm0, [r2]
+ movd xmm1, [r2+r3]
+ lea r2 , [r2+2*r3]
+ movd xmm2, [r2]
+ movd xmm3, [r2+r3]
+ punpckldq xmm0, xmm2
+ punpckldq xmm1, xmm3
- pxor xmm6, xmm6
- punpcklbw xmm0, xmm6
- punpcklbw xmm1, xmm6
+ pxor xmm6, xmm6
+ punpcklbw xmm0, xmm6
+ punpcklbw xmm1, xmm6
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
- SSE2_XSawp qdq, xmm0, xmm2, xmm3
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+ SSE2_XSawp qdq, xmm0, xmm2, xmm3
- movdqa xmm4, xmm0
- paddw xmm0, xmm3
- psubw xmm4, xmm3
+ movdqa xmm4, xmm0
+ paddw xmm0, xmm3
+ psubw xmm4, xmm3
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm4
- punpckhwd xmm4, xmm2
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm4, xmm2
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
+ SSE2_XSawp dq, xmm0, xmm4, xmm3
+ SSE2_XSawp qdq, xmm0, xmm3, xmm5
- movdqa xmm7, xmm0
- paddw xmm0, xmm5
- psubw xmm7, xmm5
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm5
+ psubw xmm7, xmm5
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
+ SSE2_XSawp qdq, xmm0, xmm7, xmm1
- ; Hadamard transform results are saved in xmm0 and xmm2
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
+ ; Hadamard transform results are saved in xmm0 and xmm2
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
- ;load top boundary samples: [a b c d]
- mov r0, arg1
- mov r1, arg2
- SIGN_EXTENSION r1, r1d
- sub r0, r1
+ ;load top boundary samples: [a b c d]
+ mov r0, arg1
+ mov r1, arg2
+ SIGN_EXTENSION r1, r1d
+ sub r0, r1
%ifdef UNIX64
- push r4
- push r5
+ push r4
+ push r5
%endif
- movzx r2d, byte [r0]
- movzx r3d, byte [r0+1]
- movzx r4d, byte [r0+2]
- movzx r5d, byte [r0+3]
+ movzx r2d, byte [r0]
+ movzx r3d, byte [r0+1]
+ movzx r4d, byte [r0+2]
+ movzx r5d, byte [r0+3]
- ; get the transform results of top boundary samples: [a b c d]
- add r3d, r2d ; r3d = a + b
- add r5d, r4d ; r5d = c + d
- add r2d, r2d ; r2d = a + a
- add r4d, r4d ; r4d = c + c
- sub r2d, r3d ; r2d = a + a - a - b = a - b
- sub r4d, r5d ; r4d = c + c - c - d = c - d
- add r5d, r3d ; r5d = (a + b) + (c + d)
- add r3d, r3d
- sub r3d, r5d ; r3d = (a + b) - (c + d)
- add r4d, r2d ; r4d = (a - b) + (c - d)
- add r2d, r2d
- sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
+ ; get the transform results of top boundary samples: [a b c d]
+ add r3d, r2d ; r3d = a + b
+ add r5d, r4d ; r5d = c + d
+ add r2d, r2d ; r2d = a + a
+ add r4d, r4d ; r4d = c + c
+ sub r2d, r3d ; r2d = a + a - a - b = a - b
+ sub r4d, r5d ; r4d = c + c - c - d = c - d
+ add r5d, r3d ; r5d = (a + b) + (c + d)
+ add r3d, r3d
+ sub r3d, r5d ; r3d = (a + b) - (c + d)
+ add r4d, r2d ; r4d = (a - b) + (c - d)
+ add r2d, r2d
+ sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
- movdqa xmm6, xmm0
- movdqa xmm7, xmm2
- movd xmm5, r5d ; store the edi for DC mode
- pxor xmm3, xmm3
- pxor xmm4, xmm4
- pinsrw xmm3, r5d, 0
- pinsrw xmm3, r4d, 4
- psllw xmm3, 2
- pinsrw xmm4, r3d, 0
- pinsrw xmm4, r2d, 4
- psllw xmm4, 2
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm2
+ movd xmm5, r5d ; store the edi for DC mode
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+ pinsrw xmm3, r5d, 0
+ pinsrw xmm3, r4d, 4
+ psllw xmm3, 2
+ pinsrw xmm4, r3d, 0
+ pinsrw xmm4, r2d, 4
+ psllw xmm4, 2
- ; get the satd of H
- psubw xmm0, xmm3
- psubw xmm2, xmm4
+ ; get the satd of H
+ psubw xmm0, xmm3
+ psubw xmm2, xmm4
- WELS_AbsW xmm0, xmm1
- WELS_AbsW xmm2, xmm1
- paddusw xmm0, xmm2
- SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0
+ WELS_AbsW xmm0, xmm1
+ WELS_AbsW xmm2, xmm1
+ paddusw xmm0, xmm2
+ SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0
- ;load left boundary samples: [a b c d]'
- add r0, r1
+ ;load left boundary samples: [a b c d]'
+ add r0, r1
- movzx r2d, byte [r0-1]
- movzx r3d, byte [r0+r1-1]
- lea r0 , [r0+2*r1]
- movzx r4d, byte [r0-1]
- movzx r5d, byte [r0+r1-1]
+ movzx r2d, byte [r0-1]
+ movzx r3d, byte [r0+r1-1]
+ lea r0 , [r0+2*r1]
+ movzx r4d, byte [r0-1]
+ movzx r5d, byte [r0+r1-1]
- ; get the transform results of left boundary samples: [a b c d]'
- add r3d, r2d ; r3d = a + b
- add r5d, r4d ; r5d = c + d
- add r2d, r2d ; r2d = a + a
- add r4d, r4d ; r4d = c + c
- sub r2d, r3d ; r2d = a + a - a - b = a - b
- sub r4d, r5d ; r4d = c + c - c - d = c - d
- add r5d, r3d ; r5d = (a + b) + (c + d)
- add r3d, r3d
- sub r3d, r5d ; r3d = (a + b) - (c + d)
- add r4d, r2d ; r4d = (a - b) + (c - d)
- add r2d, r2d
- sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
+ ; get the transform results of left boundary samples: [a b c d]'
+ add r3d, r2d ; r3d = a + b
+ add r5d, r4d ; r5d = c + d
+ add r2d, r2d ; r2d = a + a
+ add r4d, r4d ; r4d = c + c
+ sub r2d, r3d ; r2d = a + a - a - b = a - b
+ sub r4d, r5d ; r4d = c + c - c - d = c - d
+ add r5d, r3d ; r5d = (a + b) + (c + d)
+ add r3d, r3d
+ sub r3d, r5d ; r3d = (a + b) - (c + d)
+ add r4d, r2d ; r4d = (a - b) + (c - d)
+ add r2d, r2d
+ sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
- ; store the transform results in xmm3
- movd xmm3, r5d
- pinsrw xmm3, r3d, 1
- pinsrw xmm3, r2d, 2
- pinsrw xmm3, r4d, 3
- psllw xmm3, 2
+ ; store the transform results in xmm3
+ movd xmm3, r5d
+ pinsrw xmm3, r3d, 1
+ pinsrw xmm3, r2d, 2
+ pinsrw xmm3, r4d, 3
+ psllw xmm3, 2
- ; get the satd of V
- movdqa xmm2, xmm6
- movdqa xmm4, xmm7
- psubw xmm2, xmm3
- WELS_AbsW xmm2, xmm1
- WELS_AbsW xmm4, xmm1
- paddusw xmm2, xmm4
- SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2
+ ; get the satd of V
+ movdqa xmm2, xmm6
+ movdqa xmm4, xmm7
+ psubw xmm2, xmm3
+ WELS_AbsW xmm2, xmm1
+ WELS_AbsW xmm4, xmm1
+ paddusw xmm2, xmm4
+ SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2
- ; DC result is stored in xmm1
- add r5d, 4
- movd xmm1, r5d
- paddw xmm1, xmm5
- psrlw xmm1, 3
- movdqa xmm5, xmm1
- psllw xmm1, 4
+ ; DC result is stored in xmm1
+ add r5d, 4
+ movd xmm1, r5d
+ paddw xmm1, xmm5
+ psrlw xmm1, 3
+ movdqa xmm5, xmm1
+ psllw xmm1, 4
- ; get the satd of DC
- psubw xmm6, xmm1
- WELS_AbsW xmm6, xmm1
- WELS_AbsW xmm7, xmm1
- paddusw xmm6, xmm7
- SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6
+ ; get the satd of DC
+ psubw xmm6, xmm1
+ WELS_AbsW xmm6, xmm1
+ WELS_AbsW xmm7, xmm1
+ paddusw xmm6, xmm7
+ SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6
%ifdef UNIX64
- pop r5
- pop r4
+ pop r5
+ pop r4
%endif
- ; comparing order: DC H V
+ ; comparing order: DC H V
- mov r4, arg5
- movd r2d, xmm6
- movd r3d, xmm2
- movd r6d, xmm0
+ mov r4, arg5
+ movd r2d, xmm6
+ movd r3d, xmm2
+ movd r6d, xmm0
- and r2d, 0xffff
- shr r2d, 1
- and r3d, 0xffff
- shr r3d, 1
- and r6d, 0xffff
- shr r6d, 1
- add r2d, dword arg7
- add r3d, dword arg8
- add r6d, dword arg9
- cmp r2w, r3w
- jg near not_dc
- cmp r2w, r6w
- jg near not_dc_h
+ and r2d, 0xffff
+ shr r2d, 1
+ and r3d, 0xffff
+ shr r3d, 1
+ and r6d, 0xffff
+ shr r6d, 1
+ add r2d, dword arg7
+ add r3d, dword arg8
+ add r6d, dword arg9
+ cmp r2w, r3w
+ jg near not_dc
+ cmp r2w, r6w
+ jg near not_dc_h
- ; for DC mode
- movd r3d, xmm5
- imul r3d, 0x01010101
- movd xmm5, r3d
- pshufd xmm5, xmm5, 0
- movdqa [r4], xmm5
- mov r5, arg6
- mov dword [r5], 0x02
- mov retrd, r2d
- POP_XMM
+ ; for DC mode
+ movd r3d, xmm5
+ imul r3d, 0x01010101
+ movd xmm5, r3d
+ pshufd xmm5, xmm5, 0
+ movdqa [r4], xmm5
+ mov r5, arg6
+ mov dword [r5], 0x02
+ mov retrd, r2d
+ POP_XMM
%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
+ pop r6
+ pop r5
+ pop r4
+ pop r3
%endif
- ret
+ ret
not_dc:
- cmp r3w, r6w
- jg near not_dc_h
+ cmp r3w, r6w
+ jg near not_dc_h
- ; for H mode
- SSE_DB_1_2REG xmm6, xmm7
- sub r0, r1
- sub r0, r1
- movzx r6d, byte [r0-1]
- movd xmm0, r6d
- pmuludq xmm0, xmm6
+ ; for H mode
+ SSE_DB_1_2REG xmm6, xmm7
+ sub r0, r1
+ sub r0, r1
+ movzx r6d, byte [r0-1]
+ movd xmm0, r6d
+ pmuludq xmm0, xmm6
- movzx r6d, byte [r0+r1-1]
- movd xmm1, r6d
- pmuludq xmm1, xmm6
- punpckldq xmm0, xmm1
+ movzx r6d, byte [r0+r1-1]
+ movd xmm1, r6d
+ pmuludq xmm1, xmm6
+ punpckldq xmm0, xmm1
- lea r0, [r0+r1*2]
- movzx r6d, byte [r0-1]
- movd xmm2, r6d
- pmuludq xmm2, xmm6
+ lea r0, [r0+r1*2]
+ movzx r6d, byte [r0-1]
+ movd xmm2, r6d
+ pmuludq xmm2, xmm6
- movzx r6d, byte [r0+r1-1]
- movd xmm3, r6d
- pmuludq xmm3, xmm6
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
+ movzx r6d, byte [r0+r1-1]
+ movd xmm3, r6d
+ pmuludq xmm3, xmm6
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
- movdqa [r4],xmm0
+ movdqa [r4],xmm0
- mov retrd, r3d
- mov r5, arg6
- mov dword [r5], 0x01
- POP_XMM
+ mov retrd, r3d
+ mov r5, arg6
+ mov dword [r5], 0x01
+ POP_XMM
%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
+ pop r6
+ pop r5
+ pop r4
+ pop r3
%endif
- ret
+ ret
not_dc_h:
- sub r0, r1
- sub r0, r1
- sub r0, r1
- movd xmm0, [r0]
- pshufd xmm0, xmm0, 0
- movdqa [r4],xmm0
- mov retrd, r6d
- mov r5, arg6
- mov dword [r5], 0x00
- POP_XMM
+ sub r0, r1
+ sub r0, r1
+ sub r0, r1
+ movd xmm0, [r0]
+ pshufd xmm0, xmm0, 0
+ movdqa [r4],xmm0
+ mov retrd, r6d
+ mov r5, arg6
+ mov dword [r5], 0x00
+ POP_XMM
%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
+ pop r6
+ pop r5
+ pop r4
+ pop r3
%endif
- ret
+ ret
%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
- paddd xmm4, %1 ;for dc
- paddd xmm4, %3 ;for dc
- packssdw %1, %3
- psllw %1, 2
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
+ paddd xmm4, %1 ;for dc
+ paddd xmm4, %3 ;for dc
+ packssdw %1, %3
+ psllw %1, 2
%endmacro
%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
; paddd xmm4, %1 ;for dc
-; paddd xmm4, %3 ;for dc
- movdqa %4, %1
- punpcklqdq %4, %3
- packssdw %1, %3
- psllw %1, 2
+; paddd xmm4, %3 ;for dc
+ movdqa %4, %1
+ punpcklqdq %4, %3
+ packssdw %1, %3
+ psllw %1, 2
%endmacro
%macro SSE41_GetX38x4SatdDec 0
- pxor xmm7, xmm7
- movq xmm0, [r2]
- movq xmm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq xmm2, [r2]
- movq xmm3, [r2+r3]
- lea r2, [r2+2*r3]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
- ;doesn't need another transpose
+ pxor xmm7, xmm7
+ movq xmm0, [r2]
+ movq xmm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq xmm2, [r2]
+ movq xmm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+ punpcklbw xmm2, xmm7
+ punpcklbw xmm3, xmm7
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+ ;doesn't need another transpose
%endmacro
%macro SSE41_GetX38x4SatdV 2
- pxor xmm0, xmm0
- pinsrw xmm0, word[r6+%2], 0
- pinsrw xmm0, word[r6+%2+8], 4
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[r6+%2+2], 0
- pinsrw xmm0, word[r6+%2+10], 4
- psubsw xmm0, xmm1
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[r6+%2+4], 0
- pinsrw xmm0, word[r6+%2+12], 4
- psubsw xmm0, xmm3
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[r6+%2+6], 0
- pinsrw xmm0, word[r6+%2+14], 4
- psubsw xmm0, xmm2
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[r6+%2], 0
+ pinsrw xmm0, word[r6+%2+8], 4
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[r6+%2+2], 0
+ pinsrw xmm0, word[r6+%2+10], 4
+ psubsw xmm0, xmm1
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[r6+%2+4], 0
+ pinsrw xmm0, word[r6+%2+12], 4
+ psubsw xmm0, xmm3
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[r6+%2+6], 0
+ pinsrw xmm0, word[r6+%2+14], 4
+ psubsw xmm0, xmm2
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
%endmacro
%macro SSE41_GetX38x4SatdH 3
- movq xmm0, [r6+%3+8*%1]
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm5, xmm0
- pabsw xmm1, xmm1
- pabsw xmm2, xmm2
- pabsw xmm3, xmm3
- paddw xmm2, xmm1;for DC
- paddw xmm2, xmm3;for DC
- paddw xmm5, xmm2
+ movq xmm0, [r6+%3+8*%1]
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm5, xmm0
+ pabsw xmm1, xmm1
+ pabsw xmm2, xmm2
+ pabsw xmm3, xmm3
+ paddw xmm2, xmm1;for DC
+ paddw xmm2, xmm3;for DC
+ paddw xmm5, xmm2
%endmacro
%macro SSE41_I16X16GetX38x4SatdDC 0
- pxor xmm0, xmm0
- movq2dq xmm0, mm4
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
+ pxor xmm0, xmm0
+ movq2dq xmm0, mm4
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
%endmacro
%macro SSE41_ChromaGetX38x4SatdDC 1
- shl %1, 4
- movdqa xmm0, [r6+32+%1]
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
+ shl %1, 4
+ movdqa xmm0, [r6+32+%1]
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
%endmacro
%macro SSE41_I16x16GetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 32
- SSE41_I16X16GetX38x4SatdDC
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 32
+ SSE41_I16X16GetX38x4SatdDC
%endmacro
%macro SSE41_ChromaGetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 16
- SSE41_ChromaGetX38x4SatdDC %1
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 16
+ SSE41_ChromaGetX38x4SatdDC %1
%endmacro
%macro SSE41_HSum8W 3
- pmaddwd %1, %2
- movhlps %3, %1
- paddd %1, %3
- pshuflw %3, %1,0Eh
- paddd %1, %3
+ pmaddwd %1, %2
+ movhlps %3, %1
+ paddd %1, %3
+ pshuflw %3, %1,0Eh
+ paddd %1, %3
%endmacro
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
- %assign push_num 0
- LOAD_7_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
%ifndef X86_32
- push r12
- mov r12, r2
+ push r12
+ mov r12, r2
%endif
- pxor xmm4, xmm4
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub r0, r1
- movdqu xmm0, [r0]
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [r6], xmm0 ;V
- movdqa [r6+16], xmm1
- add r0, r1
- pinsrb xmm0, byte[r0-1], 0
- pinsrb xmm0, byte[r0+r1-1], 1
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 2
- pinsrb xmm0, byte[r0+r1-1], 3
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 4
- pinsrb xmm0, byte[r0+r1-1], 5
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 6
- pinsrb xmm0, byte[r0+r1-1], 7
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 8
- pinsrb xmm0, byte[r0+r1-1], 9
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 10
- pinsrb xmm0, byte[r0+r1-1], 11
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 12
- pinsrb xmm0, byte[r0+r1-1], 13
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 14
- pinsrb xmm0, byte[r0+r1-1], 15
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [r6+32], xmm0 ;H
- movdqa [r6+48], xmm1
- movd r0d, xmm4 ;dc
- add r0d, 16 ;(sum+16)
- shr r0d, 5 ;((sum+16)>>5)
- shl r0d, 4 ;
- movd mm4, r0d ; mm4 copy DC
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
+ pxor xmm4, xmm4
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub r0, r1
+ movdqu xmm0, [r0]
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [r6], xmm0 ;V
+ movdqa [r6+16], xmm1
+ add r0, r1
+ pinsrb xmm0, byte[r0-1], 0
+ pinsrb xmm0, byte[r0+r1-1], 1
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 2
+ pinsrb xmm0, byte[r0+r1-1], 3
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 4
+ pinsrb xmm0, byte[r0+r1-1], 5
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 6
+ pinsrb xmm0, byte[r0+r1-1], 7
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 8
+ pinsrb xmm0, byte[r0+r1-1], 9
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 10
+ pinsrb xmm0, byte[r0+r1-1], 11
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 12
+ pinsrb xmm0, byte[r0+r1-1], 13
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 14
+ pinsrb xmm0, byte[r0+r1-1], 15
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [r6+32], xmm0 ;H
+ movdqa [r6+48], xmm1
+ movd r0d, xmm4 ;dc
+ add r0d, 16 ;(sum+16)
+ shr r0d, 5 ;((sum+16)>>5)
+ shl r0d, 4 ;
+ movd mm4, r0d ; mm4 copy DC
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
%ifdef UNIX64
- push r4
+ push r4
%endif
- mov r0, 0
- mov r4, 0
+ mov r0, 0
+ mov r4, 0
.loop16x16_get_satd:
.loopStart1:
- SSE41_I16x16GetX38x4Satd r0, r4
- inc r0
- cmp r0, 4
- jl .loopStart1
- cmp r4, 16
- je .loop16x16_get_satd_end
+ SSE41_I16x16GetX38x4Satd r0, r4
+ inc r0
+ cmp r0, 4
+ jl .loopStart1
+ cmp r4, 16
+ je .loop16x16_get_satd_end
%ifdef X86_32
- mov r2, arg3
+ mov r2, arg3
%else
- mov r2, r12
+ mov r2, r12
%endif
- add r2, 8
- mov r0, 0
- add r4, 16
- jmp .loop16x16_get_satd
+ add r2, 8
+ mov r0, 0
+ add r4, 16
+ jmp .loop16x16_get_satd
.loop16x16_get_satd_end:
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
%ifdef UNIX64
- pop r4
+ pop r4
%endif
- ; comparing order: DC H V
- movd r3d, xmm6 ;DC
- movd r1d, xmm5 ;H
- movd r0d, xmm4 ;V
+ ; comparing order: DC H V
+ movd r3d, xmm6 ;DC
+ movd r1d, xmm5 ;H
+ movd r0d, xmm4 ;V
%ifndef X86_32
- pop r12
+ pop r12
%endif
- shl r5d, 1
- add r1d, r5d
- add r3d, r5d
- mov r4, arg5
- cmp r3d, r1d
- jge near not_dc_16x16
- cmp r3d, r0d
- jge near not_dc_h_16x16
+ shl r5d, 1
+ add r1d, r5d
+ add r3d, r5d
+ mov r4, arg5
+ cmp r3d, r1d
+ jge near not_dc_16x16
+ cmp r3d, r0d
+ jge near not_dc_h_16x16
- ; for DC mode
- mov dword[r4], 2;I16_PRED_DC
- mov retrd, r3d
- jmp near return_satd_intra_16x16_x3
+ ; for DC mode
+ mov dword[r4], 2;I16_PRED_DC
+ mov retrd, r3d
+ jmp near return_satd_intra_16x16_x3
not_dc_16x16:
- ; for H mode
- cmp r1d, r0d
- jge near not_dc_h_16x16
- mov dword[r4], 1;I16_PRED_H
- mov retrd, r1d
- jmp near return_satd_intra_16x16_x3
+ ; for H mode
+ cmp r1d, r0d
+ jge near not_dc_h_16x16
+ mov dword[r4], 1;I16_PRED_H
+ mov retrd, r1d
+ jmp near return_satd_intra_16x16_x3
not_dc_h_16x16:
- ; for V mode
- mov dword[r4], 0;I16_PRED_V
- mov retrd, r0d
+ ; for V mode
+ mov dword[r4], 0;I16_PRED_V
+ mov retrd, r0d
return_satd_intra_16x16_x3:
- WELSEMMS
- POP_XMM
- LOAD_7_PARA_POP
+ WELSEMMS
+ POP_XMM
+ LOAD_7_PARA_POP
ret
%macro SSE41_ChromaGetX38x8Satd 0
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub r0, r1
- movq xmm0, [r0]
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
- movdqa [r6], xmm0 ;V
- add r0, r1
- pinsrb xmm0, byte[r0-1], 0
- pinsrb xmm0, byte[r0+r1-1], 1
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 2
- pinsrb xmm0, byte[r0+r1-1], 3
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 4
- pinsrb xmm0, byte[r0+r1-1], 5
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 6
- pinsrb xmm0, byte[r0+r1-1], 7
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
- movdqa [r6+16], xmm0 ;H
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub r0, r1
+ movq xmm0, [r0]
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+ movdqa [r6], xmm0 ;V
+ add r0, r1
+ pinsrb xmm0, byte[r0-1], 0
+ pinsrb xmm0, byte[r0+r1-1], 1
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 2
+ pinsrb xmm0, byte[r0+r1-1], 3
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 4
+ pinsrb xmm0, byte[r0+r1-1], 5
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 6
+ pinsrb xmm0, byte[r0+r1-1], 7
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+ movdqa [r6+16], xmm0 ;H
;(sum+2)>>2
- movdqa xmm6, [PDQ2]
- movdqa xmm5, xmm4
- punpckhqdq xmm5, xmm1
- paddd xmm5, xmm6
- psrld xmm5, 2
+ movdqa xmm6, [PDQ2]
+ movdqa xmm5, xmm4
+ punpckhqdq xmm5, xmm1
+ paddd xmm5, xmm6
+ psrld xmm5, 2
;(sum1+sum2+4)>>3
- paddd xmm6, xmm6
- paddd xmm4, xmm1
- paddd xmm4, xmm6
- psrld xmm4, 3
+ paddd xmm6, xmm6
+ paddd xmm4, xmm1
+ paddd xmm4, xmm6
+ psrld xmm4, 3
;satd *16
- pslld xmm5, 4
- pslld xmm4, 4
+ pslld xmm5, 4
+ pslld xmm4, 4
;temp satd
- movdqa xmm6, xmm4
- punpcklqdq xmm4, xmm5
- psllq xmm4, 32
- psrlq xmm4, 32
- movdqa [r6+32], xmm4
- punpckhqdq xmm5, xmm6
- psllq xmm5, 32
- psrlq xmm5, 32
- movdqa [r6+48], xmm5
+ movdqa xmm6, xmm4
+ punpcklqdq xmm4, xmm5
+ psllq xmm4, 32
+ psrlq xmm4, 32
+ movdqa [r6+32], xmm4
+ punpckhqdq xmm5, xmm6
+ psllq xmm5, 32
+ psrlq xmm5, 32
+ movdqa [r6+48], xmm5
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov r0, 0
- SSE41_ChromaGetX38x4Satd r0, 0
- inc r0
- SSE41_ChromaGetX38x4Satd r0, 0
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov r0, 0
+ SSE41_ChromaGetX38x4Satd r0, 0
+ inc r0
+ SSE41_ChromaGetX38x4Satd r0, 0
%endmacro
%macro SSEReg2MMX 3
- movdq2q %2, %1
- movhlps %1, %1
- movdq2q %3, %1
+ movdq2q %2, %1
+ movhlps %1, %1
+ movdq2q %3, %1
%endmacro
%macro MMXReg2SSE 4
- movq2dq %1, %3
- movq2dq %2, %4
- punpcklqdq %1, %2
+ movq2dq %1, %3
+ movq2dq %2, %4
+ punpcklqdq %1, %2
%endmacro
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
- %assign push_num 0
- LOAD_7_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
loop_chroma_satdx3:
- SSE41_ChromaGetX38x8Satd
- SSEReg2MMX xmm4, mm0,mm1
- SSEReg2MMX xmm5, mm2,mm3
- SSEReg2MMX xmm6, mm5,mm6
- mov r0, arg8
- mov r2, arg9
+ SSE41_ChromaGetX38x8Satd
+ SSEReg2MMX xmm4, mm0,mm1
+ SSEReg2MMX xmm5, mm2,mm3
+ SSEReg2MMX xmm6, mm5,mm6
+ mov r0, arg8
+ mov r2, arg9
- SSE41_ChromaGetX38x8Satd
+ SSE41_ChromaGetX38x8Satd
- MMXReg2SSE xmm0, xmm3, mm0, mm1
- MMXReg2SSE xmm1, xmm3, mm2, mm3
- MMXReg2SSE xmm2, xmm3, mm5, mm6
+ MMXReg2SSE xmm0, xmm3, mm0, mm1
+ MMXReg2SSE xmm1, xmm3, mm2, mm3
+ MMXReg2SSE xmm2, xmm3, mm5, mm6
- paddw xmm4, xmm0
- paddw xmm5, xmm1
- paddw xmm6, xmm2
+ paddw xmm4, xmm0
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
- ; comparing order: DC H V
- movd r3d, xmm6 ;DC
- movd r1d, xmm5 ;H
- movd r0d, xmm4 ;V
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
+ ; comparing order: DC H V
+ movd r3d, xmm6 ;DC
+ movd r1d, xmm5 ;H
+ movd r0d, xmm4 ;V
- shl r5d, 1
- add r1d, r5d
- add r0d, r5d
- cmp r3d, r1d
- jge near not_dc_8x8
- cmp r3d, r0d
- jge near not_dc_h_8x8
+ shl r5d, 1
+ add r1d, r5d
+ add r0d, r5d
+ cmp r3d, r1d
+ jge near not_dc_8x8
+ cmp r3d, r0d
+ jge near not_dc_h_8x8
- ; for DC mode
- mov dword[r4], 0;I8_PRED_DC
- mov retrd, r3d
- jmp near return_satd_intra_8x8_x3
+ ; for DC mode
+ mov dword[r4], 0;I8_PRED_DC
+ mov retrd, r3d
+ jmp near return_satd_intra_8x8_x3
not_dc_8x8:
- ; for H mode
- cmp r1d, r0d
- jge near not_dc_h_8x8
- mov dword[r4], 1;I8_PRED_H
- mov retrd, r1d
- jmp near return_satd_intra_8x8_x3
+ ; for H mode
+ cmp r1d, r0d
+ jge near not_dc_h_8x8
+ mov dword[r4], 1;I8_PRED_H
+ mov retrd, r1d
+ jmp near return_satd_intra_8x8_x3
not_dc_h_8x8:
- ; for V mode
- mov dword[r4], 2;I8_PRED_V
- mov retrd, r0d
+ ; for V mode
+ mov dword[r4], 2;I8_PRED_V
+ mov retrd, r0d
return_satd_intra_8x8_x3:
- WELSEMMS
- POP_XMM
- LOAD_7_PARA_POP
+ WELSEMMS
+ POP_XMM
+ LOAD_7_PARA_POP
ret
@@ -1040,22 +1040,22 @@
;
;***********************************************************************
%macro SSSE3_Get16BSadHVDC 2
- movd xmm6,%1
- pshufb xmm6,xmm1
- movdqa %1, xmm6
- movdqa xmm0,%2
- psadbw xmm0,xmm7
- paddw xmm4,xmm0
- movdqa xmm0,%2
- psadbw xmm0,xmm5
- paddw xmm2,xmm0
- psadbw xmm6,%2
- paddw xmm3,xmm6
+ movd xmm6,%1
+ pshufb xmm6,xmm1
+ movdqa %1, xmm6
+ movdqa xmm0,%2
+ psadbw xmm0,xmm7
+ paddw xmm4,xmm0
+ movdqa xmm0,%2
+ psadbw xmm0,xmm5
+ paddw xmm2,xmm0
+ psadbw xmm6,%2
+ paddw xmm3,xmm6
%endmacro
%macro WelsAddDCValue 4
- movzx %2, byte %1
- mov %3, %2
- add %4, %2
+ movzx %2, byte %1
+ mov %3, %2
+ add %4, %2
%endmacro
;***********************************************************************
@@ -1064,138 +1064,138 @@
;
;***********************************************************************
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
- %assign push_num 0
- LOAD_7_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
- push r5
- push r4
- push r3
+ push r5
+ push r4
+ push r3
- sub r0, r1
- movdqa xmm5,[r0]
- pxor xmm0,xmm0
- psadbw xmm0,xmm5
- movhlps xmm1,xmm0
- paddw xmm0,xmm1
- movd r5d, xmm0
+ sub r0, r1
+ movdqa xmm5,[r0]
+ pxor xmm0,xmm0
+ psadbw xmm0,xmm5
+ movhlps xmm1,xmm0
+ paddw xmm0,xmm1
+ movd r5d, xmm0
- add r0,r1
- lea r3,[r1+2*r1] ;ebx r3
- WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d
- WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
- WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
- WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
- lea r0, [r0+4*r1]
- add r6, 64
- WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
- WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
- WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
- WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
- lea r0, [r0+4*r1]
- add r6, 64
- WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
- WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
- WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
- WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
- lea r0, [r0+4*r1]
- add r6, 64
- WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
- WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
- WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
- WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
- sub r6, 192
- add r5d,10h
- shr r5d,5
- movd xmm7,r5d
- pxor xmm1,xmm1
- pshufb xmm7,xmm1
- pxor xmm4,xmm4
- pxor xmm3,xmm3
- pxor xmm2,xmm2
- ;sad begin
- pop r3
- lea r4, [r3+2*r3] ;esi r4
- SSSE3_Get16BSadHVDC [r6], [r2]
- SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
- SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
- SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
- add r6, 64
- lea r2, [r2+4*r3]
- SSSE3_Get16BSadHVDC [r6], [r2]
- SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
- SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
- SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
- add r6, 64
- lea r2, [r2+4*r3]
- SSSE3_Get16BSadHVDC [r6], [r2]
- SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
- SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
- SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
- add r6, 64
- lea r2, [r2+4*r3]
- SSSE3_Get16BSadHVDC [r6], [r2]
- SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
- SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
- SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+ add r0,r1
+ lea r3,[r1+2*r1] ;ebx r3
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ lea r0, [r0+4*r1]
+ add r6, 64
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ lea r0, [r0+4*r1]
+ add r6, 64
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ lea r0, [r0+4*r1]
+ add r6, 64
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ sub r6, 192
+ add r5d,10h
+ shr r5d,5
+ movd xmm7,r5d
+ pxor xmm1,xmm1
+ pshufb xmm7,xmm1
+ pxor xmm4,xmm4
+ pxor xmm3,xmm3
+ pxor xmm2,xmm2
+ ;sad begin
+ pop r3
+ lea r4, [r3+2*r3] ;esi r4
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+ add r6, 64
+ lea r2, [r2+4*r3]
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+ add r6, 64
+ lea r2, [r2+4*r3]
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+ add r6, 64
+ lea r2, [r2+4*r3]
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
- pop r4
- pop r5
- pslldq xmm3,4
- por xmm3,xmm2
- movhlps xmm1,xmm3
- paddw xmm3,xmm1
- movhlps xmm0,xmm4
- paddw xmm4,xmm0
- ; comparing order: DC H V
- movd r1d, xmm4 ;DC ;ebx r1d
- movd r0d, xmm3 ;V ;ecx r0d
- psrldq xmm3, 4
- movd r2d, xmm3 ;H ;esi r2d
+ pop r4
+ pop r5
+ pslldq xmm3,4
+ por xmm3,xmm2
+ movhlps xmm1,xmm3
+ paddw xmm3,xmm1
+ movhlps xmm0,xmm4
+ paddw xmm4,xmm0
+ ; comparing order: DC H V
+ movd r1d, xmm4 ;DC ;ebx r1d
+ movd r0d, xmm3 ;V ;ecx r0d
+ psrldq xmm3, 4
+ movd r2d, xmm3 ;H ;esi r2d
- ;mov eax, [esp+36] ;lamda ;eax r5
- shl r5d, 1
- add r2d, r5d
- add r1d, r5d
- ;mov edx, [esp+32] ;edx r4
- cmp r1d, r2d
- jge near not_dc_16x16_sad
- cmp r1d, r0d
- jge near not_dc_h_16x16_sad
- ; for DC mode
- mov dword[r4], 2;I16_PRED_DC
- mov retrd, r1d
- sub r6, 192
+ ;mov eax, [esp+36] ;lamda ;eax r5
+ shl r5d, 1
+ add r2d, r5d
+ add r1d, r5d
+ ;mov edx, [esp+32] ;edx r4
+ cmp r1d, r2d
+ jge near not_dc_16x16_sad
+ cmp r1d, r0d
+ jge near not_dc_h_16x16_sad
+ ; for DC mode
+ mov dword[r4], 2;I16_PRED_DC
+ mov retrd, r1d
+ sub r6, 192
%assign x 0
%rep 16
- movdqa [r6+16*x], xmm7
+ movdqa [r6+16*x], xmm7
%assign x x+1
%endrep
- jmp near return_sad_intra_16x16_x3
+ jmp near return_sad_intra_16x16_x3
not_dc_16x16_sad:
- ; for H mode
- cmp r2d, r0d
- jge near not_dc_h_16x16_sad
- mov dword[r4], 1;I16_PRED_H
- mov retrd, r2d
- jmp near return_sad_intra_16x16_x3
+ ; for H mode
+ cmp r2d, r0d
+ jge near not_dc_h_16x16_sad
+ mov dword[r4], 1;I16_PRED_H
+ mov retrd, r2d
+ jmp near return_sad_intra_16x16_x3
not_dc_h_16x16_sad:
- ; for V mode
- mov dword[r4], 0;I16_PRED_V
- mov retrd, r0d
- sub r6, 192
+ ; for V mode
+ mov dword[r4], 0;I16_PRED_V
+ mov retrd, r0d
+ sub r6, 192
%assign x 0
%rep 16
- movdqa [r6+16*x], xmm5
+ movdqa [r6+16*x], xmm5
%assign x x+1
%endrep
return_sad_intra_16x16_x3:
- POP_XMM
- LOAD_7_PARA_POP
- ret
+ POP_XMM
+ LOAD_7_PARA_POP
+ ret
;***********************************************************************
;
@@ -1210,63 +1210,63 @@
;SSE4.1
%macro SSE41_GetSatd8x4 0
- movq xmm0, [r0]
- punpcklqdq xmm0, xmm0
- pmaddubsw xmm0, xmm7
- movq xmm1, [r0+r1]
- punpcklqdq xmm1, xmm1
- pmaddubsw xmm1, xmm7
- movq xmm2, [r2]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [r2+r3]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- psubsw xmm0, xmm2
- psubsw xmm1, xmm3
- movq xmm2, [r0+2*r1]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [r0+r4]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- movq xmm4, [r2+2*r3]
- punpcklqdq xmm4, xmm4
- pmaddubsw xmm4, xmm7
- movq xmm5, [r2+r5]
- punpcklqdq xmm5, xmm5
- pmaddubsw xmm5, xmm7
- psubsw xmm2, xmm4
- psubsw xmm3, xmm5
- SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
- pabsw xmm0, xmm0
- pabsw xmm2, xmm2
- pabsw xmm1, xmm1
- pabsw xmm3, xmm3
- movdqa xmm4, xmm3
- pblendw xmm3, xmm1, 0xAA
- pslld xmm1, 16
- psrld xmm4, 16
- por xmm1, xmm4
- pmaxuw xmm1, xmm3
- paddw xmm6, xmm1
- movdqa xmm4, xmm0
- pblendw xmm0, xmm2, 0xAA
- pslld xmm2, 16
- psrld xmm4, 16
- por xmm2, xmm4
- pmaxuw xmm0, xmm2
- paddw xmm6, xmm0
+ movq xmm0, [r0]
+ punpcklqdq xmm0, xmm0
+ pmaddubsw xmm0, xmm7
+ movq xmm1, [r0+r1]
+ punpcklqdq xmm1, xmm1
+ pmaddubsw xmm1, xmm7
+ movq xmm2, [r2]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r2+r3]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ psubsw xmm0, xmm2
+ psubsw xmm1, xmm3
+ movq xmm2, [r0+2*r1]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r0+r4]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ movq xmm4, [r2+2*r3]
+ punpcklqdq xmm4, xmm4
+ pmaddubsw xmm4, xmm7
+ movq xmm5, [r2+r5]
+ punpcklqdq xmm5, xmm5
+ pmaddubsw xmm5, xmm7
+ psubsw xmm2, xmm4
+ psubsw xmm3, xmm5
+ SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
+ pabsw xmm0, xmm0
+ pabsw xmm2, xmm2
+ pabsw xmm1, xmm1
+ pabsw xmm3, xmm3
+ movdqa xmm4, xmm3
+ pblendw xmm3, xmm1, 0xAA
+ pslld xmm1, 16
+ psrld xmm4, 16
+ por xmm1, xmm4
+ pmaxuw xmm1, xmm3
+ paddw xmm6, xmm1
+ movdqa xmm4, xmm0
+ pblendw xmm0, xmm2, 0xAA
+ pslld xmm2, 16
+ psrld xmm4, 16
+ por xmm2, xmm4
+ pmaxuw xmm0, xmm2
+ paddw xmm6, xmm0
%endmacro
%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
- MMX_DW_1_2REG %3, %4
- pmaddwd %2, %3
- movhlps %4, %2
- paddd %2, %4
- pshuflw %4, %2,0Eh
- paddd %2, %4
- movd %1, %2
+ MMX_DW_1_2REG %3, %4
+ pmaddwd %2, %3
+ movhlps %4, %2
+ paddd %2, %4
+ pshuflw %4, %2,0Eh
+ paddd %2, %4
+ movd %1, %2
%endmacro
;***********************************************************************
;
@@ -1274,53 +1274,53 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse41
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqa xmm4,[HSwapSumSubDB1]
- movd xmm2,[r2]
- movd xmm5,[r2+r3]
- shufps xmm2,xmm5,0
- movd xmm3,[r2+r3*2]
- lea r2, [r3*2+r2]
- movd xmm5,[r2+r3]
- shufps xmm3,xmm5,0
- movd xmm0,[r0]
- movd xmm5,[r0+r1]
- shufps xmm0,xmm5,0
- movd xmm1,[r0+r1*2]
- lea r0, [r1*2+r0]
- movd xmm5,[r0+r1]
- shufps xmm1,xmm5,0
- pmaddubsw xmm0,xmm4
- pmaddubsw xmm1,xmm4
- pmaddubsw xmm2,xmm4
- pmaddubsw xmm3,xmm4
- psubw xmm0,xmm2
- psubw xmm1,xmm3
- movdqa xmm2,xmm0
- paddw xmm0,xmm1
- psubw xmm1,xmm2
- movdqa xmm2,xmm0
- punpcklqdq xmm0,xmm1
- punpckhqdq xmm2,xmm1
- movdqa xmm1,xmm0
- paddw xmm0,xmm2
- psubw xmm2,xmm1
- movdqa xmm1,xmm0
- pblendw xmm0,xmm2,0AAh
- pslld xmm2,16
- psrld xmm1,16
- por xmm2,xmm1
- pabsw xmm0,xmm0
- pabsw xmm2,xmm2
- pmaxsw xmm0,xmm2
- SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm4,[HSwapSumSubDB1]
+ movd xmm2,[r2]
+ movd xmm5,[r2+r3]
+ shufps xmm2,xmm5,0
+ movd xmm3,[r2+r3*2]
+ lea r2, [r3*2+r2]
+ movd xmm5,[r2+r3]
+ shufps xmm3,xmm5,0
+ movd xmm0,[r0]
+ movd xmm5,[r0+r1]
+ shufps xmm0,xmm5,0
+ movd xmm1,[r0+r1*2]
+ lea r0, [r1*2+r0]
+ movd xmm5,[r0+r1]
+ shufps xmm1,xmm5,0
+ pmaddubsw xmm0,xmm4
+ pmaddubsw xmm1,xmm4
+ pmaddubsw xmm2,xmm4
+ pmaddubsw xmm3,xmm4
+ psubw xmm0,xmm2
+ psubw xmm1,xmm3
+ movdqa xmm2,xmm0
+ paddw xmm0,xmm1
+ psubw xmm1,xmm2
+ movdqa xmm2,xmm0
+ punpcklqdq xmm0,xmm1
+ punpckhqdq xmm2,xmm1
+ movdqa xmm1,xmm0
+ paddw xmm0,xmm2
+ psubw xmm2,xmm1
+ movdqa xmm1,xmm0
+ pblendw xmm0,xmm2,0AAh
+ pslld xmm2,16
+ psrld xmm1,16
+ por xmm2,xmm1
+ pabsw xmm0,xmm0
+ pabsw xmm2,xmm2
+ pmaxsw xmm0,xmm2
+ SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -1329,30 +1329,30 @@
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse41
%ifdef X86_32
- push r4
- push r5
+ push r4
+ push r5
%endif
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
%ifdef X86_32
- pop r5
- pop r4
+ pop r5
+ pop r4
%endif
- ret
+ ret
;***********************************************************************
;
@@ -1361,36 +1361,36 @@
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse41
%ifdef X86_32
- push r4
- push r5
- push r6
+ push r4
+ push r5
+ push r6
%endif
- %assign push_num 3
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- mov r6, 0
+ %assign push_num 3
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
loop_get_satd_8x16:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_8x16
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_8x16
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
%ifdef X86_32
- pop r6
- pop r5
- pop r4
+ pop r6
+ pop r5
+ pop r4
%endif
- ret
+ ret
;***********************************************************************
;
@@ -1399,42 +1399,42 @@
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse41
%ifdef X86_32
- push r4
- push r5
+ push r4
+ push r5
%endif
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- push r0
- push r2
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ push r0
+ push r2
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
- pop r2
- pop r0
- add r0, 8
- add r2, 8
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
+ pop r2
+ pop r0
+ add r0, 8
+ add r2, 8
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
%ifdef X86_32
- pop r5
- pop r4
+ pop r5
+ pop r4
%endif
- ret
+ ret
;***********************************************************************
;
@@ -1444,53 +1444,53 @@
WELS_EXTERN WelsSampleSatd16x16_sse41
%ifdef X86_32
- push r4
- push r5
- push r6
+ push r4
+ push r5
+ push r6
%endif
- %assign push_num 3
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
+ %assign push_num 3
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
- push r0
- push r2
+ push r0
+ push r2
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- mov r6, 0
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
loop_get_satd_16x16_left:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_16x16_left
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_left
- pop r2
- pop r0
- add r0, 8
- add r2, 8
- mov r6, 0
+ pop r2
+ pop r0
+ add r0, 8
+ add r2, 8
+ mov r6, 0
loop_get_satd_16x16_right:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_16x16_right
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_right
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
%ifdef X86_32
- pop r6
- pop r5
- pop r4
+ pop r6
+ pop r5
+ pop r4
%endif
- ret
+ ret
;***********************************************************************
;
@@ -1505,55 +1505,55 @@
;***********************************************************************
%macro SSE2_GetSad2x16 0
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqu xmm1, [r2]
- MOVDQ xmm2, [r0];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqu xmm1, [r2]
+ MOVDQ xmm2, [r0];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
%endmacro
%macro SSE2_GetSad4x16 0
- movdqu xmm0, [r2]
- MOVDQ xmm2, [r0]
- psadbw xmm0, xmm2
- paddw xmm7, xmm0
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [r2+2*r3]
- MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [r2+r5]
- MOVDQ xmm2, [r0+r4]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ paddw xmm7, xmm0
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+2*r3]
+ MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+r5]
+ MOVDQ xmm2, [r0+r4]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
%endmacro
%macro SSE2_GetSad8x4 0
- movq xmm0, [r0]
- movq xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movhps xmm0, [r0]
- movhps xmm1, [r0+r1]
+ movq xmm0, [r0]
+ movq xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movhps xmm0, [r0]
+ movhps xmm1, [r0+r1]
- movq xmm2, [r2]
- movq xmm3, [r2+r3]
- lea r2, [r2+2*r3]
- movhps xmm2, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm3
- paddw xmm6, xmm0
- paddw xmm6, xmm1
+ movq xmm2, [r2]
+ movq xmm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movhps xmm2, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm2
+ psadbw xmm1, xmm3
+ paddw xmm6, xmm0
+ paddw xmm6, xmm1
%endmacro
;***********************************************************************
@@ -1565,39 +1565,39 @@
;***********************************************************************
WELS_EXTERN WelsSampleSad16x16_sse2
%ifdef X86_32
- push r4
- push r5
+ push r4
+ push r5
%endif
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- lea r4, [3*r1]
- lea r5, [3*r3]
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ lea r4, [3*r1]
+ lea r5, [3*r3]
- pxor xmm7, xmm7
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd retrd, xmm0
- POP_XMM
- LOAD_4_PARA_POP
+ pxor xmm7, xmm7
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ movhlps xmm0, xmm7
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
%ifdef X86_32
- pop r5
- pop r4
+ pop r5
+ pop r4
%endif
- ret
+ ret
;***********************************************************************
;
@@ -1607,55 +1607,55 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x8_sse2
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqu xmm0, [r2]
- MOVDQ xmm2, [r0]
- psadbw xmm0, xmm2
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
- SSE2_GetSad2x16
- SSE2_GetSad2x16
- SSE2_GetSad2x16
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
- movhlps xmm1, xmm0
- paddw xmm0, xmm1
- movd retrd, xmm0
- LOAD_4_PARA_POP
- ret
+ movhlps xmm1, xmm0
+ paddw xmm0, xmm1
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+ ret
WELS_EXTERN WelsSampleSad8x16_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 7
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd retrd, xmm0
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
@@ -1664,22 +1664,22 @@
%endmacro
WELS_EXTERN WelsSampleSad8x8_sse21
- %assign push_num 0
- mov r2, arg3
- push r2
- CACHE_SPLIT_CHECK r2, 8, 64
- jle near .pixel_sad_8x8_nsplit
- pop r2
+ %assign push_num 0
+ mov r2, arg3
+ push r2
+ CACHE_SPLIT_CHECK r2, 8, 64
+ jle near .pixel_sad_8x8_nsplit
+ pop r2
%ifdef X86_32
- push r3
- push r4
- push r5
+ push r3
+ push r4
+ push r5
%endif
- %assign push_num 3
- PUSH_XMM 8
- mov r0, arg1
- mov r1, arg2
- SIGN_EXTENSION r1, r1d
+ %assign push_num 3
+ PUSH_XMM 8
+ mov r0, arg1
+ mov r1, arg2
+ SIGN_EXTENSION r1, r1d
pxor xmm7, xmm7
;ecx r2, edx r4, edi r5
@@ -1694,109 +1694,109 @@
shl r4, 3
movd xmm5, r5d
movd xmm6, r4d
- mov r5, 8
- add r5, r2
+ mov r5, 8
+ add r5, r2
mov r3, arg4
- SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r3, r3d
movq xmm0, [r0]
- movhps xmm0, [r0+r1]
+ movhps xmm0, [r0+r1]
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
movq xmm0, [r0]
- movhps xmm0, [r0+r1]
+ movhps xmm0, [r0+r1]
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
movq xmm0, [r0]
- movhps xmm0, [r0+r1]
+ movhps xmm0, [r0+r1]
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
movq xmm0, [r0]
- movhps xmm0, [r0+r1]
+ movhps xmm0, [r0+r1]
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd retrd, xmm0
- POP_XMM
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+ POP_XMM
%ifdef X86_32
- pop r5
- pop r4
- pop r3
+ pop r5
+ pop r4
+ pop r3
%endif
- jmp .return
+ jmp .return
.pixel_sad_8x8_nsplit:
- pop r2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 7
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm6, xmm6
- SSE2_GetSad8x4
+ pop r2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm6, xmm6
+ SSE2_GetSad8x4
lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
+ lea r2, [r2+2*r3]
SSE2_GetSad8x4
movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd retrd, xmm0
- POP_XMM
- LOAD_4_PARA_POP
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
.return:
- ret
+ ret
;***********************************************************************
@@ -1814,624 +1814,624 @@
%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
- psadbw %1, %4
- paddw xmm5, %1
- psadbw %4, %3
- paddw xmm4, %4
- movdqu %4, [%5-1]
- psadbw %4, %2
- paddw xmm6, %4
- movdqu %4, [%5+1]
- psadbw %4, %2
- paddw xmm7, %4
+ psadbw %1, %4
+ paddw xmm5, %1
+ psadbw %4, %3
+ paddw xmm4, %4
+ movdqu %4, [%5-1]
+ psadbw %4, %2
+ paddw xmm6, %4
+ movdqu %4, [%5+1]
+ psadbw %4, %2
+ paddw xmm7, %4
%endmacro
WELS_EXTERN WelsSampleSadFour16x16_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [r0]
- sub r2, r3
- movdqu xmm3, [r2]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
- movdqu xmm2, [r2+r3-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
- movdqu xmm3, [r2+r3+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- psadbw xmm2, xmm3
- paddw xmm5, xmm2
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm2, xmm3
+ paddw xmm5, xmm2
- movdqu xmm2, [r2-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
+ movdqu xmm2, [r2-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
- movdqu xmm3, [r2+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movdqu xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movdqu xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
WELS_EXTERN WelsSampleSadFour16x8_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [r0]
- sub r2, r3
- movdqu xmm3, [r2]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
- movdqu xmm2, [r2+r3-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
- movdqu xmm3, [r2+r3+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movdqu xmm0, [r2-1]
- psadbw xmm0, xmm1
- paddw xmm6, xmm0
+ movdqu xmm0, [r2-1]
+ psadbw xmm0, xmm1
+ paddw xmm6, xmm0
- movdqu xmm3, [r2+1]
- psadbw xmm3, xmm1
- paddw xmm7, xmm3
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm1
+ paddw xmm7, xmm3
- movdqu xmm3, [r2+r3]
- psadbw xmm1, xmm3
- paddw xmm5, xmm1
+ movdqu xmm3, [r2+r3]
+ psadbw xmm1, xmm3
+ paddw xmm5, xmm1
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
WELS_EXTERN WelsSampleSadFour8x16_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- sub r2, r3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
WELS_EXTERN WelsSampleSadFour8x8_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- sub r2, r3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
WELS_EXTERN WelsSampleSadFour4x4_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movd xmm0, [r0]
- movd xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movd xmm2, [r0]
- movd xmm3, [r0+r1]
- punpckldq xmm0, xmm1
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
- sub r2, r3
- movd xmm1, [r2]
- movd xmm2, [r2+r3]
- punpckldq xmm1, xmm2
- movd xmm2, [r2+r3-1]
- movd xmm3, [r2+r3+1]
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movd xmm0, [r0]
+ movd xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movd xmm2, [r0]
+ movd xmm3, [r0+r1]
+ punpckldq xmm0, xmm1
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ sub r2, r3
+ movd xmm1, [r2]
+ movd xmm2, [r2+r3]
+ punpckldq xmm1, xmm2
+ movd xmm2, [r2+r3-1]
+ movd xmm3, [r2+r3+1]
- lea r2, [r2+2*r3]
+ lea r2, [r2+2*r3]
- movd xmm4, [r2]
- movd xmm5, [r2-1]
- punpckldq xmm2, xmm5
- movd xmm5, [r2+1]
- punpckldq xmm3, xmm5
+ movd xmm4, [r2]
+ movd xmm5, [r2-1]
+ punpckldq xmm2, xmm5
+ movd xmm5, [r2+1]
+ punpckldq xmm3, xmm5
- movd xmm5, [r2+r3]
- punpckldq xmm4, xmm5
+ movd xmm5, [r2+r3]
+ punpckldq xmm4, xmm5
- punpcklqdq xmm1, xmm4 ;-L
+ punpcklqdq xmm1, xmm4 ;-L
- movd xmm5, [r2+r3-1]
- movd xmm6, [r2+r3+1]
+ movd xmm5, [r2+r3-1]
+ movd xmm6, [r2+r3+1]
- lea r2, [r2+2*r3]
- movd xmm7, [r2-1]
- punpckldq xmm5, xmm7
- punpcklqdq xmm2, xmm5 ;-1
- movd xmm7, [r2+1]
- punpckldq xmm6, xmm7
- punpcklqdq xmm3, xmm6 ;+1
- movd xmm6, [r2]
- movd xmm7, [r2+r3]
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6 ;+L
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
+ lea r2, [r2+2*r3]
+ movd xmm7, [r2-1]
+ punpckldq xmm5, xmm7
+ punpcklqdq xmm2, xmm5 ;-1
+ movd xmm7, [r2+1]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm3, xmm6 ;+1
+ movd xmm6, [r2]
+ movd xmm7, [r2+r3]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6 ;+L
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
- movhlps xmm0, xmm1
- paddw xmm1, xmm0
- movhlps xmm0, xmm2
- paddw xmm2, xmm0
- movhlps xmm0, xmm3
- paddw xmm3, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- punpckldq xmm1, xmm4
- punpckldq xmm2, xmm3
- punpcklqdq xmm1, xmm2
- movdqa [r4],xmm1
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ movhlps xmm0, xmm1
+ paddw xmm1, xmm0
+ movhlps xmm0, xmm2
+ paddw xmm2, xmm0
+ movhlps xmm0, xmm3
+ paddw xmm3, xmm0
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ punpckldq xmm1, xmm4
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm1, xmm2
+ movdqa [r4],xmm1
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
;***********************************************************************
;
@@ -2444,33 +2444,33 @@
;***********************************************************************
WELS_EXTERN WelsSampleSad4x4_mmx
%assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movd mm0, [r0]
- movd mm1, [r0+r1]
- punpckldq mm0, mm1
+ LOAD_4_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movd mm0, [r0]
+ movd mm1, [r0+r1]
+ punpckldq mm0, mm1
- movd mm3, [r2]
- movd mm4, [r2+r3]
- punpckldq mm3, mm4
- psadbw mm0, mm3
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm0, mm3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
- movd mm1, [r0]
- movd mm2, [r0+r1]
- punpckldq mm1, mm2
+ movd mm1, [r0]
+ movd mm2, [r0+r1]
+ punpckldq mm1, mm2
- movd mm3, [r2]
- movd mm4, [r2+r3]
- punpckldq mm3, mm4
- psadbw mm1, mm3
- paddw mm0, mm1
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm1, mm3
+ paddw mm0, mm1
movd retrd, mm0
- WELSEMMS
+ WELSEMMS
LOAD_4_PARA_POP
ret
--- a/codec/common/x86/vaa.asm
+++ b/codec/common/x86/vaa.asm
@@ -29,16 +29,16 @@
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
-;* vaa.asm
+;* vaa.asm
;*
-;* Abstract
+;* Abstract
;* sse2 for pVaa routines
;*
;* History
-;* 04/14/2010 Created
-;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
-;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
-;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;* 04/14/2010 Created
+;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
+;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
+;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
;*
;*************************************************************************/
%include "asm_inc.asm"
@@ -49,87 +49,87 @@
;***********************************************************************
; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
- ; @sum_8x2 begin
- pshufd %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 0B1h ; 10110001 B
- paddw %1, %2
- ; end of @sum_8x2
-%endmacro ; END of SUM_WORD_8x2_SSE2
+%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
+ ; @sum_8x2 begin
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 0B1h ; 10110001 B
+ paddw %1, %2
+ ; end of @sum_8x2
+%endmacro ; END of SUM_WORD_8x2_SSE2
%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [r0 ] ; line 0
- movdqa %2, [r0+r1] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [r0+r2] ; line 2
- movdqa %4, [r0+r3] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- pshufd %3, %1, 0B1h
- pshufd %4, %2, 0B1h
- paddw %1, %3
- paddw %2, %4
- movdqa %3, %1
- movdqa %4, %2
- pshuflw %5, %1, 0B1h
- pshufhw %6, %3, 0B1h
- paddw %1, %5
- paddw %3, %6
- pshuflw %5, %2, 0B1h
- pshufhw %6, %4, 0B1h
- paddw %2, %5
- paddw %4, %6
- punpcklwd %1, %2
- punpckhwd %3, %4
- punpcklwd %1, %3
- psraw %1, $04
+ movdqa %1, [r0 ] ; line 0
+ movdqa %2, [r0+r1] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [r0+r2] ; line 2
+ movdqa %4, [r0+r3] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ pshufd %3, %1, 0B1h
+ pshufd %4, %2, 0B1h
+ paddw %1, %3
+ paddw %2, %4
+ movdqa %3, %1
+ movdqa %4, %2
+ pshuflw %5, %1, 0B1h
+ pshufhw %6, %3, 0B1h
+ paddw %1, %5
+ paddw %3, %6
+ pshuflw %5, %2, 0B1h
+ pshufhw %6, %4, 0B1h
+ paddw %2, %5
+ paddw %4, %6
+ punpcklwd %1, %2
+ punpckhwd %3, %4
+ punpcklwd %1, %3
+ psraw %1, $04
%endmacro
%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [r0 ] ; line 0
- movdqa %2, [r0+r1] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [r0+r2] ; line 2
- movdqa %4, [r0+r3] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
- phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
- psraw %1, $04
+ movdqa %1, [r0 ] ; line 0
+ movdqa %2, [r0+r1] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [r0+r2] ; line 2
+ movdqa %4, [r0+r3] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+ phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+ psraw %1, $04
%endmacro
@@ -143,7 +143,7 @@
; , 6/7/2010
;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
+; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
WELS_EXTERN AnalysisVaaInfoIntra_sse2
@@ -174,71 +174,71 @@
mov r4,r2
sal r4,$01 ;r4 = 4*iLineSize
- pxor xmm7, xmm7
+ pxor xmm7, xmm7
- ; loops
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7], xmm0
+ ; loops
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7], xmm0
- lea r0, [r0+r4]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7+8], xmm0
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+8], xmm0
- lea r0, [r0+r4]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7+16], xmm0
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+16], xmm0
- lea r0, [r0+r4]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7+24], xmm0
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+24], xmm0
- movdqa xmm0, [r7] ; block 0~7
- movdqa xmm1, [r7+16] ; block 8~15
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3
+ movdqa xmm0, [r7] ; block 0~7
+ movdqa xmm1, [r7+16] ; block 8~15
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
- movd r2d, xmm0
- and r2, 0ffffh ; effective low work truncated
- mov r3, r2
- imul r2, r3
- sar r2, $04
- movd retrd, xmm1
- sub retrd, r2d
+ movd r2d, xmm0
+ and r2, 0ffffh ; effective low work truncated
+ mov r3, r2
+ imul r2, r3
+ sar r2, $04
+ movd retrd, xmm1
+ sub retrd, r2d
- add r7,32
- add r7,r5
+ add r7,32
+ add r7,r5
%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
+ pop r6
+ pop r5
+ pop r4
+ pop r3
%endif
- POP_XMM
+ POP_XMM
- ret
+ ret
;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
+; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
@@ -269,47 +269,47 @@
mov r4,r2
sal r4,$01 ;r4 = 4*iLineSize
- pxor xmm7, xmm7
+ pxor xmm7, xmm7
- ; loops
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ ; loops
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7],xmm0
- lea r0,[r0+r4]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+8],xmm1
- lea r0,[r0+r4]
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+16],xmm0
- lea r0,[r0+r4]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+24],xmm1
- movdqa xmm0,[r7]
- movdqa xmm1,[r7+16]
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
+ movdqa xmm0,[r7]
+ movdqa xmm1,[r7+16]
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
movd r2d, xmm0
@@ -318,94 +318,94 @@
imul r2, r3
sar r2, $04
movd retrd, xmm1
- sub retrd, r2d
+ sub retrd, r2d
- add r7,32
- add r7,r5
+ add r7,32
+ add r7,r5
%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
+ pop r6
+ pop r5
+ pop r4
+ pop r3
%endif
- POP_XMM
+ POP_XMM
- ret
+ ret
;***********************************************************************
-; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
+; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
;***********************************************************************
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
- %assign push_num 0
- LOAD_1_PARA
- movdqa xmm0,[r0]
- pshufd xmm1, xmm0, 01Bh
- paddd xmm1, xmm0
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
- psrad xmm1, 02h ; iAverageSad
- movdqa xmm2, xmm1
- psrad xmm2, 06h
- movdqa xmm3, xmm0 ; iSadBlock
- psrad xmm3, 06h
- psubd xmm3, xmm2
- pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
- pshufd xmm4, xmm3, 01Bh
- paddd xmm4, xmm3
- pshufd xmm3, xmm4, 0B1h
- paddd xmm3, xmm4
- movd r0d, xmm3
- cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0,[r0]
+ pshufd xmm1, xmm0, 01Bh
+ paddd xmm1, xmm0
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+ psrad xmm1, 02h ; iAverageSad
+ movdqa xmm2, xmm1
+ psrad xmm2, 06h
+ movdqa xmm3, xmm0 ; iSadBlock
+ psrad xmm3, 06h
+ psubd xmm3, xmm2
+ pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
+ pshufd xmm4, xmm3, 01Bh
+ paddd xmm4, xmm3
+ pshufd xmm3, xmm4, 0B1h
+ paddd xmm3, xmm4
+ movd r0d, xmm3
+ cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
- jb near .threshold_exit
- pshufd xmm0, xmm0, 01Bh
- pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
- movmskps retrd, xmm0
- ret
+ jb near .threshold_exit
+ pshufd xmm0, xmm0, 01Bh
+ pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
+ movmskps retrd, xmm0
+ ret
.threshold_exit:
- mov retrd, 15
- ret
+ mov retrd, 15
+ ret
;***********************************************************************
-; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
+; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
;***********************************************************************
WELS_EXTERN MdInterAnalysisVaaInfo_sse2
- %assign push_num 0
- LOAD_1_PARA
- movdqa xmm0, [r0]
- pshufd xmm1, xmm0, 01Bh
- paddd xmm1, xmm0
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
- psrad xmm1, 02h ; iAverageSad
- movdqa xmm2, xmm1
- psrad xmm2, 06h
- movdqa xmm3, xmm0 ; iSadBlock
- psrad xmm3, 06h
- psubd xmm3, xmm2
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ pshufd xmm1, xmm0, 01Bh
+ paddd xmm1, xmm0
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+ psrad xmm1, 02h ; iAverageSad
+ movdqa xmm2, xmm1
+ psrad xmm2, 06h
+ movdqa xmm3, xmm0 ; iSadBlock
+ psrad xmm3, 06h
+ psubd xmm3, xmm2
- ; to replace pmulld functionality as below
- movdqa xmm2, xmm3
- pmuludq xmm2, xmm3
- pshufd xmm4, xmm3, 0B1h
- pmuludq xmm4, xmm4
- movdqa xmm5, xmm2
- punpckldq xmm5, xmm4
- punpckhdq xmm2, xmm4
- punpcklqdq xmm5, xmm2
+ ; to replace pmulld functionality as below
+ movdqa xmm2, xmm3
+ pmuludq xmm2, xmm3
+ pshufd xmm4, xmm3, 0B1h
+ pmuludq xmm4, xmm4
+ movdqa xmm5, xmm2
+ punpckldq xmm5, xmm4
+ punpckhdq xmm2, xmm4
+ punpcklqdq xmm5, xmm2
- pshufd xmm4, xmm5, 01Bh
- paddd xmm4, xmm5
- pshufd xmm5, xmm4, 0B1h
- paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 01Bh
+ paddd xmm4, xmm5
+ pshufd xmm5, xmm4, 0B1h
+ paddd xmm5, xmm4
- movd r0d, xmm5
- cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
- jb near .threshold_exit
- pshufd xmm0, xmm0, 01Bh
- pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
- movmskps retrd, xmm0
- ret
+ movd r0d, xmm5
+ cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
+ jb near .threshold_exit
+ pshufd xmm0, xmm0, 01Bh
+ pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
+ movmskps retrd, xmm0
+ ret
.threshold_exit:
- mov retrd, 15
- ret
+ mov retrd, 15
+ ret
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -36,67 +36,67 @@
#ifdef __APPLE__
.macro ROW_TRANSFORM_1_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
- vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 $8, $1, #1
- vshr.s16 $9, $3, #1
- vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
+// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+ vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
+ vshr.s16 $8, $1, #1
+ vshr.s16 $9, $3, #1
+ vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
+ vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
-.macro TRANSFORM_4BYTES // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+.macro TRANSFORM_4BYTES // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
.macro COL_TRANSFORM_1_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 $6, $1, #1
- vshr.s32 $7, $3, #1
- vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vshr.s32 $6, $1, #1
+ vshr.s32 $7, $3, #1
+ vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
#else
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
- vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 \arg8, \arg1, #1
- vshr.s16 \arg9, \arg3, #1
- vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
+// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+ vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
+ vshr.s16 \arg8, \arg1, #1
+ vshr.s16 \arg9, \arg3, #1
+ vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
+ vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 \arg6, \arg1, #1
- vshr.s32 \arg7, \arg3, #1
- vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vshr.s32 \arg6, \arg1, #1
+ vshr.s32 \arg7, \arg3, #1
+ vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
#endif
// r0 int16_t* block,
@@ -103,61 +103,61 @@
// r1 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
- vld1.64 {d0-d2}, [r1]
+ vld1.64 {d0-d2}, [r1]
- vceq.s8 q0, q0, #0
- vceq.s8 d2, d2, #0
- vmvn q0, q0
- vmvn d2, d2
- vabs.s8 q0, q0
- vabs.s8 d2, d2
+ vceq.s8 q0, q0, #0
+ vceq.s8 d2, d2, #0
+ vmvn q0, q0
+ vmvn d2, d2
+ vabs.s8 q0, q0
+ vabs.s8 d2, d2
- vst1.64 {d0-d2}, [r1]
+ vst1.64 {d0-d2}, [r1]
WELS_ASM_FUNC_END
-// uint8_t *pred, const int32_t stride, int16_t *rs
+// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
- vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
+ vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
- ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
+ ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
- TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
+ TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
- // transform element 32bits
- vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+ // transform element 32bits
+ vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+ vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+ vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+ vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
- COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11
+ COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11
- TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
+ TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
- //after clip_table[MAX_NEG_CROP] into [0, 255]
- mov r2, r0
- vld1.32 {d20[0]},[r0],r1
- vld1.32 {d20[1]},[r0],r1
- vld1.32 {d22[0]},[r0],r1
- vld1.32 {d22[1]},[r0]
+ //after clip_table[MAX_NEG_CROP] into [0, 255]
+ mov r2, r0
+ vld1.32 {d20[0]},[r0],r1
+ vld1.32 {d20[1]},[r0],r1
+ vld1.32 {d22[0]},[r0],r1
+ vld1.32 {d22[1]},[r0]
- vrshrn.s32 d16, q0, #6
- vrshrn.s32 d17, q1, #6
- vrshrn.s32 d18, q2, #6
- vrshrn.s32 d19, q3, #6
+ vrshrn.s32 d16, q0, #6
+ vrshrn.s32 d17, q1, #6
+ vrshrn.s32 d18, q2, #6
+ vrshrn.s32 d19, q3, #6
- vmovl.u8 q0,d20
- vmovl.u8 q1,d22
- vadd.s16 q0,q8
- vadd.s16 q1,q9
+ vmovl.u8 q0,d20
+ vmovl.u8 q1,d22
+ vadd.s16 q0,q8
+ vadd.s16 q1,q9
- vqmovun.s16 d20,q0
- vqmovun.s16 d22,q1
+ vqmovun.s16 d20,q0
+ vqmovun.s16 d22,q1
- vst1.32 {d20[0]},[r2],r1
- vst1.32 {d20[1]},[r2],r1
- vst1.32 {d22[0]},[r2],r1
- vst1.32 {d22[1]},[r2]
+ vst1.32 {d20[0]},[r2],r1
+ vst1.32 {d20[1]},[r2],r1
+ vst1.32 {d22[0]},[r2],r1
+ vst1.32 {d22[1]},[r2]
WELS_ASM_FUNC_END
#endif
--- a/codec/decoder/core/arm/intra_pred_neon.S
+++ b/codec/decoder/core/arm/intra_pred_neon.S
@@ -38,45 +38,45 @@
#ifdef __APPLE__
//Global macro
.macro GET_8BYTE_DATA
- vld1.8 {$0[0]}, [$1], $2
- vld1.8 {$0[1]}, [$1], $2
- vld1.8 {$0[2]}, [$1], $2
- vld1.8 {$0[3]}, [$1], $2
- vld1.8 {$0[4]}, [$1], $2
- vld1.8 {$0[5]}, [$1], $2
- vld1.8 {$0[6]}, [$1], $2
- vld1.8 {$0[7]}, [$1], $2
+ vld1.8 {$0[0]}, [$1], $2
+ vld1.8 {$0[1]}, [$1], $2
+ vld1.8 {$0[2]}, [$1], $2
+ vld1.8 {$0[3]}, [$1], $2
+ vld1.8 {$0[4]}, [$1], $2
+ vld1.8 {$0[5]}, [$1], $2
+ vld1.8 {$0[6]}, [$1], $2
+ vld1.8 {$0[7]}, [$1], $2
.endmacro
#else
//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
- vld1.8 {\arg0[0]}, [\arg1], \arg2
- vld1.8 {\arg0[1]}, [\arg1], \arg2
- vld1.8 {\arg0[2]}, [\arg1], \arg2
- vld1.8 {\arg0[3]}, [\arg1], \arg2
- vld1.8 {\arg0[4]}, [\arg1], \arg2
- vld1.8 {\arg0[5]}, [\arg1], \arg2
- vld1.8 {\arg0[6]}, [\arg1], \arg2
- vld1.8 {\arg0[7]}, [\arg1], \arg2
+ vld1.8 {\arg0[0]}, [\arg1], \arg2
+ vld1.8 {\arg0[1]}, [\arg1], \arg2
+ vld1.8 {\arg0[2]}, [\arg1], \arg2
+ vld1.8 {\arg0[3]}, [\arg1], \arg2
+ vld1.8 {\arg0[4]}, [\arg1], \arg2
+ vld1.8 {\arg0[5]}, [\arg1], \arg2
+ vld1.8 {\arg0[6]}, [\arg1], \arg2
+ vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
- //Get the top line data to 'q0'
- sub r2, r0, r1
- vldm r2, {d0, d1}
+ //Get the top line data to 'q0'
+ sub r2, r0, r1
+ vldm r2, {d0, d1}
- mov r2, r0
- mov r3, #4
- //Set the top line to the each line of MB(16*16)
+ mov r2, r0
+ mov r3, #4
+ //Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v:
- vst1.8 {d0,d1}, [r2], r1
- vst1.8 {d0,d1}, [r2], r1
- vst1.8 {d0,d1}, [r2], r1
- vst1.8 {d0,d1}, [r2], r1
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_v
+ vst1.8 {d0,d1}, [r2], r1
+ vst1.8 {d0,d1}, [r2], r1
+ vst1.8 {d0,d1}, [r2], r1
+ vst1.8 {d0,d1}, [r2], r1
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END
@@ -83,59 +83,59 @@
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
- sub r2, r0, #1
- mov r3, #4
+ sub r2, r0, #1
+ mov r3, #4
loop_0_get_i16x16_luma_pred_h:
- //Get one byte data from left side
- vld1.8 {d0[],d1[]}, [r2], r1
- vld1.8 {d2[],d3[]}, [r2], r1
- vld1.8 {d4[],d5[]}, [r2], r1
- vld1.8 {d6[],d7[]}, [r2], r1
+ //Get one byte data from left side
+ vld1.8 {d0[],d1[]}, [r2], r1
+ vld1.8 {d2[],d3[]}, [r2], r1
+ vld1.8 {d4[],d5[]}, [r2], r1
+ vld1.8 {d6[],d7[]}, [r2], r1
- //Set the line of MB using the left side byte data
- vst1.8 {d0,d1}, [r0], r1
- vst1.8 {d2,d3}, [r0], r1
- vst1.8 {d4,d5}, [r0], r1
- vst1.8 {d6,d7}, [r0], r1
+ //Set the line of MB using the left side byte data
+ vst1.8 {d0,d1}, [r0], r1
+ vst1.8 {d2,d3}, [r0], r1
+ vst1.8 {d4,d5}, [r0], r1
+ vst1.8 {d6,d7}, [r0], r1
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_h
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
- //stmdb sp!, { r2-r5, lr}
- //Get the left vertical line data
- sub r2, r0, #1
- GET_8BYTE_DATA d0, r2, r1
- GET_8BYTE_DATA d1, r2, r1
+ //stmdb sp!, { r2-r5, lr}
+ //Get the left vertical line data
+ sub r2, r0, #1
+ GET_8BYTE_DATA d0, r2, r1
+ GET_8BYTE_DATA d1, r2, r1
- //Get the top horizontal line data
- sub r2, r0, r1
- vldm r2, {d2, d3}
+ //Get the top horizontal line data
+ sub r2, r0, r1
+ vldm r2, {d2, d3}
- //Calculate the sum of top horizontal line data and vertical line data
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vadd.u16 q0, q0, q1
- vadd.u16 d0, d0, d1
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
+ //Calculate the sum of top horizontal line data and vertical line data
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
- //Calculate the mean value
- vrshr.u16 d0, d0, #5
- vdup.8 q0, d0[0]
+ //Calculate the mean value
+ vrshr.u16 d0, d0, #5
+ vdup.8 q0, d0[0]
- //Set the mean value to the all of member of MB
- mov r2, #4
+ //Set the mean value to the all of member of MB
+ mov r2, #4
loop_0_get_i16x16_luma_pred_dc_both:
- vst1.8 {d0,d1}, [r0], r1
- vst1.8 {d0,d1}, [r0], r1
- vst1.8 {d0,d1}, [r0], r1
- vst1.8 {d0,d1}, [r0], r1
- subs r2, #1
- bne loop_0_get_i16x16_luma_pred_dc_both
+ vst1.8 {d0,d1}, [r0], r1
+ vst1.8 {d0,d1}, [r0], r1
+ vst1.8 {d0,d1}, [r0], r1
+ vst1.8 {d0,d1}, [r0], r1
+ subs r2, #1
+ bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END
@@ -149,106 +149,106 @@
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
- //stmdb sp!, { r2-r5, lr}
+ //stmdb sp!, { r2-r5, lr}
- //Load the table {(8,7,6,5,4,3,2,1) * 5}
- adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
- vldr d0, [r2]
+ //Load the table {(8,7,6,5,4,3,2,1) * 5}
+ adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
+ vldr d0, [r2]
- //Pack the top[-1] ~ top[6] to d1
- sub r2, r0, r1
- sub r3, r2, #1
- vld1.8 d1, [r3]
+ //Pack the top[-1] ~ top[6] to d1
+ sub r2, r0, r1
+ sub r3, r2, #1
+ vld1.8 d1, [r3]
- //Pack the top[8] ~ top[15] to d2
- add r3, #9
- vld1.8 d2, [r3]
+ //Pack the top[8] ~ top[15] to d2
+ add r3, #9
+ vld1.8 d2, [r3]
- //Save the top[15] to d6 for next step
- vdup.u8 d6, d2[7]
+ //Save the top[15] to d6 for next step
+ vdup.u8 d6, d2[7]
- //Get and pack left[-1] ~ left[6] to d4
- sub r3, r2, #1
- GET_8BYTE_DATA d4, r3, r1
+ //Get and pack left[-1] ~ left[6] to d4
+ sub r3, r2, #1
+ GET_8BYTE_DATA d4, r3, r1
- //Get and pack left[8] ~ left[15] to d3
- add r3, r1
- GET_8BYTE_DATA d3, r3, r1
+ //Get and pack left[8] ~ left[15] to d3
+ add r3, r1
+ GET_8BYTE_DATA d3, r3, r1
- //Save the left[15] to d7 for next step
- vdup.u8 d7, d3[7]
+ //Save the left[15] to d7 for next step
+ vdup.u8 d7, d3[7]
- //revert the sequence of d2,d3
- vrev64.8 q1, q1
+ //revert the sequence of d2,d3
+ vrev64.8 q1, q1
- vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
- vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
+ vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
+ vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
- vmovl.u8 q0, d0
- vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
- vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
+ vmovl.u8 q0, d0
+ vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
+ vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
- //Calculate the sum of items of q1, q2
- vpadd.s16 d0, d2, d3
- vpadd.s16 d1, d4, d5
- vpaddl.s16 q0, q0
- vpaddl.s32 q0, q0
+ //Calculate the sum of items of q1, q2
+ vpadd.s16 d0, d2, d3
+ vpadd.s16 d1, d4, d5
+ vpaddl.s16 q0, q0
+ vpaddl.s32 q0, q0
- //Get the value of 'b', 'c' and extend to q1, q2.
- vrshr.s64 q0, #6
- vdup.s16 q1, d0[0]
- vdup.s16 q2, d1[0]
+ //Get the value of 'b', 'c' and extend to q1, q2.
+ vrshr.s64 q0, #6
+ vdup.s16 q1, d0[0]
+ vdup.s16 q2, d1[0]
- //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
- adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
- vld1.32 {d0}, [r2]
+ //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
+ adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
+ vld1.32 {d0}, [r2]
- //Get the value of 'a' and save to q3
- vaddl.u8 q3, d6, d7
- vshl.u16 q3, #4
+ //Get the value of 'a' and save to q3
+ vaddl.u8 q3, d6, d7
+ vshl.u16 q3, #4
- //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
- vmovl.s8 q0, d0
- vmla.s16 q3, q0, q1
- vmla.s16 q3, q2, d0[0]
+ //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
+ vmovl.s8 q0, d0
+ vmla.s16 q3, q0, q1
+ vmla.s16 q3, q2, d0[0]
- //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
- vshl.s16 q8, q1, #3
- vadd.s16 q8, q3
+ //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
+ vshl.s16 q8, q1, #3
+ vadd.s16 q8, q3
- //right shift 5 bits and rounding
- vqrshrun.s16 d0, q3, #5
- vqrshrun.s16 d1, q8, #5
+ //right shift 5 bits and rounding
+ vqrshrun.s16 d0, q3, #5
+ vqrshrun.s16 d1, q8, #5
- //Set the line of MB
- vst1.u32 {d0,d1}, [r0], r1
+ //Set the line of MB
+ vst1.u32 {d0,d1}, [r0], r1
- //Do the same processing for setting other lines
- mov r2, #15
+ //Do the same processing for setting other lines
+ mov r2, #15
loop_0_get_i16x16_luma_pred_plane:
- vadd.s16 q3, q2
- vadd.s16 q8, q2
- vqrshrun.s16 d0, q3, #5
- vqrshrun.s16 d1, q8, #5
- vst1.u32 {d0,d1}, [r0], r1
- subs r2, #1
- bne loop_0_get_i16x16_luma_pred_plane
+ vadd.s16 q3, q2
+ vadd.s16 q8, q2
+ vqrshrun.s16 d0, q3, #5
+ vqrshrun.s16 d1, q8, #5
+ vst1.u32 {d0,d1}, [r0], r1
+ subs r2, #1
+ bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r2, r0, r1
- ldr r2, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r2, r0, r1
+ ldr r2, [r2]
- //Set the luma MB using top line
- str r2, [r0], r1
- str r2, [r0], r1
- str r2, [r0], r1
- str r2, [r0]
+ //Set the luma MB using top line
+ str r2, [r0], r1
+ str r2, [r0], r1
+ str r2, [r0], r1
+ str r2, [r0]
WELS_ASM_FUNC_END
@@ -255,97 +255,97 @@
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the left column (4 bytes)
- sub r2, r0, #1
- vld1.8 {d0[]}, [r2], r1
- vld1.8 {d1[]}, [r2], r1
- vld1.8 {d2[]}, [r2], r1
- vld1.8 {d3[]}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the left column (4 bytes)
+ sub r2, r0, #1
+ vld1.8 {d0[]}, [r2], r1
+ vld1.8 {d1[]}, [r2], r1
+ vld1.8 {d2[]}, [r2], r1
+ vld1.8 {d3[]}, [r2]
- //Set the luma MB using the left side byte
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d1[0]}, [r0], r1
- vst1.32 {d2[0]}, [r0], r1
- vst1.32 {d3[0]}, [r0]
+ //Set the luma MB using the left side byte
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+ vst1.32 {d2[0]}, [r0], r1
+ vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row data(8 bytes)
- sub r2, r0, r1
- vld1.32 {d0}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row data(8 bytes)
+ sub r2, r0, r1
+ vld1.32 {d0}, [r2]
- //For "t7 + (t7<<1)"
- vdup.8 d1, d0[7]
+ //For "t7 + (t7<<1)"
+ vdup.8 d1, d0[7]
- //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
- vext.8 d1, d0, d1, #1
- vaddl.u8 q1, d1, d0
+ //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q1, d1, d0
- //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
- vext.8 q2, q1, q1, #14
- vadd.u16 q0, q1, q2
+ //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
+ vext.8 q2, q1, q1, #14
+ vadd.u16 q0, q1, q2
- //right shift 2 bits and rounding
- vqrshrn.u16 d0, q0, #2
+ //right shift 2 bits and rounding
+ vqrshrn.u16 d0, q0, #2
- //Save "ddl0, ddl1, ddl2, ddl3"
- vext.8 d1, d0, d0, #1
- vst1.32 d1[0], [r0], r1
+ //Save "ddl0, ddl1, ddl2, ddl3"
+ vext.8 d1, d0, d0, #1
+ vst1.32 d1[0], [r0], r1
- //Save "ddl1, ddl2, ddl3, ddl4"
- vext.8 d1, d0, d0, #2
- vst1.32 d1[0], [r0], r1
+ //Save "ddl1, ddl2, ddl3, ddl4"
+ vext.8 d1, d0, d0, #2
+ vst1.32 d1[0], [r0], r1
- //Save "ddl2, ddl3, ddl4, ddl5"
- vext.8 d1, d0, d0, #3
- vst1.32 d1[0], [r0], r1
+ //Save "ddl2, ddl3, ddl4, ddl5"
+ vext.8 d1, d0, d0, #3
+ vst1.32 d1[0], [r0], r1
- //Save "ddl3, ddl4, ddl5, ddl6"
- vst1.32 d0[1], [r0]
+ //Save "ddl3, ddl4, ddl5, ddl6"
+ vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r2, r0, r1
- vld1.32 {d0[1]}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r2, r0, r1
+ vld1.32 {d0[1]}, [r2]
- //Load the left column (5 bytes)
- sub r2, #1
- vld1.8 {d0[3]}, [r2], r1
- vld1.8 {d0[2]}, [r2], r1
- vld1.8 {d0[1]}, [r2], r1
- vld1.8 {d0[0]}, [r2], r1
- vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
+ //Load the left column (5 bytes)
+ sub r2, #1
+ vld1.8 {d0[3]}, [r2], r1
+ vld1.8 {d0[2]}, [r2], r1
+ vld1.8 {d0[1]}, [r2], r1
+ vld1.8 {d0[0]}, [r2], r1
+ vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
- vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
- //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
+ vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
+ //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
- //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
- vaddl.u8 q2, d2, d0
+ //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
+ vaddl.u8 q2, d2, d0
- //q1:{TL0+LT0,LT0+T01,...L12+L23}
- vext.8 q3, q3, q2, #14
- vadd.u16 q1, q2, q3
+ //q1:{TL0+LT0,LT0+T01,...L12+L23}
+ vext.8 q3, q3, q2, #14
+ vadd.u16 q1, q2, q3
- //right shift 2 bits and rounding
- vqrshrn.u16 d0, q1, #2
+ //right shift 2 bits and rounding
+ vqrshrn.u16 d0, q1, #2
- //Adjust the data sequence for setting luma MB of 'pred'
- vst1.32 d0[1], [r0], r1
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0], r1
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0], r1
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0]
+ //Adjust the data sequence for setting luma MB of 'pred'
+ vst1.32 d0[1], [r0], r1
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0], r1
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0], r1
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
@@ -352,31 +352,31 @@
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (8 bytes)
- sub r2, r0, r1
- vld1.32 {d0}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (8 bytes)
+ sub r2, r0, r1
+ vld1.32 {d0}, [r2]
- vext.8 d1, d0, d0, #1
- vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
+ vext.8 d1, d0, d0, #1
+ vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
- vext.8 q2, q1, q1, #2
- vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
+ vext.8 q2, q1, q1, #2
+ vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
- //calculate the "vl0,vl1,vl2,vl3,vl4"
- vqrshrn.u16 d0, q1, #1
+ //calculate the "vl0,vl1,vl2,vl3,vl4"
+ vqrshrn.u16 d0, q1, #1
- //calculate the "vl5,vl6,vl7,vl8,vl9"
- vqrshrn.u16 d1, q2, #2
+ //calculate the "vl5,vl6,vl7,vl8,vl9"
+ vqrshrn.u16 d1, q2, #2
- //Adjust the data sequence for setting the luma MB
- vst1.32 d0[0], [r0], r1
- vst1.32 d1[0], [r0], r1
- vext.8 d0, d0, d0, #1
- vext.8 d1, d1, d1, #1
- vst1.32 d0[0], [r0], r1
- vst1.32 d1[0], [r0]
+ //Adjust the data sequence for setting the luma MB
+ vst1.32 d0[0], [r0], r1
+ vst1.32 d1[0], [r0], r1
+ vext.8 d0, d0, d0, #1
+ vext.8 d1, d1, d1, #1
+ vst1.32 d0[0], [r0], r1
+ vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END
@@ -383,152 +383,152 @@
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r2, r0, r1
- vld1.32 {d0[1]}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r2, r0, r1
+ vld1.32 {d0[1]}, [r2]
- //Load the left column (4 bytes)
- sub r2, #1
- vld1.8 {d0[3]}, [r2], r1
- vld1.8 {d0[2]}, [r2], r1
- vld1.8 {d0[1]}, [r2], r1
- vld1.8 {d0[0]}, [r2]
+ //Load the left column (4 bytes)
+ sub r2, #1
+ vld1.8 {d0[3]}, [r2], r1
+ vld1.8 {d0[2]}, [r2], r1
+ vld1.8 {d0[1]}, [r2], r1
+ vld1.8 {d0[0]}, [r2]
- vext.8 d1, d0, d0, #7
- vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
+ vext.8 d1, d0, d0, #7
+ vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
- vext.u8 q2, q1, q1, #14
- vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
+ vext.u8 q2, q1, q1, #14
+ vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
- //Calculate the vr0 ~ vr9
- vqrshrn.u16 d1, q2, #2
- vqrshrn.u16 d0, q1, #1
+ //Calculate the vr0 ~ vr9
+ vqrshrn.u16 d1, q2, #2
+ vqrshrn.u16 d0, q1, #1
- //Adjust the data sequence for setting the luma MB
- vst1.32 d0[1], [r0], r1
- vst1.32 d1[1], [r0], r1
- add r2, r0, r1
- vst1.8 d1[3], [r0]!
- vst1.16 d0[2], [r0]!
- vst1.8 d0[6], [r0]!
- vst1.8 d1[2], [r2]!
- vst1.16 d1[2], [r2]!
- vst1.8 d1[6], [r2]
+ //Adjust the data sequence for setting the luma MB
+ vst1.32 d0[1], [r0], r1
+ vst1.32 d1[1], [r0], r1
+ add r2, r0, r1
+ vst1.8 d1[3], [r0]!
+ vst1.16 d0[2], [r0]!
+ vst1.8 d0[6], [r0]!
+ vst1.8 d1[2], [r2]!
+ vst1.16 d1[2], [r2]!
+ vst1.8 d1[6], [r2]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the left column data
- sub r2, r0, #1
- mov r3, #3
- mul r3, r1
- add r3, r2
- vld1.8 {d0[]}, [r3]
- vld1.8 {d0[4]}, [r2], r1
- vld1.8 {d0[5]}, [r2], r1
- vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
+ //stmdb sp!, { r2-r5, lr}
+ //Load the left column data
+ sub r2, r0, #1
+ mov r3, #3
+ mul r3, r1
+ add r3, r2
+ vld1.8 {d0[]}, [r3]
+ vld1.8 {d0[4]}, [r2], r1
+ vld1.8 {d0[5]}, [r2], r1
+ vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
- vext.8 d1, d0, d0, #1
- vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+ vext.8 d1, d0, d0, #1
+ vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
- vext.u8 d2, d5, d4, #2
- vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
+ vext.u8 d2, d5, d4, #2
+ vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
- //Calculate the hu0 ~ hu5
- vqrshrn.u16 d2, q2, #1
- vqrshrn.u16 d1, q1, #2
+ //Calculate the hu0 ~ hu5
+ vqrshrn.u16 d2, q2, #1
+ vqrshrn.u16 d1, q1, #2
- //Adjust the data sequence for setting the luma MB
- vzip.8 d2, d1
- vst1.32 d1[0], [r0], r1
- vext.8 d2, d1, d1, #2
- vst1.32 d2[0], [r0], r1
- vst1.32 d1[1], [r0], r1
- vst1.32 d0[0], [r0]
+ //Adjust the data sequence for setting the luma MB
+ vzip.8 d2, d1
+ vst1.32 d1[0], [r0], r1
+ vext.8 d2, d1, d1, #2
+ vst1.32 d2[0], [r0], r1
+ vst1.32 d1[1], [r0], r1
+ vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the data
- sub r2, r0, r1
- sub r2, #1
- vld1.32 {d0[1]}, [r2], r1
- vld1.8 {d0[3]}, [r2], r1
- vld1.8 {d0[2]}, [r2], r1
- vld1.8 {d0[1]}, [r2], r1
- vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
+ //stmdb sp!, { r2-r5, lr}
+ //Load the data
+ sub r2, r0, r1
+ sub r2, #1
+ vld1.32 {d0[1]}, [r2], r1
+ vld1.8 {d0[3]}, [r2], r1
+ vld1.8 {d0[2]}, [r2], r1
+ vld1.8 {d0[1]}, [r2], r1
+ vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
- vext.8 d1, d0, d0, #7
- vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
+ vext.8 d1, d0, d0, #7
+ vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
- vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
- vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
+ vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
+ vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
- //Calculate the hd0~hd9
- vqrshrn.u16 d1, q3, #2
- vqrshrn.u16 d0, q2, #1
+ //Calculate the hd0~hd9
+ vqrshrn.u16 d1, q3, #2
+ vqrshrn.u16 d0, q2, #1
- //Adjust the data sequence for setting the luma MB
- vmov d3, d1
- vtrn.8 d0, d1
- vext.u8 d2, d1, d1, #6
- vst2.16 {d2[3], d3[3]}, [r0], r1
- vst2.16 {d0[2], d1[2]}, [r0], r1
- vmov d3, d0
- vst2.16 {d2[2], d3[2]}, [r0], r1
- vst2.16 {d0[1], d1[1]}, [r0]
+ //Adjust the data sequence for setting the luma MB
+ vmov d3, d1
+ vtrn.8 d0, d1
+ vext.u8 d2, d1, d1, #6
+ vst2.16 {d2[3], d3[3]}, [r0], r1
+ vst2.16 {d0[2], d1[2]}, [r0], r1
+ vmov d3, d0
+ vst2.16 {d2[2], d3[2]}, [r0], r1
+ vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
- //stmdb sp!, { r2-r5, lr}
- //Get the top row (8 byte)
- sub r2, r0, r1
- vldr d0, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Get the top row (8 byte)
+ sub r2, r0, r1
+ vldr d0, [r2]
- //Set the chroma MB using top row data
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0]
+ //Set the chroma MB using top row data
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
- //stmdb sp!, { r2-r5, lr}
- ////Get the left column (8 byte)
- sub r2, r0, #1
- vld1.8 {d0[]}, [r2], r1
- vld1.8 {d1[]}, [r2], r1
- vld1.8 {d2[]}, [r2], r1
- vld1.8 {d3[]}, [r2], r1
- vld1.8 {d4[]}, [r2], r1
- vld1.8 {d5[]}, [r2], r1
- vld1.8 {d6[]}, [r2], r1
- vld1.8 {d7[]}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ ////Get the left column (8 byte)
+ sub r2, r0, #1
+ vld1.8 {d0[]}, [r2], r1
+ vld1.8 {d1[]}, [r2], r1
+ vld1.8 {d2[]}, [r2], r1
+ vld1.8 {d3[]}, [r2], r1
+ vld1.8 {d4[]}, [r2], r1
+ vld1.8 {d5[]}, [r2], r1
+ vld1.8 {d6[]}, [r2], r1
+ vld1.8 {d7[]}, [r2]
- //Set the chroma MB using left column data
- vst1.8 {d0}, [r0], r1
- vst1.8 {d1}, [r0], r1
- vst1.8 {d2}, [r0], r1
- vst1.8 {d3}, [r0], r1
- vst1.8 {d4}, [r0], r1
- vst1.8 {d5}, [r0], r1
- vst1.8 {d6}, [r0], r1
- vst1.8 {d7}, [r0]
+ //Set the chroma MB using left column data
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d1}, [r0], r1
+ vst1.8 {d2}, [r0], r1
+ vst1.8 {d3}, [r0], r1
+ vst1.8 {d4}, [r0], r1
+ vst1.8 {d5}, [r0], r1
+ vst1.8 {d6}, [r0], r1
+ vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END
@@ -576,73 +576,73 @@
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row data
- sub r2, r0, #1
- sub r2, r1
- vld1.32 {d1[0]}, [r2]
- add r2, #5
- vld1.32 {d0[0]}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row data
+ sub r2, r0, #1
+ sub r2, r1
+ vld1.32 {d1[0]}, [r2]
+ add r2, #5
+ vld1.32 {d0[0]}, [r2]
- //Load the left column data
- sub r2, #5
- vld1.8 {d1[4]}, [r2], r1
- vld1.8 {d1[5]}, [r2], r1
- vld1.8 {d1[6]}, [r2], r1
- vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
- add r2, r1
- vld1.8 {d0[4]}, [r2], r1
- vld1.8 {d0[5]}, [r2], r1
- vld1.8 {d0[6]}, [r2], r1
- vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
+ //Load the left column data
+ sub r2, #5
+ vld1.8 {d1[4]}, [r2], r1
+ vld1.8 {d1[5]}, [r2], r1
+ vld1.8 {d1[6]}, [r2], r1
+ vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
+ add r2, r1
+ vld1.8 {d0[4]}, [r2], r1
+ vld1.8 {d0[5]}, [r2], r1
+ vld1.8 {d0[6]}, [r2], r1
+ vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
- //Save T7 to d3 for next step
- vdup.u8 d3, d0[3]
- //Save L7 to d4 for next step
- vdup.u8 d4, d0[7]
+ //Save T7 to d3 for next step
+ vdup.u8 d3, d0[3]
+ //Save L7 to d4 for next step
+ vdup.u8 d4, d0[7]
- //Calculate the value of 'a' and save to q2
- vaddl.u8 q2, d3, d4
- vshl.u16 q2, #4
+ //Calculate the value of 'a' and save to q2
+ vaddl.u8 q2, d3, d4
+ vshl.u16 q2, #4
- //Load the table {{1,2,3,4,1,2,3,4}*17}
- adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
- vld1.32 {d2}, [r2]
+ //Load the table {{1,2,3,4,1,2,3,4}*17}
+ adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
+ vld1.32 {d2}, [r2]
- //Calculate the 'b','c', and save to q0
- vrev32.8 d1, d1
- vsubl.u8 q0, d0, d1
- vmovl.u8 q1, d2
- vmul.s16 q0, q1
- vpaddl.s16 q0, q0
- vpaddl.s32 q0, q0
- vrshr.s64 q0, #5
+ //Calculate the 'b','c', and save to q0
+ vrev32.8 d1, d1
+ vsubl.u8 q0, d0, d1
+ vmovl.u8 q1, d2
+ vmul.s16 q0, q1
+ vpaddl.s16 q0, q0
+ vpaddl.s32 q0, q0
+ vrshr.s64 q0, #5
- //Load the table {-3,-2,-1,0,1,2,3,4} to q3
- adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
- vld1.32 {d6, d7}, [r2]
+ //Load the table {-3,-2,-1,0,1,2,3,4} to q3
+ adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
+ vld1.32 {d6, d7}, [r2]
- //Duplicate the 'b','c' to q0, q1 for SIMD instruction
- vdup.s16 q1, d1[0]
- vdup.s16 q0, d0[0]
+ //Duplicate the 'b','c' to q0, q1 for SIMD instruction
+ vdup.s16 q1, d1[0]
+ vdup.s16 q0, d0[0]
- //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
- vmla.s16 q2, q0, q3
- vmla.s16 q2, q1, d6[0]
- vqrshrun.s16 d0, q2, #5
+ //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
+ vmla.s16 q2, q0, q3
+ vmla.s16 q2, q1, d6[0]
+ vqrshrun.s16 d0, q2, #5
- //Set a line of chroma MB
- vst1.u32 {d0}, [r0], r1
+ //Set a line of chroma MB
+ vst1.u32 {d0}, [r0], r1
- //Do the same processing for each line.
- mov r2, #7
+ //Do the same processing for each line.
+ mov r2, #7
loop_0_get_i_chroma_pred_plane:
- vadd.s16 q2, q1
- vqrshrun.s16 d0, q2, #5
- vst1.u32 {d0}, [r0], r1
- subs r2, #1
- bne loop_0_get_i_chroma_pred_plane
+ vadd.s16 q2, q1
+ vqrshrun.s16 d0, q2, #5
+ vst1.u32 {d0}, [r0], r1
+ subs r2, #1
+ bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END
--- a/codec/decoder/core/x86/dct.asm
+++ b/codec/decoder/core/x86/dct.asm
@@ -54,7 +54,7 @@
%endmacro
%macro MMX_SumSub 3
- movq %3, %2
+ movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
@@ -62,8 +62,8 @@
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
- MMX_SumSub %1, %4, %6
- MMX_SumSub %3, %5, %6
+ MMX_SumSub %1, %4, %6
+ MMX_SumSub %3, %5, %6
%endmacro
@@ -96,13 +96,13 @@
movq mm2, [r2+16]
movq mm3, [r2+24]
- MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
- MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
+ MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
+ MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
- MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
+ MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
- WELS_Zero mm7
- WELS_DW32 mm6
+ WELS_Zero mm7
+ WELS_DW32 mm6
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
@@ -111,5 +111,5 @@
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
- emms
+ emms
ret
--- a/codec/decoder/core/x86/intra_pred.asm
+++ b/codec/decoder/core/x86/intra_pred.asm
@@ -36,10 +36,10 @@
;*
;* History
;* 18/09/2009 Created
-;* 19/11/2010 Added
-;* WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
-;* WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
-;* and WelsDecoderIChromaPredDcNA_mmx
+;* 19/11/2010 Added
+;* WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
+;* WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
+;* and WelsDecoderIChromaPredDcNA_mmx
;*
;*
;*************************************************************************/
@@ -65,7 +65,7 @@
sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
align 16
-mmx_01bytes: times 16 db 1
+mmx_01bytes: times 16 db 1
align 16
mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
@@ -81,86 +81,86 @@
;xmm0, xmm1, xmm2, eax, ecx
;lower 64 bits of xmm0 save the result
%macro SSE2_PRED_H_4X4_TWO_LINE 5
- movd %1, [%4-1]
- movdqa %3, %1
- punpcklbw %1, %3
- movdqa %3, %1
- punpcklbw %1, %3
+ movd %1, [%4-1]
+ movdqa %3, %1
+ punpcklbw %1, %3
+ movdqa %3, %1
+ punpcklbw %1, %3
- ;add %4, %5
- movd %2, [%4+%5-1]
- movdqa %3, %2
- punpcklbw %2, %3
- movdqa %3, %2
- punpcklbw %2, %3
- punpckldq %1, %2
+ ;add %4, %5
+ movd %2, [%4+%5-1]
+ movdqa %3, %2
+ punpcklbw %2, %3
+ movdqa %3, %2
+ punpcklbw %2, %3
+ punpckldq %1, %2
%endmacro
%macro LOAD_COLUMN 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpcklwd %1, %3
- lea %5, [%5+2*%6]
- movd %4, [%5]
- movd %2, [%5+%6]
- punpcklbw %4, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- lea %5, [%5+2*%6]
- punpcklbw %3, %2
- punpcklwd %4, %3
- punpckhdq %1, %4
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpcklwd %1, %3
+ lea %5, [%5+2*%6]
+ movd %4, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %4, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ lea %5, [%5+2*%6]
+ punpcklbw %3, %2
+ punpcklwd %4, %3
+ punpckhdq %1, %4
%endmacro
%macro SUMW_HORIZON 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
%endmacro
%macro COPY_16_TIMES 2
- movdqa %2, [%1-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
+ movdqa %2, [%1-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
%endmacro
%macro COPY_16_TIMESS 3
- movdqa %2, [%1+%3-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
+ movdqa %2, [%1+%3-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
%endmacro
%macro LOAD_COLUMN_C 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1,%2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpckhwd %1, %3
- lea %5, [%5+2*%6]
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1,%2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpckhwd %1, %3
+ lea %5, [%5+2*%6]
%endmacro
%macro LOAD_2_LEFT_AND_ADD 0
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01]
- add r2, r3
- movzx r3, byte [r0+r1-0x01]
- add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01]
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01]
+ add r2, r3
%endmacro
;*******************************************************************************
@@ -173,131 +173,131 @@
;*******************************************************************************
; void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
;
-; pPred must align to 16
+; pPred must align to 16
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
- movzx r2, byte [r0-1]
- movd xmm0, r2d
- pmuludq xmm0, [mmx_01bytes]
+ movzx r2, byte [r0-1]
+ movd xmm0, r2d
+ pmuludq xmm0, [mmx_01bytes]
- movzx r2, byte [r0+r1-1]
- movd xmm1, r2d
- pmuludq xmm1, [mmx_01bytes]
+ movzx r2, byte [r0+r1-1]
+ movd xmm1, r2d
+ pmuludq xmm1, [mmx_01bytes]
- lea r0, [r0+r1]
- movzx r2, byte [r0+r1-1]
- movd xmm2, r2d
- pmuludq xmm2, [mmx_01bytes]
+ lea r0, [r0+r1]
+ movzx r2, byte [r0+r1-1]
+ movd xmm2, r2d
+ pmuludq xmm2, [mmx_01bytes]
- movzx r2, byte [r0+2*r1-1]
- movd xmm3, r2d
- pmuludq xmm3, [mmx_01bytes]
+ movzx r2, byte [r0+2*r1-1]
+ movd xmm3, r2d
+ pmuludq xmm3, [mmx_01bytes]
- sub r0, r1
- movd [r0], xmm0
- movd [r0+r1], xmm1
- lea r0, [r0+2*r1]
- movd [r0], xmm2
- movd [r0+r1], xmm3
+ sub r0, r1
+ movd [r0], xmm0
+ movd [r0+r1], xmm1
+ lea r0, [r0+2*r1]
+ movd [r0], xmm2
+ movd [r0+r1], xmm3
- ret
+ ret
;*******************************************************************************
; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r4, r0 ; save r0 in r4
- sub r0, 1
- sub r0, r1
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0 ; save r0 in r4
+ sub r0, 1
+ sub r0, r1
- ;for H
- pxor xmm7, xmm7
- movq xmm0, [r0]
- movdqa xmm5, [sse2_plane_dec]
- punpcklbw xmm0, xmm7
- pmullw xmm0, xmm5
- movq xmm1, [r0 + 9]
- movdqa xmm6, [sse2_plane_inc]
- punpcklbw xmm1, xmm7
- pmullw xmm1, xmm6
- psubw xmm1, xmm0
+ ;for H
+ pxor xmm7, xmm7
+ movq xmm0, [r0]
+ movdqa xmm5, [sse2_plane_dec]
+ punpcklbw xmm0, xmm7
+ pmullw xmm0, xmm5
+ movq xmm1, [r0 + 9]
+ movdqa xmm6, [sse2_plane_inc]
+ punpcklbw xmm1, xmm7
+ pmullw xmm1, xmm6
+ psubw xmm1, xmm0
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r2d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
- movsx r2, r2w
- imul r2, 5
- add r2, 32
- sar r2, 6 ; b = (5 * H + 32) >> 6;
- SSE2_Copy8Times xmm1, r2d ; xmm1 = b,b,b,b,b,b,b,b
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r2d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+ movsx r2, r2w
+ imul r2, 5
+ add r2, 32
+ sar r2, 6 ; b = (5 * H + 32) >> 6;
+ SSE2_Copy8Times xmm1, r2d ; xmm1 = b,b,b,b,b,b,b,b
- movzx r3, BYTE [r0+16]
- sub r0, 3
- LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r0, r1
+ movzx r3, BYTE [r0+16]
+ sub r0, 3
+ LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r0, r1
- add r0, 3
- movzx r2, BYTE [r0+8*r1]
- add r3, r2
- shl r3, 4 ; a = (left[15*kiStride] + top[15]) << 4;
+ add r0, 3
+ movzx r2, BYTE [r0+8*r1]
+ add r3, r2
+ shl r3, 4 ; a = (left[15*kiStride] + top[15]) << 4;
- sub r0, 3
- add r0, r1
- LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r0, r1
- pxor xmm4, xmm4
- punpckhbw xmm0, xmm4
- pmullw xmm0, xmm5
- punpckhbw xmm7, xmm4
- pmullw xmm7, xmm6
- psubw xmm7, xmm0
+ sub r0, 3
+ add r0, r1
+ LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r0, r1
+ pxor xmm4, xmm4
+ punpckhbw xmm0, xmm4
+ pmullw xmm0, xmm5
+ punpckhbw xmm7, xmm4
+ pmullw xmm7, xmm6
+ psubw xmm7, xmm0
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r2d, xmm7 ; V
- movsx r2, r2w
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r2d, xmm7 ; V
+ movsx r2, r2w
- imul r2, 5
- add r2, 32
- sar r2, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, r2d ; xmm4 = c,c,c,c,c,c,c,c
+ imul r2, 5
+ add r2, 32
+ sar r2, 6 ; c = (5 * V + 32) >> 6;
+ SSE2_Copy8Times xmm4, r2d ; xmm4 = c,c,c,c,c,c,c,c
- mov r0, r4
- add r3, 16
- imul r2, -7
- add r3, r2 ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+ mov r0, r4
+ add r3, 16
+ imul r2, -7
+ add r3, r2 ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- xor r2, r2
- movdqa xmm5, [sse2_plane_inc_minus]
+ xor r2, r2
+ movdqa xmm5, [sse2_plane_inc_minus]
get_i16x16_luma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- movdqa xmm3, xmm1
- pmullw xmm3, xmm6
- paddw xmm3, xmm0
- psraw xmm3, 5
- packuswb xmm2, xmm3
- movdqa [r0], xmm2
- paddw xmm0, xmm4
- add r0, r1
- inc r2
- cmp r2, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ movdqa xmm3, xmm1
+ pmullw xmm3, xmm6
+ paddw xmm3, xmm0
+ psraw xmm3, 5
+ packuswb xmm2, xmm3
+ movdqa [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, r1
+ inc r2
+ cmp r2, 16
+ jnz get_i16x16_luma_pred_plane_sse2_1
- POP_XMM
- pop r4
- pop r3
- ret
+ POP_XMM
+ pop r4
+ pop r3
+ ret
@@ -306,31 +306,31 @@
;*******************************************************************************
%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
- lea %1, [%1+%2*2]
+ lea %1, [%1+%2*2]
- COPY_16_TIMES %1, xmm0
- movdqa [%1], xmm0
- COPY_16_TIMESS %1, xmm0, %2
- movdqa [%1+%2], xmm0
+ COPY_16_TIMES %1, xmm0
+ movdqa [%1], xmm0
+ COPY_16_TIMESS %1, xmm0, %2
+ movdqa [%1+%2], xmm0
%endmacro
WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
- COPY_16_TIMES r0, xmm0
- movdqa [r0], xmm0
- COPY_16_TIMESS r0, xmm0, r1
- movdqa [r0+r1], xmm0
+ COPY_16_TIMES r0, xmm0
+ movdqa [r0], xmm0
+ COPY_16_TIMESS r0, xmm0, r1
+ movdqa [r0+r1], xmm0
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
ret
@@ -338,9 +338,9 @@
; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
sub r0, r1
movdqa xmm0, [r0]
@@ -376,252 +376,252 @@
; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r4, r0
- sub r0, 1
- sub r0, r1
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
+ sub r0, 1
+ sub r0, r1
- pxor mm7, mm7
- movq mm0, [r0]
- movq mm5, [sse2_plane_dec_c]
- punpcklbw mm0, mm7
- pmullw mm0, mm5
- movq mm1, [r0 + 5]
- movq mm6, [sse2_plane_inc_c]
- punpcklbw mm1, mm7
- pmullw mm1, mm6
- psubw mm1, mm0
+ pxor mm7, mm7
+ movq mm0, [r0]
+ movq mm5, [sse2_plane_dec_c]
+ punpcklbw mm0, mm7
+ pmullw mm0, mm5
+ movq mm1, [r0 + 5]
+ movq mm6, [sse2_plane_inc_c]
+ punpcklbw mm1, mm7
+ pmullw mm1, mm6
+ psubw mm1, mm0
- movq2dq xmm1, mm1
- pxor xmm2, xmm2
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r2d, xmm1
- movsx r2, r2w
- imul r2, 17
- add r2, 16
- sar r2, 5 ; b = (17 * H + 16) >> 5;
- SSE2_Copy8Times xmm1, r2d ; mm1 = b,b,b,b,b,b,b,b
+ movq2dq xmm1, mm1
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r2d, xmm1
+ movsx r2, r2w
+ imul r2, 17
+ add r2, 16
+ sar r2, 5 ; b = (17 * H + 16) >> 5;
+ SSE2_Copy8Times xmm1, r2d ; mm1 = b,b,b,b,b,b,b,b
- movzx r3, BYTE [r0+8]
- sub r0, 3
- LOAD_COLUMN_C mm0, mm2, mm3, mm4, r0, r1
+ movzx r3, BYTE [r0+8]
+ sub r0, 3
+ LOAD_COLUMN_C mm0, mm2, mm3, mm4, r0, r1
- add r0, 3
- movzx r2, BYTE [r0+4*r1]
- add r3, r2
- shl r3, 4 ; a = (left[7*kiStride] + top[7]) << 4;
+ add r0, 3
+ movzx r2, BYTE [r0+4*r1]
+ add r3, r2
+ shl r3, 4 ; a = (left[7*kiStride] + top[7]) << 4;
- sub r0, 3
- add r0, r1
- LOAD_COLUMN_C mm7, mm2, mm3, mm4, r0, r1
- pxor mm4, mm4
- punpckhbw mm0, mm4
- pmullw mm0, mm5
- punpckhbw mm7, mm4
- pmullw mm7, mm6
- psubw mm7, mm0
+ sub r0, 3
+ add r0, r1
+ LOAD_COLUMN_C mm7, mm2, mm3, mm4, r0, r1
+ pxor mm4, mm4
+ punpckhbw mm0, mm4
+ pmullw mm0, mm5
+ punpckhbw mm7, mm4
+ pmullw mm7, mm6
+ psubw mm7, mm0
- movq2dq xmm7, mm7
- pxor xmm2, xmm2
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r2d, xmm7 ; V
- movsx r2, r2w
+ movq2dq xmm7, mm7
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r2d, xmm7 ; V
+ movsx r2, r2w
- imul r2, 17
- add r2, 16
- sar r2, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c
+ imul r2, 17
+ add r2, 16
+ sar r2, 5 ; c = (17 * V + 16) >> 5;
+ SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c
- mov r0, r4
- add r3, 16
- imul r2, -3
- add r3, r2 ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+ mov r0, r4
+ add r3, 16
+ imul r2, -3
+ add r3, r2 ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- xor r2, r2
- movdqa xmm5, [sse2_plane_mul_b_c]
+ xor r2, r2
+ movdqa xmm5, [sse2_plane_mul_b_c]
get_i_chroma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r0], xmm2
- paddw xmm0, xmm4
- add r0, r1
- inc r2
- cmp r2, 8
- jnz get_i_chroma_pred_plane_sse2_1
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, r1
+ inc r2
+ cmp r2, 8
+ jnz get_i_chroma_pred_plane_sse2_1
- POP_XMM
- pop r4
- pop r3
- WELSEMMS
- ret
+ POP_XMM
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
;*******************************************************************************
-; 0 |1 |2 |3 |4 |
-; 6 |7 |8 |9 |10|
-; 11|12|13|14|15|
-; 16|17|18|19|20|
-; 21|22|23|24|25|
-; 7 is the start pixel of current 4x4 block
-; pPred[7] = ([6]+[0]*2+[1]+2)/4
+; 0 |1 |2 |3 |4 |
+; 6 |7 |8 |9 |10|
+; 11|12|13|14|15|
+; 16|17|18|19|20|
+; 21|22|23|24|25|
+; 7 is the start pixel of current 4x4 block
+; pPred[7] = ([6]+[0]*2+[1]+2)/4
;
; void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
;
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
- movq mm1,[r2+r1-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
- movq mm2,[r2-8] ;get value of 6 mm2[8] = 6
- sub r2, r1 ;mov eax to above line of current block(postion of 1)
- punpckhbw mm2,[r2-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
- movd mm3,[r2] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
- punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
- psllq mm3,18h ;mm3[5]=[1]
- psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
- movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- lea r2,[r2+r1*2-8h] ;set eax point to 12
- movq mm4,[r2+r1] ;get value of 16, mm4[8]=[16]
- psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[16]
- por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
- movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
- movq mm4,[r2+r1*2] ;mm4[8]=[21]
- psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[21]
- por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
- movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
- pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
- pxor mm1,mm4 ;find odd value in the lowest bit of each byte
- pand mm1,[mmx_01bytes] ;set the odd bit
- psubusb mm3,mm1 ;decrease 1 from odd bytes
- pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
+ movq mm1,[r2+r1-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+ movq mm2,[r2-8] ;get value of 6 mm2[8] = 6
+ sub r2, r1 ;mov eax to above line of current block(postion of 1)
+ punpckhbw mm2,[r2-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+ movd mm3,[r2] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+ punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+ psllq mm3,18h ;mm3[5]=[1]
+ psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+ movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ lea r2,[r2+r1*2-8h] ;set eax point to 12
+ movq mm4,[r2+r1] ;get value of 16, mm4[8]=[16]
+ psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[16]
+ por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+ movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+ movq mm4,[r2+r1*2] ;mm4[8]=[21]
+ psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[21]
+ por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+ movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+ pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
+ pxor mm1,mm4 ;find odd value in the lowest bit of each byte
+ pand mm1,[mmx_01bytes] ;set the odd bit
+ psubusb mm3,mm1 ;decrease 1 from odd bytes
+ pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
- lea r0,[r0+r1]
- movd [r0+2*r1],mm2
- sub r0,r1
- psrlq mm2,8
- movd [r0+2*r1],mm2
- psrlq mm2,8
- movd [r0+r1],mm2
- psrlq mm2,8
- movd [r0],mm2
- WELSEMMS
- ret
+ lea r0,[r0+r1]
+ movd [r0+2*r1],mm2
+ sub r0,r1
+ psrlq mm2,8
+ movd [r0+2*r1],mm2
+ psrlq mm2,8
+ movd [r0+r1],mm2
+ psrlq mm2,8
+ movd [r0],mm2
+ WELSEMMS
+ ret
;*******************************************************************************
-; void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
; copy 8 pixel of 8 line from left
;*******************************************************************************
%macro MMX_PRED_H_8X8_ONE_LINE 4
- movq %1, [%3-8]
- psrlq %1, 38h
+ movq %1, [%3-8]
+ psrlq %1, 38h
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
%endmacro
%macro MMX_PRED_H_8X8_ONE_LINEE 4
- movq %1, [%3+r1-8]
- psrlq %1, 38h
+ movq %1, [%3+r1-8]
+ psrlq %1, 38h
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
%endmacro
WELS_EXTERN WelsDecoderIChromaPredH_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
- movq mm0, [r2-8]
- psrlq mm0, 38h
+ movq mm0, [r2-8]
+ psrlq mm0, 38h
- pmullw mm0, [mmx_01bytes]
- pshufw mm0, mm0, 0
- movq [r0], mm0
+ pmullw mm0, [mmx_01bytes]
+ pshufw mm0, mm0, 0
+ movq [r0], mm0
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
- lea r2, [r2+r1*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
- lea r0, [r0+2*r1]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
- lea r2, [r2+r1*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
- lea r0, [r0+2*r1]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
- lea r2, [r2+r1*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
- lea r0, [r0+2*r1]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
- WELSEMMS
- ret
+ WELSEMMS
+ ret
;*******************************************************************************
-; void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
; copy 8 pixels from top 8 pixels
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredV_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
- sub r0, r1
- movq mm0, [r0]
+ sub r0, r1
+ movq mm0, [r0]
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
- lea r0, [r0+2*r1]
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
- lea r0, [r0+2*r1]
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
- lea r0, [r0+2*r1]
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
- WELSEMMS
- ret
+ WELSEMMS
+ ret
;*******************************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |a |b |
-; |g |h |e |f |
-; |i |j |g |h |
+; |a |b |c |d |
+; |e |f |a |b |
+; |g |h |e |f |
+; |i |j |g |h |
; a = (1 + lt + l0)>>1
; e = (1 + l0 + l1)>>1
@@ -640,73 +640,73 @@
; void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movd mm0, [r2-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
- psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movd mm0, [r2-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+ psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
- movd mm1, [r2+2*r1-4]
- punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
- lea r2, [r2+2*r1]
- movd mm2, [r2+2*r1-4]
- punpcklbw mm2, [r2+r1-4] ; mm2[7] = l2, mm2[6] = l3
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
- psrlq mm2, 20h
- pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+ movd mm1, [r2+2*r1-4]
+ punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r2, [r2+2*r1]
+ movd mm2, [r2+2*r1-4]
+ punpcklbw mm2, [r2+r1-4] ; mm2[7] = l2, mm2[6] = l3
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+ psrlq mm2, 20h
+ pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
- movq mm1, mm0
- psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
- movq mm2, mm0
- psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
- movq mm3, mm2
- movq mm4, mm1
- pavgb mm1, mm0
+ movq mm1, mm0
+ psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+ movq mm2, mm0
+ psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+ movq mm3, mm2
+ movq mm4, mm1
+ pavgb mm1, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm4 ; decrease 1 from odd bytes
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm4 ; decrease 1 from odd bytes
- pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
+ pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
- movq mm4, mm0
- pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
- punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
+ movq mm4, mm0
+ pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
+ punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
- psrlq mm2, 20h
- psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
- movq mm4, mm3
- psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
- pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
- psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
+ psrlq mm2, 20h
+ psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
+ movq mm4, mm3
+ psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
+ pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
- movd [r0], mm2
- lea r0, [r0+r1]
- movd [r0+2*r1], mm3
- sub r0, r1
- psrlq mm3, 10h
- movd [r0+2*r1], mm3
- psrlq mm3, 10h
- movd [r0+r1], mm3
- WELSEMMS
- ret
+ movd [r0], mm2
+ lea r0, [r0+r1]
+ movd [r0+2*r1], mm3
+ sub r0, r1
+ psrlq mm3, 10h
+ movd [r0+2*r1], mm3
+ psrlq mm3, 10h
+ movd [r0+r1], mm3
+ WELSEMMS
+ ret
;*******************************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
; destination:
-; |a |b |c |d |
-; |c |d |e |f |
-; |e |f |g |g |
-; |g |g |g |g |
+; |a |b |c |d |
+; |c |d |e |f |
+; |e |f |g |g |
+; |g |g |g |g |
; a = (1 + l0 + l1)>>1
; c = (1 + l1 + l2)>>1
@@ -722,74 +722,74 @@
; void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
- movd mm0, [r2-4] ; mm0[3] = l0
- punpcklbw mm0, [r2+r1-4] ; mm0[7] = l1, mm0[6] = l0
- lea r2, [r2+2*r1]
- movd mm2, [r2-4] ; mm2[3] = l2
- movd mm4, [r2+r1-4] ; mm4[3] = l3
- punpcklbw mm2, mm4
- punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+ movd mm0, [r2-4] ; mm0[3] = l0
+ punpcklbw mm0, [r2+r1-4] ; mm0[7] = l1, mm0[6] = l0
+ lea r2, [r2+2*r1]
+ movd mm2, [r2-4] ; mm2[3] = l2
+ movd mm4, [r2+r1-4] ; mm4[3] = l3
+ punpcklbw mm2, mm4
+ punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
- psrlq mm4, 18h
- psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
- psrlq mm0, 8h
- pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+ psrlq mm4, 18h
+ psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
+ psrlq mm0, 8h
+ pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
- movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
- pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+ movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+ pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
- movq mm5, mm2
- pavgb mm2, mm0
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+ movq mm5, mm2
+ pavgb mm2, mm0
- pxor mm5, mm0 ; find odd value in the lowest bit of each byte
- pand mm5, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm5 ; decrease 1 from odd bytes
+ pxor mm5, mm0 ; find odd value in the lowest bit of each byte
+ pand mm5, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm5 ; decrease 1 from odd bytes
- pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
+ pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
- psrlq mm2, 8h
- pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
+ psrlq mm2, 8h
+ pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
- punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
- punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
- punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
+ punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
+ punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
+ punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
- psrlq mm4, 20h
- lea r0, [r0+r1]
- movd [r0+2*r1], mm4
+ psrlq mm4, 20h
+ lea r0, [r0+r1]
+ movd [r0+2*r1], mm4
- sub r0, r1
- movd [r0], mm1
- psrlq mm1, 10h
- movd [r0+r1], mm1
- psrlq mm1, 10h
- movd [r0+2*r1], mm1
- WELSEMMS
- ret
+ sub r0, r1
+ movd [r0], mm1
+ psrlq mm1, 10h
+ movd [r0+r1], mm1
+ psrlq mm1, 10h
+ movd [r0+2*r1], mm1
+ WELSEMMS
+ ret
;*******************************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; l3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; l3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |i |a |b |c |
-; |j |e |f |g |
+; |a |b |c |d |
+; |e |f |g |h |
+; |i |a |b |c |
+; |j |e |f |g |
; a = (1 + lt + t0)>>1
; b = (1 + t0 + t1)>>1
@@ -807,77 +807,77 @@
; void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movq mm0, [r2-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
- psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movq mm0, [r2-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+ psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
- movd mm1, [r2+2*r1-4]
- punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
- lea r2, [r2+2*r1]
- movq mm2, [r2+r1-8] ; mm2[7] = l2
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
- psrlq mm2, 28h
- pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+ movd mm1, [r2+2*r1-4]
+ punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r2, [r2+2*r1]
+ movq mm2, [r2+r1-8] ; mm2[7] = l2
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+ psrlq mm2, 28h
+ pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
- movq mm3, mm2
- pavgb mm2, mm0
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+ movq mm3, mm2
+ pavgb mm2, mm0
- pxor mm3, mm0 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm3 ; decrease 1 from odd bytes
+ pxor mm3, mm0 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm3 ; decrease 1 from odd bytes
- movq mm3, mm0
- psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
- movq mm2, mm3
+ movq mm3, mm0
+ psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
+ movq mm2, mm3
- psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
- movd [r0], mm1
+ psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
+ movd [r0], mm1
- psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
- movd [r0+r1], mm2
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
+ movd [r0+r1], mm2
- movq mm4, mm3
- psllq mm4, 20h
- psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
+ movq mm4, mm3
+ psllq mm4, 20h
+ psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
- movq mm5, mm3
- psllq mm5, 28h
- psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
+ movq mm5, mm3
+ psllq mm5, 28h
+ psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
- psllq mm1, 8h
- pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
- movd [r0+2*r1], mm4
+ psllq mm1, 8h
+ pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
+ movd [r0+2*r1], mm4
- psllq mm2, 8h
- pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
- lea r0, [r0+2*r1]
- movd [r0+r1], mm5
- WELSEMMS
- ret
+ psllq mm2, 8h
+ pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm5
+ WELSEMMS
+ ret
;*******************************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
; destination:
-; |a |b |c |d |
-; |b |c |d |e |
-; |c |d |e |f |
-; |d |e |f |g |
+; |a |b |c |d |
+; |b |c |d |e |
+; |c |d |e |f |
+; |d |e |f |g |
; a = (2 + t0 + t2 + (t1<<1))>>2
; b = (2 + t1 + t3 + (t2<<1))>>2
@@ -893,56 +893,56 @@
; void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
- movq mm3, mm0
- psrlq mm3, 38h
- psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
+ movq mm3, mm0
+ psrlq mm3, 38h
+ psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
- psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
- psrlq mm2, 8h
- pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+ psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+ psrlq mm2, 8h
+ pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
- movq mm3, mm1
- pavgb mm1, mm2
- pxor mm3, mm2 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm3 ; decrease 1 from odd bytes
+ movq mm3, mm1
+ pavgb mm1, mm2
+ pxor mm3, mm2 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm3 ; decrease 1 from odd bytes
- pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
+ pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
- psrlq mm0, 8h
- movd [r0], mm0
- psrlq mm0, 8h
- movd [r0+r1], mm0
- psrlq mm0, 8h
- movd [r0+2*r1], mm0
- psrlq mm0, 8h
- lea r0, [r0+2*r1]
- movd [r0+r1], mm0
- WELSEMMS
- ret
+ psrlq mm0, 8h
+ movd [r0], mm0
+ psrlq mm0, 8h
+ movd [r0+r1], mm0
+ psrlq mm0, 8h
+ movd [r0+2*r1], mm0
+ psrlq mm0, 8h
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm0
+ WELSEMMS
+ ret
;*******************************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |b |c |d |i |
-; |f |g |h |j |
+; |a |b |c |d |
+; |e |f |g |h |
+; |b |c |d |i |
+; |f |g |h |j |
; a = (1 + t0 + t1)>>1
; b = (1 + t1 + t2)>>1
@@ -961,40 +961,40 @@
; void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
- sub r2, r1
- movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
+ sub r2, r1
+ movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
- psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
- psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+ psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+ psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
- movq mm3, mm1
- pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
+ movq mm3, mm1
+ pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
- movq mm4, mm2
- pavgb mm2, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm4 ; decrease 1 from odd bytes
+ movq mm4, mm2
+ pavgb mm2, mm0
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm4 ; decrease 1 from odd bytes
- pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
+ pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
- movd [r0], mm3
- psrlq mm3, 8h
- movd [r0+2*r1], mm3
+ movd [r0], mm3
+ psrlq mm3, 8h
+ movd [r0+2*r1], mm3
- movd [r0+r1], mm2
- psrlq mm2, 8h
- lea r0, [r0+2*r1]
- movd [r0+r1], mm2
- WELSEMMS
- ret
+ movd [r0+r1], mm2
+ psrlq mm2, 8h
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm2
+ WELSEMMS
+ ret
;*******************************************************************************
;
@@ -1001,93 +1001,93 @@
; void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r4, r0
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
- sub r0, r1
- movq mm0, [r0]
+ sub r0, r1
+ movq mm0, [r0]
- movzx r2, byte [r0+r1-0x01] ; l1
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l2
- add r2, r3
- movzx r3, byte [r0+r1-0x01] ; l3
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l4
- add r2, r3
- movd mm1, r2d ; mm1 = l1+l2+l3+l4
+ movzx r2, byte [r0+r1-0x01] ; l1
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l2
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01] ; l3
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l4
+ add r2, r3
+ movd mm1, r2d ; mm1 = l1+l2+l3+l4
- movzx r2, byte [r0+r1-0x01] ; l5
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l6
- add r2, r3
- movzx r3, byte [r0+r1-0x01] ; l7
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l8
- add r2, r3
- movd mm2, r2d ; mm2 = l5+l6+l7+l8
+ movzx r2, byte [r0+r1-0x01] ; l5
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l6
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01] ; l7
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l8
+ add r2, r3
+ movd mm2, r2d ; mm2 = l5+l6+l7+l8
- movq mm3, mm0
- psrlq mm0, 0x20
- psllq mm3, 0x20
- psrlq mm3, 0x20
- pxor mm4, mm4
- psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+ movq mm3, mm0
+ psrlq mm0, 0x20
+ psllq mm3, 0x20
+ psrlq mm3, 0x20
+ pxor mm4, mm4
+ psadbw mm0, mm4
+ psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
- paddq mm3, mm1
- movq mm1, mm2
- paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+ paddq mm3, mm1
+ movq mm1, mm2
+ paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
- movq mm4, [mmx_0x02]
+ movq mm4, [mmx_0x02]
- paddq mm0, mm4
- psrlq mm0, 0x02
+ paddq mm0, mm4
+ psrlq mm0, 0x02
- paddq mm2, mm4
- psrlq mm2, 0x02
+ paddq mm2, mm4
+ psrlq mm2, 0x02
- paddq mm3, mm4
- paddq mm3, mm4
- psrlq mm3, 0x03
+ paddq mm3, mm4
+ paddq mm3, mm4
+ psrlq mm3, 0x03
- paddq mm1, mm4
- paddq mm1, mm4
- psrlq mm1, 0x03
+ paddq mm1, mm4
+ paddq mm1, mm4
+ psrlq mm1, 0x03
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
- psllq mm0, 0x20
- pxor mm0, mm3 ; mm0 = m_up
+ pmuludq mm0, [mmx_01bytes]
+ pmuludq mm3, [mmx_01bytes]
+ psllq mm0, 0x20
+ pxor mm0, mm3 ; mm0 = m_up
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
- psllq mm1, 0x20
- pxor mm1, mm2 ; mm2 = m_down
+ pmuludq mm2, [mmx_01bytes]
+ pmuludq mm1, [mmx_01bytes]
+ psllq mm1, 0x20
+ pxor mm1, mm2 ; mm2 = m_down
- movq [r4], mm0
- movq [r4+r1], mm0
- movq [r4+2*r1], mm0
- lea r4, [r4+2*r1]
- movq [r4+r1], mm0
+ movq [r4], mm0
+ movq [r4+r1], mm0
+ movq [r4+2*r1], mm0
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm0
- movq [r4+2*r1], mm1
- lea r4, [r4+2*r1]
- movq [r4+r1], mm1
- movq [r4+2*r1], mm1
- lea r4, [r4+2*r1]
- movq [r4+r1], mm1
+ movq [r4+2*r1], mm1
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm1
+ movq [r4+2*r1], mm1
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm1
- pop r4
- pop r3
- WELSEMMS
- ret
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
@@ -1096,75 +1096,75 @@
; void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r4, r0
- sub r0, r1
- movdqa xmm0, [r0] ; read one row
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- movdqa xmm1, xmm0
- psrldq xmm1, 0x08
- pslldq xmm0, 0x08
- psrldq xmm0, 0x08
- paddw xmm0, xmm1
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
+ sub r0, r1
+ movdqa xmm0, [r0] ; read one row
+ pxor xmm1, xmm1
+ psadbw xmm0, xmm1
+ movdqa xmm1, xmm0
+ psrldq xmm1, 0x08
+ pslldq xmm0, 0x08
+ psrldq xmm0, 0x08
+ paddw xmm0, xmm1
- movzx r2, byte [r0+r1-0x01]
- movzx r3, byte [r0+2*r1-0x01]
- add r2, r3
- lea r0, [r0+r1]
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- add r2, 0x10
- movd xmm1, r2d
- paddw xmm0, xmm1
- psrld xmm0, 0x05
- pmuludq xmm0, [mmx_01bytes]
- pshufd xmm0, xmm0, 0
+ movzx r2, byte [r0+r1-0x01]
+ movzx r3, byte [r0+2*r1-0x01]
+ add r2, r3
+ lea r0, [r0+r1]
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ add r2, 0x10
+ movd xmm1, r2d
+ paddw xmm0, xmm1
+ psrld xmm0, 0x05
+ pmuludq xmm0, [mmx_01bytes]
+ pshufd xmm0, xmm0, 0
- movdqa [r4], xmm0
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4], xmm0
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
+ movdqa [r4+r1], xmm0
- pop r4
- pop r3
+ pop r4
+ pop r3
- ret
+ ret
;*******************************************************************************
; for intra prediction as follows, 11/19/2010
@@ -1171,239 +1171,239 @@
;*******************************************************************************
;*******************************************************************************
-; void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movdqa xmm0, [r2] ; pPred-kiStride, top line
- pxor xmm7, xmm7
- psadbw xmm0, xmm7
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddw xmm0, xmm1
- xor r2, r2
- movd r2d, xmm0
- ;movdqa xmm1, xmm0
- ;punpcklbw xmm0, xmm7
- ;punpckhbw xmm1, xmm7
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movdqa xmm0, [r2] ; pPred-kiStride, top line
+ pxor xmm7, xmm7
+ psadbw xmm0, xmm7
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddw xmm0, xmm1
+ xor r2, r2
+ movd r2d, xmm0
+ ;movdqa xmm1, xmm0
+ ;punpcklbw xmm0, xmm7
+ ;punpckhbw xmm1, xmm7
- ;paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
- ;pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4
- ;paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
- ;pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
- ;paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
- ;pshuflw xmm1, xmm0, 0b1h ; 10110001
- ;paddw xmm0, xmm1 ; sum in word unit (x8)
- ;xor r3, r3
- ;movd r3d, xmm0
- ;and edx, 0ffffh
+ ;paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
+ ;pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4
+ ;paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
+ ;pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
+ ;paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
+ ;pshuflw xmm1, xmm0, 0b1h ; 10110001
+ ;paddw xmm0, xmm1 ; sum in word unit (x8)
+ ;xor r3, r3
+ ;movd r3d, xmm0
+ ;and edx, 0ffffh
- add r2, 8
- sar r2, 4
- SSE2_Copy16Times xmm1, r2d
- ;mov dh, dl
- ;mov r2, edx
- ;shl r2, 010h
- ;or edx, r2
- ;movd xmm1, edx
- ;pshufd xmm0, xmm1, 00h
- ;movdqa xmm1, xmm0
- movdqa xmm0, xmm1
- lea r2, [2*r1+r1] ; 3*kiStride
+ add r2, 8
+ sar r2, 4
+ SSE2_Copy16Times xmm1, r2d
+ ;mov dh, dl
+ ;mov r2, edx
+ ;shl r2, 010h
+ ;or edx, r2
+ ;movd xmm1, edx
+ ;pshufd xmm0, xmm1, 00h
+ ;movdqa xmm1, xmm0
+ movdqa xmm0, xmm1
+ lea r2, [2*r1+r1] ; 3*kiStride
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- POP_XMM
- ret
+ POP_XMM
+ ret
;*******************************************************************************
-; void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- lea r2, [2*r1+r1] ; 3*kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ lea r2, [2*r1+r1] ; 3*kiStride
- movdqa xmm0, [sse2_dc_0x80]
- movdqa xmm1, xmm0
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
+ movdqa xmm0, [sse2_dc_0x80]
+ movdqa xmm1, xmm0
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- ret
+ ret
;*******************************************************************************
-; void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r4, r0
- ; for left
- dec r0
- xor r2, r2
- xor r3, r3
- movzx r2, byte [r0]
- movzx r3, byte [r0+r1]
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0]
- add r2, r3
- movzx r3, byte [r0+r1]
- add r2, r3
- add r2, 02h
- sar r2, 02h
- ;SSE2_Copy16Times mm0, r2d
- mov r3, r2
- sal r3, 8
- or r2, r3
- movd mm1, r2d
- pshufw mm0, mm1, 00h
- ;mov bh, bl
- ;movd mm1, ebx
- ;pshufw mm0, mm1, 00h ; up64
- movq mm1, mm0
- xor r2, r2
- lea r0, [r0+2*r1]
- movzx r2, byte [r0]
- movzx r3, byte [r0+r1]
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0]
- add r2, r3
- movzx r3, byte [r0+r1]
- add r2, r3
- add r2, 02h
- sar r2, 02h
- mov r3, r2
- sal r3, 8
- or r2, r3
- movd mm3, r2d
- pshufw mm2, mm3, 00h
- ;mov bh, bl
- ;movd mm3, ebx
- ;pshufw mm2, mm3, 00h ; down64
- ;SSE2_Copy16Times mm2, r2d
- movq mm3, mm2
- lea r2, [2*r1+r1]
- movq [r4], mm0
- movq [r4+r1], mm1
- movq [r4+2*r1], mm0
- movq [r4+r2], mm1
- lea r4, [r4+4*r1]
- movq [r4], mm2
- movq [r4+r1], mm3
- movq [r4+2*r1], mm2
- movq [r4+r2], mm3
- pop r4
- pop r3
- emms
- ret
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
+ ; for left
+ dec r0
+ xor r2, r2
+ xor r3, r3
+ movzx r2, byte [r0]
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0]
+ add r2, r3
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ add r2, 02h
+ sar r2, 02h
+ ;SSE2_Copy16Times mm0, r2d
+ mov r3, r2
+ sal r3, 8
+ or r2, r3
+ movd mm1, r2d
+ pshufw mm0, mm1, 00h
+ ;mov bh, bl
+ ;movd mm1, ebx
+ ;pshufw mm0, mm1, 00h ; up64
+ movq mm1, mm0
+ xor r2, r2
+ lea r0, [r0+2*r1]
+ movzx r2, byte [r0]
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0]
+ add r2, r3
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ add r2, 02h
+ sar r2, 02h
+ mov r3, r2
+ sal r3, 8
+ or r2, r3
+ movd mm3, r2d
+ pshufw mm2, mm3, 00h
+ ;mov bh, bl
+ ;movd mm3, ebx
+ ;pshufw mm2, mm3, 00h ; down64
+ ;SSE2_Copy16Times mm2, r2d
+ movq mm3, mm2
+ lea r2, [2*r1+r1]
+ movq [r4], mm0
+ movq [r4+r1], mm1
+ movq [r4+2*r1], mm0
+ movq [r4+r2], mm1
+ lea r4, [r4+4*r1]
+ movq [r4], mm2
+ movq [r4+r1], mm3
+ movq [r4+2*r1], mm2
+ movq [r4+r2], mm3
+ pop r4
+ pop r3
+ emms
+ ret
;*******************************************************************************
-; void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movq xmm0, [r2] ; top: 8x1 pixels
- pxor xmm7, xmm7
- punpcklbw xmm0, xmm7 ; ext 8x2 words
- pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
- paddw xmm0, xmm1 ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
- movdqa xmm1, xmm0
- pshuflw xmm2, xmm0, 0B1h ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
- pshufhw xmm3, xmm1, 0B1h ; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
- paddw xmm0, xmm2 ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
- paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
- punpckhqdq xmm1, xmm7
- punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
- movdqa xmm6, [sse2_wd_0x02]
- paddw xmm0, xmm6
- psraw xmm0, 02h
- packuswb xmm0, xmm7
- lea r2, [2*r1+r1]
- movq [r0], xmm0
- movq [r0+r1], xmm0
- movq [r0+2*r1], xmm0
- movq [r0+r2], xmm0
- lea r0, [r0+4*r1]
- movq [r0], xmm0
- movq [r0+r1], xmm0
- movq [r0+2*r1], xmm0
- movq [r0+r2], xmm0
- POP_XMM
- ret
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movq xmm0, [r2] ; top: 8x1 pixels
+ pxor xmm7, xmm7
+ punpcklbw xmm0, xmm7 ; ext 8x2 words
+ pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
+ paddw xmm0, xmm1 ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
+ movdqa xmm1, xmm0
+ pshuflw xmm2, xmm0, 0B1h ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
+ pshufhw xmm3, xmm1, 0B1h ; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
+ paddw xmm0, xmm2 ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
+ paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
+ punpckhqdq xmm1, xmm7
+ punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
+ movdqa xmm6, [sse2_wd_0x02]
+ paddw xmm0, xmm6
+ psraw xmm0, 02h
+ packuswb xmm0, xmm7
+ lea r2, [2*r1+r1]
+ movq [r0], xmm0
+ movq [r0+r1], xmm0
+ movq [r0+2*r1], xmm0
+ movq [r0+r2], xmm0
+ lea r0, [r0+4*r1]
+ movq [r0], xmm0
+ movq [r0+r1], xmm0
+ movq [r0+2*r1], xmm0
+ movq [r0+r2], xmm0
+ POP_XMM
+ ret
;*******************************************************************************
-; void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- lea r2, [2*r1+r1]
- movq mm0, [sse2_dc_0x80]
- movq mm1, mm0
- movq [r0], mm0
- movq [r0+r1], mm1
- movq [r0+2*r1], mm0
- movq [r0+r2], mm1
- lea r0, [r0+4*r1]
- movq [r0], mm0
- movq [r0+r1], mm1
- movq [r0+2*r1], mm0
- movq [r0+r2], mm1
- emms
- ret
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ lea r2, [2*r1+r1]
+ movq mm0, [sse2_dc_0x80]
+ movq mm1, mm0
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ movq [r0+2*r1], mm0
+ movq [r0+r2], mm1
+ lea r0, [r0+4*r1]
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ movq [r0+2*r1], mm0
+ movq [r0+r2], mm1
+ emms
+ ret
--- a/codec/encoder/core/arm/intra_pred_neon.S
+++ b/codec/encoder/core/arm/intra_pred_neon.S
@@ -38,107 +38,107 @@
#ifdef __APPLE__
//Global macro
.macro GET_8BYTE_DATA
- vld1.8 {$0[0]}, [$1], $2
- vld1.8 {$0[1]}, [$1], $2
- vld1.8 {$0[2]}, [$1], $2
- vld1.8 {$0[3]}, [$1], $2
- vld1.8 {$0[4]}, [$1], $2
- vld1.8 {$0[5]}, [$1], $2
- vld1.8 {$0[6]}, [$1], $2
- vld1.8 {$0[7]}, [$1], $2
+ vld1.8 {$0[0]}, [$1], $2
+ vld1.8 {$0[1]}, [$1], $2
+ vld1.8 {$0[2]}, [$1], $2
+ vld1.8 {$0[3]}, [$1], $2
+ vld1.8 {$0[4]}, [$1], $2
+ vld1.8 {$0[5]}, [$1], $2
+ vld1.8 {$0[6]}, [$1], $2
+ vld1.8 {$0[7]}, [$1], $2
.endm
#else
//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
- vld1.8 {\arg0[0]}, [\arg1], \arg2
- vld1.8 {\arg0[1]}, [\arg1], \arg2
- vld1.8 {\arg0[2]}, [\arg1], \arg2
- vld1.8 {\arg0[3]}, [\arg1], \arg2
- vld1.8 {\arg0[4]}, [\arg1], \arg2
- vld1.8 {\arg0[5]}, [\arg1], \arg2
- vld1.8 {\arg0[6]}, [\arg1], \arg2
- vld1.8 {\arg0[7]}, [\arg1], \arg2
+ vld1.8 {\arg0[0]}, [\arg1], \arg2
+ vld1.8 {\arg0[1]}, [\arg1], \arg2
+ vld1.8 {\arg0[2]}, [\arg1], \arg2
+ vld1.8 {\arg0[3]}, [\arg1], \arg2
+ vld1.8 {\arg0[4]}, [\arg1], \arg2
+ vld1.8 {\arg0[5]}, [\arg1], \arg2
+ vld1.8 {\arg0[6]}, [\arg1], \arg2
+ vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
- //Get the top line data to 'q0'
- sub r3, r1, r2
- vldm r3, {d0, d1}
+ //Get the top line data to 'q0'
+ sub r3, r1, r2
+ vldm r3, {d0, d1}
- //mov r2, #16
- mov r3, #4
- //Set the top line to the each line of MB(16*16)
+ //mov r2, #16
+ mov r3, #4
+ //Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v:
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_v
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
//stmdb sp!, {r4, lr}
- sub r1, r1, #1
- mov r3, #4
+ sub r1, r1, #1
+ mov r3, #4
loop_0_get_i16x16_luma_pred_h:
- //Get one byte data from left side
- vld1.8 {d0[],d1[]}, [r1], r2
- vld1.8 {d2[],d3[]}, [r1], r2
- vld1.8 {d4[],d5[]}, [r1], r2
- vld1.8 {d6[],d7[]}, [r1], r2
+ //Get one byte data from left side
+ vld1.8 {d0[],d1[]}, [r1], r2
+ vld1.8 {d2[],d3[]}, [r1], r2
+ vld1.8 {d4[],d5[]}, [r1], r2
+ vld1.8 {d6[],d7[]}, [r1], r2
- //Set the line of MB using the left side byte data
- vst1.8 {d0,d1}, [r0]!
- //add r0, #16
- vst1.8 {d2,d3}, [r0]!
- //add r0, #16
- vst1.8 {d4,d5}, [r0]!
- //add r0, #16
- vst1.8 {d6,d7}, [r0]!
- //add r0, #16
+ //Set the line of MB using the left side byte data
+ vst1.8 {d0,d1}, [r0]!
+ //add r0, #16
+ vst1.8 {d2,d3}, [r0]!
+ //add r0, #16
+ vst1.8 {d4,d5}, [r0]!
+ //add r0, #16
+ vst1.8 {d6,d7}, [r0]!
+ //add r0, #16
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_h
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
- //stmdb sp!, { r2-r5, lr}
- //Get the left vertical line data
- sub r3, r1, #1
- GET_8BYTE_DATA d0, r3, r2
- GET_8BYTE_DATA d1, r3, r2
+ //stmdb sp!, { r2-r5, lr}
+ //Get the left vertical line data
+ sub r3, r1, #1
+ GET_8BYTE_DATA d0, r3, r2
+ GET_8BYTE_DATA d1, r3, r2
- //Get the top horizontal line data
- sub r3, r1, r2
- vldm r3, {d2, d3}
+ //Get the top horizontal line data
+ sub r3, r1, r2
+ vldm r3, {d2, d3}
- //Calculate the sum of top horizontal line data and vertical line data
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vadd.u16 q0, q0, q1
- vadd.u16 d0, d0, d1
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
+ //Calculate the sum of top horizontal line data and vertical line data
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
- //Calculate the mean value
- vrshr.u16 d0, d0, #5
- vdup.8 q0, d0[0]
+ //Calculate the mean value
+ vrshr.u16 d0, d0, #5
+ vdup.8 q0, d0[0]
- //Set the mean value to the all of member of MB
- mov r3, #4
+ //Set the mean value to the all of member of MB
+ mov r3, #4
loop_0_get_i16x16_luma_pred_dc_both:
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_dc_both
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END
@@ -151,383 +151,383 @@
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
- //stmdb sp!, { r4, lr}
+ //stmdb sp!, { r4, lr}
- //Load the table {(8,7,6,5,4,3,2,1) * 5}
- adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
- vldr d0, [r3]
+ //Load the table {(8,7,6,5,4,3,2,1) * 5}
+ adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
+ vldr d0, [r3]
- //Pack the top[-1] ~ top[6] to d1
- sub r3, r1, r2
- sub r1, r3, #1
- vld1.8 d1, [r1]
+ //Pack the top[-1] ~ top[6] to d1
+ sub r3, r1, r2
+ sub r1, r3, #1
+ vld1.8 d1, [r1]
- //Pack the top[8] ~ top[15] to d2
- add r1, #9
- vld1.8 d2, [r1]
+ //Pack the top[8] ~ top[15] to d2
+ add r1, #9
+ vld1.8 d2, [r1]
- //Save the top[15] to d6 for next step
- vdup.u8 d6, d2[7]
+ //Save the top[15] to d6 for next step
+ vdup.u8 d6, d2[7]
- //Get and pack left[-1] ~ left[6] to d4
- sub r1, r3, #1
- GET_8BYTE_DATA d4, r1, r2
+ //Get and pack left[-1] ~ left[6] to d4
+ sub r1, r3, #1
+ GET_8BYTE_DATA d4, r1, r2
- //Get and pack left[8] ~ left[15] to d3
- add r1, r2
- GET_8BYTE_DATA d3, r1, r2
+ //Get and pack left[8] ~ left[15] to d3
+ add r1, r2
+ GET_8BYTE_DATA d3, r1, r2
- //Save the left[15] to d7 for next step
- vdup.u8 d7, d3[7]
+ //Save the left[15] to d7 for next step
+ vdup.u8 d7, d3[7]
- //revert the sequence of d2,d3
- vrev64.8 q1, q1
+ //revert the sequence of d2,d3
+ vrev64.8 q1, q1
- vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
- vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
+ vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
+ vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
- vmovl.u8 q0, d0
- vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
- vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
+ vmovl.u8 q0, d0
+ vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
+ vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
- //Calculate the sum of items of q1, q2
- vpadd.s16 d0, d2, d3
- vpadd.s16 d1, d4, d5
- vpaddl.s16 q0, q0
- vpaddl.s32 q0, q0
+ //Calculate the sum of items of q1, q2
+ vpadd.s16 d0, d2, d3
+ vpadd.s16 d1, d4, d5
+ vpaddl.s16 q0, q0
+ vpaddl.s32 q0, q0
- //Get the value of 'b', 'c' and extend to q1, q2.
- vrshr.s64 q0, #6
- vdup.s16 q1, d0[0]
- vdup.s16 q2, d1[0]
+ //Get the value of 'b', 'c' and extend to q1, q2.
+ vrshr.s64 q0, #6
+ vdup.s16 q1, d0[0]
+ vdup.s16 q2, d1[0]
- //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
- adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
- vld1.32 {d0}, [r3]
+ //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
+ adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
+ vld1.32 {d0}, [r3]
- //Get the value of 'a' and save to q3
- vaddl.u8 q3, d6, d7
- vshl.u16 q3, #4
+ //Get the value of 'a' and save to q3
+ vaddl.u8 q3, d6, d7
+ vshl.u16 q3, #4
- //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
- vmovl.s8 q0, d0
- vmla.s16 q3, q0, q1
- vmla.s16 q3, q2, d0[0]
+ //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
+ vmovl.s8 q0, d0
+ vmla.s16 q3, q0, q1
+ vmla.s16 q3, q2, d0[0]
- //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
- vshl.s16 q8, q1, #3
- vadd.s16 q8, q3
+ //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
+ vshl.s16 q8, q1, #3
+ vadd.s16 q8, q3
- //right shift 5 bits and rounding
- vqrshrun.s16 d0, q3, #5
- vqrshrun.s16 d1, q8, #5
+ //right shift 5 bits and rounding
+ vqrshrun.s16 d0, q3, #5
+ vqrshrun.s16 d1, q8, #5
- //Set the line of MB
- vst1.u32 {d0,d1}, [r0]!
+ //Set the line of MB
+ vst1.u32 {d0,d1}, [r0]!
- //Do the same processing for setting other lines
- mov r3, #15
+ //Do the same processing for setting other lines
+ mov r3, #15
loop_0_get_i16x16_luma_pred_plane:
- vadd.s16 q3, q2
- vadd.s16 q8, q2
- vqrshrun.s16 d0, q3, #5
- vqrshrun.s16 d1, q8, #5
- vst1.u32 {d0,d1}, [r0]!
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_plane
+ vadd.s16 q3, q2
+ vadd.s16 q8, q2
+ vqrshrun.s16 d0, q3, #5
+ vqrshrun.s16 d1, q8, #5
+ vst1.u32 {d0,d1}, [r0]!
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r3, r1, r2
- ldr r3, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r3, r1, r2
+ ldr r3, [r3]
- //Set the luma MB using top line
- str r3, [r0], #4
- str r3, [r0], #4
- str r3, [r0], #4
- str r3, [r0]
+ //Set the luma MB using top line
+ str r3, [r0], #4
+ str r3, [r0], #4
+ str r3, [r0], #4
+ str r3, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the left column (4 bytes)
- sub r3, r1, #1
- vld1.8 {d0[]}, [r3], r2
- vld1.8 {d1[]}, [r3], r2
- vld1.8 {d2[]}, [r3], r2
- vld1.8 {d3[]}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the left column (4 bytes)
+ sub r3, r1, #1
+ vld1.8 {d0[]}, [r3], r2
+ vld1.8 {d1[]}, [r3], r2
+ vld1.8 {d2[]}, [r3], r2
+ vld1.8 {d3[]}, [r3]
- //Set the luma MB using the left side byte
- vst1.32 {d0[0]}, [r0]!
- vst1.32 {d1[0]}, [r0]!
- vst1.32 {d2[0]}, [r0]!
- vst1.32 {d3[0]}, [r0]
+ //Set the luma MB using the left side byte
+ vst1.32 {d0[0]}, [r0]!
+ vst1.32 {d1[0]}, [r0]!
+ vst1.32 {d2[0]}, [r0]!
+ vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row data(8 bytes)
- sub r3, r1, r2
- vld1.32 {d0}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row data(8 bytes)
+ sub r3, r1, r2
+ vld1.32 {d0}, [r3]
- //For "t7 + (t7<<1)"
- vdup.8 d1, d0[7]
+ //For "t7 + (t7<<1)"
+ vdup.8 d1, d0[7]
- //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
- vext.8 d1, d0, d1, #1
- vaddl.u8 q1, d1, d0
+ //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q1, d1, d0
- //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
- vext.8 q2, q1, q1, #14
- vadd.u16 q0, q1, q2
+ //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
+ vext.8 q2, q1, q1, #14
+ vadd.u16 q0, q1, q2
- //right shift 2 bits and rounding
- vqrshrn.u16 d0, q0, #2
+ //right shift 2 bits and rounding
+ vqrshrn.u16 d0, q0, #2
- //Save "ddl0, ddl1, ddl2, ddl3"
- vext.8 d1, d0, d0, #1
- vst1.32 d1[0], [r0]!
+ //Save "ddl0, ddl1, ddl2, ddl3"
+ vext.8 d1, d0, d0, #1
+ vst1.32 d1[0], [r0]!
- //Save "ddl1, ddl2, ddl3, ddl4"
- vext.8 d1, d0, d0, #2
- vst1.32 d1[0], [r0]!
+ //Save "ddl1, ddl2, ddl3, ddl4"
+ vext.8 d1, d0, d0, #2
+ vst1.32 d1[0], [r0]!
- //Save "ddl2, ddl3, ddl4, ddl5"
- vext.8 d1, d0, d0, #3
- vst1.32 d1[0], [r0]!
+ //Save "ddl2, ddl3, ddl4, ddl5"
+ vext.8 d1, d0, d0, #3
+ vst1.32 d1[0], [r0]!
- //Save "ddl3, ddl4, ddl5, ddl6"
- vst1.32 d0[1], [r0]
+ //Save "ddl3, ddl4, ddl5, ddl6"
+ vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r3, r1, r2
- vld1.32 {d0[1]}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r3, r1, r2
+ vld1.32 {d0[1]}, [r3]
- //Load the left column (5 bytes)
- sub r3, #1
- vld1.8 {d0[3]}, [r3], r2
- vld1.8 {d0[2]}, [r3], r2
- vld1.8 {d0[1]}, [r3], r2
- vld1.8 {d0[0]}, [r3], r2
- vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
+ //Load the left column (5 bytes)
+ sub r3, #1
+ vld1.8 {d0[3]}, [r3], r2
+ vld1.8 {d0[2]}, [r3], r2
+ vld1.8 {d0[1]}, [r3], r2
+ vld1.8 {d0[0]}, [r3], r2
+ vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
- vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
- //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
+ vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
+ //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
- //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
- vaddl.u8 q2, d2, d0
+ //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
+ vaddl.u8 q2, d2, d0
- //q1:{TL0+LT0,LT0+T01,...L12+L23}
- vext.8 q3, q3, q2, #14
- vadd.u16 q1, q2, q3
+ //q1:{TL0+LT0,LT0+T01,...L12+L23}
+ vext.8 q3, q3, q2, #14
+ vadd.u16 q1, q2, q3
- //right shift 2 bits and rounding
- vqrshrn.u16 d0, q1, #2
+ //right shift 2 bits and rounding
+ vqrshrn.u16 d0, q1, #2
- //Adjust the data sequence for setting luma MB of 'pred'
- vst1.32 d0[1], [r0]!
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0]!
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0]!
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0]
+ //Adjust the data sequence for setting luma MB of 'pred'
+ vst1.32 d0[1], [r0]!
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0]!
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0]!
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (8 bytes)
- sub r3, r1, r2
- vld1.32 {d0}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (8 bytes)
+ sub r3, r1, r2
+ vld1.32 {d0}, [r3]
- vext.8 d1, d0, d0, #1
- vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
+ vext.8 d1, d0, d0, #1
+ vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
- vext.8 q2, q1, q1, #2
- vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
+ vext.8 q2, q1, q1, #2
+ vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
- //calculate the "vl0,vl1,vl2,vl3,vl4"
- vqrshrn.u16 d0, q1, #1
+ //calculate the "vl0,vl1,vl2,vl3,vl4"
+ vqrshrn.u16 d0, q1, #1
- //calculate the "vl5,vl6,vl7,vl8,vl9"
- vqrshrn.u16 d1, q2, #2
+ //calculate the "vl5,vl6,vl7,vl8,vl9"
+ vqrshrn.u16 d1, q2, #2
- //Adjust the data sequence for setting the luma MB
- vst1.32 d0[0], [r0]!
- vst1.32 d1[0], [r0]!
- vext.8 d0, d0, d0, #1
- vext.8 d1, d1, d1, #1
- vst1.32 d0[0], [r0]!
- vst1.32 d1[0], [r0]
+ //Adjust the data sequence for setting the luma MB
+ vst1.32 d0[0], [r0]!
+ vst1.32 d1[0], [r0]!
+ vext.8 d0, d0, d0, #1
+ vext.8 d1, d1, d1, #1
+ vst1.32 d0[0], [r0]!
+ vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r3, r1, r2
- vld1.32 {d0[1]}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r3, r1, r2
+ vld1.32 {d0[1]}, [r3]
- //Load the left column (4 bytes)
- sub r3, #1
- vld1.8 {d0[3]}, [r3], r2
- vld1.8 {d0[2]}, [r3], r2
- vld1.8 {d0[1]}, [r3], r2
- vld1.8 {d0[0]}, [r3]
+ //Load the left column (4 bytes)
+ sub r3, #1
+ vld1.8 {d0[3]}, [r3], r2
+ vld1.8 {d0[2]}, [r3], r2
+ vld1.8 {d0[1]}, [r3], r2
+ vld1.8 {d0[0]}, [r3]
- vext.8 d1, d0, d0, #7
- vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
+ vext.8 d1, d0, d0, #7
+ vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
- vext.u8 q2, q1, q1, #14
- vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
+ vext.u8 q2, q1, q1, #14
+ vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
- //Calculate the vr0 ~ vr9
- vqrshrn.u16 d1, q2, #2
- vqrshrn.u16 d0, q1, #1
+ //Calculate the vr0 ~ vr9
+ vqrshrn.u16 d1, q2, #2
+ vqrshrn.u16 d0, q1, #1
- //Adjust the data sequence for setting the luma MB
- vst1.32 d0[1], [r0]!
- vst1.32 d1[1], [r0]!
- //add r2, r0, r1
- vst1.8 d1[3], [r0]!
- vst1.16 d0[2], [r0]!
- vst1.8 d0[6], [r0]!
- vst1.8 d1[2], [r0]!
- vst1.16 d1[2], [r0]!
- vst1.8 d1[6], [r0]
+ //Adjust the data sequence for setting the luma MB
+ vst1.32 d0[1], [r0]!
+ vst1.32 d1[1], [r0]!
+ //add r2, r0, r1
+ vst1.8 d1[3], [r0]!
+ vst1.16 d0[2], [r0]!
+ vst1.8 d0[6], [r0]!
+ vst1.8 d1[2], [r0]!
+ vst1.16 d1[2], [r0]!
+ vst1.8 d1[6], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon
- //stmdb sp!, { r4, lr}
- //Load the left column data
- sub r3, r1, #1
- mov r1, #3
- mul r1, r2
- add r1, r3
- vld1.8 {d0[]}, [r1]
- vld1.8 {d0[4]}, [r3], r2
- vld1.8 {d0[5]}, [r3], r2
- vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
+ //stmdb sp!, { r4, lr}
+ //Load the left column data
+ sub r3, r1, #1
+ mov r1, #3
+ mul r1, r2
+ add r1, r3
+ vld1.8 {d0[]}, [r1]
+ vld1.8 {d0[4]}, [r3], r2
+ vld1.8 {d0[5]}, [r3], r2
+ vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
- vext.8 d1, d0, d0, #1
- vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+ vext.8 d1, d0, d0, #1
+ vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
- vext.u8 d2, d5, d4, #2
- vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
+ vext.u8 d2, d5, d4, #2
+ vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
- //Calculate the hu0 ~ hu5
- vqrshrn.u16 d2, q2, #1
- vqrshrn.u16 d1, q1, #2
+ //Calculate the hu0 ~ hu5
+ vqrshrn.u16 d2, q2, #1
+ vqrshrn.u16 d1, q1, #2
- //Adjust the data sequence for setting the luma MB
- vzip.8 d2, d1
- vst1.32 d1[0], [r0]!
- vext.8 d2, d1, d1, #2
- vst1.32 d2[0], [r0]!
- vst1.32 d1[1], [r0]!
- vst1.32 d0[0], [r0]
+ //Adjust the data sequence for setting the luma MB
+ vzip.8 d2, d1
+ vst1.32 d1[0], [r0]!
+ vext.8 d2, d1, d1, #2
+ vst1.32 d2[0], [r0]!
+ vst1.32 d1[1], [r0]!
+ vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the data
- sub r3, r1, r2
- sub r3, #1
- vld1.32 {d0[1]}, [r3], r2
- vld1.8 {d0[3]}, [r3], r2
- vld1.8 {d0[2]}, [r3], r2
- vld1.8 {d0[1]}, [r3], r2
- vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
+ //stmdb sp!, { r2-r5, lr}
+ //Load the data
+ sub r3, r1, r2
+ sub r3, #1
+ vld1.32 {d0[1]}, [r3], r2
+ vld1.8 {d0[3]}, [r3], r2
+ vld1.8 {d0[2]}, [r3], r2
+ vld1.8 {d0[1]}, [r3], r2
+ vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
- vext.8 d1, d0, d0, #7
- vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
+ vext.8 d1, d0, d0, #7
+ vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
- vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
- vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
+ vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
+ vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
- //Calculate the hd0~hd9
- vqrshrn.u16 d1, q3, #2
- vqrshrn.u16 d0, q2, #1
+ //Calculate the hd0~hd9
+ vqrshrn.u16 d1, q3, #2
+ vqrshrn.u16 d0, q2, #1
- //Adjust the data sequence for setting the luma MB
- vmov d3, d1
- vtrn.8 d0, d1
- vext.u8 d2, d1, d1, #6
- vst2.16 {d2[3], d3[3]}, [r0]!
- vst2.16 {d0[2], d1[2]}, [r0]!
- vmov d3, d0
- vst2.16 {d2[2], d3[2]}, [r0]!
- vst2.16 {d0[1], d1[1]}, [r0]
+ //Adjust the data sequence for setting the luma MB
+ vmov d3, d1
+ vtrn.8 d0, d1
+ vext.u8 d2, d1, d1, #6
+ vst2.16 {d2[3], d3[3]}, [r0]!
+ vst2.16 {d0[2], d1[2]}, [r0]!
+ vmov d3, d0
+ vst2.16 {d2[2], d3[2]}, [r0]!
+ vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon
- //stmdb sp!, { r2-r5, lr}
- //Get the top row (8 byte)
- sub r3, r1, r2
- vldr d0, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Get the top row (8 byte)
+ sub r3, r1, r2
+ vldr d0, [r3]
- //Set the chroma MB using top row data
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]
+ //Set the chroma MB using top row data
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
- //stmdb sp!, { r2-r5, lr}
- ////Get the left column (8 byte)
- sub r3, r1, #1
- vld1.8 {d0[]}, [r3], r2
- vld1.8 {d1[]}, [r3], r2
- vld1.8 {d2[]}, [r3], r2
- vld1.8 {d3[]}, [r3], r2
- vld1.8 {d4[]}, [r3], r2
- vld1.8 {d5[]}, [r3], r2
- vld1.8 {d6[]}, [r3], r2
- vld1.8 {d7[]}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ ////Get the left column (8 byte)
+ sub r3, r1, #1
+ vld1.8 {d0[]}, [r3], r2
+ vld1.8 {d1[]}, [r3], r2
+ vld1.8 {d2[]}, [r3], r2
+ vld1.8 {d3[]}, [r3], r2
+ vld1.8 {d4[]}, [r3], r2
+ vld1.8 {d5[]}, [r3], r2
+ vld1.8 {d6[]}, [r3], r2
+ vld1.8 {d7[]}, [r3]
- //Set the chroma MB using left column data
- vst1.8 {d0}, [r0]!
- vst1.8 {d1}, [r0]!
- vst1.8 {d2}, [r0]!
- vst1.8 {d3}, [r0]!
- vst1.8 {d4}, [r0]!
- vst1.8 {d5}, [r0]!
- vst1.8 {d6}, [r0]!
- vst1.8 {d7}, [r0]
+ //Set the chroma MB using left column data
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d1}, [r0]!
+ vst1.8 {d2}, [r0]!
+ vst1.8 {d3}, [r0]!
+ vst1.8 {d4}, [r0]!
+ vst1.8 {d5}, [r0]!
+ vst1.8 {d6}, [r0]!
+ vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END
@@ -575,73 +575,73 @@
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row data
- sub r3, r1, #1
- sub r3, r2
- vld1.32 {d1[0]}, [r3]
- add r3, #5
- vld1.32 {d0[0]}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row data
+ sub r3, r1, #1
+ sub r3, r2
+ vld1.32 {d1[0]}, [r3]
+ add r3, #5
+ vld1.32 {d0[0]}, [r3]
- //Load the left column data
- sub r3, #5
- vld1.8 {d1[4]}, [r3], r2
- vld1.8 {d1[5]}, [r3], r2
- vld1.8 {d1[6]}, [r3], r2
- vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
- add r3, r2
- vld1.8 {d0[4]}, [r3], r2
- vld1.8 {d0[5]}, [r3], r2
- vld1.8 {d0[6]}, [r3], r2
- vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
+ //Load the left column data
+ sub r3, #5
+ vld1.8 {d1[4]}, [r3], r2
+ vld1.8 {d1[5]}, [r3], r2
+ vld1.8 {d1[6]}, [r3], r2
+ vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
+ add r3, r2
+ vld1.8 {d0[4]}, [r3], r2
+ vld1.8 {d0[5]}, [r3], r2
+ vld1.8 {d0[6]}, [r3], r2
+ vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
- //Save T7 to d3 for next step
- vdup.u8 d3, d0[3]
- //Save L7 to d4 for next step
- vdup.u8 d4, d0[7]
+ //Save T7 to d3 for next step
+ vdup.u8 d3, d0[3]
+ //Save L7 to d4 for next step
+ vdup.u8 d4, d0[7]
- //Calculate the value of 'a' and save to q2
- vaddl.u8 q2, d3, d4
- vshl.u16 q2, #4
+ //Calculate the value of 'a' and save to q2
+ vaddl.u8 q2, d3, d4
+ vshl.u16 q2, #4
- //Load the table {{1,2,3,4,1,2,3,4}*17}
- adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
- vld1.32 {d2}, [r3]
+ //Load the table {{1,2,3,4,1,2,3,4}*17}
+ adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
+ vld1.32 {d2}, [r3]
- //Calculate the 'b','c', and save to q0
- vrev32.8 d1, d1
- vsubl.u8 q0, d0, d1
- vmovl.u8 q1, d2
- vmul.s16 q0, q1
- vpaddl.s16 q0, q0
- vpaddl.s32 q0, q0
- vrshr.s64 q0, #5
+ //Calculate the 'b','c', and save to q0
+ vrev32.8 d1, d1
+ vsubl.u8 q0, d0, d1
+ vmovl.u8 q1, d2
+ vmul.s16 q0, q1
+ vpaddl.s16 q0, q0
+ vpaddl.s32 q0, q0
+ vrshr.s64 q0, #5
- //Load the table {-3,-2,-1,0,1,2,3,4} to q3
- adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
- vld1.32 {d6, d7}, [r3]
+ //Load the table {-3,-2,-1,0,1,2,3,4} to q3
+ adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
+ vld1.32 {d6, d7}, [r3]
- //Duplicate the 'b','c' to q0, q1 for SIMD instruction
- vdup.s16 q1, d1[0]
- vdup.s16 q0, d0[0]
+ //Duplicate the 'b','c' to q0, q1 for SIMD instruction
+ vdup.s16 q1, d1[0]
+ vdup.s16 q0, d0[0]
- //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
- vmla.s16 q2, q0, q3
- vmla.s16 q2, q1, d6[0]
- vqrshrun.s16 d0, q2, #5
+ //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
+ vmla.s16 q2, q0, q3
+ vmla.s16 q2, q1, d6[0]
+ vqrshrun.s16 d0, q2, #5
- //Set a line of chroma MB
- vst1.u32 {d0}, [r0]!
+ //Set a line of chroma MB
+ vst1.u32 {d0}, [r0]!
- //Do the same processing for each line.
- mov r3, #7
+ //Do the same processing for each line.
+ mov r3, #7
loop_0_get_i_chroma_pred_plane:
- vadd.s16 q2, q1
- vqrshrun.s16 d0, q2, #5
- vst1.u32 {d0}, [r0]!
- subs r3, #1
- bne loop_0_get_i_chroma_pred_plane
+ vadd.s16 q2, q1
+ vqrshrun.s16 d0, q2, #5
+ vst1.u32 {d0}, [r0]!
+ subs r3, #1
+ bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END
--- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
+++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
@@ -38,59 +38,59 @@
#ifdef __APPLE__
//The data sequence will be used
.macro GET_8BYTE_DATA_L0
- vld1.8 {$0[0]}, [$1], $2
- vld1.8 {$0[1]}, [$1], $2
- vld1.8 {$0[2]}, [$1], $2
- vld1.8 {$0[3]}, [$1], $2
- vld1.8 {$0[4]}, [$1], $2
- vld1.8 {$0[5]}, [$1], $2
- vld1.8 {$0[6]}, [$1], $2
- vld1.8 {$0[7]}, [$1], $2
+ vld1.8 {$0[0]}, [$1], $2
+ vld1.8 {$0[1]}, [$1], $2
+ vld1.8 {$0[2]}, [$1], $2
+ vld1.8 {$0[3]}, [$1], $2
+ vld1.8 {$0[4]}, [$1], $2
+ vld1.8 {$0[5]}, [$1], $2
+ vld1.8 {$0[6]}, [$1], $2
+ vld1.8 {$0[7]}, [$1], $2
.endm
.macro HDM_TRANSFORM_4X4_L0
- //Do the vertical transform
- vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
- vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
- vswp d1, d2
- vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
- vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
+ //Do the vertical transform
+ vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
+ vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
+ vswp d1, d2
+ vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
+ vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
- //Do the horizontal transform
- vtrn.32 q2, q1
- vadd.s16 q0, q2, q1
- vsub.s16 q1, q2, q1
+ //Do the horizontal transform
+ vtrn.32 q2, q1
+ vadd.s16 q0, q2, q1
+ vsub.s16 q1, q2, q1
- vtrn.16 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
+ vtrn.16 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
- vmov.s16 d0, d4
- vmov.s16 d1, d2
+ vmov.s16 d0, d4
+ vmov.s16 d1, d2
- vabs.s16 d3, d3
+ vabs.s16 d3, d3
- //16x16_v
- vtrn.32 d0, d1 //{0,1,3,2}
- vaba.s16 $5, d0, $2 //16x16_v
- vaba.s16 $5, d1, $8
- vaba.s16 $5, d5, $8
- vadd.u16 $5, d3
+ //16x16_v
+ vtrn.32 d0, d1 //{0,1,3,2}
+ vaba.s16 $5, d0, $2 //16x16_v
+ vaba.s16 $5, d1, $8
+ vaba.s16 $5, d5, $8
+ vadd.u16 $5, d3
- //16x16_h
- vtrn.16 d4, d5 //{0,4,12,8}
- vaba.s16 $6, d4, $3 //16x16_h
- vabs.s16 d2, d2
- vabs.s16 d5, d5
- vadd.u16 d2, d3
- vadd.u16 d2, d5
- vadd.u16 $6, d2
+ //16x16_h
+ vtrn.16 d4, d5 //{0,4,12,8}
+ vaba.s16 $6, d4, $3 //16x16_h
+ vabs.s16 d2, d2
+ vabs.s16 d5, d5
+ vadd.u16 d2, d3
+ vadd.u16 d2, d5
+ vadd.u16 $6, d2
- //16x16_dc_both
- vaba.s16 $7, d4, $4 //16x16_dc_both
- vadd.u16 $7, d2
+ //16x16_dc_both
+ vaba.s16 $7, d4, $4 //16x16_dc_both
+ vadd.u16 $7, d2
.endm
@@ -97,58 +97,58 @@
#else
//The data sequence will be used
.macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
- vld1.8 {\arg0[0]}, [\arg1], \arg2
- vld1.8 {\arg0[1]}, [\arg1], \arg2
- vld1.8 {\arg0[2]}, [\arg1], \arg2
- vld1.8 {\arg0[3]}, [\arg1], \arg2
- vld1.8 {\arg0[4]}, [\arg1], \arg2
- vld1.8 {\arg0[5]}, [\arg1], \arg2
- vld1.8 {\arg0[6]}, [\arg1], \arg2
- vld1.8 {\arg0[7]}, [\arg1], \arg2
+ vld1.8 {\arg0[0]}, [\arg1], \arg2
+ vld1.8 {\arg0[1]}, [\arg1], \arg2
+ vld1.8 {\arg0[2]}, [\arg1], \arg2
+ vld1.8 {\arg0[3]}, [\arg1], \arg2
+ vld1.8 {\arg0[4]}, [\arg1], \arg2
+ vld1.8 {\arg0[5]}, [\arg1], \arg2
+ vld1.8 {\arg0[6]}, [\arg1], \arg2
+ vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2,arg3, arg4, arg5, arg6, arg7, arg8
- //Do the vertical transform
- vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13}
- vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15}
- vswp d1, d2
- vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
- vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
+ //Do the vertical transform
+ vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13}
+ vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15}
+ vswp d1, d2
+ vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
+ vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
- //Do the horizontal transform
- vtrn.32 q2, q1
- vadd.s16 q0, q2, q1
- vsub.s16 q1, q2, q1
+ //Do the horizontal transform
+ vtrn.32 q2, q1
+ vadd.s16 q0, q2, q1
+ vsub.s16 q1, q2, q1
- vtrn.16 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
+ vtrn.16 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
- vmov.s16 d0, d4
- vmov.s16 d1, d2
+ vmov.s16 d0, d4
+ vmov.s16 d1, d2
- vabs.s16 d3, d3
+ vabs.s16 d3, d3
- //16x16_v
- vtrn.32 d0, d1 //{0,1,3,2}
- vaba.s16 \arg5, d0, \arg2 //16x16_v
- vaba.s16 \arg5, d1, \arg8
- vaba.s16 \arg5, d5, \arg8
- vadd.u16 \arg5, d3
+ //16x16_v
+ vtrn.32 d0, d1 //{0,1,3,2}
+ vaba.s16 \arg5, d0, \arg2 //16x16_v
+ vaba.s16 \arg5, d1, \arg8
+ vaba.s16 \arg5, d5, \arg8
+ vadd.u16 \arg5, d3
- //16x16_h
- vtrn.16 d4, d5 //{0,4,12,8}
- vaba.s16 \arg6, d4, \arg3 //16x16_h
- vabs.s16 d2, d2
- vabs.s16 d5, d5
- vadd.u16 d2, d3
- vadd.u16 d2, d5
- vadd.u16 \arg6, d2
+ //16x16_h
+ vtrn.16 d4, d5 //{0,4,12,8}
+ vaba.s16 \arg6, d4, \arg3 //16x16_h
+ vabs.s16 d2, d2
+ vabs.s16 d5, d5
+ vadd.u16 d2, d3
+ vadd.u16 d2, d5
+ vadd.u16 \arg6, d2
- //16x16_dc_both
- vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
- vadd.u16 \arg7, d2
+ //16x16_dc_both
+ vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
+ vadd.u16 \arg7, d2
.endm
#endif
@@ -156,63 +156,63 @@
stmdb sp!, {r4-r7, lr}
vpush {q4-q7}
- //Get the top line data to 'q15'(16 bytes)
- sub r7, r0, r1
+ //Get the top line data to 'q15'(16 bytes)
+ sub r7, r0, r1
vld1.8 {q15}, [r7]
- //Get the left colume data to 'q14' (16 bytes)
- sub r7, r0, #1
- GET_8BYTE_DATA_L0 d28, r7, r1
- GET_8BYTE_DATA_L0 d29, r7, r1
+ //Get the left colume data to 'q14' (16 bytes)
+ sub r7, r0, #1
+ GET_8BYTE_DATA_L0 d28, r7, r1
+ GET_8BYTE_DATA_L0 d29, r7, r1
- //Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
- //Calculate the 16x16_dc_both mode SATD
- vaddl.u8 q0, d30, d31
- vaddl.u8 q1, d28, d29
- vadd.u16 q0, q1
- vadd.u16 d0, d1
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
+ //Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
+ //Calculate the 16x16_dc_both mode SATD
+ vaddl.u8 q0, d30, d31
+ vaddl.u8 q1, d28, d29
+ vadd.u16 q0, q1
+ vadd.u16 d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
- //Calculate the mean value
- vrshr.u16 d0, #5
- vshl.u16 d27, d0, #4
+ //Calculate the mean value
+ vrshr.u16 d0, #5
+ vshl.u16 d27, d0, #4
- //Calculate the 16x16_v mode SATD and save to "q11, 12"
- vshll.u8 q0, d30, #2
- vshll.u8 q1, d31, #2
- vtrn.32 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
- vtrn.16 q2, q1
- vadd.s16 q12, q2, q1
- vsub.s16 q11, q2, q1
- vtrn.32 q12, q11 //{0,1,3,2, 4,5,7,6} q12
- //{8,9,11,10, 12,13,15,14} q11
+ //Calculate the 16x16_v mode SATD and save to "q11, 12"
+ vshll.u8 q0, d30, #2
+ vshll.u8 q1, d31, #2
+ vtrn.32 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
+ vtrn.16 q2, q1
+ vadd.s16 q12, q2, q1
+ vsub.s16 q11, q2, q1
+ vtrn.32 q12, q11 //{0,1,3,2, 4,5,7,6} q12
+ //{8,9,11,10, 12,13,15,14} q11
//Calculate the 16x16_h mode SATD and save to "q9, q10"
- vshll.u8 q0, d28, #2
- vshll.u8 q1, d29, #2
- vtrn.32 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
- vtrn.16 q2, q1
- vadd.s16 q10, q2, q1
- vsub.s16 q9, q2, q1
- vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10
- //{8,9,11,10, 12,13,15,14} q9
+ vshll.u8 q0, d28, #2
+ vshll.u8 q1, d29, #2
+ vtrn.32 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
+ vtrn.16 q2, q1
+ vadd.s16 q10, q2, q1
+ vsub.s16 q9, q2, q1
+ vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10
+ //{8,9,11,10, 12,13,15,14} q9
- vmov.i32 d17, #0//Save the SATD of DC_BOTH
- vmov.i32 d16, #0//Save the SATD of H
- vmov.i32 d15, #0//Save the SATD of V
- vmov.i32 d14, #0//For zero D register
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {q3}, [r2], r3
- vld1.32 {q4}, [r2], r3
- vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
- vtrn.32 q3, q4
- vtrn.32 q5, q6
+ vmov.i32 d17, #0//Save the SATD of DC_BOTH
+ vmov.i32 d16, #0//Save the SATD of H
+ vmov.i32 d15, #0//Save the SATD of V
+ vmov.i32 d14, #0//For zero D register
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {q3}, [r2], r3
+ vld1.32 {q4}, [r2], r3
+ vld1.32 {q5}, [r2], r3
+ vld1.32 {q6}, [r2], r3
+ vtrn.32 q3, q4
+ vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14
@@ -219,13 +219,13 @@
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {q3}, [r2], r3
- vld1.32 {q4}, [r2], r3
- vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
- vtrn.32 q3, q4
- vtrn.32 q5, q6
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {q3}, [r2], r3
+ vld1.32 {q4}, [r2], r3
+ vld1.32 {q5}, [r2], r3
+ vld1.32 {q6}, [r2], r3
+ vtrn.32 q3, q4
+ vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14
@@ -232,13 +232,13 @@
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {q3}, [r2], r3
- vld1.32 {q4}, [r2], r3
- vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
- vtrn.32 q3, q4
- vtrn.32 q5, q6
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {q3}, [r2], r3
+ vld1.32 {q4}, [r2], r3
+ vld1.32 {q5}, [r2], r3
+ vld1.32 {q6}, [r2], r3
+ vtrn.32 q3, q4
+ vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14
@@ -245,13 +245,13 @@
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {q3}, [r2], r3
- vld1.32 {q4}, [r2], r3
- vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
- vtrn.32 q3, q4
- vtrn.32 q5, q6
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {q3}, [r2], r3
+ vld1.32 {q4}, [r2], r3
+ vld1.32 {q5}, [r2], r3
+ vld1.32 {q6}, [r2], r3
+ vtrn.32 q3, q4
+ vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14
@@ -258,29 +258,29 @@
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
- //Get the data from stack
- ldr r5, [sp, #84] //the addr of Best_mode
- ldr r6, [sp, #88] //the value of i_lambda
+ //Get the data from stack
+ ldr r5, [sp, #84] //the addr of Best_mode
+ ldr r6, [sp, #88] //the value of i_lambda
- //vadd.u16 d24, d25
- vrshr.u16 d15, #1
- vpaddl.u16 d15, d15
- vpaddl.u32 d15, d15
- vmov.u32 r0, d15[0]
+ //vadd.u16 d24, d25
+ vrshr.u16 d15, #1
+ vpaddl.u16 d15, d15
+ vpaddl.u32 d15, d15
+ vmov.u32 r0, d15[0]
- //vadd.u16 d22, d23
- vrshr.u16 d16, #1
- vpaddl.u16 d16, d16
- vpaddl.u32 d16, d16
- vmov.u32 r1, d16[0]
- add r1, r1, r6, lsl #1
+ //vadd.u16 d22, d23
+ vrshr.u16 d16, #1
+ vpaddl.u16 d16, d16
+ vpaddl.u32 d16, d16
+ vmov.u32 r1, d16[0]
+ add r1, r1, r6, lsl #1
- //vadd.u16 d20, d21
- vrshr.u16 d17, #1
- vpaddl.u16 d17, d17
- vpaddl.u32 d17, d17
- vmov.u32 r2, d17[0]
- add r2, r2, r6, lsl #1
+ //vadd.u16 d20, d21
+ vrshr.u16 d17, #1
+ vpaddl.u16 d17, d17
+ vpaddl.u32 d17, d17
+ vmov.u32 r2, d17[0]
+ add r2, r2, r6, lsl #1
mov r4, #0
cmp r1, r0
@@ -300,77 +300,77 @@
WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Sad_neon
stmdb sp!, {r4-r7, lr}
- //Get the top line data to 'q15'(16 bytes)
- sub r4, r0, r1
+ //Get the top line data to 'q15'(16 bytes)
+ sub r4, r0, r1
vld1.8 {q15}, [r4]
- //Get the left colume data to 'q14' (16 bytes)
- sub r4, r0, #1
- GET_8BYTE_DATA_L0 d28, r4, r1
- GET_8BYTE_DATA_L0 d29, r4, r1
+ //Get the left colume data to 'q14' (16 bytes)
+ sub r4, r0, #1
+ GET_8BYTE_DATA_L0 d28, r4, r1
+ GET_8BYTE_DATA_L0 d29, r4, r1
- //Calculate the mean value and save to 'q13' (8 bytes)
- //Calculate the 16x16_dc_both mode SATD
- vaddl.u8 q0, d30, d31
- vaddl.u8 q1, d28, d29
- vadd.u16 q0, q1
- vadd.u16 d0, d1
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
+ //Calculate the mean value and save to 'q13' (8 bytes)
+ //Calculate the 16x16_dc_both mode SATD
+ vaddl.u8 q0, d30, d31
+ vaddl.u8 q1, d28, d29
+ vadd.u16 q0, q1
+ vadd.u16 d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
- //Calculate the mean value
- vrshr.u16 d0, d0, #5
- vdup.8 q13, d0[0]
+ //Calculate the mean value
+ vrshr.u16 d0, d0, #5
+ vdup.8 q13, d0[0]
- sub r4, r0, #1
+ sub r4, r0, #1
- vmov.i32 q12, #0//Save the SATD of DC_BOTH
- vmov.i32 q11, #0//Save the SATD of H
- vmov.i32 q10, #0//Save the SATD of V
+ vmov.i32 q12, #0//Save the SATD of DC_BOTH
+ vmov.i32 q11, #0//Save the SATD of H
+ vmov.i32 q10, #0//Save the SATD of V
- mov lr, #16
+ mov lr, #16
sad_intra_16x16_x3_opt_loop0:
//Get the left colume data to 'd0' (16 bytes)
- vld1.8 {d0[]}, [r4], r1
+ vld1.8 {d0[]}, [r4], r1
- //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
- vld1.8 {q1}, [r2], r3
+ //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+ vld1.8 {q1}, [r2], r3
- subs lr, #1
- //Do the SAD for top colume
- vabal.u8 q12, d30, d2
- vabal.u8 q12, d31, d3
+ subs lr, #1
+ //Do the SAD for top colume
+ vabal.u8 q12, d30, d2
+ vabal.u8 q12, d31, d3
- //Do the SAD for left colume
- vabal.u8 q11, d0, d2
- vabal.u8 q11, d0, d3
+ //Do the SAD for left colume
+ vabal.u8 q11, d0, d2
+ vabal.u8 q11, d0, d3
- //Do the SAD for mean value
- vabal.u8 q10, d26, d2
- vabal.u8 q10, d26, d3
+ //Do the SAD for mean value
+ vabal.u8 q10, d26, d2
+ vabal.u8 q10, d26, d3
- bne sad_intra_16x16_x3_opt_loop0
+ bne sad_intra_16x16_x3_opt_loop0
- //Get the data from stack
- ldr r5, [sp, #20] //the addr of Best_mode
- ldr r6, [sp, #24] //the value of i_lambda
+ //Get the data from stack
+ ldr r5, [sp, #20] //the addr of Best_mode
+ ldr r6, [sp, #24] //the value of i_lambda
- vadd.u16 d24, d25
- vpaddl.u16 d24, d24
- vpaddl.u32 d24, d24
- vmov.u32 r0, d24[0]
+ vadd.u16 d24, d25
+ vpaddl.u16 d24, d24
+ vpaddl.u32 d24, d24
+ vmov.u32 r0, d24[0]
- vadd.u16 d22, d23
- vpaddl.u16 d22, d22
- vpaddl.u32 d22, d22
- vmov.u32 r1, d22[0]
- add r1, r1, r6, lsl #1
+ vadd.u16 d22, d23
+ vpaddl.u16 d22, d22
+ vpaddl.u32 d22, d22
+ vmov.u32 r1, d22[0]
+ add r1, r1, r6, lsl #1
- vadd.u16 d20, d21
- vpaddl.u16 d20, d20
- vpaddl.u32 d20, d20
- vmov.u32 r2, d20[0]
- add r2, r2, r6, lsl #1
+ vadd.u16 d20, d21
+ vpaddl.u16 d20, d20
+ vpaddl.u32 d20, d20
+ vmov.u32 r2, d20[0]
+ add r2, r2, r6, lsl #1
mov r4, #0
cmp r1, r0
@@ -382,7 +382,7 @@
str r4, [r5]
- ldmia sp!, {r4-r7, lr}
+ ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
@@ -389,24 +389,24 @@
WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon
stmdb sp!, {r4-r7, lr}
- //Get the data from stack
- ldr r4, [sp, #32] //p_dec_cr
- ldr r5, [sp, #36] //p_enc_cr
+ //Get the data from stack
+ ldr r4, [sp, #32] //p_dec_cr
+ ldr r5, [sp, #36] //p_enc_cr
- //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
- sub r6, r0, #1
- GET_8BYTE_DATA_L0 d28, r6, r1
- sub r6, r4, #1
- GET_8BYTE_DATA_L0 d30, r6, r1
+ //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
+ sub r6, r0, #1
+ GET_8BYTE_DATA_L0 d28, r6, r1
+ sub r6, r4, #1
+ GET_8BYTE_DATA_L0 d30, r6, r1
- //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
- sub r6, r0, r1
+ //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
+ sub r6, r0, r1
vld1.8 {d29}, [r6]
- sub r6, r4, r1
+ sub r6, r4, r1
vld1.8 {d31}, [r6]
- //Calculate the sum of left column and top row
- vmov.i32 q0, q14
+ //Calculate the sum of left column and top row
+ vmov.i32 q0, q14
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
@@ -416,13 +416,13 @@
//duplicate the 'mx' to a vector line
vdup.8 d27, d2[0]
vdup.8 d26, d1[4]
- vtrn.32 d27, d26
+ vtrn.32 d27, d26
vdup.8 d26, d0[4]
vdup.8 d25, d2[4]
vtrn.32 d26, d25 //Save to "d27, d26"
- vmov.i32 q0, q15
+ vmov.i32 q0, q15
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
@@ -432,94 +432,94 @@
//duplicate the 'mx' to a vector line
vdup.8 d25, d2[0]
vdup.8 d24, d1[4]
- vtrn.32 d25, d24
+ vtrn.32 d25, d24
vdup.8 d24, d0[4]
vdup.8 d23, d2[4]
- vtrn.32 d24, d23 //Save to "d25, d24"
+ vtrn.32 d24, d23 //Save to "d25, d24"
- vmov.i32 q11, #0//Save the SATD of DC_BOTH
- vmov.i32 q10, #0//Save the SATD of H
- vmov.i32 q9 , #0//Save the SATD of V
- sub r6, r0, #1
- sub r7, r4, #1
- mov lr, #4
+ vmov.i32 q11, #0//Save the SATD of DC_BOTH
+ vmov.i32 q10, #0//Save the SATD of H
+ vmov.i32 q9 , #0//Save the SATD of V
+ sub r6, r0, #1
+ sub r7, r4, #1
+ mov lr, #4
sad_intra_8x8_x3_opt_loop0:
- //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
- vld1.8 {d0}, [r2], r3
- vld1.8 {d1}, [r5], r3
+ //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r5], r3
//Get the left colume data to 'd0' (16 bytes)
- vld1.8 {d2[]}, [r6], r1
- vld1.8 {d3[]}, [r7], r1
+ vld1.8 {d2[]}, [r6], r1
+ vld1.8 {d3[]}, [r7], r1
- subs lr, #1
+ subs lr, #1
- //Do the SAD for top colume
- vabal.u8 q11, d29, d0
- vabal.u8 q11, d31, d1
+ //Do the SAD for top colume
+ vabal.u8 q11, d29, d0
+ vabal.u8 q11, d31, d1
- //Do the SAD for left colume
- vabal.u8 q10, d2, d0
- vabal.u8 q10, d3, d1
+ //Do the SAD for left colume
+ vabal.u8 q10, d2, d0
+ vabal.u8 q10, d3, d1
- //Do the SAD for mean value
- vabal.u8 q9, d27, d0
- vabal.u8 q9, d25, d1
+ //Do the SAD for mean value
+ vabal.u8 q9, d27, d0
+ vabal.u8 q9, d25, d1
- bne sad_intra_8x8_x3_opt_loop0
+ bne sad_intra_8x8_x3_opt_loop0
- mov lr, #4
+ mov lr, #4
sad_intra_8x8_x3_opt_loop1:
- //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
- vld1.8 {d0}, [r2], r3
- vld1.8 {d1}, [r5], r3
+ //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r5], r3
//Get the left colume data to 'd0' (16 bytes)
- vld1.8 {d2[]}, [r6], r1
- vld1.8 {d3[]}, [r7], r1
+ vld1.8 {d2[]}, [r6], r1
+ vld1.8 {d3[]}, [r7], r1
- subs lr, #1
+ subs lr, #1
- //Do the SAD for top colume
- vabal.u8 q11, d29, d0
- vabal.u8 q11, d31, d1
+ //Do the SAD for top colume
+ vabal.u8 q11, d29, d0
+ vabal.u8 q11, d31, d1
- //Do the SAD for left colume
- vabal.u8 q10, d2, d0
- vabal.u8 q10, d3, d1
+ //Do the SAD for left colume
+ vabal.u8 q10, d2, d0
+ vabal.u8 q10, d3, d1
- //Do the SAD for mean value
- vabal.u8 q9, d26, d0
- vabal.u8 q9, d24, d1
+ //Do the SAD for mean value
+ vabal.u8 q9, d26, d0
+ vabal.u8 q9, d24, d1
- bne sad_intra_8x8_x3_opt_loop1
- //Get the data from stack
- ldr r5, [sp, #20] //the addr of Best_mode
- ldr r6, [sp, #24] //the value of i_lambda
+ bne sad_intra_8x8_x3_opt_loop1
+ //Get the data from stack
+ ldr r5, [sp, #20] //the addr of Best_mode
+ ldr r6, [sp, #24] //the value of i_lambda
- vadd.u16 d22, d23
- vpaddl.u16 d22, d22
- vpaddl.u32 d22, d22
- vmov.u32 r0, d22[0]
- add r0, r0, r6, lsl #1
+ vadd.u16 d22, d23
+ vpaddl.u16 d22, d22
+ vpaddl.u32 d22, d22
+ vmov.u32 r0, d22[0]
+ add r0, r0, r6, lsl #1
- vadd.u16 d20, d21
- vpaddl.u16 d20, d20
- vpaddl.u32 d20, d20
- vmov.u32 r1, d20[0]
- add r1, r1, r6, lsl #1
+ vadd.u16 d20, d21
+ vpaddl.u16 d20, d20
+ vpaddl.u32 d20, d20
+ vmov.u32 r1, d20[0]
+ add r1, r1, r6, lsl #1
- vadd.u16 d18, d19
- vpaddl.u16 d18, d18
- vpaddl.u32 d18, d18
- vmov.u32 r2, d18[0]
+ vadd.u16 d18, d19
+ vpaddl.u16 d18, d18
+ vpaddl.u32 d18, d18
+ vmov.u32 r2, d18[0]
mov r4, #2
cmp r1, r0
@@ -531,7 +531,7 @@
str r4, [r5]
- ldmia sp!, {r4-r7, lr}
+ ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
@@ -539,47 +539,47 @@
stmdb sp!, {r4-r7, lr}
vpush {q4-q7}
- //Get the data from stack
- ldr r4, [sp, #96] //p_dec_cr
- ldr r5, [sp, #100] //p_enc_cr
+ //Get the data from stack
+ ldr r4, [sp, #96] //p_dec_cr
+ ldr r5, [sp, #100] //p_enc_cr
- //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
- sub r6, r0, r1
+ //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
+ sub r6, r0, r1
vld1.8 {d29}, [r6]
- sub r6, r4, r1
+ sub r6, r4, r1
vld1.8 {d31}, [r6]
- //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
- sub r6, r0, #1
- GET_8BYTE_DATA_L0 d28, r6, r1
- sub r6, r4, #1
- GET_8BYTE_DATA_L0 d30, r6, r1
+ //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
+ sub r6, r0, #1
+ GET_8BYTE_DATA_L0 d28, r6, r1
+ sub r6, r4, #1
+ GET_8BYTE_DATA_L0 d30, r6, r1
- //Calculate the 16x16_v mode SATD and save to "q12, 13"
- vshll.u8 q0, d29, #2
- vshll.u8 q1, d31, #2
- vtrn.32 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
- vtrn.16 q2, q1
- vadd.s16 q13, q2, q1
- vsub.s16 q12, q2, q1
- vtrn.32 q13, q12 //{0,1,3,2, 4,5,7,6} q13
- //{8,9,11,10, 12,13,15,14} q12
+ //Calculate the 16x16_v mode SATD and save to "q12, 13"
+ vshll.u8 q0, d29, #2
+ vshll.u8 q1, d31, #2
+ vtrn.32 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
+ vtrn.16 q2, q1
+ vadd.s16 q13, q2, q1
+ vsub.s16 q12, q2, q1
+ vtrn.32 q13, q12 //{0,1,3,2, 4,5,7,6} q13
+ //{8,9,11,10, 12,13,15,14} q12
//Calculate the 16x16_h mode SATD and save to "q10, q11"
- vshll.u8 q0, d28, #2
- vshll.u8 q1, d30, #2
- vtrn.32 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
- vtrn.16 q2, q1
- vadd.s16 q11, q2, q1
- vsub.s16 q10, q2, q1
- vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11
- //{8,9,11,10, 12,13,15,14} q10
+ vshll.u8 q0, d28, #2
+ vshll.u8 q1, d30, #2
+ vtrn.32 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
+ vtrn.16 q2, q1
+ vadd.s16 q11, q2, q1
+ vsub.s16 q10, q2, q1
+ vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11
+ //{8,9,11,10, 12,13,15,14} q10
- //Calculate the sum of left column and top row
- //vmov.i32 q0, q14
+ //Calculate the sum of left column and top row
+ //vmov.i32 q0, q14
vpaddl.u8 q0, q14
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1
@@ -588,77 +588,77 @@
vpaddl.u16 q2, q2
vadd.u32 d3, d4, d5
- vtrn.32 q0, q2
- vrshr.u32 q1, #3
- vrshr.u32 q2, #2
- vshll.u32 q9, d4, #4 // {2cb, 2cr} q9
- vshll.u32 q8, d5, #4 // {1cb, 1cr} q8
- vshll.u32 q7, d2, #4 // {0cb, 3cb} q7
- vshll.u32 q6, d3, #4 // {0cr, 3cr} q6
+ vtrn.32 q0, q2
+ vrshr.u32 q1, #3
+ vrshr.u32 q2, #2
+ vshll.u32 q9, d4, #4 // {2cb, 2cr} q9
+ vshll.u32 q8, d5, #4 // {1cb, 1cr} q8
+ vshll.u32 q7, d2, #4 // {0cb, 3cb} q7
+ vshll.u32 q6, d3, #4 // {0cr, 3cr} q6
vmov.i32 d28, #0//Save the SATD of DC_BOTH
- vmov.i32 d10, #0//Save the SATD of H
- vmov.i32 d11, #0//Save the SATD of V
- vmov.i32 d30, #0//For zero D register
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {d6}, [r2], r3
- vld1.32 {d7}, [r2], r3
- vld1.32 {d8}, [r2], r3
- vld1.32 {d9}, [r2], r3
- vtrn.32 d6, d7
- vtrn.32 d8, d9
+ vmov.i32 d10, #0//Save the SATD of H
+ vmov.i32 d11, #0//Save the SATD of V
+ vmov.i32 d30, #0//For zero D register
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {d6}, [r2], r3
+ vld1.32 {d7}, [r2], r3
+ vld1.32 {d8}, [r2], r3
+ vld1.32 {d9}, [r2], r3
+ vtrn.32 d6, d7
+ vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30
- vld1.32 {d6}, [r5], r3
- vld1.32 {d7}, [r5], r3
- vld1.32 {d8}, [r5], r3
- vld1.32 {d9}, [r5], r3
- vtrn.32 d6, d7
- vtrn.32 d8, d9
+ vld1.32 {d6}, [r5], r3
+ vld1.32 {d7}, [r5], r3
+ vld1.32 {d8}, [r5], r3
+ vld1.32 {d9}, [r5], r3
+ vtrn.32 d6, d7
+ vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {d6}, [r2], r3
- vld1.32 {d7}, [r2], r3
- vld1.32 {d8}, [r2], r3
- vld1.32 {d9}, [r2], r3
- vtrn.32 d6, d7
- vtrn.32 d8, d9
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {d6}, [r2], r3
+ vld1.32 {d7}, [r2], r3
+ vld1.32 {d8}, [r2], r3
+ vld1.32 {d9}, [r2], r3
+ vtrn.32 d6, d7
+ vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30
- vld1.32 {d6}, [r5], r3
- vld1.32 {d7}, [r5], r3
- vld1.32 {d8}, [r5], r3
- vld1.32 {d9}, [r5], r3
- vtrn.32 d6, d7
- vtrn.32 d8, d9
+ vld1.32 {d6}, [r5], r3
+ vld1.32 {d7}, [r5], r3
+ vld1.32 {d8}, [r5], r3
+ vld1.32 {d9}, [r5], r3
+ vtrn.32 d6, d7
+ vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
- //Get the data from stack
- ldr r5, [sp, #84] //the addr of Best_mode
- ldr r6, [sp, #88] //the value of i_lambda
+ //Get the data from stack
+ ldr r5, [sp, #84] //the addr of Best_mode
+ ldr r6, [sp, #88] //the value of i_lambda
- vrshr.u16 d11, #1
- vpaddl.u16 d11, d11
- vpaddl.u32 d11, d11
- vmov.u32 lr, d11[0]
- add lr, lr, r6, lsl #1
+ vrshr.u16 d11, #1
+ vpaddl.u16 d11, d11
+ vpaddl.u32 d11, d11
+ vmov.u32 lr, d11[0]
+ add lr, lr, r6, lsl #1
- vrshr.u16 d10, #1
- vpaddl.u16 d10, d10
- vpaddl.u32 d10, d10
- vmov.u32 r3, d10[0]
- add r3, r3, r6, lsl #1
+ vrshr.u16 d10, #1
+ vpaddl.u16 d10, d10
+ vpaddl.u32 d10, d10
+ vmov.u32 r3, d10[0]
+ add r3, r3, r6, lsl #1
- vrshr.u16 d28, #1
- vpaddl.u16 d28, d28
- vpaddl.u32 d28, d28
- vmov.u32 r2, d28[0]
+ vrshr.u16 d28, #1
+ vpaddl.u16 d28, d28
+ vpaddl.u32 d28, d28
+ vmov.u32 r2, d28[0]
mov r6, #2
cmp r3, lr
@@ -671,8 +671,8 @@
str r6, [r5]
mov r0, lr
- vpop {q4-q7}
- ldmia sp!, {r4-r7, lr}
+ vpop {q4-q7}
+ ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
@@ -680,118 +680,118 @@
stmdb sp!, {r4-r7, lr}
//Get the top line data to 'd31[0~3]'(4 bytes)
- sub r7, r0, r1
+ sub r7, r0, r1
vld1.32 {d31[0]}, [r7]
- //Get the left colume data to 'd31[4~7]' (4 bytes)
- sub r7, r0, #1
+ //Get the left colume data to 'd31[4~7]' (4 bytes)
+ sub r7, r0, #1
vld1.8 {d31[4]}, [r7], r1
vld1.8 {d31[5]}, [r7], r1
vld1.8 {d31[6]}, [r7], r1
vld1.8 {d31[7]}, [r7], r1
- //Calculate the mean value and save to 'd30' (2 bytes)
- vpaddl.u8 d0, d31
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
- //Calculate the mean value
- vrshr.u16 d0, #3
- vshl.u16 d30, d0, #4
+ //Calculate the mean value and save to 'd30' (2 bytes)
+ vpaddl.u8 d0, d31
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ //Calculate the mean value
+ vrshr.u16 d0, #3
+ vshl.u16 d30, d0, #4
- //Calculate the 16x16_v mode SATD and save to "d29"
+ //Calculate the 16x16_v mode SATD and save to "d29"
//Calculate the 16x16_h mode SATD and save to "d28"
- vshll.u8 q0, d31, #2
- vtrn.32 d0, d1
- vadd.s16 d2, d0, d1
- vsub.s16 d1, d0, d1
- vtrn.16 d2, d1
- vadd.s16 d29, d2, d1
- vsub.s16 d28, d2, d1
- vtrn.32 d29, d28 //{0,1,3,2 top} d29
- //{0,1,3,2 left} d28
+ vshll.u8 q0, d31, #2
+ vtrn.32 d0, d1
+ vadd.s16 d2, d0, d1
+ vsub.s16 d1, d0, d1
+ vtrn.16 d2, d1
+ vadd.s16 d29, d2, d1
+ vsub.s16 d28, d2, d1
+ vtrn.32 d29, d28 //{0,1,3,2 top} d29
+ //{0,1,3,2 left} d28
vmov.i32 d27, #0//Save the SATD of DC_BOTH
- vmov.i32 d26, #0//Save the SATD of H
- vmov.i32 d25, #0//Save the SATD of V
- vmov.i32 d24, #0//For zero D register
+ vmov.i32 d26, #0//Save the SATD of H
+ vmov.i32 d25, #0//Save the SATD of V
+ vmov.i32 d24, #0//For zero D register
- //Load the p_enc data and save to "d22,d23"--- 4X4 bytes
- vld1.32 {d23[0]}, [r2], r3
- vld1.32 {d23[1]}, [r2], r3
- vld1.32 {d22[0]}, [r2], r3
- vld1.32 {d22[1]}, [r2], r3
+ //Load the p_enc data and save to "d22,d23"--- 4X4 bytes
+ vld1.32 {d23[0]}, [r2], r3
+ vld1.32 {d23[1]}, [r2], r3
+ vld1.32 {d22[0]}, [r2], r3
+ vld1.32 {d22[1]}, [r2], r3
HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24
- //Get the data from stack
- ldr r5, [sp, #28] //the value of lambda2
- ldr r6, [sp, #32] //the value of lambda1
- ldr r7, [sp, #36] //the value of lambda0
+ //Get the data from stack
+ ldr r5, [sp, #28] //the value of lambda2
+ ldr r6, [sp, #32] //the value of lambda1
+ ldr r7, [sp, #36] //the value of lambda0
- vrshr.u16 d25, #1
- vpaddl.u16 d25, d25
- vpaddl.u32 d25, d25
- vmov.u32 r0, d25[0]
- add r0, r7
+ vrshr.u16 d25, #1
+ vpaddl.u16 d25, d25
+ vpaddl.u32 d25, d25
+ vmov.u32 r0, d25[0]
+ add r0, r7
- vrshr.u16 d26, #1
- vpaddl.u16 d26, d26
- vpaddl.u32 d26, d26
- vmov.u32 r1, d26[0]
- add r1, r6
+ vrshr.u16 d26, #1
+ vpaddl.u16 d26, d26
+ vpaddl.u32 d26, d26
+ vmov.u32 r1, d26[0]
+ add r1, r6
- vrshr.u16 d27, #1
- vpaddl.u16 d27, d27
- vpaddl.u32 d27, d27
- vmov.u32 r2, d27[0]
- add r2, r5
+ vrshr.u16 d27, #1
+ vpaddl.u16 d27, d27
+ vpaddl.u32 d27, d27
+ vmov.u32 r2, d27[0]
+ add r2, r5
- ldr r5, [sp, #20] //p_dst
- ldr r6, [sp, #24] //the addr of Best_mode
+ ldr r5, [sp, #20] //p_dst
+ ldr r6, [sp, #24] //the addr of Best_mode
- mov r4, r0
- cmp r1, r4
- movcc r4, r1
- cmp r2, r4
- movcc r4, r2
+ mov r4, r0
+ cmp r1, r4
+ movcc r4, r1
+ cmp r2, r4
+ movcc r4, r2
- //The compare sequence affect the resule
- cmp r4, r2
- bne satd_intra_4x4_x3_opt_jump0
- mov r0, #2
- str r0, [r6]
- vshr.u32 d0, d30, #4 // {2cb, 2cr} q9
- vdup.8 q1, d0[0]
- vst1.8 {q1}, [r5]
- //...
- bl satd_intra_4x4_x3_opt_end
+ //The compare sequence affect the resule
+ cmp r4, r2
+ bne satd_intra_4x4_x3_opt_jump0
+ mov r0, #2
+ str r0, [r6]
+ vshr.u32 d0, d30, #4 // {2cb, 2cr} q9
+ vdup.8 q1, d0[0]
+ vst1.8 {q1}, [r5]
+ //...
+ bl satd_intra_4x4_x3_opt_end
satd_intra_4x4_x3_opt_jump0:
- cmp r4, r1
- bne satd_intra_4x4_x3_opt_jump1
- mov r0, #1
- str r0, [r6]
- vdup.8 d0, d31[4]
- vdup.8 d1, d31[5]
- vdup.8 d2, d31[6]
- vdup.8 d3, d31[7]
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
+ cmp r4, r1
+ bne satd_intra_4x4_x3_opt_jump1
+ mov r0, #1
+ str r0, [r6]
+ vdup.8 d0, d31[4]
+ vdup.8 d1, d31[5]
+ vdup.8 d2, d31[6]
+ vdup.8 d3, d31[7]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
- bl satd_intra_4x4_x3_opt_end
+ bl satd_intra_4x4_x3_opt_end
satd_intra_4x4_x3_opt_jump1:
- mov r0, #0
- str r0, [r6]
- vst1.32 {d31[0]}, [r5]!
- vst1.32 {d31[0]}, [r5]!
- vst1.32 {d31[0]}, [r5]!
- vst1.32 {d31[0]}, [r5]!
+ mov r0, #0
+ str r0, [r6]
+ vst1.32 {d31[0]}, [r5]!
+ vst1.32 {d31[0]}, [r5]!
+ vst1.32 {d31[0]}, [r5]!
+ vst1.32 {d31[0]}, [r5]!
satd_intra_4x4_x3_opt_end:
- mov r0, r4
+ mov r0, r4
- ldmia sp!, {r4-r7, lr}
+ ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
#endif
--- a/codec/encoder/core/arm/pixel_neon.S
+++ b/codec/encoder/core/arm/pixel_neon.S
@@ -66,10 +66,10 @@
vsub.s16 q3, q12, q13
vadd.s16 q8, q10, q11
- vsub.s16 q9, q10, q11
+ vsub.s16 q9, q10, q11
vadd.s16 q10, q14, q15
- vsub.s16 q11, q14, q15
+ vsub.s16 q11, q14, q15
vadd.s16 q12, q0, q2
vsub.s16 q14, q0, q2
@@ -372,28 +372,28 @@
WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
stmdb sp!, {r4-r5, lr}
- //Loading a horizontal line data (4 bytes)
- //line 0
- ldr r4, [r0], r1
- ldr r5, [r2], r3
- usad8 lr, r4, r5
+ //Loading a horizontal line data (4 bytes)
+ //line 0
+ ldr r4, [r0], r1
+ ldr r5, [r2], r3
+ usad8 lr, r4, r5
//line 1
- ldr r4, [r0], r1
- ldr r5, [r2], r3
- usada8 lr, r4, r5, lr
+ ldr r4, [r0], r1
+ ldr r5, [r2], r3
+ usada8 lr, r4, r5, lr
//line 2
- ldr r4, [r0], r1
- ldr r5, [r2], r3
- usada8 lr, r4, r5, lr
+ ldr r4, [r0], r1
+ ldr r5, [r2], r3
+ usada8 lr, r4, r5, lr
- //line 3
- ldr r4, [r0]
- ldr r5, [r2]
- usada8 r0, r4, r5, lr
+ //line 3
+ ldr r4, [r0]
+ ldr r5, [r2]
+ usada8 r0, r4, r5, lr
- ldmia sp!, {r4-r5, lr}
+ ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
@@ -401,76 +401,76 @@
stmdb sp!, {r4-r5, lr}
- //Generate the pix2 start addr
- sub r4, r2, #1
- add r5, r2, #1
- sub r2, r3
+ //Generate the pix2 start addr
+ sub r4, r2, #1
+ add r5, r2, #1
+ sub r2, r3
//Loading a horizontal line data (16 bytes)
- vld1.8 {q0}, [r0], r1 //save pix1
+ vld1.8 {q0}, [r0], r1 //save pix1
- vld1.8 {q1}, [r2], r3 //save pix2 - stride
- vld1.8 {q10}, [r2], r3 //save pix2
- vld1.8 {q2}, [r2], r3 //save pix2 + stride
+ vld1.8 {q1}, [r2], r3 //save pix2 - stride
+ vld1.8 {q10}, [r2], r3 //save pix2
+ vld1.8 {q2}, [r2], r3 //save pix2 + stride
- vld1.8 {q3}, [r4], r3 //save pix2 - 1
- vld1.8 {q8}, [r5], r3 //save pix2 + 1
+ vld1.8 {q3}, [r4], r3 //save pix2 - 1
+ vld1.8 {q8}, [r5], r3 //save pix2 + 1
- //Do the SAD for 16 bytes
- vabdl.u8 q15, d0, d2
- vabal.u8 q15, d1, d3
+ //Do the SAD for 16 bytes
+ vabdl.u8 q15, d0, d2
+ vabal.u8 q15, d1, d3
- vabdl.u8 q13, d0, d4
- vabal.u8 q13, d1, d5
+ vabdl.u8 q13, d0, d4
+ vabal.u8 q13, d1, d5
- vabdl.u8 q11, d0, d6
- vabal.u8 q11, d1, d7
+ vabdl.u8 q11, d0, d6
+ vabal.u8 q11, d1, d7
- vabdl.u8 q9, d0, d16
- vabal.u8 q9, d1, d17
+ vabdl.u8 q9, d0, d16
+ vabal.u8 q9, d1, d17
- mov lr, #15
+ mov lr, #15
pixel_sad_4_16x16_loop_0:
//Loading a horizontal line data (16 bytes)
- vld1.8 {q0}, [r0], r1 //save pix1
- vmov.8 q1, q10 //save pix2 - stride
- vmov.8 q10, q2
- vabal.u8 q15, d0, d2
- vld1.8 {q2}, [r2], r3 //save pix2 + stride
- vabal.u8 q15, d1, d3
- vld1.8 {q3}, [r4], r3 //save pix2 - 1
- vabal.u8 q13, d0, d4
- vld1.8 {q8}, [r5], r3 //save pix2 + 1
+ vld1.8 {q0}, [r0], r1 //save pix1
+ vmov.8 q1, q10 //save pix2 - stride
+ vmov.8 q10, q2
+ vabal.u8 q15, d0, d2
+ vld1.8 {q2}, [r2], r3 //save pix2 + stride
+ vabal.u8 q15, d1, d3
+ vld1.8 {q3}, [r4], r3 //save pix2 - 1
+ vabal.u8 q13, d0, d4
+ vld1.8 {q8}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
- subs lr, #1
+ subs lr, #1
- vabal.u8 q11, d0, d6
- vabal.u8 q11, d1, d7
+ vabal.u8 q11, d0, d6
+ vabal.u8 q11, d1, d7
- vabal.u8 q9, d0, d16
- vabal.u8 q9, d1, d17
+ vabal.u8 q9, d0, d16
+ vabal.u8 q9, d1, d17
- bne pixel_sad_4_16x16_loop_0
+ bne pixel_sad_4_16x16_loop_0
//Save SAD to 'r0'
- ldr r0, [sp, #12]
+ ldr r0, [sp, #12]
- vadd.u16 d0, d30, d31
- vadd.u16 d1, d26, d27
- vadd.u16 d2, d22, d23
- vadd.u16 d3, d18, d19
+ vadd.u16 d0, d30, d31
+ vadd.u16 d1, d26, d27
+ vadd.u16 d2, d22, d23
+ vadd.u16 d3, d18, d19
- vpaddl.u16 q0, q0
- vpaddl.u16 q1, q1
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
- vpaddl.u32 q0, q0
- vpaddl.u32 q1, q1
+ vpaddl.u32 q0, q0
+ vpaddl.u32 q1, q1
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
- ldmia sp!, {r4-r5, lr}
+ ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
@@ -477,75 +477,75 @@
WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
stmdb sp!, {r4-r5, lr}
- //Generate the pix2 start addr
- sub r4, r2, #1
- add r5, r2, #1
- sub r2, r3
+ //Generate the pix2 start addr
+ sub r4, r2, #1
+ add r5, r2, #1
+ sub r2, r3
//Loading a horizontal line data (16 bytes)
- vld1.8 {q0}, [r0], r1 //save pix1
+ vld1.8 {q0}, [r0], r1 //save pix1
- vld1.8 {q1}, [r2], r3 //save pix2 - stride
- vld1.8 {q10}, [r2], r3 //save pix2
- vld1.8 {q2}, [r2], r3 //save pix2 + stride
+ vld1.8 {q1}, [r2], r3 //save pix2 - stride
+ vld1.8 {q10}, [r2], r3 //save pix2
+ vld1.8 {q2}, [r2], r3 //save pix2 + stride
- vld1.8 {q3}, [r4], r3 //save pix2 - 1
- vld1.8 {q8}, [r5], r3 //save pix2 + 1
+ vld1.8 {q3}, [r4], r3 //save pix2 - 1
+ vld1.8 {q8}, [r5], r3 //save pix2 + 1
- //Do the SAD for 16 bytes
- vabdl.u8 q15, d0, d2
- vabal.u8 q15, d1, d3
+ //Do the SAD for 16 bytes
+ vabdl.u8 q15, d0, d2
+ vabal.u8 q15, d1, d3
- vabdl.u8 q13, d0, d4
- vabal.u8 q13, d1, d5
+ vabdl.u8 q13, d0, d4
+ vabal.u8 q13, d1, d5
- vabdl.u8 q11, d0, d6
- vabal.u8 q11, d1, d7
+ vabdl.u8 q11, d0, d6
+ vabal.u8 q11, d1, d7
- vabdl.u8 q9, d0, d16
- vabal.u8 q9, d1, d17
+ vabdl.u8 q9, d0, d16
+ vabal.u8 q9, d1, d17
- mov lr, #7
+ mov lr, #7
pixel_sad_4_16x8_loop_0:
//Loading a horizontal line data (16 bytes)
- vld1.8 {q0}, [r0], r1 //save pix1
- vmov.8 q1, q10 //save pix2 - stride
- vmov.8 q10, q2
- vabal.u8 q15, d0, d2
- vld1.8 {q2}, [r2], r3 //save pix2 + stride
- vabal.u8 q15, d1, d3
- vld1.8 {q3}, [r4], r3 //save pix2 - 1
- vabal.u8 q13, d0, d4
- vld1.8 {q8}, [r5], r3 //save pix2 + 1
+ vld1.8 {q0}, [r0], r1 //save pix1
+ vmov.8 q1, q10 //save pix2 - stride
+ vmov.8 q10, q2
+ vabal.u8 q15, d0, d2
+ vld1.8 {q2}, [r2], r3 //save pix2 + stride
+ vabal.u8 q15, d1, d3
+ vld1.8 {q3}, [r4], r3 //save pix2 - 1
+ vabal.u8 q13, d0, d4
+ vld1.8 {q8}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
- subs lr, #1
+ subs lr, #1
- vabal.u8 q11, d0, d6
- vabal.u8 q11, d1, d7
+ vabal.u8 q11, d0, d6
+ vabal.u8 q11, d1, d7
- vabal.u8 q9, d0, d16
- vabal.u8 q9, d1, d17
+ vabal.u8 q9, d0, d16
+ vabal.u8 q9, d1, d17
- bne pixel_sad_4_16x8_loop_0
+ bne pixel_sad_4_16x8_loop_0
//Save SAD to 'r0'
- ldr r0, [sp, #12]
+ ldr r0, [sp, #12]
- vadd.u16 d0, d30, d31
- vadd.u16 d1, d26, d27
- vadd.u16 d2, d22, d23
- vadd.u16 d3, d18, d19
+ vadd.u16 d0, d30, d31
+ vadd.u16 d1, d26, d27
+ vadd.u16 d2, d22, d23
+ vadd.u16 d3, d18, d19
- vpaddl.u16 q0, q0
- vpaddl.u16 q1, q1
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
- vpaddl.u32 q0, q0
- vpaddl.u32 q1, q1
+ vpaddl.u32 q0, q0
+ vpaddl.u32 q1, q1
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
- ldmia sp!, {r4-r5, lr}
+ ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
@@ -552,189 +552,189 @@
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
stmdb sp!, {r4-r5, lr}
- //Generate the pix2 start addr
- sub r4, r2, #1
- add r5, r2, #1
- sub r2, r3
+ //Generate the pix2 start addr
+ sub r4, r2, #1
+ add r5, r2, #1
+ sub r2, r3
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1 //save pix1
+ vld1.8 {d0}, [r0], r1 //save pix1
- vld1.8 {d1}, [r2], r3 //save pix2 - stride
- vld1.8 {d6}, [r2], r3 //save pix2
- vld1.8 {d2}, [r2], r3 //save pix2 + stride
+ vld1.8 {d1}, [r2], r3 //save pix2 - stride
+ vld1.8 {d6}, [r2], r3 //save pix2
+ vld1.8 {d2}, [r2], r3 //save pix2 + stride
- vld1.8 {d3}, [r4], r3 //save pix2 - 1
- vld1.8 {d4}, [r5], r3 //save pix2 + 1
+ vld1.8 {d3}, [r4], r3 //save pix2 - 1
+ vld1.8 {d4}, [r5], r3 //save pix2 + 1
- //Do the SAD for 8 bytes
- vabdl.u8 q15, d0, d1
- vabdl.u8 q14, d0, d2
- vabdl.u8 q13, d0, d3
- vabdl.u8 q12, d0, d4
+ //Do the SAD for 8 bytes
+ vabdl.u8 q15, d0, d1
+ vabdl.u8 q14, d0, d2
+ vabdl.u8 q13, d0, d3
+ vabdl.u8 q12, d0, d4
- mov lr, #15
+ mov lr, #15
pixel_sad_4_8x16_loop_0:
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1 //save pix1
- vmov.8 d1, d6 //save pix2 - stride
- vmov.8 d6, d2
- vld1.8 {d2}, [r2], r3 //save pix2 + stride
- vld1.8 {d3}, [r4], r3 //save pix2 - 1
- vabal.u8 q15, d0, d1
+ vld1.8 {d0}, [r0], r1 //save pix1
+ vmov.8 d1, d6 //save pix2 - stride
+ vmov.8 d6, d2
+ vld1.8 {d2}, [r2], r3 //save pix2 + stride
+ vld1.8 {d3}, [r4], r3 //save pix2 - 1
+ vabal.u8 q15, d0, d1
- vld1.8 {d4}, [r5], r3 //save pix2 + 1
- //Do the SAD for 8 bytes
- vabal.u8 q14, d0, d2
- vabal.u8 q13, d0, d3
- vabal.u8 q12, d0, d4
+ vld1.8 {d4}, [r5], r3 //save pix2 + 1
+ //Do the SAD for 8 bytes
+ vabal.u8 q14, d0, d2
+ vabal.u8 q13, d0, d3
+ vabal.u8 q12, d0, d4
subs lr, #1
- bne pixel_sad_4_8x16_loop_0
+ bne pixel_sad_4_8x16_loop_0
//Save SAD to 'r0'
- ldr r0, [sp, #12]
+ ldr r0, [sp, #12]
- vadd.u16 d0, d30, d31
- vadd.u16 d1, d28, d29
- vadd.u16 d2, d26, d27
- vadd.u16 d3, d24, d25
+ vadd.u16 d0, d30, d31
+ vadd.u16 d1, d28, d29
+ vadd.u16 d2, d26, d27
+ vadd.u16 d3, d24, d25
- vpaddl.u16 q0, q0
- vpaddl.u16 q1, q1
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
- vpaddl.u32 q0, q0
- vpaddl.u32 q1, q1
+ vpaddl.u32 q0, q0
+ vpaddl.u32 q1, q1
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
- ldmia sp!, {r4-r5, lr}
+ ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
- stmdb sp!, {r4-r5, lr}
+ stmdb sp!, {r4-r5, lr}
- //Generate the pix2 start addr
- sub r4, r2, #1
- add r5, r2, #1
- sub r2, r3
+ //Generate the pix2 start addr
+ sub r4, r2, #1
+ add r5, r2, #1
+ sub r2, r3
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1 //save pix1
+ vld1.8 {d0}, [r0], r1 //save pix1
- vld1.8 {d1}, [r2], r3 //save pix2 - stride
- vld1.8 {d6}, [r2], r3 //save pix2
- vld1.8 {d2}, [r2], r3 //save pix2 + stride
+ vld1.8 {d1}, [r2], r3 //save pix2 - stride
+ vld1.8 {d6}, [r2], r3 //save pix2
+ vld1.8 {d2}, [r2], r3 //save pix2 + stride
- vld1.8 {d3}, [r4], r3 //save pix2 - 1
- vld1.8 {d4}, [r5], r3 //save pix2 + 1
+ vld1.8 {d3}, [r4], r3 //save pix2 - 1
+ vld1.8 {d4}, [r5], r3 //save pix2 + 1
- //Do the SAD for 8 bytes
- vabdl.u8 q15, d0, d1
- vabdl.u8 q14, d0, d2
- vabdl.u8 q13, d0, d3
- vabdl.u8 q12, d0, d4
+ //Do the SAD for 8 bytes
+ vabdl.u8 q15, d0, d1
+ vabdl.u8 q14, d0, d2
+ vabdl.u8 q13, d0, d3
+ vabdl.u8 q12, d0, d4
- mov lr, #7
+ mov lr, #7
pixel_sad_4_8x8_loop_0:
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1 //save pix1
- vmov.8 d1, d6 //save pix2 - stride
- vmov.8 d6, d2
- vld1.8 {d2}, [r2], r3 //save pix2 + stride
- vld1.8 {d3}, [r4], r3 //save pix2 - 1
- vabal.u8 q15, d0, d1
+ vld1.8 {d0}, [r0], r1 //save pix1
+ vmov.8 d1, d6 //save pix2 - stride
+ vmov.8 d6, d2
+ vld1.8 {d2}, [r2], r3 //save pix2 + stride
+ vld1.8 {d3}, [r4], r3 //save pix2 - 1
+ vabal.u8 q15, d0, d1
- vld1.8 {d4}, [r5], r3 //save pix2 + 1
- //Do the SAD for 8 bytes
- vabal.u8 q14, d0, d2
- vabal.u8 q13, d0, d3
- vabal.u8 q12, d0, d4
+ vld1.8 {d4}, [r5], r3 //save pix2 + 1
+ //Do the SAD for 8 bytes
+ vabal.u8 q14, d0, d2
+ vabal.u8 q13, d0, d3
+ vabal.u8 q12, d0, d4
subs lr, #1
- bne pixel_sad_4_8x8_loop_0
+ bne pixel_sad_4_8x8_loop_0
//Save SAD to 'r0'
- ldr r0, [sp, #12]
+ ldr r0, [sp, #12]
- vadd.u16 d0, d30, d31
- vadd.u16 d1, d28, d29
- vadd.u16 d2, d26, d27
- vadd.u16 d3, d24, d25
+ vadd.u16 d0, d30, d31
+ vadd.u16 d1, d28, d29
+ vadd.u16 d2, d26, d27
+ vadd.u16 d3, d24, d25
- vpaddl.u16 q0, q0
- vpaddl.u16 q1, q1
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
- vpaddl.u32 q0, q0
- vpaddl.u32 q1, q1
+ vpaddl.u32 q0, q0
+ vpaddl.u32 q1, q1
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
- ldmia sp!, {r4-r5, lr}
+ ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
- vld1.32 {d0[0]}, [r0], r1
- vld1.32 {d0[1]}, [r0], r1
- vld1.32 {d1[0]}, [r0], r1
- vld1.32 {d1[1]}, [r0]
+ vld1.32 {d0[0]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d1[0]}, [r0], r1
+ vld1.32 {d1[1]}, [r0]
- sub r0, r2, r3
- vld1.32 {d2[0]}, [r0], r3
- vld1.32 {d2[1]}, [r0], r3
- vld1.32 {d3[0]}, [r0], r3
- vld1.32 {d3[1]}, [r0], r3
- vld1.32 {d4[0]}, [r0], r3
- vld1.32 {d4[1]}, [r0]
+ sub r0, r2, r3
+ vld1.32 {d2[0]}, [r0], r3
+ vld1.32 {d2[1]}, [r0], r3
+ vld1.32 {d3[0]}, [r0], r3
+ vld1.32 {d3[1]}, [r0], r3
+ vld1.32 {d4[0]}, [r0], r3
+ vld1.32 {d4[1]}, [r0]
- sub r0, r2, #1
- vld1.32 {d5[0]}, [r0], r3
- vld1.32 {d5[1]}, [r0], r3
- vld1.32 {d6[0]}, [r0], r3
- vld1.32 {d6[1]}, [r0]
+ sub r0, r2, #1
+ vld1.32 {d5[0]}, [r0], r3
+ vld1.32 {d5[1]}, [r0], r3
+ vld1.32 {d6[0]}, [r0], r3
+ vld1.32 {d6[1]}, [r0]
- add r0, r2, #1
- vld1.32 {d7[0]}, [r0], r3
- vld1.32 {d7[1]}, [r0], r3
- vld1.32 {d8[0]}, [r0], r3
- vld1.32 {d8[1]}, [r0]
+ add r0, r2, #1
+ vld1.32 {d7[0]}, [r0], r3
+ vld1.32 {d7[1]}, [r0], r3
+ vld1.32 {d8[0]}, [r0], r3
+ vld1.32 {d8[1]}, [r0]
- vabdl.u8 q15, d0, d2
- vabdl.u8 q14, d1, d3
+ vabdl.u8 q15, d0, d2
+ vabdl.u8 q14, d1, d3
- vabdl.u8 q13, d0, d3
- vabdl.u8 q12, d1, d4
+ vabdl.u8 q13, d0, d3
+ vabdl.u8 q12, d1, d4
- vabdl.u8 q11, d0, d5
- vabdl.u8 q10, d1, d6
+ vabdl.u8 q11, d0, d5
+ vabdl.u8 q10, d1, d6
- vabdl.u8 q9, d0, d7
- vabdl.u8 q8, d1, d8
+ vabdl.u8 q9, d0, d7
+ vabdl.u8 q8, d1, d8
- //Save SAD to 'r4'
- ldr r0, [sp]
- vadd.u16 q0, q14, q15
- vadd.u16 q1, q12, q13
- vadd.u16 q2, q10, q11
- vadd.u16 q3, q8 , q9
+ //Save SAD to 'r4'
+ ldr r0, [sp]
+ vadd.u16 q0, q14, q15
+ vadd.u16 q1, q12, q13
+ vadd.u16 q2, q10, q11
+ vadd.u16 q3, q8 , q9
- vadd.u16 d0, d1
- vadd.u16 d1, d2, d3
- vadd.u16 d2, d4, d5
- vadd.u16 d3, d6, d7
+ vadd.u16 d0, d1
+ vadd.u16 d1, d2, d3
+ vadd.u16 d2, d4, d5
+ vadd.u16 d3, d6, d7
- vpaddl.u16 q0, q0
- vpaddl.u16 q1, q1
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
- vpaddl.u32 q0, q0
- vpaddl.u32 q1, q1
+ vpaddl.u32 q0, q0
+ vpaddl.u32 q1, q1
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
WELS_ASM_FUNC_END
@@ -834,16 +834,16 @@
WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
//Load the pix1 data --- 16 bytes
- vld1.32 {d0[0]}, [r0], r1
- vld1.32 {d0[1]}, [r0], r1
- vld1.32 {d1[0]}, [r0], r1
- vld1.32 {d1[1]}, [r0]
+ vld1.32 {d0[0]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d1[0]}, [r0], r1
+ vld1.32 {d1[1]}, [r0]
//Load the pix2 data --- 16 bytes
- vld1.32 {d2[0]}, [r2], r3
- vld1.32 {d2[1]}, [r2], r3
- vld1.32 {d3[0]}, [r2], r3
- vld1.32 {d3[1]}, [r2]
+ vld1.32 {d2[0]}, [r2], r3
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d3[0]}, [r2], r3
+ vld1.32 {d3[1]}, [r2]
//Get the difference
vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
@@ -874,7 +874,7 @@
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
- vmov.u32 r0, d0[0]
+ vmov.u32 r0, d0[0]
WELS_ASM_FUNC_END
--- a/codec/encoder/core/arm/reconstruct_neon.S
+++ b/codec/encoder/core/arm/reconstruct_neon.S
@@ -36,591 +36,591 @@
#ifdef __APPLE__
.macro LOAD_4x4_DATA_FOR_DCT
-// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride
- vld2.16 {$0[0],$1[0]}, [$4], $5
- vld2.16 {$2[0],$3[0]}, [$6], $7
- vld2.16 {$0[1],$1[1]}, [$4], $5
- vld2.16 {$2[1],$3[1]}, [$6], $7
+// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride
+ vld2.16 {$0[0],$1[0]}, [$4], $5
+ vld2.16 {$2[0],$3[0]}, [$6], $7
+ vld2.16 {$0[1],$1[1]}, [$4], $5
+ vld2.16 {$2[1],$3[1]}, [$6], $7
- vld2.16 {$0[2],$1[2]}, [$4], $5
- vld2.16 {$2[2],$3[2]}, [$6], $7
- vld2.16 {$0[3],$1[3]}, [$4], $5
- vld2.16 {$2[3],$3[3]}, [$6], $7
-// }
+ vld2.16 {$0[2],$1[2]}, [$4], $5
+ vld2.16 {$2[2],$3[2]}, [$6], $7
+ vld2.16 {$0[3],$1[3]}, [$4], $5
+ vld2.16 {$2[3],$3[3]}, [$6], $7
+// }
.endm
.macro LOAD_8x8_DATA_FOR_DCT
-// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
- vld1.64 {$0}, [$8], r2
- vld1.64 {$4}, [$9], r4
- vld1.64 {$1}, [$8], r2
- vld1.64 {$5}, [$9], r4
+// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+ vld1.64 {$0}, [$8], r2
+ vld1.64 {$4}, [$9], r4
+ vld1.64 {$1}, [$8], r2
+ vld1.64 {$5}, [$9], r4
- vld1.64 {$2}, [$8], r2
- vld1.64 {$6}, [$9], r4
- vld1.64 {$3}, [$8], r2
- vld1.64 {$7}, [$9], r4
-// }
+ vld1.64 {$2}, [$8], r2
+ vld1.64 {$6}, [$9], r4
+ vld1.64 {$3}, [$8], r2
+ vld1.64 {$7}, [$9], r4
+// }
.endm
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
-// { // input: src_d[0]~[3], working: [4]~[7]
- vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3];
- vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3];
- vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2];
- vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2];
+// { // input: src_d[0]~[3], working: [4]~[7]
+ vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3];
+ vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3];
+ vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2];
+ vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2];
- vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1];
- vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1];
- vshl.s16 $1, $7, #1
- vshl.s16 $3, $6, #1
- vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2];
- vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1);
-// }
+ vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1];
+ vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1];
+ vshl.s16 $1, $7, #1
+ vshl.s16 $3, $6, #1
+ vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2];
+ vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1);
+// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS
-// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
- vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-// }
+// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+ vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+ vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+ vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+ vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+// }
.endm
-.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
- veor.s16 $6, $6 // init 0 , and keep 0;
- vaba.s16 $1, $0, $6 // f + abs(coef - 0)
- vmull.s16 $7, $2, $4
- vmull.s16 $8, $3, $5
- vshr.s32 $7, #16
- vshr.s32 $8, #16
- vmovn.s32 $2, $7
- vmovn.s32 $3, $8
+.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+ veor.s16 $6, $6 // init 0 , and keep 0;
+ vaba.s16 $1, $0, $6 // f + abs(coef - 0)
+ vmull.s16 $7, $2, $4
+ vmull.s16 $8, $3, $5
+ vshr.s32 $7, #16
+ vshr.s32 $8, #16
+ vmovn.s32 $2, $7
+ vmovn.s32 $3, $8
- vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $6, #1
- vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
+ vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 $6, #1
+ vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
+// }
.endm
-.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
- veor.s16 $6, $6 // init 0 , and keep 0;
- vaba.s16 $1, $0, $6 // f + abs(coef - 0)
- vmull.s16 $7, $2, $4
- vmull.s16 $8, $3, $5
- vshr.s32 $7, #16
- vshr.s32 $8, #16
- vmovn.s32 $2, $7
- vmovn.s32 $3, $8
+.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+ veor.s16 $6, $6 // init 0 , and keep 0;
+ vaba.s16 $1, $0, $6 // f + abs(coef - 0)
+ vmull.s16 $7, $2, $4
+ vmull.s16 $8, $3, $5
+ vshr.s32 $7, #16
+ vshr.s32 $8, #16
+ vmovn.s32 $2, $7
+ vmovn.s32 $3, $8
- vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $6, #1
- vmax.s16 $9, $2, $3
- vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
+ vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 $6, #1
+ vmax.s16 $9, $2, $3
+ vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
+// }
.endm
-.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), mf , working_d (all 0), working_q
- vaba.s16 $1, $0, $3 // f + abs(coef - 0)
- vmull.s16 $4, $1, $2 // *= mf
- vshr.s32 $4, #16
- vmovn.s32 $1, $4 // >> 16
+.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
+// { // input: coef, ff (dst), mf , working_d (all 0), working_q
+ vaba.s16 $1, $0, $3 // f + abs(coef - 0)
+ vmull.s16 $4, $1, $2 // *= mf
+ vshr.s32 $4, #16
+ vmovn.s32 $1, $4 // >> 16
- vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $3, #1
- vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111
+ vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 $3, #1
+ vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x
+// }
.endm
.macro DC_ZERO_COUNT_IN_DUALWORD
-// { // input: coef, dst_d, working_d (all 0x01)
- vceq.s16 $1, $0, #0
- vand.s16 $1, $2
- vpadd.s16 $1, $1, $1
- vpadd.s16 $1, $1, $1
-// }
+// { // input: coef, dst_d, working_d (all 0x01)
+ vceq.s16 $1, $0, #0
+ vand.s16 $1, $2
+ vpadd.s16 $1, $1, $1
+ vpadd.s16 $1, $1, $1
+// }
.endm
.macro SELECT_MAX_IN_ABS_COEF
-// { // input: coef_0, coef_1, max_q (identy to follow two)
- vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4
- vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3]
- vpmax.s16 $3, $3, $4 // max 1st in $3[0][1]
-// }
+// { // input: coef_0, coef_1, max_q (identy to follow two)
+ vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4
+ vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3]
+ vpmax.s16 $3, $3, $4 // max 1st in $3[0][1]
+// }
.endm
.macro ZERO_COUNT_IN_2_QUARWORD
-// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
- vceq.s16 $0, #0
- vceq.s16 $1, #0
- vand.s16 $0, $2
- vand.s16 $1, $2
+// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
+ vceq.s16 $0, #0
+ vceq.s16 $1, #0
+ vand.s16 $0, $2
+ vand.s16 $1, $2
- vpadd.s16 $3, $3, $5
- vpadd.s16 $4, $4, $6
- vpadd.s16 $3, $3, $4 // 8-->4
- vpadd.s16 $3, $3, $3
- vpadd.s16 $3, $3, $3
-// }
+ vpadd.s16 $3, $3, $5
+ vpadd.s16 $4, $4, $6
+ vpadd.s16 $3, $3, $4 // 8-->4
+ vpadd.s16 $3, $3, $3
+ vpadd.s16 $3, $3, $3
+// }
.endm
.macro HDM_QUANT_2x2_TOTAL_16BITS
-// { // input: src_d[0]~[3], working_d, dst_d
- vshr.s64 $1, $0, #32
- vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
- vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
- vtrn.s16 $2, $1
- vtrn.s32 $2, $1
-// }
+// { // input: src_d[0]~[3], working_d, dst_d
+ vshr.s64 $1, $0, #32
+ vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+ vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+ vtrn.s16 $2, $1
+ vtrn.s32 $2, $1
+// }
.endm
.macro IHDM_4x4_TOTAL_16BITS
-// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
- vshr.s64 $1, $0, #32
- vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
- vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
- vtrn.s16 $2, $1
- vrev32.16 $1, $1
- vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+ vshr.s64 $1, $0, #32
+ vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+ vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+ vtrn.s16 $2, $1
+ vrev32.16 $1, $1
+ vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
- vrev64.16 $1, $2
- vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
- vsub.s16 $1, $2, $1
- vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
- vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-// }
+ vrev64.16 $1, $2
+ vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+ vsub.s16 $1, $2, $1
+ vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+ vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+// }
.endm
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
-// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
- vmovl.u8 $4,$0
- vmovl.u8 $5,$1
- vadd.s16 $4,$2
- vadd.s16 $5,$3
- vqmovun.s16 $0,$4
- vqmovun.s16 $1,$5
-// }
+// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+ vmovl.u8 $4,$0
+ vmovl.u8 $5,$1
+ vadd.s16 $4,$2
+ vadd.s16 $5,$3
+ vqmovun.s16 $0,$4
+ vqmovun.s16 $1,$5
+// }
.endm
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
-// { // input: src_d[0]~[3], output: e_d[0]~[3];
- vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2];
- vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2];
- vshr.s16 $6, $1, #1
- vshr.s16 $7, $3, #1
- vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3];
- vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1);
-// }
+// { // input: src_d[0]~[3], output: e_d[0]~[3];
+ vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2];
+ vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2];
+ vshr.s16 $6, $1, #1
+ vshr.s16 $7, $3, #1
+ vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3];
+ vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
-.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
.macro ROW_TRANSFORM_0_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3];
- vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
- vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3];
- vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3];
-// }
+// { // input: src_d[0]~[3], output: e_q[0]~[3];
+ vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
+ vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3];
+ vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3];
+// }
.endm
.macro ROW_TRANSFORM_1_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
- vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 $8, $1, #1
- vshr.s16 $9, $3, #1
- vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
+// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+ vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
+ vshr.s16 $8, $1, #1
+ vshr.s16 $9, $3, #1
+ vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
+ vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
-.macro TRANSFORM_4BYTES // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+.macro TRANSFORM_4BYTES // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
.macro COL_TRANSFORM_0_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
- vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
.macro COL_TRANSFORM_1_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 $6, $1, #1
- vshr.s32 $7, $3, #1
- vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vshr.s32 $6, $1, #1
+ vshr.s32 $7, $3, #1
+ vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
#else
.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
- vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
- vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7
- vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5
- vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7
+// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
+ vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
+ vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7
+ vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5
+ vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7
- vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5
- vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7
- vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5
- vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7
-// }
+ vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5
+ vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7
+ vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5
+ vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7
+// }
.endm
.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
- vld1.64 {\arg0}, [\arg8], r2
- vld1.64 {\arg4}, [\arg9], r4
- vld1.64 {\arg1}, [\arg8], r2
- vld1.64 {\arg5}, [\arg9], r4
+// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+ vld1.64 {\arg0}, [\arg8], r2
+ vld1.64 {\arg4}, [\arg9], r4
+ vld1.64 {\arg1}, [\arg8], r2
+ vld1.64 {\arg5}, [\arg9], r4
- vld1.64 {\arg2}, [\arg8], r2
- vld1.64 {\arg6}, [\arg9], r4
- vld1.64 {\arg3}, [\arg8], r2
- vld1.64 {\arg7}, [\arg9], r4
-// }
+ vld1.64 {\arg2}, [\arg8], r2
+ vld1.64 {\arg6}, [\arg9], r4
+ vld1.64 {\arg3}, [\arg8], r2
+ vld1.64 {\arg7}, [\arg9], r4
+// }
.endm
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_d[0]~[3], working: [4]~[7]
- vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3];
- vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3];
- vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2];
- vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2];
+// { // input: src_d[0]~[3], working: [4]~[7]
+ vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3];
+ vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3];
+ vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2];
+ vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2];
- vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1];
- vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1];
- vshl.s16 \arg1, \arg7, #1
- vshl.s16 \arg3, \arg6, #1
- vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2];
- vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1);
-// }
+ vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1];
+ vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1];
+ vshl.s16 \arg1, \arg7, #1
+ vshl.s16 \arg3, \arg6, #1
+ vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2];
+ vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1);
+// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
-// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
- vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-// }
+// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+ vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+ vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+ vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+ vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
- veor.s16 \arg6, \arg6 // init 0 , and keep 0;
- vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
- vmull.s16 \arg7, \arg2, \arg4
- vmull.s16 \arg8, \arg3, \arg5
- vshr.s32 \arg7, #16
- vshr.s32 \arg8, #16
- vmovn.s32 \arg2, \arg7
- vmovn.s32 \arg3, \arg8
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+ veor.s16 \arg6, \arg6 // init 0 , and keep 0;
+ vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
+ vmull.s16 \arg7, \arg2, \arg4
+ vmull.s16 \arg8, \arg3, \arg5
+ vshr.s32 \arg7, #16
+ vshr.s32 \arg8, #16
+ vmovn.s32 \arg2, \arg7
+ vmovn.s32 \arg3, \arg8
- vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
- vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 \arg6, #1
- vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
+ vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 \arg6, #1
+ vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
+// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
- veor.s16 \arg6, \arg6 // init 0 , and keep 0;
- vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
- vmull.s16 \arg7, \arg2, \arg4
- vmull.s16 \arg8, \arg3, \arg5
- vshr.s32 \arg7, #16
- vshr.s32 \arg8, #16
- vmovn.s32 \arg2, \arg7
- vmovn.s32 \arg3, \arg8
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+ veor.s16 \arg6, \arg6 // init 0 , and keep 0;
+ vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
+ vmull.s16 \arg7, \arg2, \arg4
+ vmull.s16 \arg8, \arg3, \arg5
+ vshr.s32 \arg7, #16
+ vshr.s32 \arg8, #16
+ vmovn.s32 \arg2, \arg7
+ vmovn.s32 \arg3, \arg8
- vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
- vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 \arg6, #1
- vmax.s16 \arg9, \arg2, \arg3
- vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
+ vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 \arg6, #1
+ vmax.s16 \arg9, \arg2, \arg3
+ vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
+// }
.endm
.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
-// { // input: coef, ff (dst), mf , working_d (all 0), working_q
- vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0)
- vmull.s16 \arg4, \arg1, \arg2 // *= mf
- vshr.s32 \arg4, #16
- vmovn.s32 \arg1, \arg4 // >> 16
+// { // input: coef, ff (dst), mf , working_d (all 0), working_q
+ vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0)
+ vmull.s16 \arg4, \arg1, \arg2 // *= mf
+ vshr.s32 \arg4, #16
+ vmovn.s32 \arg1, \arg4 // >> 16
- vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111
- vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 \arg3, #1
- vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111
+ vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 \arg3, #1
+ vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x
+// }
.endm
.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
-// { // input: coef, dst_d, working_d (all 0x01)
- vceq.s16 \arg1, \arg0, #0
- vand.s16 \arg1, \arg2
- vpadd.s16 \arg1, \arg1, \arg1
- vpadd.s16 \arg1, \arg1, \arg1
-// }
+// { // input: coef, dst_d, working_d (all 0x01)
+ vceq.s16 \arg1, \arg0, #0
+ vand.s16 \arg1, \arg2
+ vpadd.s16 \arg1, \arg1, \arg1
+ vpadd.s16 \arg1, \arg1, \arg1
+// }
.endm
.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
-// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
- vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4
- vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
- vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1]
-// }
+// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
+ vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4
+ vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
+ vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1]
+// }
.endm
.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
-// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
- vceq.s16 \arg0, #0
- vceq.s16 \arg1, #0
- vand.s16 \arg0, \arg2
- vand.s16 \arg1, \arg2
+// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
+ vceq.s16 \arg0, #0
+ vceq.s16 \arg1, #0
+ vand.s16 \arg0, \arg2
+ vand.s16 \arg1, \arg2
- vpadd.s16 \arg3, \arg3, \arg5
- vpadd.s16 \arg4, \arg4, \arg6
- vpadd.s16 \arg3, \arg3, \arg4 // 8-->4
- vpadd.s16 \arg3, \arg3, \arg3
- vpadd.s16 \arg3, \arg3, \arg3
-// }
+ vpadd.s16 \arg3, \arg3, \arg5
+ vpadd.s16 \arg4, \arg4, \arg6
+ vpadd.s16 \arg3, \arg3, \arg4 // 8-->4
+ vpadd.s16 \arg3, \arg3, \arg3
+ vpadd.s16 \arg3, \arg3, \arg3
+// }
.endm
.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
-// { // input: src_d[0]~[3], working_d, dst_d
- vshr.s64 \arg1, \arg0, #32
- vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
- vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
- vtrn.s16 \arg2, \arg1
- vtrn.s32 \arg2, \arg1
-// }
+// { // input: src_d[0]~[3], working_d, dst_d
+ vshr.s64 \arg1, \arg0, #32
+ vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+ vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+ vtrn.s16 \arg2, \arg1
+ vtrn.s32 \arg2, \arg1
+// }
.endm
.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
-// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
- vshr.s64 \arg1, \arg0, #32
- vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
- vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
- vtrn.s16 \arg2, \arg1
- vrev32.16 \arg1, \arg1
- vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+ vshr.s64 \arg1, \arg0, #32
+ vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+ vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+ vtrn.s16 \arg2, \arg1
+ vrev32.16 \arg1, \arg1
+ vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
- vrev64.16 \arg1, \arg2
- vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
- vsub.s16 \arg1, \arg2, \arg1
- vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
- vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-// }
+ vrev64.16 \arg1, \arg2
+ vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+ vsub.s16 \arg1, \arg2, \arg1
+ vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+ vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+// }
.endm
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
- vmovl.u8 \arg4,\arg0
- vmovl.u8 \arg5,\arg1
- vadd.s16 \arg4,\arg2
- vadd.s16 \arg5,\arg3
- vqmovun.s16 \arg0,\arg4
- vqmovun.s16 \arg1,\arg5
-// }
+// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+ vmovl.u8 \arg4,\arg0
+ vmovl.u8 \arg5,\arg1
+ vadd.s16 \arg4,\arg2
+ vadd.s16 \arg5,\arg3
+ vqmovun.s16 \arg0,\arg4
+ vqmovun.s16 \arg1,\arg5
+// }
.endm
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_d[0]~[3], output: e_d[0]~[3];
- vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2];
- vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2];
- vshr.s16 \arg6, \arg1, #1
- vshr.s16 \arg7, \arg3, #1
- vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3];
- vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1);
-// }
+// { // input: src_d[0]~[3], output: e_d[0]~[3];
+ vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2];
+ vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2];
+ vshr.s16 \arg6, \arg1, #1
+ vshr.s16 \arg7, \arg3, #1
+ vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3];
+ vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
-.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_d[0]~[3], output: e_q[0]~[3];
- vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
- vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3];
- vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3];
-// }
+// { // input: src_d[0]~[3], output: e_q[0]~[3];
+ vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
+ vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3];
+ vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3];
+// }
.endm
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
- vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 \arg8, \arg1, #1
- vshr.s16 \arg9, \arg3, #1
- vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
+// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
+ vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
+ vshr.s16 \arg8, \arg1, #1
+ vshr.s16 \arg9, \arg3, #1
+ vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
+ vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
-.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
- vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 \arg6, \arg1, #1
- vshr.s32 \arg7, \arg3, #1
- vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vshr.s32 \arg6, \arg1, #1
+ vshr.s32 \arg7, \arg3, #1
+ vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsDctT4_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4
+ LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4
- vsubl.u8 q0, d4, d6
- vsubl.u8 q1, d5, d7
- vtrn.s32 q0, q1
- vswp d1, d2
+ vsubl.u8 q0, d4, d6
+ vsubl.u8 q1, d5, d7
+ vtrn.s32 q0, q1
+ vswp d1, d2
- // horizontal transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ // horizontal transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
- // vertical transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ // vertical transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
- vst1.s16 {q0, q1}, [r0]!
+ vst1.s16 {q0, q1}, [r0]!
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
+ LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
- vsubl.u8 q0, d16, d20
- vsubl.u8 q1, d17, d21
- vsubl.u8 q2, d18, d22
- vsubl.u8 q3, d19, d23
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ vsubl.u8 q0, d16, d20
+ vsubl.u8 q1, d17, d21
+ vsubl.u8 q2, d18, d22
+ vsubl.u8 q3, d19, d23
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- // horizontal transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ // horizontal transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- // vertical transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ // vertical transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- vswp d1, d2
- vswp d5, d6
- vswp q1, q2
- vst1.s16 {q0, q1}, [r0]!
- vst1.s16 {q2, q3}, [r0]!
+ vswp d1, d2
+ vswp d5, d6
+ vswp q1, q2
+ vst1.s16 {q0, q1}, [r0]!
+ vst1.s16 {q2, q3}, [r0]!
- ////////////////
- LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
+ ////////////////
+ LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
- vsubl.u8 q0, d16, d20
- vsubl.u8 q1, d17, d21
- vsubl.u8 q2, d18, d22
- vsubl.u8 q3, d19, d23
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ vsubl.u8 q0, d16, d20
+ vsubl.u8 q1, d17, d21
+ vsubl.u8 q2, d18, d22
+ vsubl.u8 q3, d19, d23
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- // horizontal transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ // horizontal transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- // vertical transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ // vertical transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- vswp d1, d2
- vswp d5, d6
- vswp q1, q2
- vst1.s16 {q0, q1}, [r0]!
- vst1.s16 {q2, q3}, [r0]!
+ vswp d1, d2
+ vswp d5, d6
+ vswp q1, q2
+ vst1.s16 {q0, q1}, [r0]!
+ vst1.s16 {q2, q3}, [r0]!
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon
- vld1.s16 {q2}, [r1]
- vld1.s16 {q0, q1}, [r0]
- vld1.s16 {q3}, [r2]
+ vld1.s16 {q2}, [r1]
+ vld1.s16 {q0, q1}, [r0]
+ vld1.s16 {q3}, [r2]
- vmov q8, q2
+ vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11
- vst1.s16 {q2}, [r0]!
+ NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11
+ vst1.s16 {q2}, [r0]!
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r0]!
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r0]!
WELS_ASM_FUNC_END
@@ -627,266 +627,266 @@
WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon
- vld1.s16 {q0, q1}, [r0]
- vdup.s16 q2, r1 // even ff range [0, 768]
- vdup.s16 q3, r2
+ vld1.s16 {q0, q1}, [r0]
+ vdup.s16 q2, r1 // even ff range [0, 768]
+ vdup.s16 q3, r2
- vmov q8, q2
+ vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11
- vst1.s16 {q2}, [r0]!
+ NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11
+ vst1.s16 {q2}, [r0]!
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r0]!
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r0]!
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon
- vld1.s16 {q2}, [r1]
- vld1.s16 {q3}, [r2]
- mov r1, r0
+ vld1.s16 {q2}, [r1]
+ vld1.s16 {q3}, [r2]
+ mov r1, r0
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon
- vld1.s16 {q2}, [r1]
- vld1.s16 {q3}, [r2]
- mov r1, r0
+ vld1.s16 {q2}, [r1]
+ vld1.s16 {q3}, [r2]
+ mov r1, r0
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
- vst1.s16 {q8}, [r1]!
- vmov q12, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
- vst1.s16 {q12}, [r1]! // then 1st 16 elem in d26 & d28
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
+ vst1.s16 {q8}, [r1]!
+ vmov q12, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
+ vst1.s16 {q12}, [r1]! // then 1st 16 elem in d26 & d28
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
- vst1.s16 {q8}, [r1]!
- vmov q12, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
- vst1.s16 {q12}, [r1]! // then 2nd 16 elem in d27 & d29
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
+ vst1.s16 {q8}, [r1]!
+ vmov q12, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
+ vst1.s16 {q12}, [r1]! // then 2nd 16 elem in d27 & d29
- SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1
- vst1.s32 {d0[0]}, [r3]!
+ SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1
+ vst1.s32 {d0[0]}, [r3]!
- ///////////
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
- vst1.s16 {q8}, [r1]!
- vmov q12, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
- vst1.s16 {q12}, [r1]! // then 3rd 16 elem in d26 & d28
+ ///////////
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
+ vst1.s16 {q8}, [r1]!
+ vmov q12, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
+ vst1.s16 {q12}, [r1]! // then 3rd 16 elem in d26 & d28
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
- vst1.s16 {q8}, [r1]!
- vmov q12, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
- vst1.s16 {q12}, [r1]! // then 4th 16 elem in d27 & d29
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
+ vst1.s16 {q8}, [r1]!
+ vmov q12, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
+ vst1.s16 {q12}, [r1]! // then 4th 16 elem in d27 & d29
- SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1
- vst1.s32 {d0[0]}, [r3]!
+ SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1
+ vst1.s32 {d0[0]}, [r3]!
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon
- push {r2,r3}
- mov r2, #64 // 2*16*sizeof(int16_t)
- add r3, r1, #32
+ push {r2,r3}
+ mov r2, #64 // 2*16*sizeof(int16_t)
+ add r3, r1, #32
- vld1.s16 {d0}, [r1], r2
- vld1.s16 {d1}, [r3], r2
- vld1.s16 {d4}, [r1], r2
- vld1.s16 {d5}, [r3], r2
- vld1.s16 {d2}, [r1], r2
- vld1.s16 {d3}, [r3], r2
- vld1.s16 {d6}, [r1], r2
- vld1.s16 {d7}, [r3], r2
- vtrn.16 q0, q2 // d0[0 4], d1[1 5]
- vtrn.16 q1, q3 // d2[2 6], d3[3 7]
+ vld1.s16 {d0}, [r1], r2
+ vld1.s16 {d1}, [r3], r2
+ vld1.s16 {d4}, [r1], r2
+ vld1.s16 {d5}, [r3], r2
+ vld1.s16 {d2}, [r1], r2
+ vld1.s16 {d3}, [r3], r2
+ vld1.s16 {d6}, [r1], r2
+ vld1.s16 {d7}, [r3], r2
+ vtrn.16 q0, q2 // d0[0 4], d1[1 5]
+ vtrn.16 q1, q3 // d2[2 6], d3[3 7]
- vld1.s16 {d16}, [r1], r2
- vld1.s16 {d17}, [r3], r2
- vld1.s16 {d20}, [r1], r2
- vld1.s16 {d21}, [r3], r2
- vld1.s16 {d18}, [r1], r2
- vld1.s16 {d19}, [r3], r2
- vld1.s16 {d22}, [r1], r2
- vld1.s16 {d23}, [r3], r2
- vtrn.16 q8, q10 //d16[08 12],d17[09 13]
- vtrn.16 q9, q11 //d18[10 14],d19[11 15]
+ vld1.s16 {d16}, [r1], r2
+ vld1.s16 {d17}, [r3], r2
+ vld1.s16 {d20}, [r1], r2
+ vld1.s16 {d21}, [r3], r2
+ vld1.s16 {d18}, [r1], r2
+ vld1.s16 {d19}, [r3], r2
+ vld1.s16 {d22}, [r1], r2
+ vld1.s16 {d23}, [r3], r2
+ vtrn.16 q8, q10 //d16[08 12],d17[09 13]
+ vtrn.16 q9, q11 //d18[10 14],d19[11 15]
- vtrn.32 q0, q8 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16]
- vtrn.32 q1, q9 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80]
+ vtrn.32 q0, q8 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16]
+ vtrn.32 q1, q9 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80]
- ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q8, q11, q10, q9
+ ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q8, q11, q10, q9
- TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9
+ TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9
- // transform element 32bits
- vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+ // transform element 32bits
+ vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+ vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+ vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+ vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
- COL_TRANSFORM_0_STEP q0, q1, q3, q2, q8, q11, q10, q9
+ COL_TRANSFORM_0_STEP q0, q1, q3, q2, q8, q11, q10, q9
- TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9
+ TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9
- vrshrn.s32 d16, q0, #1
- vrshrn.s32 d17, q1, #1
- vrshrn.s32 d18, q2, #1
- vrshrn.s32 d19, q3, #1
- vst1.16 {q8, q9}, [r0] //store
+ vrshrn.s32 d16, q0, #1
+ vrshrn.s32 d17, q1, #1
+ vrshrn.s32 d18, q2, #1
+ vrshrn.s32 d19, q3, #1
+ vst1.16 {q8, q9}, [r0] //store
- pop {r2,r3}
+ pop {r2,r3}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon
- vdup.s16 d1, r1 //ff
- vdup.s16 d2, r2 //mf
- veor d3, d3
+ vdup.s16 d1, r1 //ff
+ vdup.s16 d2, r2 //mf
+ veor d3, d3
- mov r1, #32
- mov r2, r0
+ mov r1, #32
+ mov r2, r0
- vld1.s16 {d0[0]}, [r0], r1 //rs[00]
- vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0
- vld1.s16 {d0[1]}, [r0], r1 //rs[16]
- vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0
- vld1.s16 {d0[2]}, [r0], r1 //rs[32]
- vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0
- vld1.s16 {d0[3]}, [r0], r1 //rs[48]
- vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0
+ vld1.s16 {d0[0]}, [r0], r1 //rs[00]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0
+ vld1.s16 {d0[1]}, [r0], r1 //rs[16]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0
+ vld1.s16 {d0[2]}, [r0], r1 //rs[32]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0
+ vld1.s16 {d0[3]}, [r0], r1 //rs[48]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0
- HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5
+ HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5
- HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0
+ HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0
- QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2
+ QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2
- vst1.s16 d1, [r3] // store to dct
- ldr r2, [sp, #0]
- vst1.s16 d1, [r2] // store to block
+ vst1.s16 d1, [r3] // store to dct
+ ldr r2, [sp, #0]
+ vst1.s16 d1, [r2] // store to block
- mov r1, #1
- vdup.s16 d3, r1
- DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3
+ mov r1, #1
+ vdup.s16 d3, r1
+ DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3
- vmov r0, r1, d0
- and r0, #0x07 // range [0~4]
- rsb r0, #4
+ vmov r0, r1, d0
+ and r0, #0x07 // range [0~4]
+ rsb r0, #4
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon
- vdup.s16 d3, r1
- mov r1, #32
- vld1.s16 {d0[0]}, [r0], r1 //rs[00]
- vld1.s16 {d0[1]}, [r0], r1 //rs[16]
- vld1.s16 {d0[2]}, [r0], r1 //rs[32]
- vld1.s16 {d0[3]}, [r0], r1 //rs[48]
+ vdup.s16 d3, r1
+ mov r1, #32
+ vld1.s16 {d0[0]}, [r0], r1 //rs[00]
+ vld1.s16 {d0[1]}, [r0], r1 //rs[16]
+ vld1.s16 {d0[2]}, [r0], r1 //rs[32]
+ vld1.s16 {d0[3]}, [r0], r1 //rs[48]
- HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2
+ HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2
- HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0
+ HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0
- vabs.s16 d1, d0
- vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold;
- vmov r0, r1, d1
- orr r0, r1
+ vabs.s16 d1, d0
+ vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold;
+ vmov r0, r1, d1
+ orr r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon
- push {r1}
- vld1.s16 {q0, q1}, [r0]
- vmov.s16 q8, #1
+ push {r1}
+ vld1.s16 {q0, q1}, [r0]
+ vmov.s16 q8, #1
- ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3
- vmov r0, r1, d0
- and r0, #0x1F // range [0~16]
- rsb r0, #16
- pop {r1}
+ ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3
+ vmov r0, r1, d0
+ and r0, #0x1F // range [0~16]
+ rsb r0, #16
+ pop {r1}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon
- vld1.s16 {q0, q1}, [r0]
- vld1.u16 {q2}, [r1]
+ vld1.s16 {q0, q1}, [r0]
+ vld1.u16 {q2}, [r1]
- vmul.s16 q8, q0, q2
- vmul.s16 q9, q1, q2
+ vmul.s16 q8, q0, q2
+ vmul.s16 q9, q1, q2
- vst1.s16 {q8, q9}, [r0]
+ vst1.s16 {q8, q9}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon
- vld1.u16 {q12}, [r1]
- mov r1, r0
- vld1.s16 {q0, q1}, [r0]!
- vld1.s16 {q2, q3}, [r0]!
- vmul.s16 q0, q0, q12
- vld1.s16 {q8, q9}, [r0]!
- vmul.s16 q1, q1, q12
- vld1.s16 {q10, q11}, [r0]!
+ vld1.u16 {q12}, [r1]
+ mov r1, r0
+ vld1.s16 {q0, q1}, [r0]!
+ vld1.s16 {q2, q3}, [r0]!
+ vmul.s16 q0, q0, q12
+ vld1.s16 {q8, q9}, [r0]!
+ vmul.s16 q1, q1, q12
+ vld1.s16 {q10, q11}, [r0]!
- vst1.s16 {q0, q1}, [r1]!
+ vst1.s16 {q0, q1}, [r1]!
- vmul.s16 q2, q2, q12
- vmul.s16 q3, q3, q12
- vmul.s16 q8, q8, q12
- vst1.s16 {q2, q3}, [r1]!
+ vmul.s16 q2, q2, q12
+ vmul.s16 q3, q3, q12
+ vmul.s16 q8, q8, q12
+ vst1.s16 {q2, q3}, [r1]!
- vmul.s16 q9, q9, q12
- vmul.s16 q10, q10, q12
- vmul.s16 q11, q11, q12
- vst1.s16 {q8, q9}, [r1]!
- vst1.s16 {q10, q11}, [r1]!
+ vmul.s16 q9, q9, q12
+ vmul.s16 q10, q10, q12
+ vmul.s16 q11, q11, q12
+ vst1.s16 {q8, q9}, [r1]!
+ vst1.s16 {q10, q11}, [r1]!
WELS_ASM_FUNC_END
@@ -893,258 +893,258 @@
WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon
- vld1.s16 {q0, q1}, [r0]
- vdup.s16 q8, r1
+ vld1.s16 {q0, q1}, [r0]
+ vdup.s16 q8, r1
- IHDM_4x4_TOTAL_16BITS q0, q2, q3
- IHDM_4x4_TOTAL_16BITS q1, q2, q3
+ IHDM_4x4_TOTAL_16BITS q0, q2, q3
+ IHDM_4x4_TOTAL_16BITS q1, q2, q3
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
- IHDM_4x4_TOTAL_16BITS q0, q2, q3
- vmul.s16 q0, q8
+ IHDM_4x4_TOTAL_16BITS q0, q2, q3
+ vmul.s16 q0, q8
- IHDM_4x4_TOTAL_16BITS q1, q2, q3
- vmul.s16 q1, q8
+ IHDM_4x4_TOTAL_16BITS q1, q2, q3
+ vmul.s16 q1, q8
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
- vst1.s16 {q0, q1}, [r0]
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+ vst1.s16 {q0, q1}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon
- vld1.u32 {d16[0]}, [r2], r3
- push {r4}
- ldr r4, [sp, #4]
- vld1.u32 {d16[1]}, [r2], r3
+ vld1.u32 {d16[0]}, [r2], r3
+ push {r4}
+ ldr r4, [sp, #4]
+ vld1.u32 {d16[1]}, [r2], r3
- vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles!
- vld1.u32 {d17[0]}, [r2], r3
- vld1.u32 {d17[1]}, [r2], r3 // q7 is pred
+ vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles!
+ vld1.u32 {d17[0]}, [r2], r3
+ vld1.u32 {d17[1]}, [r2], r3 // q7 is pred
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- vrshr.s16 d0, d0, #6
- vrshr.s16 d1, d1, #6
- vrshr.s16 d2, d2, #6
- vrshr.s16 d3, d3, #6
+ TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ vrshr.s16 d0, d0, #6
+ vrshr.s16 d1, d1, #6
+ vrshr.s16 d2, d2, #6
+ vrshr.s16 d3, d3, #6
- //after rounding 6, clip into [0, 255]
- vmovl.u8 q2,d16
- vadd.s16 q0,q2
- vqmovun.s16 d16,q0
- vst1.32 {d16[0]},[r0],r1
- vst1.32 {d16[1]},[r0],r1
+ //after rounding 6, clip into [0, 255]
+ vmovl.u8 q2,d16
+ vadd.s16 q0,q2
+ vqmovun.s16 d16,q0
+ vst1.32 {d16[0]},[r0],r1
+ vst1.32 {d16[1]},[r0],r1
- vmovl.u8 q2,d17
- vadd.s16 q1,q2
- vqmovun.s16 d17,q1
- vst1.32 {d17[0]},[r0],r1
- vst1.32 {d17[1]},[r0]
+ vmovl.u8 q2,d17
+ vadd.s16 q1,q2
+ vqmovun.s16 d17,q1
+ vst1.32 {d17[0]},[r0],r1
+ vst1.32 {d17[1]},[r0]
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon
- vld1.u64 {d24}, [r2], r3
- push {r4}
- ldr r4, [sp, #4]
- vld1.u64 {d25}, [r2], r3
+ vld1.u64 {d24}, [r2], r3
+ push {r4}
+ ldr r4, [sp, #4]
+ vld1.u64 {d25}, [r2], r3
- vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
- vld1.u64 {d26}, [r2], r3
- vld1.u64 {d27}, [r2], r3
- vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
- vswp d1, d4
- vswp d3, d6
- vswp q1, q2 // q0~q3
+ vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
+ vld1.u64 {d26}, [r2], r3
+ vld1.u64 {d27}, [r2], r3
+ vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
+ vswp d1, d4
+ vswp d3, d6
+ vswp q1, q2 // q0~q3
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- vrshr.s16 q0, q0, #6
- vrshr.s16 q1, q1, #6
- vrshr.s16 q2, q2, #6
- vrshr.s16 q3, q3, #6
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ vrshr.s16 q0, q0, #6
+ vrshr.s16 q1, q1, #6
+ vrshr.s16 q2, q2, #6
+ vrshr.s16 q3, q3, #6
- //after rounding 6, clip into [0, 255]
- vmovl.u8 q8,d24
- vadd.s16 q0,q8
- vqmovun.s16 d24,q0
- vst1.u8 {d24},[r0],r1
+ //after rounding 6, clip into [0, 255]
+ vmovl.u8 q8,d24
+ vadd.s16 q0,q8
+ vqmovun.s16 d24,q0
+ vst1.u8 {d24},[r0],r1
- vmovl.u8 q8,d25
- vadd.s16 q1,q8
- vqmovun.s16 d25,q1
- vst1.u8 {d25},[r0],r1
+ vmovl.u8 q8,d25
+ vadd.s16 q1,q8
+ vqmovun.s16 d25,q1
+ vst1.u8 {d25},[r0],r1
- vmovl.u8 q8,d26
- vadd.s16 q2,q8
- vqmovun.s16 d26,q2
- vst1.u8 {d26},[r0],r1
+ vmovl.u8 q8,d26
+ vadd.s16 q2,q8
+ vqmovun.s16 d26,q2
+ vst1.u8 {d26},[r0],r1
- vmovl.u8 q8,d27
- vadd.s16 q3,q8
- vqmovun.s16 d27,q3
- vst1.u8 {d27},[r0],r1
+ vmovl.u8 q8,d27
+ vadd.s16 q3,q8
+ vqmovun.s16 d27,q3
+ vst1.u8 {d27},[r0],r1
- vld1.u64 {d24}, [r2], r3
- vld1.u64 {d25}, [r2], r3
+ vld1.u64 {d24}, [r2], r3
+ vld1.u64 {d25}, [r2], r3
- vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
- vld1.u64 {d26}, [r2], r3
- vld1.u64 {d27}, [r2], r3
- vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
- vswp d1, d4
- vswp d3, d6
- vswp q1, q2 // q0~q3
+ vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
+ vld1.u64 {d26}, [r2], r3
+ vld1.u64 {d27}, [r2], r3
+ vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
+ vswp d1, d4
+ vswp d3, d6
+ vswp q1, q2 // q0~q3
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- vrshr.s16 q0, q0, #6
- vrshr.s16 q1, q1, #6
- vrshr.s16 q2, q2, #6
- vrshr.s16 q3, q3, #6
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ vrshr.s16 q0, q0, #6
+ vrshr.s16 q1, q1, #6
+ vrshr.s16 q2, q2, #6
+ vrshr.s16 q3, q3, #6
- //after rounding 6, clip into [0, 255]
- vmovl.u8 q8,d24
- vadd.s16 q0,q8
- vqmovun.s16 d24,q0
- vst1.u8 {d24},[r0],r1
+ //after rounding 6, clip into [0, 255]
+ vmovl.u8 q8,d24
+ vadd.s16 q0,q8
+ vqmovun.s16 d24,q0
+ vst1.u8 {d24},[r0],r1
- vmovl.u8 q8,d25
- vadd.s16 q1,q8
- vqmovun.s16 d25,q1
- vst1.u8 {d25},[r0],r1
+ vmovl.u8 q8,d25
+ vadd.s16 q1,q8
+ vqmovun.s16 d25,q1
+ vst1.u8 {d25},[r0],r1
- vmovl.u8 q8,d26
- vadd.s16 q2,q8
- vqmovun.s16 d26,q2
- vst1.u8 {d26},[r0],r1
+ vmovl.u8 q8,d26
+ vadd.s16 q2,q8
+ vqmovun.s16 d26,q2
+ vst1.u8 {d26},[r0],r1
- vmovl.u8 q8,d27
- vadd.s16 q3,q8
- vqmovun.s16 d27,q3
- vst1.u8 {d27},[r0],r1
+ vmovl.u8 q8,d27
+ vadd.s16 q3,q8
+ vqmovun.s16 d27,q3
+ vst1.u8 {d27},[r0],r1
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- vld1.s16 {q8,q9}, [r4]
- vrshr.s16 q8, q8, #6
- vrshr.s16 q9, q9, #6
+ vld1.s16 {q8,q9}, [r4]
+ vrshr.s16 q8, q8, #6
+ vrshr.s16 q9, q9, #6
- vdup.s16 d20, d16[0]
- vdup.s16 d21, d16[1]
- vdup.s16 d22, d16[2]
- vdup.s16 d23, d16[3]
+ vdup.s16 d20, d16[0]
+ vdup.s16 d21, d16[1]
+ vdup.s16 d22, d16[2]
+ vdup.s16 d23, d16[3]
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vdup.s16 d20, d17[0]
- vdup.s16 d21, d17[1]
- vdup.s16 d22, d17[2]
- vdup.s16 d23, d17[3]
+ vdup.s16 d20, d17[0]
+ vdup.s16 d21, d17[1]
+ vdup.s16 d22, d17[2]
+ vdup.s16 d23, d17[3]
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vdup.s16 d20, d18[0]
- vdup.s16 d21, d18[1]
- vdup.s16 d22, d18[2]
- vdup.s16 d23, d18[3]
+ vdup.s16 d20, d18[0]
+ vdup.s16 d21, d18[1]
+ vdup.s16 d22, d18[2]
+ vdup.s16 d23, d18[3]
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vdup.s16 d20, d19[0]
- vdup.s16 d21, d19[1]
- vdup.s16 d22, d19[2]
- vdup.s16 d23, d19[3]
+ vdup.s16 d20, d19[0]
+ vdup.s16 d21, d19[1]
+ vdup.s16 d22, d19[2]
+ vdup.s16 d23, d19[3]
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
#endif
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@@ -55,262 +55,262 @@
align 16
byte_1pos_table:
- db 0,0,0,0,0,0,0,0, ;0
- db 0,0,0,0,0,0,0,1, ;1
- db 1,0,0,0,0,0,0,1, ;2
- db 1,0,0,0,0,0,0,2, ;3
- db 2,0,0,0,0,0,0,1, ;4
- db 2,0,0,0,0,0,0,2, ;5
- db 2,1,0,0,0,0,0,2, ;6
- db 2,1,0,0,0,0,0,3, ;7
- db 3,0,0,0,0,0,0,1, ;8
- db 3,0,0,0,0,0,0,2, ;9
- db 3,1,0,0,0,0,0,2, ;10
- db 3,1,0,0,0,0,0,3, ;11
- db 3,2,0,0,0,0,0,2, ;12
- db 3,2,0,0,0,0,0,3, ;13
- db 3,2,1,0,0,0,0,3, ;14
- db 3,2,1,0,0,0,0,4, ;15
- db 4,0,0,0,0,0,0,1, ;16
- db 4,0,0,0,0,0,0,2, ;17
- db 4,1,0,0,0,0,0,2, ;18
- db 4,1,0,0,0,0,0,3, ;19
- db 4,2,0,0,0,0,0,2, ;20
- db 4,2,0,0,0,0,0,3, ;21
- db 4,2,1,0,0,0,0,3, ;22
- db 4,2,1,0,0,0,0,4, ;23
- db 4,3,0,0,0,0,0,2, ;24
- db 4,3,0,0,0,0,0,3, ;25
- db 4,3,1,0,0,0,0,3, ;26
- db 4,3,1,0,0,0,0,4, ;27
- db 4,3,2,0,0,0,0,3, ;28
- db 4,3,2,0,0,0,0,4, ;29
- db 4,3,2,1,0,0,0,4, ;30
- db 4,3,2,1,0,0,0,5, ;31
- db 5,0,0,0,0,0,0,1, ;32
- db 5,0,0,0,0,0,0,2, ;33
- db 5,1,0,0,0,0,0,2, ;34
- db 5,1,0,0,0,0,0,3, ;35
- db 5,2,0,0,0,0,0,2, ;36
- db 5,2,0,0,0,0,0,3, ;37
- db 5,2,1,0,0,0,0,3, ;38
- db 5,2,1,0,0,0,0,4, ;39
- db 5,3,0,0,0,0,0,2, ;40
- db 5,3,0,0,0,0,0,3, ;41
- db 5,3,1,0,0,0,0,3, ;42
- db 5,3,1,0,0,0,0,4, ;43
- db 5,3,2,0,0,0,0,3, ;44
- db 5,3,2,0,0,0,0,4, ;45
- db 5,3,2,1,0,0,0,4, ;46
- db 5,3,2,1,0,0,0,5, ;47
- db 5,4,0,0,0,0,0,2, ;48
- db 5,4,0,0,0,0,0,3, ;49
- db 5,4,1,0,0,0,0,3, ;50
- db 5,4,1,0,0,0,0,4, ;51
- db 5,4,2,0,0,0,0,3, ;52
- db 5,4,2,0,0,0,0,4, ;53
- db 5,4,2,1,0,0,0,4, ;54
- db 5,4,2,1,0,0,0,5, ;55
- db 5,4,3,0,0,0,0,3, ;56
- db 5,4,3,0,0,0,0,4, ;57
- db 5,4,3,1,0,0,0,4, ;58
- db 5,4,3,1,0,0,0,5, ;59
- db 5,4,3,2,0,0,0,4, ;60
- db 5,4,3,2,0,0,0,5, ;61
- db 5,4,3,2,1,0,0,5, ;62
- db 5,4,3,2,1,0,0,6, ;63
- db 6,0,0,0,0,0,0,1, ;64
- db 6,0,0,0,0,0,0,2, ;65
- db 6,1,0,0,0,0,0,2, ;66
- db 6,1,0,0,0,0,0,3, ;67
- db 6,2,0,0,0,0,0,2, ;68
- db 6,2,0,0,0,0,0,3, ;69
- db 6,2,1,0,0,0,0,3, ;70
- db 6,2,1,0,0,0,0,4, ;71
- db 6,3,0,0,0,0,0,2, ;72
- db 6,3,0,0,0,0,0,3, ;73
- db 6,3,1,0,0,0,0,3, ;74
- db 6,3,1,0,0,0,0,4, ;75
- db 6,3,2,0,0,0,0,3, ;76
- db 6,3,2,0,0,0,0,4, ;77
- db 6,3,2,1,0,0,0,4, ;78
- db 6,3,2,1,0,0,0,5, ;79
- db 6,4,0,0,0,0,0,2, ;80
- db 6,4,0,0,0,0,0,3, ;81
- db 6,4,1,0,0,0,0,3, ;82
- db 6,4,1,0,0,0,0,4, ;83
- db 6,4,2,0,0,0,0,3, ;84
- db 6,4,2,0,0,0,0,4, ;85
- db 6,4,2,1,0,0,0,4, ;86
- db 6,4,2,1,0,0,0,5, ;87
- db 6,4,3,0,0,0,0,3, ;88
- db 6,4,3,0,0,0,0,4, ;89
- db 6,4,3,1,0,0,0,4, ;90
- db 6,4,3,1,0,0,0,5, ;91
- db 6,4,3,2,0,0,0,4, ;92
- db 6,4,3,2,0,0,0,5, ;93
- db 6,4,3,2,1,0,0,5, ;94
- db 6,4,3,2,1,0,0,6, ;95
- db 6,5,0,0,0,0,0,2, ;96
- db 6,5,0,0,0,0,0,3, ;97
- db 6,5,1,0,0,0,0,3, ;98
- db 6,5,1,0,0,0,0,4, ;99
- db 6,5,2,0,0,0,0,3, ;100
- db 6,5,2,0,0,0,0,4, ;101
- db 6,5,2,1,0,0,0,4, ;102
- db 6,5,2,1,0,0,0,5, ;103
- db 6,5,3,0,0,0,0,3, ;104
- db 6,5,3,0,0,0,0,4, ;105
- db 6,5,3,1,0,0,0,4, ;106
- db 6,5,3,1,0,0,0,5, ;107
- db 6,5,3,2,0,0,0,4, ;108
- db 6,5,3,2,0,0,0,5, ;109
- db 6,5,3,2,1,0,0,5, ;110
- db 6,5,3,2,1,0,0,6, ;111
- db 6,5,4,0,0,0,0,3, ;112
- db 6,5,4,0,0,0,0,4, ;113
- db 6,5,4,1,0,0,0,4, ;114
- db 6,5,4,1,0,0,0,5, ;115
- db 6,5,4,2,0,0,0,4, ;116
- db 6,5,4,2,0,0,0,5, ;117
- db 6,5,4,2,1,0,0,5, ;118
- db 6,5,4,2,1,0,0,6, ;119
- db 6,5,4,3,0,0,0,4, ;120
- db 6,5,4,3,0,0,0,5, ;121
- db 6,5,4,3,1,0,0,5, ;122
- db 6,5,4,3,1,0,0,6, ;123
- db 6,5,4,3,2,0,0,5, ;124
- db 6,5,4,3,2,0,0,6, ;125
- db 6,5,4,3,2,1,0,6, ;126
- db 6,5,4,3,2,1,0,7, ;127
- db 7,0,0,0,0,0,0,1, ;128
- db 7,0,0,0,0,0,0,2, ;129
- db 7,1,0,0,0,0,0,2, ;130
- db 7,1,0,0,0,0,0,3, ;131
- db 7,2,0,0,0,0,0,2, ;132
- db 7,2,0,0,0,0,0,3, ;133
- db 7,2,1,0,0,0,0,3, ;134
- db 7,2,1,0,0,0,0,4, ;135
- db 7,3,0,0,0,0,0,2, ;136
- db 7,3,0,0,0,0,0,3, ;137
- db 7,3,1,0,0,0,0,3, ;138
- db 7,3,1,0,0,0,0,4, ;139
- db 7,3,2,0,0,0,0,3, ;140
- db 7,3,2,0,0,0,0,4, ;141
- db 7,3,2,1,0,0,0,4, ;142
- db 7,3,2,1,0,0,0,5, ;143
- db 7,4,0,0,0,0,0,2, ;144
- db 7,4,0,0,0,0,0,3, ;145
- db 7,4,1,0,0,0,0,3, ;146
- db 7,4,1,0,0,0,0,4, ;147
- db 7,4,2,0,0,0,0,3, ;148
- db 7,4,2,0,0,0,0,4, ;149
- db 7,4,2,1,0,0,0,4, ;150
- db 7,4,2,1,0,0,0,5, ;151
- db 7,4,3,0,0,0,0,3, ;152
- db 7,4,3,0,0,0,0,4, ;153
- db 7,4,3,1,0,0,0,4, ;154
- db 7,4,3,1,0,0,0,5, ;155
- db 7,4,3,2,0,0,0,4, ;156
- db 7,4,3,2,0,0,0,5, ;157
- db 7,4,3,2,1,0,0,5, ;158
- db 7,4,3,2,1,0,0,6, ;159
- db 7,5,0,0,0,0,0,2, ;160
- db 7,5,0,0,0,0,0,3, ;161
- db 7,5,1,0,0,0,0,3, ;162
- db 7,5,1,0,0,0,0,4, ;163
- db 7,5,2,0,0,0,0,3, ;164
- db 7,5,2,0,0,0,0,4, ;165
- db 7,5,2,1,0,0,0,4, ;166
- db 7,5,2,1,0,0,0,5, ;167
- db 7,5,3,0,0,0,0,3, ;168
- db 7,5,3,0,0,0,0,4, ;169
- db 7,5,3,1,0,0,0,4, ;170
- db 7,5,3,1,0,0,0,5, ;171
- db 7,5,3,2,0,0,0,4, ;172
- db 7,5,3,2,0,0,0,5, ;173
- db 7,5,3,2,1,0,0,5, ;174
- db 7,5,3,2,1,0,0,6, ;175
- db 7,5,4,0,0,0,0,3, ;176
- db 7,5,4,0,0,0,0,4, ;177
- db 7,5,4,1,0,0,0,4, ;178
- db 7,5,4,1,0,0,0,5, ;179
- db 7,5,4,2,0,0,0,4, ;180
- db 7,5,4,2,0,0,0,5, ;181
- db 7,5,4,2,1,0,0,5, ;182
- db 7,5,4,2,1,0,0,6, ;183
- db 7,5,4,3,0,0,0,4, ;184
- db 7,5,4,3,0,0,0,5, ;185
- db 7,5,4,3,1,0,0,5, ;186
- db 7,5,4,3,1,0,0,6, ;187
- db 7,5,4,3,2,0,0,5, ;188
- db 7,5,4,3,2,0,0,6, ;189
- db 7,5,4,3,2,1,0,6, ;190
- db 7,5,4,3,2,1,0,7, ;191
- db 7,6,0,0,0,0,0,2, ;192
- db 7,6,0,0,0,0,0,3, ;193
- db 7,6,1,0,0,0,0,3, ;194
- db 7,6,1,0,0,0,0,4, ;195
- db 7,6,2,0,0,0,0,3, ;196
- db 7,6,2,0,0,0,0,4, ;197
- db 7,6,2,1,0,0,0,4, ;198
- db 7,6,2,1,0,0,0,5, ;199
- db 7,6,3,0,0,0,0,3, ;200
- db 7,6,3,0,0,0,0,4, ;201
- db 7,6,3,1,0,0,0,4, ;202
- db 7,6,3,1,0,0,0,5, ;203
- db 7,6,3,2,0,0,0,4, ;204
- db 7,6,3,2,0,0,0,5, ;205
- db 7,6,3,2,1,0,0,5, ;206
- db 7,6,3,2,1,0,0,6, ;207
- db 7,6,4,0,0,0,0,3, ;208
- db 7,6,4,0,0,0,0,4, ;209
- db 7,6,4,1,0,0,0,4, ;210
- db 7,6,4,1,0,0,0,5, ;211
- db 7,6,4,2,0,0,0,4, ;212
- db 7,6,4,2,0,0,0,5, ;213
- db 7,6,4,2,1,0,0,5, ;214
- db 7,6,4,2,1,0,0,6, ;215
- db 7,6,4,3,0,0,0,4, ;216
- db 7,6,4,3,0,0,0,5, ;217
- db 7,6,4,3,1,0,0,5, ;218
- db 7,6,4,3,1,0,0,6, ;219
- db 7,6,4,3,2,0,0,5, ;220
- db 7,6,4,3,2,0,0,6, ;221
- db 7,6,4,3,2,1,0,6, ;222
- db 7,6,4,3,2,1,0,7, ;223
- db 7,6,5,0,0,0,0,3, ;224
- db 7,6,5,0,0,0,0,4, ;225
- db 7,6,5,1,0,0,0,4, ;226
- db 7,6,5,1,0,0,0,5, ;227
- db 7,6,5,2,0,0,0,4, ;228
- db 7,6,5,2,0,0,0,5, ;229
- db 7,6,5,2,1,0,0,5, ;230
- db 7,6,5,2,1,0,0,6, ;231
- db 7,6,5,3,0,0,0,4, ;232
- db 7,6,5,3,0,0,0,5, ;233
- db 7,6,5,3,1,0,0,5, ;234
- db 7,6,5,3,1,0,0,6, ;235
- db 7,6,5,3,2,0,0,5, ;236
- db 7,6,5,3,2,0,0,6, ;237
- db 7,6,5,3,2,1,0,6, ;238
- db 7,6,5,3,2,1,0,7, ;239
- db 7,6,5,4,0,0,0,4, ;240
- db 7,6,5,4,0,0,0,5, ;241
- db 7,6,5,4,1,0,0,5, ;242
- db 7,6,5,4,1,0,0,6, ;243
- db 7,6,5,4,2,0,0,5, ;244
- db 7,6,5,4,2,0,0,6, ;245
- db 7,6,5,4,2,1,0,6, ;246
- db 7,6,5,4,2,1,0,7, ;247
- db 7,6,5,4,3,0,0,5, ;248
- db 7,6,5,4,3,0,0,6, ;249
- db 7,6,5,4,3,1,0,6, ;250
- db 7,6,5,4,3,1,0,7, ;251
- db 7,6,5,4,3,2,0,6, ;252
- db 7,6,5,4,3,2,0,7, ;253
- db 7,6,5,4,3,2,1,7, ;254
- db 7,6,5,4,3,2,1,8, ;255
+ db 0,0,0,0,0,0,0,0, ;0
+ db 0,0,0,0,0,0,0,1, ;1
+ db 1,0,0,0,0,0,0,1, ;2
+ db 1,0,0,0,0,0,0,2, ;3
+ db 2,0,0,0,0,0,0,1, ;4
+ db 2,0,0,0,0,0,0,2, ;5
+ db 2,1,0,0,0,0,0,2, ;6
+ db 2,1,0,0,0,0,0,3, ;7
+ db 3,0,0,0,0,0,0,1, ;8
+ db 3,0,0,0,0,0,0,2, ;9
+ db 3,1,0,0,0,0,0,2, ;10
+ db 3,1,0,0,0,0,0,3, ;11
+ db 3,2,0,0,0,0,0,2, ;12
+ db 3,2,0,0,0,0,0,3, ;13
+ db 3,2,1,0,0,0,0,3, ;14
+ db 3,2,1,0,0,0,0,4, ;15
+ db 4,0,0,0,0,0,0,1, ;16
+ db 4,0,0,0,0,0,0,2, ;17
+ db 4,1,0,0,0,0,0,2, ;18
+ db 4,1,0,0,0,0,0,3, ;19
+ db 4,2,0,0,0,0,0,2, ;20
+ db 4,2,0,0,0,0,0,3, ;21
+ db 4,2,1,0,0,0,0,3, ;22
+ db 4,2,1,0,0,0,0,4, ;23
+ db 4,3,0,0,0,0,0,2, ;24
+ db 4,3,0,0,0,0,0,3, ;25
+ db 4,3,1,0,0,0,0,3, ;26
+ db 4,3,1,0,0,0,0,4, ;27
+ db 4,3,2,0,0,0,0,3, ;28
+ db 4,3,2,0,0,0,0,4, ;29
+ db 4,3,2,1,0,0,0,4, ;30
+ db 4,3,2,1,0,0,0,5, ;31
+ db 5,0,0,0,0,0,0,1, ;32
+ db 5,0,0,0,0,0,0,2, ;33
+ db 5,1,0,0,0,0,0,2, ;34
+ db 5,1,0,0,0,0,0,3, ;35
+ db 5,2,0,0,0,0,0,2, ;36
+ db 5,2,0,0,0,0,0,3, ;37
+ db 5,2,1,0,0,0,0,3, ;38
+ db 5,2,1,0,0,0,0,4, ;39
+ db 5,3,0,0,0,0,0,2, ;40
+ db 5,3,0,0,0,0,0,3, ;41
+ db 5,3,1,0,0,0,0,3, ;42
+ db 5,3,1,0,0,0,0,4, ;43
+ db 5,3,2,0,0,0,0,3, ;44
+ db 5,3,2,0,0,0,0,4, ;45
+ db 5,3,2,1,0,0,0,4, ;46
+ db 5,3,2,1,0,0,0,5, ;47
+ db 5,4,0,0,0,0,0,2, ;48
+ db 5,4,0,0,0,0,0,3, ;49
+ db 5,4,1,0,0,0,0,3, ;50
+ db 5,4,1,0,0,0,0,4, ;51
+ db 5,4,2,0,0,0,0,3, ;52
+ db 5,4,2,0,0,0,0,4, ;53
+ db 5,4,2,1,0,0,0,4, ;54
+ db 5,4,2,1,0,0,0,5, ;55
+ db 5,4,3,0,0,0,0,3, ;56
+ db 5,4,3,0,0,0,0,4, ;57
+ db 5,4,3,1,0,0,0,4, ;58
+ db 5,4,3,1,0,0,0,5, ;59
+ db 5,4,3,2,0,0,0,4, ;60
+ db 5,4,3,2,0,0,0,5, ;61
+ db 5,4,3,2,1,0,0,5, ;62
+ db 5,4,3,2,1,0,0,6, ;63
+ db 6,0,0,0,0,0,0,1, ;64
+ db 6,0,0,0,0,0,0,2, ;65
+ db 6,1,0,0,0,0,0,2, ;66
+ db 6,1,0,0,0,0,0,3, ;67
+ db 6,2,0,0,0,0,0,2, ;68
+ db 6,2,0,0,0,0,0,3, ;69
+ db 6,2,1,0,0,0,0,3, ;70
+ db 6,2,1,0,0,0,0,4, ;71
+ db 6,3,0,0,0,0,0,2, ;72
+ db 6,3,0,0,0,0,0,3, ;73
+ db 6,3,1,0,0,0,0,3, ;74
+ db 6,3,1,0,0,0,0,4, ;75
+ db 6,3,2,0,0,0,0,3, ;76
+ db 6,3,2,0,0,0,0,4, ;77
+ db 6,3,2,1,0,0,0,4, ;78
+ db 6,3,2,1,0,0,0,5, ;79
+ db 6,4,0,0,0,0,0,2, ;80
+ db 6,4,0,0,0,0,0,3, ;81
+ db 6,4,1,0,0,0,0,3, ;82
+ db 6,4,1,0,0,0,0,4, ;83
+ db 6,4,2,0,0,0,0,3, ;84
+ db 6,4,2,0,0,0,0,4, ;85
+ db 6,4,2,1,0,0,0,4, ;86
+ db 6,4,2,1,0,0,0,5, ;87
+ db 6,4,3,0,0,0,0,3, ;88
+ db 6,4,3,0,0,0,0,4, ;89
+ db 6,4,3,1,0,0,0,4, ;90
+ db 6,4,3,1,0,0,0,5, ;91
+ db 6,4,3,2,0,0,0,4, ;92
+ db 6,4,3,2,0,0,0,5, ;93
+ db 6,4,3,2,1,0,0,5, ;94
+ db 6,4,3,2,1,0,0,6, ;95
+ db 6,5,0,0,0,0,0,2, ;96
+ db 6,5,0,0,0,0,0,3, ;97
+ db 6,5,1,0,0,0,0,3, ;98
+ db 6,5,1,0,0,0,0,4, ;99
+ db 6,5,2,0,0,0,0,3, ;100
+ db 6,5,2,0,0,0,0,4, ;101
+ db 6,5,2,1,0,0,0,4, ;102
+ db 6,5,2,1,0,0,0,5, ;103
+ db 6,5,3,0,0,0,0,3, ;104
+ db 6,5,3,0,0,0,0,4, ;105
+ db 6,5,3,1,0,0,0,4, ;106
+ db 6,5,3,1,0,0,0,5, ;107
+ db 6,5,3,2,0,0,0,4, ;108
+ db 6,5,3,2,0,0,0,5, ;109
+ db 6,5,3,2,1,0,0,5, ;110
+ db 6,5,3,2,1,0,0,6, ;111
+ db 6,5,4,0,0,0,0,3, ;112
+ db 6,5,4,0,0,0,0,4, ;113
+ db 6,5,4,1,0,0,0,4, ;114
+ db 6,5,4,1,0,0,0,5, ;115
+ db 6,5,4,2,0,0,0,4, ;116
+ db 6,5,4,2,0,0,0,5, ;117
+ db 6,5,4,2,1,0,0,5, ;118
+ db 6,5,4,2,1,0,0,6, ;119
+ db 6,5,4,3,0,0,0,4, ;120
+ db 6,5,4,3,0,0,0,5, ;121
+ db 6,5,4,3,1,0,0,5, ;122
+ db 6,5,4,3,1,0,0,6, ;123
+ db 6,5,4,3,2,0,0,5, ;124
+ db 6,5,4,3,2,0,0,6, ;125
+ db 6,5,4,3,2,1,0,6, ;126
+ db 6,5,4,3,2,1,0,7, ;127
+ db 7,0,0,0,0,0,0,1, ;128
+ db 7,0,0,0,0,0,0,2, ;129
+ db 7,1,0,0,0,0,0,2, ;130
+ db 7,1,0,0,0,0,0,3, ;131
+ db 7,2,0,0,0,0,0,2, ;132
+ db 7,2,0,0,0,0,0,3, ;133
+ db 7,2,1,0,0,0,0,3, ;134
+ db 7,2,1,0,0,0,0,4, ;135
+ db 7,3,0,0,0,0,0,2, ;136
+ db 7,3,0,0,0,0,0,3, ;137
+ db 7,3,1,0,0,0,0,3, ;138
+ db 7,3,1,0,0,0,0,4, ;139
+ db 7,3,2,0,0,0,0,3, ;140
+ db 7,3,2,0,0,0,0,4, ;141
+ db 7,3,2,1,0,0,0,4, ;142
+ db 7,3,2,1,0,0,0,5, ;143
+ db 7,4,0,0,0,0,0,2, ;144
+ db 7,4,0,0,0,0,0,3, ;145
+ db 7,4,1,0,0,0,0,3, ;146
+ db 7,4,1,0,0,0,0,4, ;147
+ db 7,4,2,0,0,0,0,3, ;148
+ db 7,4,2,0,0,0,0,4, ;149
+ db 7,4,2,1,0,0,0,4, ;150
+ db 7,4,2,1,0,0,0,5, ;151
+ db 7,4,3,0,0,0,0,3, ;152
+ db 7,4,3,0,0,0,0,4, ;153
+ db 7,4,3,1,0,0,0,4, ;154
+ db 7,4,3,1,0,0,0,5, ;155
+ db 7,4,3,2,0,0,0,4, ;156
+ db 7,4,3,2,0,0,0,5, ;157
+ db 7,4,3,2,1,0,0,5, ;158
+ db 7,4,3,2,1,0,0,6, ;159
+ db 7,5,0,0,0,0,0,2, ;160
+ db 7,5,0,0,0,0,0,3, ;161
+ db 7,5,1,0,0,0,0,3, ;162
+ db 7,5,1,0,0,0,0,4, ;163
+ db 7,5,2,0,0,0,0,3, ;164
+ db 7,5,2,0,0,0,0,4, ;165
+ db 7,5,2,1,0,0,0,4, ;166
+ db 7,5,2,1,0,0,0,5, ;167
+ db 7,5,3,0,0,0,0,3, ;168
+ db 7,5,3,0,0,0,0,4, ;169
+ db 7,5,3,1,0,0,0,4, ;170
+ db 7,5,3,1,0,0,0,5, ;171
+ db 7,5,3,2,0,0,0,4, ;172
+ db 7,5,3,2,0,0,0,5, ;173
+ db 7,5,3,2,1,0,0,5, ;174
+ db 7,5,3,2,1,0,0,6, ;175
+ db 7,5,4,0,0,0,0,3, ;176
+ db 7,5,4,0,0,0,0,4, ;177
+ db 7,5,4,1,0,0,0,4, ;178
+ db 7,5,4,1,0,0,0,5, ;179
+ db 7,5,4,2,0,0,0,4, ;180
+ db 7,5,4,2,0,0,0,5, ;181
+ db 7,5,4,2,1,0,0,5, ;182
+ db 7,5,4,2,1,0,0,6, ;183
+ db 7,5,4,3,0,0,0,4, ;184
+ db 7,5,4,3,0,0,0,5, ;185
+ db 7,5,4,3,1,0,0,5, ;186
+ db 7,5,4,3,1,0,0,6, ;187
+ db 7,5,4,3,2,0,0,5, ;188
+ db 7,5,4,3,2,0,0,6, ;189
+ db 7,5,4,3,2,1,0,6, ;190
+ db 7,5,4,3,2,1,0,7, ;191
+ db 7,6,0,0,0,0,0,2, ;192
+ db 7,6,0,0,0,0,0,3, ;193
+ db 7,6,1,0,0,0,0,3, ;194
+ db 7,6,1,0,0,0,0,4, ;195
+ db 7,6,2,0,0,0,0,3, ;196
+ db 7,6,2,0,0,0,0,4, ;197
+ db 7,6,2,1,0,0,0,4, ;198
+ db 7,6,2,1,0,0,0,5, ;199
+ db 7,6,3,0,0,0,0,3, ;200
+ db 7,6,3,0,0,0,0,4, ;201
+ db 7,6,3,1,0,0,0,4, ;202
+ db 7,6,3,1,0,0,0,5, ;203
+ db 7,6,3,2,0,0,0,4, ;204
+ db 7,6,3,2,0,0,0,5, ;205
+ db 7,6,3,2,1,0,0,5, ;206
+ db 7,6,3,2,1,0,0,6, ;207
+ db 7,6,4,0,0,0,0,3, ;208
+ db 7,6,4,0,0,0,0,4, ;209
+ db 7,6,4,1,0,0,0,4, ;210
+ db 7,6,4,1,0,0,0,5, ;211
+ db 7,6,4,2,0,0,0,4, ;212
+ db 7,6,4,2,0,0,0,5, ;213
+ db 7,6,4,2,1,0,0,5, ;214
+ db 7,6,4,2,1,0,0,6, ;215
+ db 7,6,4,3,0,0,0,4, ;216
+ db 7,6,4,3,0,0,0,5, ;217
+ db 7,6,4,3,1,0,0,5, ;218
+ db 7,6,4,3,1,0,0,6, ;219
+ db 7,6,4,3,2,0,0,5, ;220
+ db 7,6,4,3,2,0,0,6, ;221
+ db 7,6,4,3,2,1,0,6, ;222
+ db 7,6,4,3,2,1,0,7, ;223
+ db 7,6,5,0,0,0,0,3, ;224
+ db 7,6,5,0,0,0,0,4, ;225
+ db 7,6,5,1,0,0,0,4, ;226
+ db 7,6,5,1,0,0,0,5, ;227
+ db 7,6,5,2,0,0,0,4, ;228
+ db 7,6,5,2,0,0,0,5, ;229
+ db 7,6,5,2,1,0,0,5, ;230
+ db 7,6,5,2,1,0,0,6, ;231
+ db 7,6,5,3,0,0,0,4, ;232
+ db 7,6,5,3,0,0,0,5, ;233
+ db 7,6,5,3,1,0,0,5, ;234
+ db 7,6,5,3,1,0,0,6, ;235
+ db 7,6,5,3,2,0,0,5, ;236
+ db 7,6,5,3,2,0,0,6, ;237
+ db 7,6,5,3,2,1,0,6, ;238
+ db 7,6,5,3,2,1,0,7, ;239
+ db 7,6,5,4,0,0,0,4, ;240
+ db 7,6,5,4,0,0,0,5, ;241
+ db 7,6,5,4,1,0,0,5, ;242
+ db 7,6,5,4,1,0,0,6, ;243
+ db 7,6,5,4,2,0,0,5, ;244
+ db 7,6,5,4,2,0,0,6, ;245
+ db 7,6,5,4,2,1,0,6, ;246
+ db 7,6,5,4,2,1,0,7, ;247
+ db 7,6,5,4,3,0,0,5, ;248
+ db 7,6,5,4,3,0,0,6, ;249
+ db 7,6,5,4,3,1,0,6, ;250
+ db 7,6,5,4,3,1,0,7, ;251
+ db 7,6,5,4,3,2,0,6, ;252
+ db 7,6,5,4,3,2,0,7, ;253
+ db 7,6,5,4,3,2,1,7, ;254
+ db 7,6,5,4,3,2,1,8, ;255
;***********************************************************************
; Code
@@ -323,43 +323,43 @@
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
WELS_EXTERN CavlcParamCal_sse2
- push ebx
- push edi
- push esi
+ push ebx
+ push edi
+ push esi
- mov eax, [esp+16] ;coffLevel
- mov edi, [esp+24] ;Level
- mov ebx, [esp+32] ;endIdx
- cmp ebx, 3
- jne .Level16
- pxor xmm1, xmm1
- movq xmm0, [eax] ; removed QWORD
- jmp .Cal_begin
+ mov eax, [esp+16] ;coffLevel
+ mov edi, [esp+24] ;Level
+ mov ebx, [esp+32] ;endIdx
+ cmp ebx, 3
+ jne .Level16
+ pxor xmm1, xmm1
+ movq xmm0, [eax] ; removed QWORD
+ jmp .Cal_begin
.Level16:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax+16]
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax+16]
.Cal_begin:
- movdqa xmm2, xmm0
- packsswb xmm0, xmm1
- movdqa xmm4, xmm0
- pxor xmm3, xmm3
- pcmpgtb xmm0, xmm3
- pcmpgtb xmm3, xmm4
- por xmm0, xmm3
- pmovmskb edx, xmm0
- cmp edx, 0
- je near .return
- movdqa xmm6, [sse2_b_1]
- pcmpeqw xmm7, xmm7 ;generate -1
- mov ebx, 0xff
- ;pinsrw xmm6, ebx, 3
+ movdqa xmm2, xmm0
+ packsswb xmm0, xmm1
+ movdqa xmm4, xmm0
+ pxor xmm3, xmm3
+ pcmpgtb xmm0, xmm3
+ pcmpgtb xmm3, xmm4
+ por xmm0, xmm3
+ pmovmskb edx, xmm0
+ cmp edx, 0
+ je near .return
+ movdqa xmm6, [sse2_b_1]
+ pcmpeqw xmm7, xmm7 ;generate -1
+ mov ebx, 0xff
+ ;pinsrw xmm6, ebx, 3
mov bl, dh
- lea ebx, [byte_1pos_table+8*ebx]
- movq xmm0, [ebx]
- pextrw ecx, xmm0, 3
- shr ecx, 8
+ lea ebx, [byte_1pos_table+8*ebx]
+ movq xmm0, [ebx]
+ pextrw ecx, xmm0, 3
+ shr ecx, 8
mov dh, cl
.loopHighFind0:
@@ -367,19 +367,19 @@
je .loopHighFind0End
;mov esi, [ebx]
;and esi, 0xff
- movzx esi, byte [ebx]
+ movzx esi, byte [ebx]
add esi, 8
mov esi, [eax+2*esi]
mov [edi], si
add edi, 2
;add ebx, 1
- inc ebx
+ inc ebx
dec ecx
- jmp .loopHighFind0
+ jmp .loopHighFind0
.loopHighFind0End:
mov cl, dh
cmp cl, 8
- pand xmm0, xmm6
+ pand xmm0, xmm6
jne .LowByteFind0
sub edi, 2
mov esi, [eax+16]
@@ -387,8 +387,8 @@
add edi, 2
.LowByteFind0:
and edx, 0xff
- lea ebx, [byte_1pos_table+8*edx]
- movq xmm1, [ebx]
+ lea ebx, [byte_1pos_table+8*edx]
+ movq xmm1, [ebx]
pextrw esi, xmm1, 3
or esi, 0xff
or ecx, 0xff00
@@ -398,16 +398,16 @@
.loopLowFind0:
cmp esi, 0
je .loopLowFind0End
- ;mov edx, [ebx]
- ;and edx, 0xff
- movzx edx, byte [ebx]
- mov edx, [eax+2*edx]
- mov [edi], dx
- add edi, 2
- ;add ebx, 1
- inc ebx
+ ;mov edx, [ebx]
+ ;and edx, 0xff
+ movzx edx, byte [ebx]
+ mov edx, [eax+2*edx]
+ mov [edi], dx
+ add edi, 2
+ ;add ebx, 1
+ inc ebx
dec esi
- jmp .loopLowFind0
+ jmp .loopLowFind0
.loopLowFind0End:
cmp ch, 8
jne .getLevelEnd
@@ -415,12 +415,12 @@
mov edx, [eax]
mov [edi], dx
.getLevelEnd:
- mov edx, [esp+28] ;total_coeffs
+ mov edx, [esp+28] ;total_coeffs
;mov ebx, ecx
;and ebx, 0xff
- movzx ebx, byte cl
+ movzx ebx, byte cl
add cl, ch
- mov [edx], cl
+ mov [edx], cl
;getRun
movq xmm5, [sse2_b8]
paddb xmm0, xmm5
@@ -430,7 +430,7 @@
sub eax, ebx
shl eax, 3
shl ebx, 3
- pinsrw xmm2, ebx, 0
+ pinsrw xmm2, ebx, 0
pinsrw xmm3, eax, 0
psllq xmm0, xmm3
psrlq xmm0, xmm3
@@ -441,19 +441,19 @@
por xmm0, xmm1
pextrw eax, xmm0, 0
- and eax, 0xff
+ and eax, 0xff
inc eax
sub al, cl
- movdqa xmm1, xmm0
- paddb xmm1, xmm7
- psrldq xmm0, 1
- psubb xmm1, xmm0
+ movdqa xmm1, xmm0
+ paddb xmm1, xmm7
+ psrldq xmm0, 1
+ psubb xmm1, xmm0
mov ecx, [esp+20] ;run
- movdqa [ecx], xmm1
+ movdqa [ecx], xmm1
;getRunEnd
.return:
- pop esi
- pop edi
- pop ebx
- ret
+ pop esi
+ pop edi
+ pop ebx
+ ret
%endif
--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -50,17 +50,17 @@
align 16
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
- dw 10, 13, 10, 13, 13, 16, 13, 16,
+ dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 11, 14, 11, 14, 14, 18, 14, 18,
- dw 11, 14, 11, 14, 14, 18, 14, 18,
- dw 13, 16, 13, 16, 16, 20, 16, 20,
- dw 13, 16, 13, 16, 16, 20, 16, 20,
+ dw 11, 14, 11, 14, 14, 18, 14, 18,
+ dw 13, 16, 13, 16, 16, 20, 16, 20,
+ dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 14, 18, 14, 18, 18, 23, 18, 23,
- dw 14, 18, 14, 18, 18, 23, 18, 23,
- dw 16, 20, 16, 20, 20, 25, 20, 25,
- dw 16, 20, 16, 20, 20, 25, 20, 25,
+ dw 14, 18, 14, 18, 18, 23, 18, 23,
+ dw 16, 20, 16, 20, 20, 25, 20, 25,
+ dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 18, 23, 18, 23, 23, 29, 23, 29,
- dw 18, 23, 18, 23, 23, 29, 23, 29
+ dw 18, 23, 18, 23, 23, 29, 23, 29
;***********************************************************************
@@ -68,27 +68,27 @@
;***********************************************************************
%macro MMX_LoadDiff4P 5
- movd %1, [%3]
- movd %2, [%4]
- punpcklbw %1, %5
- punpcklbw %2, %5
- psubw %1, %2
+ movd %1, [%3]
+ movd %2, [%4]
+ punpcklbw %1, %5
+ punpcklbw %2, %5
+ psubw %1, %2
%endmacro
%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
- MMX_LoadDiff4P %1, %9, %5, %7, %10
- MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
- lea %5, [%5+2*%6]
- lea %7, [%7+2*%8]
- MMX_LoadDiff4P %3, %9, %5, %7, %10
- MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
+ MMX_LoadDiff4P %1, %9, %5, %7, %10
+ MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
+ lea %5, [%5+2*%6]
+ lea %7, [%7+2*%8]
+ MMX_LoadDiff4P %3, %9, %5, %7, %10
+ MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
%endmacro
%macro MMX_SumSubMul2 3
- movq %3, %1
- psllw %1, $01
- paddw %1, %2
- psllw %2, $01
+ movq %3, %1
+ psllw %1, $01
+ paddw %1, %2
+ psllw %2, $01
psubw %3, %2
%endmacro
@@ -101,15 +101,15 @@
%endmacro
%macro MMX_SumSub 3
- movq %3, %2
+ movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
%macro MMX_DCT 6
- MMX_SumSub %4, %1, %6
- MMX_SumSub %3, %2, %6
- MMX_SumSub %3, %4, %6
+ MMX_SumSub %4, %1, %6
+ MMX_SumSub %3, %2, %6
+ MMX_SumSub %3, %4, %6
MMX_SumSubMul2 %1, %2, %5
%endmacro
@@ -116,8 +116,8 @@
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
- MMX_SumSub %1, %4, %6
- MMX_SumSub %3, %5, %6
+ MMX_SumSub %1, %4, %6
+ MMX_SumSub %3, %5, %6
%endmacro
%macro MMX_StoreDiff4P 6
@@ -142,11 +142,11 @@
MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
- MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
- MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
+ MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
+ MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
- MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
- MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
+ MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
+ MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
movq [r0+ 0], mm2
movq [r0+ 8], mm1
@@ -170,22 +170,22 @@
movq mm2, [r4+16]
movq mm3, [r4+24]
- MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
- MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
- MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
- MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
+ MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
+ MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
+ MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
+ MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
- WELS_Zero mm7
- WELS_DW32 mm6
+ WELS_Zero mm7
+ WELS_DW32 mm6
- MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
- MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
+ MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
+ MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
- MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
- MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
+ MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
+ MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
- WELSEMMS
+ WELSEMMS
LOAD_5_PARA_POP
ret
@@ -194,21 +194,21 @@
; SSE2 functions
;***********************************************************************
%macro SSE2_Store4x8p 6
- SSE2_XSawp qdq, %2, %3, %6
- SSE2_XSawp qdq, %4, %5, %3
- MOVDQ [%1+0x00], %2
- MOVDQ [%1+0x10], %4
- MOVDQ [%1+0x20], %6
- MOVDQ [%1+0x30], %3
+ SSE2_XSawp qdq, %2, %3, %6
+ SSE2_XSawp qdq, %4, %5, %3
+ MOVDQ [%1+0x00], %2
+ MOVDQ [%1+0x10], %4
+ MOVDQ [%1+0x20], %6
+ MOVDQ [%1+0x30], %3
%endmacro
%macro SSE2_Load4x8p 6
- MOVDQ %2, [%1+0x00]
- MOVDQ %4, [%1+0x10]
- MOVDQ %6, [%1+0x20]
- MOVDQ %3, [%1+0x30]
- SSE2_XSawp qdq, %4, %3, %5
- SSE2_XSawp qdq, %2, %6, %3
+ MOVDQ %2, [%1+0x00]
+ MOVDQ %4, [%1+0x10]
+ MOVDQ %6, [%1+0x20]
+ MOVDQ %3, [%1+0x30]
+ SSE2_XSawp qdq, %4, %3, %5
+ SSE2_XSawp qdq, %2, %6, %3
%endmacro
%macro SSE2_SumSubMul2 3
@@ -231,57 +231,57 @@
%macro SSE2_StoreDiff8p 6
paddw %1, %3
psraw %1, $06
- movq %2, %6
+ movq %2, %6
punpcklbw %2, %4
paddsw %2, %1
packuswb %2, %2
- movq %5, %2
+ movq %5, %2
%endmacro
%macro SSE2_StoreDiff8p 5
- movq %2, %5
+ movq %2, %5
punpcklbw %2, %3
paddsw %2, %1
packuswb %2, %2
- movq %4, %2
+ movq %4, %2
%endmacro
-%macro SSE2_Load8DC 6
- movdqa %1, %6 ; %1 = dc0 dc1
- paddw %1, %5
- psraw %1, $06 ; (dc + 32) >> 6
+%macro SSE2_Load8DC 6
+ movdqa %1, %6 ; %1 = dc0 dc1
+ paddw %1, %5
+ psraw %1, $06 ; (dc + 32) >> 6
- movdqa %2, %1
- psrldq %2, 4
- punpcklwd %2, %2
- punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklwd %2, %2
+ punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
- movdqa %3, %1
- psrldq %3, 8
- punpcklwd %3, %3
- punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
+ movdqa %3, %1
+ psrldq %3, 8
+ punpcklwd %3, %3
+ punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
- movdqa %4, %1
- psrldq %4, 12
- punpcklwd %4, %4
- punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
+ movdqa %4, %1
+ psrldq %4, 12
+ punpcklwd %4, %4
+ punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
- punpcklwd %1, %1
- punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+ punpcklwd %1, %1
+ punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
%endmacro
%macro SSE2_DCT 6
- SSE2_SumSub %6, %3, %5
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %2, %5
- SSE2_SumSubMul2 %6, %1, %4
+ SSE2_SumSub %6, %3, %5
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %2, %5
+ SSE2_SumSubMul2 %6, %1, %4
%endmacro
%macro SSE2_IDCT 7
SSE2_SumSub %7, %2, %6
SSE2_SumSubDiv2 %1, %3, %5, %4
- SSE2_SumSub %2, %1, %5
- SSE2_SumSub %7, %4, %5
+ SSE2_SumSub %2, %1, %5
+ SSE2_SumSub %7, %4, %5
%endmacro
;***********************************************************************
@@ -294,42 +294,42 @@
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
pxor xmm7, xmm7
- ;Load 4x8
- SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
+ ;Load 4x8
+ SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
- SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
+ SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
- SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
- SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
- SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
- SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
+ SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+ SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
+ SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+ SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
- SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+ SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
- ;Load 4x8
- SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
+ ;Load 4x8
+ SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
- SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
- SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
- SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
- SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
+ SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+ SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
+ SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+ SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
- lea r0, [r0+64]
- SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+ lea r0, [r0+64]
+ SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
- POP_XMM
- LOAD_5_PARA_POP
+ POP_XMM
+ LOAD_5_PARA_POP
ret
@@ -337,59 +337,59 @@
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
;***********************************************************************
WELS_EXTERN WelsIDctFourT4Rec_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- ;Load 4x8
- SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ ;Load 4x8
+ SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
- SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
- SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
- SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
- SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+ SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
+ SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+ SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
+ SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
- WELS_Zero xmm7
- WELS_DW32 xmm6
+ WELS_Zero xmm7
+ WELS_DW32 xmm6
- SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
- SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
- SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
+ SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
+ SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
+ SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
- add r4, 64
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
+ add r4, 64
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
- SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
- SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+ SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
+ SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
- SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+ SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
- WELS_Zero xmm7
- WELS_DW32 xmm6
+ WELS_Zero xmm7
+ WELS_DW32 xmm6
- SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
- SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
- SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
- POP_XMM
- LOAD_5_PARA_POP
- ; pop esi
- ; pop ebx
+ SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
+ SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
+ SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
+ POP_XMM
+ LOAD_5_PARA_POP
+ ; pop esi
+ ; pop ebx
ret
%macro SSE2_StoreDiff4x8p 8
- SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
- SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
- SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
- SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
+ SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
+ SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
+ SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
+ SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
%endmacro
;***********************************************************************
@@ -396,76 +396,76 @@
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
;***********************************************************************
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm7, xmm7
- WELS_DW32 xmm6
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm7, xmm7
+ WELS_DW32 xmm6
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- POP_XMM
- LOAD_5_PARA_POP
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ POP_XMM
+ LOAD_5_PARA_POP
ret
%macro SSE2_SumSubD 3
- movdqa %3, %2
+ movdqa %3, %2
paddd %2, %1
psubd %1, %3
%endmacro
%macro SSE2_SumSubDiv2D 4
- paddd %1, %2
- paddd %1, %3
- psrad %1, 1
- movdqa %4, %1
- psubd %4, %2
+ paddd %1, %2
+ paddd %1, %3
+ psrad %1, 1
+ movdqa %4, %1
+ psubd %4, %2
%endmacro
-%macro SSE2_Load4Col 5
- movsx r2, WORD[%5]
- movd %1, r2d
- movsx r2, WORD[%5 + 0x20]
- movd %2, r2d
- punpckldq %1, %2
- movsx r2, WORD[%5 + 0x80]
- movd %3, r2d
- movsx r2, WORD[%5 + 0xa0]
- movd %4, r2d
- punpckldq %3, %4
- punpcklqdq %1, %3
+%macro SSE2_Load4Col 5
+ movsx r2, WORD[%5]
+ movd %1, r2d
+ movsx r2, WORD[%5 + 0x20]
+ movd %2, r2d
+ punpckldq %1, %2
+ movsx r2, WORD[%5 + 0x80]
+ movd %3, r2d
+ movsx r2, WORD[%5 + 0xa0]
+ movd %4, r2d
+ punpckldq %3, %4
+ punpcklqdq %1, %3
%endmacro
;***********************************************************************
@@ -472,33 +472,33 @@
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;***********************************************************************
WELS_EXTERN WelsHadamardT4Dc_sse2
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
- SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
- SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
- SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
+ SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
+ SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
+ SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
- SSE2_SumSubD xmm1, xmm2, xmm7
- SSE2_SumSubD xmm3, xmm4, xmm7
- SSE2_SumSubD xmm2, xmm4, xmm7
- SSE2_SumSubD xmm1, xmm3, xmm7
+ SSE2_SumSubD xmm1, xmm2, xmm7
+ SSE2_SumSubD xmm3, xmm4, xmm7
+ SSE2_SumSubD xmm2, xmm4, xmm7
+ SSE2_SumSubD xmm1, xmm3, xmm7
- SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
+ SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
- SSE2_SumSubD xmm4, xmm3, xmm7
- SSE2_SumSubD xmm5, xmm1, xmm7
+ SSE2_SumSubD xmm4, xmm3, xmm7
+ SSE2_SumSubD xmm5, xmm1, xmm7
- WELS_DD1 xmm6
- SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
- SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
- SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
+ WELS_DD1 xmm6
+ SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
+ SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
+ SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
- packssdw xmm3, xmm4
- packssdw xmm2, xmm1
- movdqa [r0+ 0], xmm3
- movdqa [r0+16], xmm2
+ packssdw xmm3, xmm4
+ packssdw xmm2, xmm1
+ movdqa [r0+ 0], xmm3
+ movdqa [r0+16], xmm2
- POP_XMM
- ret
+ POP_XMM
+ ret
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -61,7 +61,7 @@
sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
align 16
-mmx_01bytes: times 16 db 1
+mmx_01bytes: times 16 db 1
align 16
mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
@@ -73,106 +73,106 @@
;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
;%1 will keep the last result
%macro SSE_DB_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubb %1, %2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubb %1, %2
%endmacro
;xmm0, xmm1, xmm2, eax, ecx
;lower 64 bits of xmm0 save the result
%macro SSE2_PRED_H_4X4_TWO_LINE 5
- movd %1, [%4-1]
- movdqa %3, %1
- punpcklbw %1, %3
- movdqa %3, %1
- punpcklbw %1, %3
+ movd %1, [%4-1]
+ movdqa %3, %1
+ punpcklbw %1, %3
+ movdqa %3, %1
+ punpcklbw %1, %3
- ;add %4, %5
- movd %2, [%4+%5-1]
- movdqa %3, %2
- punpcklbw %2, %3
- movdqa %3, %2
- punpcklbw %2, %3
- punpckldq %1, %2
+ ;add %4, %5
+ movd %2, [%4+%5-1]
+ movdqa %3, %2
+ punpcklbw %2, %3
+ movdqa %3, %2
+ punpcklbw %2, %3
+ punpckldq %1, %2
%endmacro
%macro SUMW_HORIZON1 2
- movdqa %2, %1
- psrldq %2, 8
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 4
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 2
- paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 8
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ paddusw %1, %2
%endmacro
%macro LOAD_COLUMN 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpcklwd %1, %3
- lea %5, [%5+2*%6]
- movd %4, [%5]
- movd %2, [%5+%6]
- punpcklbw %4, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- lea %5, [%5+2*%6]
- punpcklbw %3, %2
- punpcklwd %4, %3
- punpckhdq %1, %4
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpcklwd %1, %3
+ lea %5, [%5+2*%6]
+ movd %4, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %4, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ lea %5, [%5+2*%6]
+ punpcklbw %3, %2
+ punpcklwd %4, %3
+ punpckhdq %1, %4
%endmacro
%macro SUMW_HORIZON 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
%endmacro
%macro COPY_16_TIMES 2
- movdqa %2, [%1-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
+ movdqa %2, [%1-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
%endmacro
%macro COPY_16_TIMESS 3
- movdqa %2, [%1+%3-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
+ movdqa %2, [%1+%3-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
%endmacro
%macro LOAD_COLUMN_C 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1,%2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpckhwd %1, %3
- lea %5, [%5+2*%6]
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1,%2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpckhwd %1, %3
+ lea %5, [%5+2*%6]
%endmacro
%macro LOAD_2_LEFT_AND_ADD 0
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01]
- add r3, r4
- movzx r4, byte [r1+r2-0x01]
- add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01]
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01]
+ add r3, r4
%endmacro
;***********************************************************************
@@ -184,127 +184,127 @@
;***********************************************************************
; void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;
-; pred must align to 16
+; pred must align to 16
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredH_sse2
- push r3
- %assign push_num 1
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movzx r3, byte [r1-1]
- movd xmm0, r3d
- pmuludq xmm0, [mmx_01bytes]
+ push r3
+ %assign push_num 1
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movzx r3, byte [r1-1]
+ movd xmm0, r3d
+ pmuludq xmm0, [mmx_01bytes]
- movzx r3, byte [r1+r2-1]
- movd xmm1, r3d
- pmuludq xmm1, [mmx_01bytes]
+ movzx r3, byte [r1+r2-1]
+ movd xmm1, r3d
+ pmuludq xmm1, [mmx_01bytes]
- unpcklps xmm0, xmm1
+ unpcklps xmm0, xmm1
- lea r1, [r1+r2*2]
- movzx r3, byte [r1-1]
- movd xmm2, r3d
- pmuludq xmm2, [mmx_01bytes]
+ lea r1, [r1+r2*2]
+ movzx r3, byte [r1-1]
+ movd xmm2, r3d
+ pmuludq xmm2, [mmx_01bytes]
- movzx r3, byte [r1+r2-1]
- movd xmm3, r3d
- pmuludq xmm3, [mmx_01bytes]
+ movzx r3, byte [r1+r2-1]
+ movd xmm3, r3d
+ pmuludq xmm3, [mmx_01bytes]
- unpcklps xmm2, xmm3
- unpcklpd xmm0, xmm2
+ unpcklps xmm2, xmm3
+ unpcklpd xmm0, xmm2
- movdqa [r0], xmm0
- pop r3
- ret
+ movdqa [r0], xmm0
+ pop r3
+ ret
;***********************************************************************
; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r2, r2d
- sub r1, 1
- sub r1, r2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ sub r1, 1
+ sub r1, r2
- ;for H
- pxor xmm7, xmm7
- movq xmm0, [r1]
- movdqa xmm5, [sse2_plane_dec]
- punpcklbw xmm0, xmm7
- pmullw xmm0, xmm5
- movq xmm1, [r1 + 9]
- movdqa xmm6, [sse2_plane_inc]
- punpcklbw xmm1, xmm7
- pmullw xmm1, xmm6
- psubw xmm1, xmm0
+ ;for H
+ pxor xmm7, xmm7
+ movq xmm0, [r1]
+ movdqa xmm5, [sse2_plane_dec]
+ punpcklbw xmm0, xmm7
+ pmullw xmm0, xmm5
+ movq xmm1, [r1 + 9]
+ movdqa xmm6, [sse2_plane_inc]
+ punpcklbw xmm1, xmm7
+ pmullw xmm1, xmm6
+ psubw xmm1, xmm0
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
- movsx r3, r3w
- imul r3, 5
- add r3, 32
- sar r3, 6 ; b = (5 * H + 32) >> 6;
- SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+ movsx r3, r3w
+ imul r3, 5
+ add r3, 32
+ sar r3, 6 ; b = (5 * H + 32) >> 6;
+ SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b
- movzx r4, BYTE [r1+16]
- sub r1, 3
- LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2
+ movzx r4, BYTE [r1+16]
+ sub r1, 3
+ LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2
- add r1, 3
- movzx r3, BYTE [r1+8*r2]
- add r4, r3
- shl r4, 4 ; a = (left[15*stride] + top[15]) << 4;
+ add r1, 3
+ movzx r3, BYTE [r1+8*r2]
+ add r4, r3
+ shl r4, 4 ; a = (left[15*stride] + top[15]) << 4;
- sub r1, 3
- add r1, r2
- LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2
- pxor xmm4, xmm4
- punpckhbw xmm0, xmm4
- pmullw xmm0, xmm5
- punpckhbw xmm7, xmm4
- pmullw xmm7, xmm6
- psubw xmm7, xmm0
+ sub r1, 3
+ add r1, r2
+ LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2
+ pxor xmm4, xmm4
+ punpckhbw xmm0, xmm4
+ pmullw xmm0, xmm5
+ punpckhbw xmm7, xmm4
+ pmullw xmm7, xmm6
+ psubw xmm7, xmm0
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r3d, xmm7 ; V
- movsx r3, r3w
- imul r3, 5
- add r3, 32
- sar r3, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r3d, xmm7 ; V
+ movsx r3, r3w
+ imul r3, 5
+ add r3, 32
+ sar r3, 6 ; c = (5 * V + 32) >> 6;
+ SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c
- add r4, 16
- imul r3, -7
- add r3, r4 ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+ add r4, 16
+ imul r3, -7
+ add r3, r4 ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- xor r3, r3
- movdqa xmm5, [sse2_plane_inc_minus]
+ xor r3, r3
+ movdqa xmm5, [sse2_plane_inc_minus]
get_i16x16_luma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- movdqa xmm3, xmm1
- pmullw xmm3, xmm6
- paddw xmm3, xmm0
- psraw xmm3, 5
- packuswb xmm2, xmm3
- movdqa [r0], xmm2
- paddw xmm0, xmm4
- add r0, 16
- inc r3
- cmp r3, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
- POP_XMM
- pop r4
- pop r3
- ret
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ movdqa xmm3, xmm1
+ pmullw xmm3, xmm6
+ paddw xmm3, xmm0
+ psraw xmm3, 5
+ packuswb xmm2, xmm3
+ movdqa [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, 16
+ inc r3
+ cmp r3, 16
+ jnz get_i16x16_luma_pred_plane_sse2_1
+ POP_XMM
+ pop r4
+ pop r3
+ ret
;***********************************************************************
; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
@@ -311,38 +311,38 @@
;***********************************************************************
%macro SSE2_PRED_H_16X16_ONE_LINE 0
- add r0, 16
- add r1, r2
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
+ add r0, 16
+ add r1, r2
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
%endmacro
WELS_EXTERN WelsI16x16LumaPredH_sse2
- push r3
- %assign push_num 1
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- dec r1
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- pop r3
+ push r3
+ %assign push_num 1
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ dec r1
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ pop r3
ret
;***********************************************************************
@@ -378,289 +378,289 @@
; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WELS_EXTERN WelsIChromaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r2, r2d
- sub r1, 1
- sub r1, r2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ sub r1, 1
+ sub r1, r2
- pxor mm7, mm7
- movq mm0, [r1]
- movq mm5, [sse2_plane_dec_c]
- punpcklbw mm0, mm7
- pmullw mm0, mm5
- movq mm1, [r1 + 5]
- movq mm6, [sse2_plane_inc_c]
- punpcklbw mm1, mm7
- pmullw mm1, mm6
- psubw mm1, mm0
+ pxor mm7, mm7
+ movq mm0, [r1]
+ movq mm5, [sse2_plane_dec_c]
+ punpcklbw mm0, mm7
+ pmullw mm0, mm5
+ movq mm1, [r1 + 5]
+ movq mm6, [sse2_plane_inc_c]
+ punpcklbw mm1, mm7
+ pmullw mm1, mm6
+ psubw mm1, mm0
- movq2dq xmm1, mm1
- pxor xmm2, xmm2
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r3d, xmm1
- movsx r3, r3w
- imul r3, 17
- add r3, 16
- sar r3, 5 ; b = (17 * H + 16) >> 5;
- SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b
+ movq2dq xmm1, mm1
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r3d, xmm1
+ movsx r3, r3w
+ imul r3, 17
+ add r3, 16
+ sar r3, 5 ; b = (17 * H + 16) >> 5;
+ SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b
- movzx r3, BYTE [r1+8]
- sub r1, 3
- LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2
+ movzx r3, BYTE [r1+8]
+ sub r1, 3
+ LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2
- add r1, 3
- movzx r4, BYTE [r1+4*r2]
- add r4, r3
- shl r4, 4 ; a = (left[7*stride] + top[7]) << 4;
+ add r1, 3
+ movzx r4, BYTE [r1+4*r2]
+ add r4, r3
+ shl r4, 4 ; a = (left[7*stride] + top[7]) << 4;
- sub r1, 3
- add r1, r2
- LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2
- pxor mm4, mm4
- punpckhbw mm0, mm4
- pmullw mm0, mm5
- punpckhbw mm7, mm4
- pmullw mm7, mm6
- psubw mm7, mm0
+ sub r1, 3
+ add r1, r2
+ LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2
+ pxor mm4, mm4
+ punpckhbw mm0, mm4
+ pmullw mm0, mm5
+ punpckhbw mm7, mm4
+ pmullw mm7, mm6
+ psubw mm7, mm0
- movq2dq xmm7, mm7
- pxor xmm2, xmm2
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r3d, xmm7 ; V
- movsx r3, r3w
- imul r3, 17
- add r3, 16
- sar r3, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c
+ movq2dq xmm7, mm7
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r3d, xmm7 ; V
+ movsx r3, r3w
+ imul r3, 17
+ add r3, 16
+ sar r3, 5 ; c = (17 * V + 16) >> 5;
+ SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c
- add r4, 16
- imul r3, -3
- add r3, r4 ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+ add r4, 16
+ imul r3, -3
+ add r3, r4 ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- xor r3, r3
- movdqa xmm5, [sse2_plane_mul_b_c]
+ xor r3, r3
+ movdqa xmm5, [sse2_plane_mul_b_c]
get_i_chroma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r0], xmm2
- paddw xmm0, xmm4
- add r0, 8
- inc r3
- cmp r3, 8
- jnz get_i_chroma_pred_plane_sse2_1
- POP_XMM
- pop r4
- pop r3
- WELSEMMS
- ret
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, 8
+ inc r3
+ cmp r3, 8
+ jnz get_i_chroma_pred_plane_sse2_1
+ POP_XMM
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
;***********************************************************************
-; 0 |1 |2 |3 |4 |
-; 6 |7 |8 |9 |10|
-; 11|12|13|14|15|
-; 16|17|18|19|20|
-; 21|22|23|24|25|
-; 7 is the start pixel of current 4x4 block
-; pred[7] = ([6]+[0]*2+[1]+2)/4
+; 0 |1 |2 |3 |4 |
+; 6 |7 |8 |9 |10|
+; 11|12|13|14|15|
+; 16|17|18|19|20|
+; 21|22|23|24|25|
+; 7 is the start pixel of current 4x4 block
+; pred[7] = ([6]+[0]*2+[1]+2)/4
;
; void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDR_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
- movq mm2,[r1-8] ;get value of 6 mm2[8] = 6
- sub r1, r2 ;mov eax to above line of current block(postion of 1)
- punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
- movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
- punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
- psllq mm3,18h ;mm3[5]=[1]
- psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
- movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- lea r1,[r1+r2*2-8h] ;set eax point to 12
- movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16]
- psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[16]
- por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
- movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
- movq mm4,[r1+r2*2] ;mm4[8]=[21]
- psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[21]
- por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
- movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
- pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
- pxor mm1,mm4 ;find odd value in the lowest bit of each byte
- pand mm1,[mmx_01bytes] ;set the odd bit
- psubusb mm3,mm1 ;decrease 1 from odd bytes
- pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+ movq mm2,[r1-8] ;get value of 6 mm2[8] = 6
+ sub r1, r2 ;mov eax to above line of current block(postion of 1)
+ punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+ movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+ punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+ psllq mm3,18h ;mm3[5]=[1]
+ psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+ movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ lea r1,[r1+r2*2-8h] ;set eax point to 12
+ movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16]
+ psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[16]
+ por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+ movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+ movq mm4,[r1+r2*2] ;mm4[8]=[21]
+ psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[21]
+ por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+ movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+ pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
+ pxor mm1,mm4 ;find odd value in the lowest bit of each byte
+ pand mm1,[mmx_01bytes] ;set the odd bit
+ psubusb mm3,mm1 ;decrease 1 from odd bytes
+ pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
- movd [r0+12],mm2
- psrlq mm2,8
- movd [r0+8],mm2
- psrlq mm2,8
- movd [r0+4],mm2
- psrlq mm2,8
- movd [r0],mm2
- WELSEMMS
- ret
+ movd [r0+12],mm2
+ psrlq mm2,8
+ movd [r0+8],mm2
+ psrlq mm2,8
+ movd [r0+4],mm2
+ psrlq mm2,8
+ movd [r0],mm2
+ WELSEMMS
+ ret
;***********************************************************************
-; 0 |1 |2 |3 |4 |
-; 5 |6 |7 |8 |9 |
-; 10|11|12|13|14|
-; 15|16|17|18|19|
-; 20|21|22|23|24|
-; 6 is the start pixel of current 4x4 block
-; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
+; 0 |1 |2 |3 |4 |
+; 5 |6 |7 |8 |9 |
+; 10|11|12|13|14|
+; 15|16|17|18|19|
+; 20|21|22|23|24|
+; 6 is the start pixel of current 4x4 block
+; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
;
; void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
;
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movzx r4, byte [r1-1h]
- sub r1, r2
- movd xmm0, [r1]
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- xor r3, r3
- movd r3d, xmm0
- add r3, r4
- movzx r4, byte [r1+r2*2-1h]
- add r3, r4
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movzx r4, byte [r1-1h]
+ sub r1, r2
+ movd xmm0, [r1]
+ pxor xmm1, xmm1
+ psadbw xmm0, xmm1
+ xor r3, r3
+ movd r3d, xmm0
+ add r3, r4
+ movzx r4, byte [r1+r2*2-1h]
+ add r3, r4
- lea r1, [r1+r2*2-1]
- movzx r4, byte [r1+r2]
- add r3, r4
+ lea r1, [r1+r2*2-1]
+ movzx r4, byte [r1+r2]
+ add r3, r4
- movzx r4, byte [r1+r2*2]
- add r3, r4
- add r3, 4
- sar r3, 3
- imul r3, 0x01010101
+ movzx r4, byte [r1+r2*2]
+ add r3, r4
+ add r3, 4
+ sar r3, 3
+ imul r3, 0x01010101
- movd xmm0, r3d
- pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- pop r4
- pop r3
- ret
+ movd xmm0, r3d
+ pshufd xmm0, xmm0, 0
+ movdqa [r0], xmm0
+ pop r4
+ pop r3
+ ret
;***********************************************************************
-; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy 8 pixel of 8 line from left
;***********************************************************************
%macro MMX_PRED_H_8X8_ONE_LINE 4
- movq %1, [%3-8]
- psrlq %1, 38h
+ movq %1, [%3-8]
+ psrlq %1, 38h
- ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
+ ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
%endmacro
%macro MMX_PRED_H_8X8_ONE_LINEE 4
- movq %1, [%3+r2-8]
- psrlq %1, 38h
+ movq %1, [%3+r2-8]
+ psrlq %1, 38h
- ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
+ ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
%endmacro
WELS_EXTERN WelsIChromaPredH_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movq mm0, [r1-8]
- psrlq mm0, 38h
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movq mm0, [r1-8]
+ psrlq mm0, 38h
- ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
- pmullw mm0, [mmx_01bytes]
- pshufw mm0, mm0, 0
- movq [r0], mm0
+ ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
+ pmullw mm0, [mmx_01bytes]
+ pshufw mm0, mm0, 0
+ movq [r0], mm0
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
- WELSEMMS
- ret
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
+ WELSEMMS
+ ret
;***********************************************************************
-; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy pixels from top 4 pixels
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredV_sse2
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movd xmm0, [r1]
- pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- ret
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movd xmm0, [r1]
+ pshufd xmm0, xmm0, 0
+ movdqa [r0], xmm0
+ ret
;***********************************************************************
-; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy 8 pixels from top 8 pixels
;***********************************************************************
WELS_EXTERN WelsIChromaPredV_sse2
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq xmm0, [r1]
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm1
- movdqa [r0], xmm0
- movdqa [r0+16], xmm0
- movdqa [r0+32], xmm0
- movdqa [r0+48], xmm0
- ret
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq xmm0, [r1]
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm1
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm0
+ movdqa [r0+32], xmm0
+ movdqa [r0+48], xmm0
+ ret
;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |a |b |
-; |g |h |e |f |
-; |i |j |g |h |
+; |a |b |c |d |
+; |e |f |a |b |
+; |g |h |e |f |
+; |i |j |g |h |
; a = (1 + lt + l0)>>1
; e = (1 + l0 + l1)>>1
@@ -679,68 +679,68 @@
; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
- psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+ psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
- movd mm1, [r1+2*r2-4]
- punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
- lea r1, [r1+2*r2]
- movd mm2, [r1+2*r2-4]
- punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
- psrlq mm2, 20h
- pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+ movd mm1, [r1+2*r2-4]
+ punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r1, [r1+2*r2]
+ movd mm2, [r1+2*r2-4]
+ punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+ psrlq mm2, 20h
+ pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
- movq mm1, mm0
- psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
- movq mm2, mm0
- psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
- movq mm3, mm2
- movq mm4, mm1
- pavgb mm1, mm0
+ movq mm1, mm0
+ psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+ movq mm2, mm0
+ psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+ movq mm3, mm2
+ movq mm4, mm1
+ pavgb mm1, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm4 ; decrease 1 from odd bytes
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm4 ; decrease 1 from odd bytes
- pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
+ pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
- movq mm4, mm0
- pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
- punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
+ movq mm4, mm0
+ pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
+ punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
- psrlq mm2, 20h
- psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
- movq mm4, mm3
- psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
- pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
- psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
+ psrlq mm2, 20h
+ psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
+ movq mm4, mm3
+ psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
+ pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
- movd [r0], mm2
- movd [r0+12], mm3
- psrlq mm3, 10h
- movd [r0+8], mm3
- psrlq mm3, 10h
- movd [r0+4], mm3
- WELSEMMS
- ret
+ movd [r0], mm2
+ movd [r0+12], mm3
+ psrlq mm3, 10h
+ movd [r0+8], mm3
+ psrlq mm3, 10h
+ movd [r0+4], mm3
+ WELSEMMS
+ ret
;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
; destination:
-; |a |b |c |d |
-; |c |d |e |f |
-; |e |f |g |g |
-; |g |g |g |g |
+; |a |b |c |d |
+; |c |d |e |f |
+; |e |f |g |g |
+; |g |g |g |g |
; a = (1 + l0 + l1)>>1
; c = (1 + l1 + l2)>>1
@@ -756,70 +756,70 @@
; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movd mm0, [r1-4] ; mm0[3] = l0
- punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0
- lea r1, [r1+2*r2]
- movd mm2, [r1-4] ; mm2[3] = l2
- movd mm4, [r1+r2-4] ; mm4[3] = l3
- punpcklbw mm2, mm4
- punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movd mm0, [r1-4] ; mm0[3] = l0
+ punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0
+ lea r1, [r1+2*r2]
+ movd mm2, [r1-4] ; mm2[3] = l2
+ movd mm4, [r1+r2-4] ; mm4[3] = l3
+ punpcklbw mm2, mm4
+ punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
- psrlq mm4, 18h
- psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
- psrlq mm0, 8h
- pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+ psrlq mm4, 18h
+ psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
+ psrlq mm0, 8h
+ pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
- movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
- pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+ movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+ pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
- movq mm5, mm2
- pavgb mm2, mm0
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+ movq mm5, mm2
+ pavgb mm2, mm0
- pxor mm5, mm0 ; find odd value in the lowest bit of each byte
- pand mm5, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm5 ; decrease 1 from odd bytes
+ pxor mm5, mm0 ; find odd value in the lowest bit of each byte
+ pand mm5, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm5 ; decrease 1 from odd bytes
- pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
+ pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
- psrlq mm2, 8h
- pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
+ psrlq mm2, 8h
+ pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
- punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
- punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
- punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
+ punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
+ punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
+ punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
- psrlq mm4, 20h
- movd [r0+12], mm4
+ psrlq mm4, 20h
+ movd [r0+12], mm4
- movd [r0], mm1
- psrlq mm1, 10h
- movd [r0+4], mm1
- psrlq mm1, 10h
- movd [r0+8], mm1
- WELSEMMS
- ret
+ movd [r0], mm1
+ psrlq mm1, 10h
+ movd [r0+4], mm1
+ psrlq mm1, 10h
+ movd [r0+8], mm1
+ WELSEMMS
+ ret
;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; l3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; l3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |i |a |b |c |
-; |j |e |f |g |
+; |a |b |c |d |
+; |e |f |g |h |
+; |i |a |b |c |
+; |j |e |f |g |
; a = (1 + lt + t0)>>1
; b = (1 + t0 + t1)>>1
@@ -837,75 +837,75 @@
; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
- psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+ psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
- movd mm1, [r1+2*r2-4]
- punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
- lea r1, [r1+2*r2]
- movq mm2, [r1+r2-8] ; mm2[7] = l2
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
- psrlq mm2, 28h
- pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+ movd mm1, [r1+2*r2-4]
+ punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r1, [r1+2*r2]
+ movq mm2, [r1+r2-8] ; mm2[7] = l2
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+ psrlq mm2, 28h
+ pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
- movq mm3, mm2
- pavgb mm2, mm0
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+ movq mm3, mm2
+ pavgb mm2, mm0
- pxor mm3, mm0 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm3 ; decrease 1 from odd bytes
+ pxor mm3, mm0 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm3 ; decrease 1 from odd bytes
- movq mm3, mm0
- psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
- movq mm2, mm3
+ movq mm3, mm0
+ psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
+ movq mm2, mm3
- psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
- movd [r0], mm1
+ psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
+ movd [r0], mm1
- psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
- movd [r0+4], mm2
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
+ movd [r0+4], mm2
- movq mm4, mm3
- psllq mm4, 20h
- psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
+ movq mm4, mm3
+ psllq mm4, 20h
+ psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
- movq mm5, mm3
- psllq mm5, 28h
- psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
+ movq mm5, mm3
+ psllq mm5, 28h
+ psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
- psllq mm1, 8h
- pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
- movd [r0+8], mm4
+ psllq mm1, 8h
+ pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
+ movd [r0+8], mm4
- psllq mm2, 8h
- pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
- movd [r0+12], mm5
- WELSEMMS
- ret
+ psllq mm2, 8h
+ pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
+ movd [r0+12], mm5
+ WELSEMMS
+ ret
;***********************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
; destination:
-; |a |b |c |d |
-; |b |c |d |e |
-; |c |d |e |f |
-; |d |e |f |g |
+; |a |b |c |d |
+; |b |c |d |e |
+; |c |d |e |f |
+; |d |e |f |g |
; a = (2 + t0 + t2 + (t1<<1))>>2
; b = (2 + t1 + t3 + (t2<<1))>>2
@@ -921,54 +921,54 @@
; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
- movq mm3, mm0
- psrlq mm3, 38h
- psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
+ movq mm3, mm0
+ psrlq mm3, 38h
+ psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
- psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
- psrlq mm2, 8h
- pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+ psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+ psrlq mm2, 8h
+ pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
- movq mm3, mm1
- pavgb mm1, mm2
- pxor mm3, mm2 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm3 ; decrease 1 from odd bytes
+ movq mm3, mm1
+ pavgb mm1, mm2
+ pxor mm3, mm2 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm3 ; decrease 1 from odd bytes
- pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
+ pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
- psrlq mm0, 8h
- movd [r0], mm0
- psrlq mm0, 8h
- movd [r0+4], mm0
- psrlq mm0, 8h
- movd [r0+8], mm0
- psrlq mm0, 8h
- movd [r0+12], mm0
- WELSEMMS
- ret
+ psrlq mm0, 8h
+ movd [r0], mm0
+ psrlq mm0, 8h
+ movd [r0+4], mm0
+ psrlq mm0, 8h
+ movd [r0+8], mm0
+ psrlq mm0, 8h
+ movd [r0+12], mm0
+ WELSEMMS
+ ret
;***********************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |b |c |d |i |
-; |f |g |h |j |
+; |a |b |c |d |
+; |e |f |g |h |
+; |b |c |d |i |
+; |f |g |h |j |
; a = (1 + t0 + t1)>>1
; b = (1 + t1 + t2)>>1
@@ -987,37 +987,37 @@
; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
- psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
- psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+ psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+ psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
- movq mm3, mm1
- pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
+ movq mm3, mm1
+ pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
- movq mm4, mm2
- pavgb mm2, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm4 ; decrease 1 from odd bytes
+ movq mm4, mm2
+ pavgb mm2, mm0
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm4 ; decrease 1 from odd bytes
- pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
+ pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
- movd [r0], mm3
- psrlq mm3, 8h
- movd [r0+8], mm3
+ movd [r0], mm3
+ psrlq mm3, 8h
+ movd [r0+8], mm3
- movd [r0+4], mm2
- psrlq mm2, 8h
- movd [r0+12], mm2
- WELSEMMS
- ret
+ movd [r0+4], mm2
+ psrlq mm2, 8h
+ movd [r0+12], mm2
+ WELSEMMS
+ ret
;***********************************************************************
;
@@ -1024,88 +1024,88 @@
; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsIChromaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1]
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1]
- movzx r3, byte [r1+r2-0x01] ; l1
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l2
- add r3, r4
- movzx r4, byte [r1+r2-0x01] ; l3
- add r3, r4
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l4
- add r3, r4
- movd mm1, r3d ; mm1 = l1+l2+l3+l4
+ movzx r3, byte [r1+r2-0x01] ; l1
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l2
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01] ; l3
+ add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l4
+ add r3, r4
+ movd mm1, r3d ; mm1 = l1+l2+l3+l4
- movzx r3, byte [r1+r2-0x01] ; l5
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l6
- add r3, r4
- movzx r4, byte [r1+r2-0x01] ; l7
- add r3, r4
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l8
- add r3, r4
- movd mm2, r3d ; mm2 = l5+l6+l7+l8
+ movzx r3, byte [r1+r2-0x01] ; l5
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l6
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01] ; l7
+ add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l8
+ add r3, r4
+ movd mm2, r3d ; mm2 = l5+l6+l7+l8
- movq mm3, mm0
- psrlq mm0, 0x20
- psllq mm3, 0x20
- psrlq mm3, 0x20
- pxor mm4, mm4
- psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+ movq mm3, mm0
+ psrlq mm0, 0x20
+ psllq mm3, 0x20
+ psrlq mm3, 0x20
+ pxor mm4, mm4
+ psadbw mm0, mm4
+ psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
- paddq mm3, mm1
- movq mm1, mm2
- paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+ paddq mm3, mm1
+ movq mm1, mm2
+ paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
- movq mm4, [mmx_0x02]
+ movq mm4, [mmx_0x02]
- paddq mm0, mm4
- psrlq mm0, 0x02
+ paddq mm0, mm4
+ psrlq mm0, 0x02
- paddq mm2, mm4
- psrlq mm2, 0x02
+ paddq mm2, mm4
+ psrlq mm2, 0x02
- paddq mm3, mm4
- paddq mm3, mm4
- psrlq mm3, 0x03
+ paddq mm3, mm4
+ paddq mm3, mm4
+ psrlq mm3, 0x03
- paddq mm1, mm4
- paddq mm1, mm4
- psrlq mm1, 0x03
+ paddq mm1, mm4
+ paddq mm1, mm4
+ psrlq mm1, 0x03
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
- psllq mm0, 0x20
- pxor mm0, mm3 ; mm0 = m_up
+ pmuludq mm0, [mmx_01bytes]
+ pmuludq mm3, [mmx_01bytes]
+ psllq mm0, 0x20
+ pxor mm0, mm3 ; mm0 = m_up
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
- psllq mm1, 0x20
- pxor mm1, mm2 ; mm2 = m_down
+ pmuludq mm2, [mmx_01bytes]
+ pmuludq mm1, [mmx_01bytes]
+ psllq mm1, 0x20
+ pxor mm1, mm2 ; mm2 = m_down
- movq [r0], mm0
- movq [r0+0x08], mm0
- movq [r0+0x10], mm0
- movq [r0+0x18], mm0
+ movq [r0], mm0
+ movq [r0+0x08], mm0
+ movq [r0+0x10], mm0
+ movq [r0+0x18], mm0
- movq [r0+0x20], mm1
- movq [r0+0x28], mm1
- movq [r0+0x30], mm1
- movq [r0+0x38], mm1
+ movq [r0+0x20], mm1
+ movq [r0+0x28], mm1
+ movq [r0+0x30], mm1
+ movq [r0+0x38], mm1
- pop r4
- pop r3
- WELSEMMS
- ret
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
@@ -1114,56 +1114,56 @@
; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movdqa xmm0, [r1] ; read one row
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- movdqa xmm1, xmm0
- psrldq xmm1, 0x08
- pslldq xmm0, 0x08
- psrldq xmm0, 0x08
- paddw xmm0, xmm1
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movdqa xmm0, [r1] ; read one row
+ pxor xmm1, xmm1
+ psadbw xmm0, xmm1
+ movdqa xmm1, xmm0
+ psrldq xmm1, 0x08
+ pslldq xmm0, 0x08
+ psrldq xmm0, 0x08
+ paddw xmm0, xmm1
- movzx r3, byte [r1+r2-0x01]
- movzx r4, byte [r1+2*r2-0x01]
- add r3, r4
- lea r1, [r1+r2]
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- add r3, 0x10
- movd xmm1, r3d
- paddw xmm0, xmm1
- psrld xmm0, 0x05
- pmuludq xmm0, [mmx_01bytes]
- pshufd xmm0, xmm0, 0
+ movzx r3, byte [r1+r2-0x01]
+ movzx r4, byte [r1+2*r2-0x01]
+ add r3, r4
+ lea r1, [r1+r2]
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ add r3, 0x10
+ movd xmm1, r3d
+ paddw xmm0, xmm1
+ psrld xmm0, 0x05
+ pmuludq xmm0, [mmx_01bytes]
+ pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- movdqa [r0+0x10], xmm0
- movdqa [r0+0x20], xmm0
- movdqa [r0+0x30], xmm0
- movdqa [r0+0x40], xmm0
- movdqa [r0+0x50], xmm0
- movdqa [r0+0x60], xmm0
- movdqa [r0+0x70], xmm0
- movdqa [r0+0x80], xmm0
- movdqa [r0+0x90], xmm0
- movdqa [r0+0xa0], xmm0
- movdqa [r0+0xb0], xmm0
- movdqa [r0+0xc0], xmm0
- movdqa [r0+0xd0], xmm0
- movdqa [r0+0xe0], xmm0
- movdqa [r0+0xf0], xmm0
+ movdqa [r0], xmm0
+ movdqa [r0+0x10], xmm0
+ movdqa [r0+0x20], xmm0
+ movdqa [r0+0x30], xmm0
+ movdqa [r0+0x40], xmm0
+ movdqa [r0+0x50], xmm0
+ movdqa [r0+0x60], xmm0
+ movdqa [r0+0x70], xmm0
+ movdqa [r0+0x80], xmm0
+ movdqa [r0+0x90], xmm0
+ movdqa [r0+0xa0], xmm0
+ movdqa [r0+0xb0], xmm0
+ movdqa [r0+0xc0], xmm0
+ movdqa [r0+0xd0], xmm0
+ movdqa [r0+0xe0], xmm0
+ movdqa [r0+0xf0], xmm0
- pop r4
- pop r3
- ret
\ No newline at end of file
+ pop r4
+ pop r3
+ ret
\ No newline at end of file
--- a/codec/encoder/core/x86/matrix_transpose.asm
+++ b/codec/encoder/core/x86/matrix_transpose.asm
@@ -34,153 +34,153 @@
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE_8x8B_MMX 10
- MMX_XSwap bw, %1, %2, %8
- MMX_XSwap bw, %3, %4, %2
- MMX_XSwap bw, %5, %6, %4
- movq %6, %9
- movq %10, %4
- MMX_XSwap bw, %7, %6, %4
+ MMX_XSwap bw, %1, %2, %8
+ MMX_XSwap bw, %3, %4, %2
+ MMX_XSwap bw, %5, %6, %4
+ movq %6, %9
+ movq %10, %4
+ MMX_XSwap bw, %7, %6, %4
- MMX_XSwap wd, %1, %3, %6
- MMX_XSwap wd, %8, %2, %3
- MMX_XSwap wd, %5, %7, %2
- movq %7, %10
- movq %10, %3
- MMX_XSwap wd, %7, %4, %3
+ MMX_XSwap wd, %1, %3, %6
+ MMX_XSwap wd, %8, %2, %3
+ MMX_XSwap wd, %5, %7, %2
+ movq %7, %10
+ movq %10, %3
+ MMX_XSwap wd, %7, %4, %3
- MMX_XSwap dq, %1, %5, %4
- MMX_XSwap dq, %6, %2, %5
- MMX_XSwap dq, %8, %7, %2
- movq %7, %10
- movq %10, %5
- MMX_XSwap dq, %7, %3, %5
+ MMX_XSwap dq, %1, %5, %4
+ MMX_XSwap dq, %6, %2, %5
+ MMX_XSwap dq, %8, %7, %2
+ movq %7, %10
+ movq %10, %5
+ MMX_XSwap dq, %7, %3, %5
- movq %3, %10
+ movq %3, %10
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
-%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
- movq [%1], mm0 ; result of line 1, x8 bytes
- movq [%1+%2], mm3 ; result of line 2
- lea %1, [%1+2*%2]
- movq [%1], mm5 ; result of line 3
- movq [%1+%2], mm2 ; result of line 4
- lea %1, [%1+2*%2]
- movq [%1], mm7 ; result of line 5
- movq [%1+%2], mm1 ; result of line 6
- lea %1, [%1+2*%2]
- movq [%1], mm6 ; result of line 7
- movq [%1+%2], mm4 ; result of line 8
+%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
+ movq [%1], mm0 ; result of line 1, x8 bytes
+ movq [%1+%2], mm3 ; result of line 2
+ lea %1, [%1+2*%2]
+ movq [%1], mm5 ; result of line 3
+ movq [%1+%2], mm2 ; result of line 4
+ lea %1, [%1+2*%2]
+ movq [%1], mm7 ; result of line 5
+ movq [%1+%2], mm1 ; result of line 6
+ lea %1, [%1+2*%2]
+ movq [%1], mm6 ; result of line 7
+ movq [%1+%2], mm4 ; result of line 8
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
-%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
- movq [%1], mm0 ; result of line 1, x8 bytes
- movq [%1+%2], mm3 ; result of line 2
- lea %3, [%1+2*%2]
- movq [%3], mm5 ; result of line 3
- movq [%3+%2], mm2 ; result of line 4
- lea %3, [%3+2*%2]
- movq [%3], mm7 ; result of line 5
- movq [%3+%2], mm1 ; result of line 6
- lea %3, [%3+2*%2]
- movq [%3], mm6 ; result of line 7
- movq [%3+%2], mm4 ; result of line 8
-%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
+%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
+ movq [%1], mm0 ; result of line 1, x8 bytes
+ movq [%1+%2], mm3 ; result of line 2
+ lea %3, [%1+2*%2]
+ movq [%3], mm5 ; result of line 3
+ movq [%3+%2], mm2 ; result of line 4
+ lea %3, [%3+2*%2]
+ movq [%3], mm7 ; result of line 5
+ movq [%3+%2], mm1 ; result of line 6
+ lea %3, [%3+2*%2]
+ movq [%3], mm6 ; result of line 7
+ movq [%3+%2], mm4 ; result of line 8
+%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
; for transpose 16x8
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
-%macro TRANSPOSE_8x16B_SSE2 10
- SSE2_XSawp bw, %1, %2, %8
- SSE2_XSawp bw, %3, %4, %2
- SSE2_XSawp bw, %5, %6, %4
- movdqa %6, %9
- movdqa %10, %4
- SSE2_XSawp bw, %7, %6, %4
+%macro TRANSPOSE_8x16B_SSE2 10
+ SSE2_XSawp bw, %1, %2, %8
+ SSE2_XSawp bw, %3, %4, %2
+ SSE2_XSawp bw, %5, %6, %4
+ movdqa %6, %9
+ movdqa %10, %4
+ SSE2_XSawp bw, %7, %6, %4
- SSE2_XSawp wd, %1, %3, %6
- SSE2_XSawp wd, %8, %2, %3
- SSE2_XSawp wd, %5, %7, %2
- movdqa %7, %10
- movdqa %10, %3
- SSE2_XSawp wd, %7, %4, %3
+ SSE2_XSawp wd, %1, %3, %6
+ SSE2_XSawp wd, %8, %2, %3
+ SSE2_XSawp wd, %5, %7, %2
+ movdqa %7, %10
+ movdqa %10, %3
+ SSE2_XSawp wd, %7, %4, %3
- SSE2_XSawp dq, %1, %5, %4
- SSE2_XSawp dq, %6, %2, %5
- SSE2_XSawp dq, %8, %7, %2
- movdqa %7, %10
- movdqa %10, %5
- SSE2_XSawp dq, %7, %3, %5
+ SSE2_XSawp dq, %1, %5, %4
+ SSE2_XSawp dq, %6, %2, %5
+ SSE2_XSawp dq, %8, %7, %2
+ movdqa %7, %10
+ movdqa %10, %5
+ SSE2_XSawp dq, %7, %3, %5
- SSE2_XSawp qdq, %1, %8, %3
- SSE2_XSawp qdq, %4, %2, %8
- SSE2_XSawp qdq, %6, %7, %2
- movdqa %7, %10
- movdqa %10, %1
- SSE2_XSawp qdq, %7, %5, %1
- movdqa %5, %10
-%endmacro ; end of TRANSPOSE_8x16B_SSE2
+ SSE2_XSawp qdq, %1, %8, %3
+ SSE2_XSawp qdq, %4, %2, %8
+ SSE2_XSawp qdq, %6, %7, %2
+ movdqa %7, %10
+ movdqa %10, %1
+ SSE2_XSawp qdq, %7, %5, %1
+ movdqa %5, %10
+%endmacro ; end of TRANSPOSE_8x16B_SSE2
-%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
- movq [%1], xmm4 ; result of line 1, x8 bytes
- movq [%1+%2], xmm2 ; result of line 2
- lea %1, [%1+2*%2]
- movq [%1], xmm3 ; result of line 3
- movq [%1+%2], xmm7 ; result of line 4
+%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
+ movq [%1], xmm4 ; result of line 1, x8 bytes
+ movq [%1+%2], xmm2 ; result of line 2
+ lea %1, [%1+2*%2]
+ movq [%1], xmm3 ; result of line 3
+ movq [%1+%2], xmm7 ; result of line 4
- lea %1, [%1+2*%2]
- movq [%1], xmm5 ; result of line 5
- movq [%1+%2], xmm1 ; result of line 6
- lea %1, [%1+2*%2]
- movq [%1], xmm6 ; result of line 7
- movq [%1+%2], xmm0 ; result of line 8
+ lea %1, [%1+2*%2]
+ movq [%1], xmm5 ; result of line 5
+ movq [%1+%2], xmm1 ; result of line 6
+ lea %1, [%1+2*%2]
+ movq [%1], xmm6 ; result of line 7
+ movq [%1+%2], xmm0 ; result of line 8
- lea %1, [%1+2*%2]
- movhpd [%1], xmm4 ; result of line 9
- movhpd [%1+%2], xmm2 ; result of line 10
- lea %1, [%1+2*%2]
- movhpd [%1], xmm3 ; result of line 11
- movhpd [%1+%2], xmm7 ; result of line 12
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm4 ; result of line 9
+ movhpd [%1+%2], xmm2 ; result of line 10
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm3 ; result of line 11
+ movhpd [%1+%2], xmm7 ; result of line 12
- lea %1, [%1+2*%2]
- movhpd [%1], xmm5 ; result of line 13
- movhpd [%1+%2], xmm1 ; result of line 14
- lea %1, [%1+2*%2]
- movhpd [%1], xmm6 ; result of line 15
- movhpd [%1+%2], xmm0 ; result of line 16
-%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm5 ; result of line 13
+ movhpd [%1+%2], xmm1 ; result of line 14
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm6 ; result of line 15
+ movhpd [%1+%2], xmm0 ; result of line 16
+%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
-%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
- movq [%1], xmm4 ; result of line 1, x8 bytes
- movq [%1+%2], xmm2 ; result of line 2
- lea %3, [%1+2*%2]
- movq [%3], xmm3 ; result of line 3
- movq [%3+%2], xmm7 ; result of line 4
+%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
+ movq [%1], xmm4 ; result of line 1, x8 bytes
+ movq [%1+%2], xmm2 ; result of line 2
+ lea %3, [%1+2*%2]
+ movq [%3], xmm3 ; result of line 3
+ movq [%3+%2], xmm7 ; result of line 4
- lea %3, [%3+2*%2]
- movq [%3], xmm5 ; result of line 5
- movq [%3+%2], xmm1 ; result of line 6
- lea %3, [%3+2*%2]
- movq [%3], xmm6 ; result of line 7
- movq [%3+%2], xmm0 ; result of line 8
+ lea %3, [%3+2*%2]
+ movq [%3], xmm5 ; result of line 5
+ movq [%3+%2], xmm1 ; result of line 6
+ lea %3, [%3+2*%2]
+ movq [%3], xmm6 ; result of line 7
+ movq [%3+%2], xmm0 ; result of line 8
- lea %3, [%3+2*%2]
- movhpd [%3], xmm4 ; result of line 9
- movhpd [%3+%2], xmm2 ; result of line 10
- lea %3, [%3+2*%2]
- movhpd [%3], xmm3 ; result of line 11
- movhpd [%3+%2], xmm7 ; result of line 12
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm4 ; result of line 9
+ movhpd [%3+%2], xmm2 ; result of line 10
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm3 ; result of line 11
+ movhpd [%3+%2], xmm7 ; result of line 12
- lea %3, [%3+2*%2]
- movhpd [%3], xmm5 ; result of line 13
- movhpd [%3+%2], xmm1 ; result of line 14
- lea %3, [%3+2*%2]
- movhpd [%3], xmm6 ; result of line 15
- movhpd [%3+%2], xmm0 ; result of line 16
-%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm5 ; result of line 13
+ movhpd [%3+%2], xmm1 ; result of line 14
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm6 ; result of line 15
+ movhpd [%3+%2], xmm0 ; result of line 16
+%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
SECTION .text
@@ -187,209 +187,209 @@
WELS_EXTERN TransposeMatrixBlock16x16_sse2
; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
- push r4
- push r5
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
- mov r4, r7
- and r4, 0Fh
- sub r7, 10h
- sub r7, r4
- lea r5, [r3+r3*2]
- ; top 8x16 block
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+r3*2]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+r3*4]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+r3*2]
+ mov r4, r7
+ and r4, 0Fh
+ sub r7, 10h
+ sub r7, r4
+ lea r5, [r3+r3*2]
+ ; top 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+r3*2]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+r3*4]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+r3*2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m4, m2, m3, m7, m5, m1, m6, m0
- TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
- TRANSPOSE8x16_WRITE_SSE2 r0, r1
+ TRANSPOSE8x16_WRITE_SSE2 r0, r1
- ; bottom 8x16 block
- lea r2, [r2+r3*4]
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+r3*2]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+r3*4]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+r3*2]
+ ; bottom 8x16 block
+ lea r2, [r2+r3*4]
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+r3*2]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+r3*4]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+r3*2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m4, m2, m3, m7, m5, m1, m6, m0
- TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
- mov r5, r1
- sal r5, 4
- sub r0, r5
- lea r0, [r0+r1*2+8]
- TRANSPOSE8x16_WRITE_SSE2 r0, r1
+ mov r5, r1
+ sal r5, 4
+ sub r0, r5
+ lea r0, [r0+r1*2+8]
+ TRANSPOSE8x16_WRITE_SSE2 r0, r1
- add r7, r4
- add r7, 10h
- POP_XMM
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
+ add r7, r4
+ add r7, 10h
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
WELS_EXTERN TransposeMatrixBlocksx16_sse2
; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
- push r5
- push r6
- %assign push_num 2
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- mov r5, r7
- and r5, 0Fh
- sub r7, 10h
- sub r7, r5
+ push r5
+ push r6
+ %assign push_num 2
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ mov r5, r7
+ and r5, 0Fh
+ sub r7, 10h
+ sub r7, r5
TRANSPOSE_LOOP_SSE2:
- ; explictly loading next loop data
- lea r6, [r2+r3*8]
- push r4
+ ; explictly loading next loop data
+ lea r6, [r2+r3*8]
+ push r4
%rep 8
- mov r4, [r6]
- mov r4, [r6+r3]
- lea r6, [r6+r3*2]
+ mov r4, [r6]
+ mov r4, [r6+r3]
+ lea r6, [r6+r3*2]
%endrep
- pop r4
- ; top 8x16 block
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm2, [r2]
- movdqa xmm3, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm6, [r2]
+ pop r4
+ ; top 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm2, [r2]
+ movdqa xmm3, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm6, [r2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m4, m2, m3, m7, m5, m1, m6, m0
- TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
- TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
- lea r2, [r2+r3*2]
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+ TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
+ lea r2, [r2+r3*2]
- ; bottom 8x16 block
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm2, [r2]
- movdqa xmm3, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm6, [r2]
+ ; bottom 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm2, [r2]
+ movdqa xmm3, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm6, [r2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m4, m2, m3, m7, m5, m1, m6, m0
- TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
- TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
- lea r2, [r2+r3*2]
- lea r0, [r0+16]
- dec r4
- jg near TRANSPOSE_LOOP_SSE2
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+ TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
+ lea r2, [r2+r3*2]
+ lea r0, [r0+16]
+ dec r4
+ jg near TRANSPOSE_LOOP_SSE2
- add r7, r5
- add r7, 10h
- POP_XMM
- LOAD_5_PARA_POP
- pop r6
- pop r5
- ret
+ add r7, r5
+ add r7, 10h
+ POP_XMM
+ LOAD_5_PARA_POP
+ pop r6
+ pop r5
+ ret
WELS_EXTERN TransposeMatrixBlock8x8_mmx
; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- sub r7, 8
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ sub r7, 8
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m0, m3, m5, m2, m7, m1, m6, m4
- TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m0, m3, m5, m2, m7, m1, m6, m4
+ TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
- TRANSPOSE8x8_WRITE_MMX r0, r1
+ TRANSPOSE8x8_WRITE_MMX r0, r1
- emms
- add r7, 8
- LOAD_4_PARA_POP
- ret
+ emms
+ add r7, 8
+ LOAD_4_PARA_POP
+ ret
WELS_EXTERN TransposeMatrixBlocksx8_mmx
; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
- push r5
- push r6
- %assign push_num 2
- LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- sub r7, 8
+ push r5
+ push r6
+ %assign push_num 2
+ LOAD_5_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ sub r7, 8
- lea r5, [r2+r3*8]
+ lea r5, [r2+r3*8]
TRANSPOSE_BLOCKS_X8_LOOP_MMX:
- ; explictly loading next loop data
+ ; explictly loading next loop data
%rep 4
- mov r6, [r5]
- mov r6, [r5+r3]
- lea r5, [r5+r3*2]
+ mov r6, [r5]
+ mov r6, [r5+r3]
+ lea r5, [r5+r3*2]
%endrep
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m0, m3, m5, m2, m7, m1, m6, m4
- TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m0, m3, m5, m2, m7, m1, m6, m4
+ TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
- TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
- lea r0, [r0+8]
- lea r2, [r2+2*r3]
- dec r4
- jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
+ TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
+ lea r0, [r0+8]
+ lea r2, [r2+2*r3]
+ dec r4
+ jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
- emms
- add r7, 8
- LOAD_5_PARA_POP
- pop r6
- pop r5
- ret
+ emms
+ add r7, 8
+ LOAD_5_PARA_POP
+ pop r6
+ pop r5
+ ret
--- a/codec/encoder/core/x86/memzero.asm
+++ b/codec/encoder/core/x86/memzero.asm
@@ -51,10 +51,10 @@
;void WelsPrefetchZero_mmx(int8_t const*_A);
;***********************************************************************
WELS_EXTERN WelsPrefetchZero_mmx
- %assign push_num 0
- LOAD_1_PARA
- prefetchnta [r0]
- ret
+ %assign push_num 0
+ LOAD_1_PARA
+ prefetchnta [r0]
+ ret
;***********************************************************************
@@ -62,23 +62,23 @@
;***********************************************************************
WELS_EXTERN WelsSetMemZeroAligned64_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- neg r1
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ neg r1
- pxor xmm0, xmm0
+ pxor xmm0, xmm0
.memzeroa64_sse2_loops:
- movdqa [r0], xmm0
- movdqa [r0+16], xmm0
- movdqa [r0+32], xmm0
- movdqa [r0+48], xmm0
- add r0, 0x40
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm0
+ movdqa [r0+32], xmm0
+ movdqa [r0+48], xmm0
+ add r0, 0x40
- add r1, 0x40
- jnz near .memzeroa64_sse2_loops
+ add r1, 0x40
+ jnz near .memzeroa64_sse2_loops
- ret
+ ret
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -85,28 +85,28 @@
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize64_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- neg r1
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ neg r1
- pxor mm0, mm0
+ pxor mm0, mm0
.memzero64_mmx_loops:
- movq [r0], mm0
- movq [r0+8], mm0
- movq [r0+16], mm0
- movq [r0+24], mm0
- movq [r0+32], mm0
- movq [r0+40], mm0
- movq [r0+48], mm0
- movq [r0+56], mm0
- add r0, 0x40
+ movq [r0], mm0
+ movq [r0+8], mm0
+ movq [r0+16], mm0
+ movq [r0+24], mm0
+ movq [r0+32], mm0
+ movq [r0+40], mm0
+ movq [r0+48], mm0
+ movq [r0+56], mm0
+ add r0, 0x40
- add r1, 0x40
- jnz near .memzero64_mmx_loops
+ add r1, 0x40
+ jnz near .memzero64_mmx_loops
- WELSEMMS
- ret
+ WELSEMMS
+ ret
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
@@ -113,20 +113,20 @@
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize8_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- neg r1
- pxor mm0, mm0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ neg r1
+ pxor mm0, mm0
.memzero8_mmx_loops:
- movq [r0], mm0
- add r0, 0x08
+ movq [r0], mm0
+ add r0, 0x08
- add r1, 0x08
- jnz near .memzero8_mmx_loops
+ add r1, 0x08
+ jnz near .memzero8_mmx_loops
- WELSEMMS
- ret
+ WELSEMMS
+ ret
--- a/codec/encoder/core/x86/quant.asm
+++ b/codec/encoder/core/x86/quant.asm
@@ -49,140 +49,140 @@
;************************************************
%macro SSE2_Quant8 5
- MOVDQ %1, %5
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
- paddusw %1, %3
- pmulhuw %1, %4
- pxor %1, %2
- psubw %1, %2
- MOVDQ %5, %1
+ MOVDQ %1, %5
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+ paddusw %1, %3
+ pmulhuw %1, %4
+ pxor %1, %2
+ psubw %1, %2
+ MOVDQ %5, %1
%endmacro
%macro SSE2_QuantMax8 6
- MOVDQ %1, %5
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
- paddusw %1, %3
- pmulhuw %1, %4
- pmaxsw %6, %1
- pxor %1, %2
- psubw %1, %2
- MOVDQ %5, %1
+ MOVDQ %1, %5
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+ paddusw %1, %3
+ pmulhuw %1, %4
+ pmaxsw %6, %1
+ pxor %1, %2
+ psubw %1, %2
+ MOVDQ %5, %1
%endmacro
-%define pDct esp + 4
-%define ff esp + 8
-%define mf esp + 12
-%define max esp + 16
+%define pDct esp + 4
+%define ff esp + 8
+%define mf esp + 12
+%define max esp + 16
;***********************************************************************
-; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
+; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4_sse2
- %assign push_num 0
- LOAD_3_PARA
- movdqa xmm2, [r1]
- movdqa xmm3, [r2]
+ %assign push_num 0
+ LOAD_3_PARA
+ movdqa xmm2, [r1]
+ movdqa xmm3, [r2]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
- ret
+ ret
;***********************************************************************
;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4Dc_sse2
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSIONW r1, r1w
- SIGN_EXTENSIONW r2, r2w
- SSE2_Copy8Times xmm3, r2d
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSIONW r1, r1w
+ SIGN_EXTENSIONW r2, r2w
+ SSE2_Copy8Times xmm3, r2d
- SSE2_Copy8Times xmm2, r1d
+ SSE2_Copy8Times xmm2, r1d
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
- ret
+ ret
;***********************************************************************
-; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
+; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4_sse2
- %assign push_num 0
- LOAD_3_PARA
- MOVDQ xmm2, [r1]
- MOVDQ xmm3, [r2]
+ %assign push_num 0
+ LOAD_3_PARA
+ MOVDQ xmm2, [r1]
+ MOVDQ xmm3, [r2]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
- ret
+ ret
;***********************************************************************
-; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
+; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4Max_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- MOVDQ xmm2, [r1]
- MOVDQ xmm3, [r2]
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ MOVDQ xmm2, [r1]
+ MOVDQ xmm3, [r2]
- pxor xmm4, xmm4
- pxor xmm5, xmm5
- pxor xmm6, xmm6
- pxor xmm7, xmm7
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
+ pxor xmm4, xmm4
+ pxor xmm5, xmm5
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
- SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
- pmaxsw xmm0, xmm4
- pmaxsw xmm0, xmm5
- pmaxsw xmm0, xmm7
- movdqa xmm1, xmm0
- punpckhqdq xmm0, xmm1
- pmaxsw xmm0, xmm1
+ SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
+ pmaxsw xmm0, xmm4
+ pmaxsw xmm0, xmm5
+ pmaxsw xmm0, xmm7
+ movdqa xmm1, xmm0
+ punpckhqdq xmm0, xmm1
+ pmaxsw xmm0, xmm1
- movq [r3], xmm0
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ movq [r3], xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
%macro MMX_Copy4Times 2
- movd %1, %2
- punpcklwd %1, %1
- punpckldq %1, %1
+ movd %1, %2
+ punpcklwd %1, %1
+ punpckldq %1, %1
%endmacro
SECTION .text
%macro MMX_Quant4 4
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
- paddusw %1, %3
- pmulhuw %1, %4
- pxor %1, %2
- psubw %1, %2
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+ paddusw %1, %3
+ pmulhuw %1, %4
+ pxor %1, %2
+ psubw %1, %2
%endmacro
;***********************************************************************
@@ -189,101 +189,101 @@
;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2_mmx
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENSIONW r1, r1w
- SIGN_EXTENSIONW r2, r2w
- movd mm0, [r0]
- movd mm1, [r0 + 0x20]
- punpcklwd mm0, mm1
- movd mm3, [r0 + 0x40]
- movd mm1, [r0 + 0x60]
- punpcklwd mm3, mm1
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENSIONW r1, r1w
+ SIGN_EXTENSIONW r2, r2w
+ movd mm0, [r0]
+ movd mm1, [r0 + 0x20]
+ punpcklwd mm0, mm1
+ movd mm3, [r0 + 0x40]
+ movd mm1, [r0 + 0x60]
+ punpcklwd mm3, mm1
- ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
- movq mm5, mm3
- paddw mm3, mm0
- psubw mm0, mm5
- punpcklwd mm3, mm0
- movq mm1, mm3
- psrlq mm1, 32
- movq mm5, mm1
- paddw mm1, mm3
- psubw mm3, mm5
- punpcklwd mm1, mm3
+ ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
+ movq mm5, mm3
+ paddw mm3, mm0
+ psubw mm0, mm5
+ punpcklwd mm3, mm0
+ movq mm1, mm3
+ psrlq mm1, 32
+ movq mm5, mm1
+ paddw mm1, mm3
+ psubw mm3, mm5
+ punpcklwd mm1, mm3
- ;quant_2x2_dc
- MMX_Copy4Times mm3, r2d
- MMX_Copy4Times mm2, r1d
- MMX_Quant4 mm1, mm0, mm2, mm3
+ ;quant_2x2_dc
+ MMX_Copy4Times mm3, r2d
+ MMX_Copy4Times mm2, r1d
+ MMX_Quant4 mm1, mm0, mm2, mm3
- ; store dct_2x2
- movq [r3], mm1
- movq [r4], mm1
+ ; store dct_2x2
+ movq [r3], mm1
+ movq [r4], mm1
- ; pNonZeroCount of dct_2x2
- pcmpeqb mm2, mm2 ; mm2 = FF
- pxor mm3, mm3
- packsswb mm1, mm3
- pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
- psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
- psadbw mm1, mm3 ;
- mov r1w, 0
- mov [r0], r1w
- mov [r0 + 0x20], r1w
- mov [r0 + 0x40], r1w
- mov [r0 + 0x60], r1w
+ ; pNonZeroCount of dct_2x2
+ pcmpeqb mm2, mm2 ; mm2 = FF
+ pxor mm3, mm3
+ packsswb mm1, mm3
+ pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
+ psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
+ psadbw mm1, mm3 ;
+ mov r1w, 0
+ mov [r0], r1w
+ mov [r0 + 0x20], r1w
+ mov [r0 + 0x40], r1w
+ mov [r0 + 0x60], r1w
- movd retrd, mm1
+ movd retrd, mm1
- WELSEMMS
- LOAD_5_PARA_POP
- ret
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
;***********************************************************************
;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSIONW r1, r1w
- SIGN_EXTENSIONW r2, r2w
- movd mm0, [r0]
- movd mm1, [r0 + 0x20]
- punpcklwd mm0, mm1
- movd mm3, [r0 + 0x40]
- movd mm1, [r0 + 0x60]
- punpcklwd mm3, mm1
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSIONW r1, r1w
+ SIGN_EXTENSIONW r2, r2w
+ movd mm0, [r0]
+ movd mm1, [r0 + 0x20]
+ punpcklwd mm0, mm1
+ movd mm3, [r0 + 0x40]
+ movd mm1, [r0 + 0x60]
+ punpcklwd mm3, mm1
- ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
- movq mm5, mm3
- paddw mm3, mm0
- psubw mm0, mm5
- punpcklwd mm3, mm0
- movq mm1, mm3
- psrlq mm1, 32
- movq mm5, mm1
- paddw mm1, mm3
- psubw mm3, mm5
- punpcklwd mm1, mm3
+ ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
+ movq mm5, mm3
+ paddw mm3, mm0
+ psubw mm0, mm5
+ punpcklwd mm3, mm0
+ movq mm1, mm3
+ psrlq mm1, 32
+ movq mm5, mm1
+ paddw mm1, mm3
+ psubw mm3, mm5
+ punpcklwd mm1, mm3
- ;quant_2x2_dc
- MMX_Copy4Times mm3, r2d
- MMX_Copy4Times mm2, r1d
- MMX_Quant4 mm1, mm0, mm2, mm3
+ ;quant_2x2_dc
+ MMX_Copy4Times mm3, r2d
+ MMX_Copy4Times mm2, r1d
+ MMX_Quant4 mm1, mm0, mm2, mm3
- ; pNonZeroCount of dct_2x2
- pcmpeqb mm2, mm2 ; mm2 = FF
- pxor mm3, mm3
- packsswb mm1, mm3
- pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
- psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
- psadbw mm1, mm3 ;
- movd retrd, mm1
+ ; pNonZeroCount of dct_2x2
+ pcmpeqb mm2, mm2 ; mm2 = FF
+ pxor mm3, mm3
+ packsswb mm1, mm3
+ pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
+ psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
+ psadbw mm1, mm3 ;
+ movd retrd, mm1
- WELSEMMS
- ret
+ WELSEMMS
+ ret
%macro SSE2_DeQuant8 3
@@ -297,12 +297,12 @@
; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
;***********************************************************************
WELS_EXTERN WelsDequant4x4_sse2
- %assign push_num 0
- LOAD_2_PARA
+ %assign push_num 0
+ LOAD_2_PARA
- movdqa xmm1, [r1]
- SSE2_DeQuant8 [r0 ], xmm0, xmm1
- SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
+ movdqa xmm1, [r1]
+ SSE2_DeQuant8 [r0 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
ret
@@ -311,18 +311,18 @@
;***********************************************************************====
WELS_EXTERN WelsDequantFour4x4_sse2
- %assign push_num 0
- LOAD_2_PARA
+ %assign push_num 0
+ LOAD_2_PARA
- movdqa xmm1, [r1]
- SSE2_DeQuant8 [r0 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
+ movdqa xmm1, [r1]
+ SSE2_DeQuant8 [r0 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
ret
@@ -330,41 +330,41 @@
;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
;***********************************************************************
WELS_EXTERN WelsDequantIHadamard4x4_sse2
- %assign push_num 0
- LOAD_2_PARA
- %ifndef X86_32
- movzx r1, r1w
- %endif
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movzx r1, r1w
+ %endif
- ; WelsDequantLumaDc4x4
- SSE2_Copy8Times xmm1, r1d
- ;psrlw xmm1, 2 ; for the (>>2) in ihdm
- MOVDQ xmm0, [r0]
- MOVDQ xmm2, [r0+0x10]
- pmullw xmm0, xmm1
- pmullw xmm2, xmm1
+ ; WelsDequantLumaDc4x4
+ SSE2_Copy8Times xmm1, r1d
+ ;psrlw xmm1, 2 ; for the (>>2) in ihdm
+ MOVDQ xmm0, [r0]
+ MOVDQ xmm2, [r0+0x10]
+ pmullw xmm0, xmm1
+ pmullw xmm2, xmm1
- ; ihdm_4x4
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- movdqa xmm3, xmm2
- psrldq xmm3, 8
+ ; ihdm_4x4
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ movdqa xmm3, xmm2
+ psrldq xmm3, 8
- SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
- SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
- SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
- SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
+ SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+ SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
+ SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
+ SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
- SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
- SSE2_SumSub xmm2, xmm4, xmm5
- SSE2_SumSub xmm1, xmm0, xmm5
- SSE2_SumSub xmm4, xmm0, xmm5
- SSE2_SumSub xmm2, xmm1, xmm5
- SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
+ SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
+ SSE2_SumSub xmm2, xmm4, xmm5
+ SSE2_SumSub xmm1, xmm0, xmm5
+ SSE2_SumSub xmm4, xmm0, xmm5
+ SSE2_SumSub xmm2, xmm1, xmm5
+ SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
- punpcklqdq xmm0, xmm1
- MOVDQ [r0], xmm0
+ punpcklqdq xmm0, xmm1
+ MOVDQ [r0], xmm0
- punpcklqdq xmm2, xmm3
- MOVDQ [r0+16], xmm2
- ret
+ punpcklqdq xmm2, xmm3
+ MOVDQ [r0+16], xmm2
+ ret
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -35,123 +35,123 @@
;**********************************************************************************************************************************
;
-; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
+; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
;
-; \note:
-; src need align with 16 bytes, ref is optional
-; \return value:
-; return minimal SAD cost, according index carried by index_min_cost
+; \note:
+; src need align with 16 bytes, ref is optional
+; \return value:
+; return minimal SAD cost, according index carried by index_min_cost
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
-%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
- movdqa xmm0, [%1]
- movdqu xmm1, [%2]
- movdqu xmm2, [%2+8h]
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
+%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
+ movdqa xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqu xmm2, [%2+8h]
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
- mpsadbw xmm1, xmm0, 0 ; 000 B
- paddw xmm7, xmm1 ; accumulate cost
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
- mpsadbw xmm3, xmm0, 5 ; 101 B
- paddw xmm7, xmm3 ; accumulate cost
+ mpsadbw xmm3, xmm0, 5 ; 101 B
+ paddw xmm7, xmm3 ; accumulate cost
- mpsadbw xmm2, xmm0, 2 ; 010 B
- paddw xmm7, xmm2 ; accumulate cost
+ mpsadbw xmm2, xmm0, 2 ; 010 B
+ paddw xmm7, xmm2 ; accumulate cost
- mpsadbw xmm4, xmm0, 7 ; 111 B
- paddw xmm7, xmm4 ; accumulate cost
+ mpsadbw xmm4, xmm0, 7 ; 111 B
+ paddw xmm7, xmm4 ; accumulate cost
- add %1, %3
- add %2, %4
-%endmacro ; end of SAD_16x16_LINE_SSE41
-%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
- movdqa xmm0, [%1]
- movdqu xmm1, [%2]
- movdqu xmm2, [%2+8h]
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
+ add %1, %3
+ add %2, %4
+%endmacro ; end of SAD_16x16_LINE_SSE41
+%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
+ movdqa xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqu xmm2, [%2+8h]
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
- mpsadbw xmm1, xmm0, 0 ; 000 B
- paddw xmm7, xmm1 ; accumulate cost
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
- mpsadbw xmm3, xmm0, 5 ; 101 B
- paddw xmm7, xmm3 ; accumulate cost
+ mpsadbw xmm3, xmm0, 5 ; 101 B
+ paddw xmm7, xmm3 ; accumulate cost
- mpsadbw xmm2, xmm0, 2 ; 010 B
- paddw xmm7, xmm2 ; accumulate cost
+ mpsadbw xmm2, xmm0, 2 ; 010 B
+ paddw xmm7, xmm2 ; accumulate cost
- mpsadbw xmm4, xmm0, 7 ; 111 B
- paddw xmm7, xmm4 ; accumulate cost
-%endmacro ; end of SAD_16x16_LINE_SSE41E
+ mpsadbw xmm4, xmm0, 7 ; 111 B
+ paddw xmm7, xmm4 ; accumulate cost
+%endmacro ; end of SAD_16x16_LINE_SSE41E
WELS_EXTERN SampleSad16x16Hor8_sse41
;push ebx
;push esi
- ;mov eax, [esp+12] ; src
- ;mov ecx, [esp+16] ; stride_src
- ;mov ebx, [esp+20] ; ref
- ;mov edx, [esp+24] ; stride_ref
- ;mov esi, [esp+28] ; base_cost
+ ;mov eax, [esp+12] ; src
+ ;mov ecx, [esp+16] ; stride_src
+ ;mov ebx, [esp+20] ; ref
+ ;mov edx, [esp+24] ; stride_ref
+ ;mov esi, [esp+28] ; base_cost
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm7, xmm7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm7, xmm7
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41E r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41E r0, r2, r1, r3
- pxor xmm0, xmm0
- movdqa xmm6, xmm7
- punpcklwd xmm6, xmm0
- punpckhwd xmm7, xmm0
+ pxor xmm0, xmm0
+ movdqa xmm6, xmm7
+ punpcklwd xmm6, xmm0
+ punpckhwd xmm7, xmm0
- movdqa xmm5, [r4]
- movdqa xmm4, xmm5
- punpcklwd xmm4, xmm0
- punpckhwd xmm5, xmm0
+ movdqa xmm5, [r4]
+ movdqa xmm4, xmm5
+ punpcklwd xmm4, xmm0
+ punpckhwd xmm5, xmm0
- paddd xmm4, xmm6
- paddd xmm5, xmm7
- movdqa xmm3, xmm4
- pminud xmm3, xmm5
- pshufd xmm2, xmm3, 01001110B
- pminud xmm2, xmm3
- pshufd xmm3, xmm2, 10110001B
- pminud xmm2, xmm3
- movd retrd, xmm2
- pcmpeqd xmm4, xmm2
- movmskps r2d, xmm4
- bsf r1d, r2d
- jnz near WRITE_INDEX
+ paddd xmm4, xmm6
+ paddd xmm5, xmm7
+ movdqa xmm3, xmm4
+ pminud xmm3, xmm5
+ pshufd xmm2, xmm3, 01001110B
+ pminud xmm2, xmm3
+ pshufd xmm3, xmm2, 10110001B
+ pminud xmm2, xmm3
+ movd retrd, xmm2
+ pcmpeqd xmm4, xmm2
+ movmskps r2d, xmm4
+ bsf r1d, r2d
+ jnz near WRITE_INDEX
- pcmpeqd xmm5, xmm2
- movmskps r2d, xmm5
- bsf r1d, r2d
- add r1d, 4
+ pcmpeqd xmm5, xmm2
+ movmskps r2d, xmm5
+ bsf r1d, r2d
+ add r1d, 4
WRITE_INDEX:
- mov [r5], r1d
+ mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP
ret
@@ -158,66 +158,66 @@
;**********************************************************************************************************************************
;
-; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
+; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
;
-; \note:
-; src and ref is optional to align with 16 due inter 8x8
-; \return value:
-; return minimal SAD cost, according index carried by index_min_cost
+; \note:
+; src and ref is optional to align with 16 due inter 8x8
+; \return value:
+; return minimal SAD cost, according index carried by index_min_cost
;
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
-%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
- movdqu xmm0, [%1]
- movdqu xmm1, [%2]
- movdqa xmm2, xmm1
+%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
+ movdqu xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0 ; 000 B
- paddw xmm7, xmm1 ; accumulate cost
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
- mpsadbw xmm2, xmm0, 5 ; 101 B
- paddw xmm7, xmm2 ; accumulate cost
+ mpsadbw xmm2, xmm0, 5 ; 101 B
+ paddw xmm7, xmm2 ; accumulate cost
- add %1, %3
- add %2, %4
-%endmacro ; end of SAD_8x8_LINE_SSE41
-%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
- movdqu xmm0, [%1]
- movdqu xmm1, [%2]
- movdqa xmm2, xmm1
+ add %1, %3
+ add %2, %4
+%endmacro ; end of SAD_8x8_LINE_SSE41
+%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
+ movdqu xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0 ; 000 B
- paddw xmm7, xmm1 ; accumulate cost
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
- mpsadbw xmm2, xmm0, 5 ; 101 B
- paddw xmm7, xmm2 ; accumulate cost
-%endmacro ; end of SAD_8x8_LINE_SSE41E
+ mpsadbw xmm2, xmm0, 5 ; 101 B
+ paddw xmm7, xmm2 ; accumulate cost
+%endmacro ; end of SAD_8x8_LINE_SSE41E
WELS_EXTERN SampleSad8x8Hor8_sse41
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqa xmm7, [r4] ; load base cost list
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm7, [r4] ; load base cost list
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41E r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41E r0, r2, r1, r3
- phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
- movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
- mov r1d, retrd
- and retrd, 0xFFFF
- sar r1d, 16
- mov [r5], r1d
+ phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
+ movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
+ mov r1d, retrd
+ and retrd, 0xFFFF
+ sar r1d, 16
+ mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -104,32 +104,32 @@
align 16
high_mask_table:
- db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
- db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
- db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
- db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
- db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
- db 5, 8, 5, 7, 8,11, 6, 8, 8,11
- db 9,11,12,15, 0, 1, 1, 4, 1, 3
- db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
- db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
- db 7,10, 8,10,11,14, 3, 4, 4, 7
- db 5, 7, 8,11, 5, 7, 7,10, 8,10
- db 11,14, 6, 7, 8,11, 8,10,11,14
- db 9,11,11,14,12,14,15,18, 0, 0
- db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
- db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
- db 7,10, 5, 7, 7,10, 8,10,11,14
- db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
- db 6, 9, 7, 9,10,13, 5, 6, 7,10
- db 7, 9,10,13, 8,10,10,13,11,13
- db 14,17, 3, 4, 4, 7, 4, 6, 7,10
- db 5, 7, 7,10, 8,10,11,14, 5, 6
- db 7,10, 7, 9,10,13, 8,10,10,13
- db 11,13,14,17, 6, 7, 7,10, 8,10
- db 11,14, 8,10,10,13,11,13,14,17
- db 9,10,11,14,11,13,14,17,12,14
- db 14,17,15,17,18,21
+ db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
+ db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
+ db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
+ db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
+ db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
+ db 5, 8, 5, 7, 8,11, 6, 8, 8,11
+ db 9,11,12,15, 0, 1, 1, 4, 1, 3
+ db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
+ db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
+ db 7,10, 8,10,11,14, 3, 4, 4, 7
+ db 5, 7, 8,11, 5, 7, 7,10, 8,10
+ db 11,14, 6, 7, 8,11, 8,10,11,14
+ db 9,11,11,14,12,14,15,18, 0, 0
+ db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
+ db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
+ db 7,10, 5, 7, 7,10, 8,10,11,14
+ db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
+ db 6, 9, 7, 9,10,13, 5, 6, 7,10
+ db 7, 9,10,13, 8,10,10,13,11,13
+ db 14,17, 3, 4, 4, 7, 4, 6, 7,10
+ db 5, 7, 7,10, 8,10,11,14, 5, 6
+ db 7,10, 7, 9,10,13, 8,10,10,13
+ db 11,13,14,17, 6, 7, 7,10, 8,10
+ db 11,14, 8,10,10,13,11,13,14,17
+ db 9,10,11,14,11,13,14,17,12,14
+ db 14,17,15,17,18,21
align 16
low_mask_table:
@@ -167,78 +167,78 @@
;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_sse2
- %ifdef X86_32
- push r3
- %assign push_num 1
- %else
- %assign push_num 0
- %endif
- LOAD_2_PARA
- movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
- movdqa xmm1, [r1+16] ; f e d c b a 9 8
- pextrw r2d, xmm0, 7 ; ecx = 7
- pextrw r3d, xmm1, 2 ; edx = a
- pextrw r1d, xmm0, 5 ; eax = 5
- pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
- pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
- pextrw r2d, xmm1, 0 ; ecx = 8
- pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
- pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
- pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
- pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
- pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
- pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
- movdqa [r0],xmm0
- movdqa [r0+16], xmm1
- %ifdef X86_32
- pop r3
- %endif
- ret
+ %ifdef X86_32
+ push r3
+ %assign push_num 1
+ %else
+ %assign push_num 0
+ %endif
+ LOAD_2_PARA
+ movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
+ movdqa xmm1, [r1+16] ; f e d c b a 9 8
+ pextrw r2d, xmm0, 7 ; ecx = 7
+ pextrw r3d, xmm1, 2 ; edx = a
+ pextrw r1d, xmm0, 5 ; eax = 5
+ pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
+ pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
+ pextrw r2d, xmm1, 0 ; ecx = 8
+ pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
+ pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
+ pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
+ pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
+ pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
+ pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
+ movdqa [r0],xmm0
+ movdqa [r0+16], xmm1
+ %ifdef X86_32
+ pop r3
+ %endif
+ ret
;***********************************************************************
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_ssse3
- %assign push_num 0
- LOAD_2_PARA
- movdqa xmm0, [r1]
- movdqa xmm1, [r1+16]
- pextrw r2d, xmm0, 7 ; ecx = [7]
- pextrw r1d, xmm1, 0 ; eax = [8]
- pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
- pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
- pshufb xmm1, [pb_scanacdc_maskb]
- pshufb xmm0, [pb_scanacdc_maska]
+ %assign push_num 0
+ LOAD_2_PARA
+ movdqa xmm0, [r1]
+ movdqa xmm1, [r1+16]
+ pextrw r2d, xmm0, 7 ; ecx = [7]
+ pextrw r1d, xmm1, 0 ; eax = [8]
+ pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
+ pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
+ pshufb xmm1, [pb_scanacdc_maskb]
+ pshufb xmm0, [pb_scanacdc_maska]
- movdqa [r0],xmm0
- movdqa [r0+16], xmm1
- ret
+ movdqa [r0],xmm0
+ movdqa [r0+16], xmm1
+ ret
;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4Ac_sse2
- %assign push_num 0
- LOAD_2_PARA
- movdqa xmm0, [r1]
- movdqa xmm1, [r1+16]
- movdqa xmm2, xmm0
- punpcklqdq xmm0, xmm1
- punpckhqdq xmm2, xmm1
+ %assign push_num 0
+ LOAD_2_PARA
+ movdqa xmm0, [r1]
+ movdqa xmm1, [r1+16]
+ movdqa xmm2, xmm0
+ punpcklqdq xmm0, xmm1
+ punpckhqdq xmm2, xmm1
- movdqa xmm3, xmm0
- punpckldq xmm0, xmm2
- punpckhdq xmm3, xmm2
- pextrw r1d , xmm0, 3
- pextrw r2d , xmm0, 7
- pinsrw xmm0, r1d, 7
- pextrw r1d, xmm3, 4
- pinsrw xmm3, r2d, 4
- pextrw r2d, xmm3, 0
- pinsrw xmm3, r1d, 0
- pinsrw xmm0, r2d, 3
+ movdqa xmm3, xmm0
+ punpckldq xmm0, xmm2
+ punpckhdq xmm3, xmm2
+ pextrw r1d , xmm0, 3
+ pextrw r2d , xmm0, 7
+ pinsrw xmm0, r1d, 7
+ pextrw r1d, xmm3, 4
+ pinsrw xmm3, r2d, 4
+ pextrw r2d, xmm3, 0
+ pinsrw xmm3, r1d, 0
+ pinsrw xmm0, r2d, 3
- pshufhw xmm1, xmm0, 0x93
- pshuflw xmm2, xmm3, 0x39
+ pshufhw xmm1, xmm0, 0x93
+ pshuflw xmm2, xmm3, 0x39
movdqa xmm3, xmm2
psrldq xmm1, 2
@@ -245,9 +245,9 @@
pslldq xmm3, 14
por xmm1, xmm3
psrldq xmm2, 2
- movdqa [r0],xmm1
- movdqa [r0+16], xmm2
- ret
+ movdqa [r0],xmm1
+ movdqa [r0+16], xmm2
+ ret
;***********************************************************************
@@ -254,19 +254,19 @@
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
- %ifdef X86_32
- push r3
- %assign push_num 1
- %else
- %assign push_num 0
- %endif
- LOAD_1_PARA
- movdqa xmm0, [r0]
- movdqa xmm1, [r0+16]
+ %ifdef X86_32
+ push r3
+ %assign push_num 1
+ %else
+ %assign push_num 0
+ %endif
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+16]
- packsswb xmm0, xmm1
- ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
- xor r3, r3
+ packsswb xmm0, xmm1
+ ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
+ xor r3, r3
pxor xmm3, xmm3
pcmpeqb xmm0, xmm3
pmovmskb r3d, xmm0
@@ -273,39 +273,39 @@
xor r3, 0xffff
- xor r0, r0
- mov r2, 7
- mov r1, 8
+ xor r0, r0
+ mov r2, 7
+ mov r1, 8
.loop_low8_find1:
- bt r3, r2
- jc .loop_high8_find1
- dec r2
- jnz .loop_low8_find1
+ bt r3, r2
+ jc .loop_high8_find1
+ dec r2
+ jnz .loop_low8_find1
.loop_high8_find1:
- bt r3, r1
- jc .find1end
- inc r1
- cmp r1,16
- jb .loop_high8_find1
+ bt r3, r1
+ jc .find1end
+ inc r1
+ cmp r1,16
+ jb .loop_high8_find1
.find1end:
- sub r1, r2
- sub r1, 1
- lea r2, [i_ds_table]
- add r0b, [r2+r1]
- mov r1, r3
- and r3, 0xff
- shr r1, 8
- and r1, 0xff
- lea r2 , [low_mask_table]
- add r0b, [r2 +r3]
- lea r2, [high_mask_table]
- add r0b, [r2+r1]
- %ifdef X86_32
- pop r3
- %else
- mov retrd, r0d
- %endif
- ret
+ sub r1, r2
+ sub r1, 1
+ lea r2, [i_ds_table]
+ add r0b, [r2+r1]
+ mov r1, r3
+ and r3, 0xff
+ shr r1, 8
+ and r1, 0xff
+ lea r2 , [low_mask_table]
+ add r0b, [r2 +r3]
+ lea r2, [high_mask_table]
+ add r0b, [r2+r1]
+ %ifdef X86_32
+ pop r3
+ %else
+ mov retrd, r0d
+ %endif
+ ret
;***********************************************************************
@@ -312,28 +312,28 @@
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;***********************************************************************
WELS_EXTERN WelsGetNoneZeroCount_sse2
- %assign push_num 0
- LOAD_1_PARA
- movdqa xmm0, [r0]
- movdqa xmm1, [r0+16]
- pxor xmm2, xmm2
- pcmpeqw xmm0, xmm2
- pcmpeqw xmm1, xmm2
- packsswb xmm1, xmm0
- xor r1, r1
- pmovmskb r1d, xmm1
- xor r1d, 0xffff
- mov r2, r1
- and r1, 0xff
- shr r2, 8
-; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
-; xor retr, retr
- ;add al, [nozero_count_table+r2]
- lea r0 , [nozero_count_table]
- movzx r2, byte [r0+r2]
- movzx r1, byte [r0+r1]
- mov retrq, r2
- add retrq, r1
- ;add al, [nozero_count_table+r1]
- ret
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+16]
+ pxor xmm2, xmm2
+ pcmpeqw xmm0, xmm2
+ pcmpeqw xmm1, xmm2
+ packsswb xmm1, xmm0
+ xor r1, r1
+ pmovmskb r1d, xmm1
+ xor r1d, 0xffff
+ mov r2, r1
+ and r1, 0xff
+ shr r2, 8
+; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
+; xor retr, retr
+ ;add al, [nozero_count_table+r2]
+ lea r0 , [nozero_count_table]
+ movzx r2, byte [r0+r2]
+ movzx r1, byte [r0+r1]
+ mov retrq, r2
+ add retrq, r1
+ ;add al, [nozero_count_table+r1]
+ ret
--- a/codec/processing/src/arm/adaptive_quantization.S
+++ b/codec/processing/src/arm/adaptive_quantization.S
@@ -36,17 +36,17 @@
#ifdef __APPLE__
.macro SQR_ADD_16BYTES
- vmull.u8 q3, $0, $0
- vmull.u8 q8, $1, $1
- vpadal.u16 $2, q3
- vpadal.u16 $2, q8
+ vmull.u8 q3, $0, $0
+ vmull.u8 q8, $1, $1
+ vpadal.u16 $2, q3
+ vpadal.u16 $2, q8
.endm
#else
.macro SQR_ADD_16BYTES arg0, arg1, arg2
- vmull.u8 q3, \arg0, \arg0
- vmull.u8 q8, \arg1, \arg1
- vpadal.u16 \arg2, q3
- vpadal.u16 \arg2, q8
+ vmull.u8 q3, \arg0, \arg0
+ vmull.u8 q8, \arg1, \arg1
+ vpadal.u16 \arg2, q3
+ vpadal.u16 \arg2, q8
.endm
#endif
@@ -54,66 +54,66 @@
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
stmdb sp!, {r4}
- vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
- vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
+ vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
+ vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
- vabd.u8 q13, q14, q15
- vmull.u8 q12, d27, d27
- vmull.u8 q11, d26, d26
- vaddl.u16 q12, d24, d25
- vpadal.u16 q12, q11 //sqr
+ vabd.u8 q13, q14, q15
+ vmull.u8 q12, d27, d27
+ vmull.u8 q11, d26, d26
+ vaddl.u16 q12, d24, d25
+ vpadal.u16 q12, q11 //sqr
vaddl.u8 q13, d26, d27 //sum
- vaddl.u8 q10, d28, d29 //sum_cur
+ vaddl.u8 q10, d28, d29 //sum_cur
- vmull.u8 q9, d29, d29
- vmull.u8 q8, d28, d28
- vaddl.u16 q9, d18, d19 //sqr_cur
- vpadal.u16 q9, q8
+ vmull.u8 q9, d29, d29
+ vmull.u8 q8, d28, d28
+ vaddl.u16 q9, d18, d19 //sqr_cur
+ vpadal.u16 q9, q8
- mov r4, #15
+ mov r4, #15
pixel_var_16x16_loop0:
- vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
- vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
+ vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
+ vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
- vabd.u8 q2, q0, q1
+ vabd.u8 q2, q0, q1
- //q10 save sum_cur
- vpadal.u8 q10, q1
+ //q10 save sum_cur
+ vpadal.u8 q10, q1
- //q12 save sqr
- SQR_ADD_16BYTES d4, d5, q12
+ //q12 save sqr
+ SQR_ADD_16BYTES d4, d5, q12
//q13 save sum
- vpadal.u8 q13, q2
+ vpadal.u8 q13, q2
- subs r4, #1
+ subs r4, #1
- //q9 save sqr_cur
- SQR_ADD_16BYTES d2, d3, q9
+ //q9 save sqr_cur
+ SQR_ADD_16BYTES d2, d3, q9
- bne pixel_var_16x16_loop0
+ bne pixel_var_16x16_loop0
- vadd.u16 d0, d26, d27 //sum
- vadd.u16 d1, d20, d21 //sum_cur
- vpaddl.u16 q0, q0
- vadd.u32 d2, d24, d25 //sqr
- vadd.u32 d3, d18, d19 //sqr_cur
- vpadd.u32 d0, d0, d1
- vpadd.u32 d1, d2, d3
+ vadd.u16 d0, d26, d27 //sum
+ vadd.u16 d1, d20, d21 //sum_cur
+ vpaddl.u16 q0, q0
+ vadd.u32 d2, d24, d25 //sqr
+ vadd.u32 d3, d18, d19 //sqr_cur
+ vpadd.u32 d0, d0, d1
+ vpadd.u32 d1, d2, d3
- ldr r4, [sp, #4]
+ ldr r4, [sp, #4]
- vshr.u32 q0, q0, #8
- vmul.u32 d0, d0
- vsub.u32 d0, d1, d0
+ vshr.u32 q0, q0, #8
+ vmul.u32 d0, d0
+ vsub.u32 d0, d1, d0
vmovl.u32 q0, d0
- vst2.16 {d0[0], d1[0]}, [r4]
+ vst2.16 {d0[0], d1[0]}, [r4]
- ldmia sp!, {r4}
+ ldmia sp!, {r4}
WELS_ASM_FUNC_END
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -30,196 +30,196 @@
*
*/
-#ifdef HAVE_NEON
+#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
-WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
- stmdb sp!, {r4-r8, lr}
+WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
+ stmdb sp!, {r4-r8, lr}
- //Get the width and height
- ldr r4, [sp, #24] //src_width
- ldr r5, [sp, #28] //src_height
+ //Get the width and height
+ ldr r4, [sp, #24] //src_width
+ ldr r5, [sp, #28] //src_height
- //Initialize the register
- mov r6, r2
- mov r8, r0
- mov lr, #0
- lsr r5, #1
+ //Initialize the register
+ mov r6, r2
+ mov r8, r0
+ mov lr, #0
+ lsr r5, #1
- //Save the tailer for the unasigned size
- mla r7, r1, r5, r0
- vld1.32 {q15}, [r7]
+ //Save the tailer for the unasigned size
+ mla r7, r1, r5, r0
+ vld1.32 {q15}, [r7]
- add r7, r2, r3
- //processing a colume data
+ add r7, r2, r3
+ //processing a colume data
comp_ds_bilinear_loop0:
- vld1.8 {q0,q1}, [r2]!
- vld1.8 {q2,q3}, [r7]!
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vpaddl.u8 q2, q2
- vpaddl.u8 q3, q3
- vrshr.u16 q0, #1
- vrshr.u16 q1, #1
- vrshr.u16 q2, #1
- vrshr.u16 q3, #1
- vrhadd.u16 q0, q2
- vrhadd.u16 q1, q3
- vmovn.u16 d0, q0
- vmovn.u16 d1, q1
- vst1.32 {q0}, [r0]!
- add lr, #32
+ vld1.8 {q0,q1}, [r2]!
+ vld1.8 {q2,q3}, [r7]!
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vrshr.u16 q0, #1
+ vrshr.u16 q1, #1
+ vrshr.u16 q2, #1
+ vrshr.u16 q3, #1
+ vrhadd.u16 q0, q2
+ vrhadd.u16 q1, q3
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q1
+ vst1.32 {q0}, [r0]!
+ add lr, #32
- cmp lr, r4
- movcs lr, #0
- addcs r6, r6, r3, lsl #1
- movcs r2, r6
- addcs r7, r2, r3
- addcs r8, r1
- movcs r0, r8
- subscs r5, #1
- bne comp_ds_bilinear_loop0
+ cmp lr, r4
+ movcs lr, #0
+ addcs r6, r6, r3, lsl #1
+ movcs r2, r6
+ addcs r7, r2, r3
+ addcs r8, r1
+ movcs r0, r8
+ subscs r5, #1
+ bne comp_ds_bilinear_loop0
- //restore the tailer for the unasigned size
- vst1.32 {q15}, [r0]
+ //restore the tailer for the unasigned size
+ vst1.32 {q15}, [r0]
- ldmia sp!, {r4-r8,lr}
+ ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
- stmdb sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
+ stmdb sp!, {r4-r7, lr}
- //Get the width and height
- ldr r4, [sp, #20] //src_width
- ldr r5, [sp, #24] //src_height
+ //Get the width and height
+ ldr r4, [sp, #20] //src_width
+ ldr r5, [sp, #24] //src_height
- //Get the difference
- sub lr, r3, r4
- sub r1, r1, r4, lsr #1
+ //Get the difference
+ sub lr, r3, r4
+ sub r1, r1, r4, lsr #1
- lsr r5, #1
+ lsr r5, #1
- //processing a colume data
+ //processing a colume data
comp_ds_bilinear_w_x8_loop0:
- lsr r6, r4, #3
- add r7, r2, r3
- //processing a line data
+ lsr r6, r4, #3
+ add r7, r2, r3
+ //processing a line data
comp_ds_bilinear_w_x8_loop1:
- vld1.8 {d0}, [r2]!
- vld1.8 {d1}, [r7]!
- vpaddl.u8 q0, q0
- vrshr.u16 q0, #1
- vrhadd.u16 d0, d1
+ vld1.8 {d0}, [r2]!
+ vld1.8 {d1}, [r7]!
+ vpaddl.u8 q0, q0
+ vrshr.u16 q0, #1
+ vrhadd.u16 d0, d1
- vmovn.u16 d0, q0
- vst1.32 {d0[0]}, [r0]!
- subs r6, #1
- bne comp_ds_bilinear_w_x8_loop1
+ vmovn.u16 d0, q0
+ vst1.32 {d0[0]}, [r0]!
+ subs r6, #1
+ bne comp_ds_bilinear_w_x8_loop1
- add r2, r7, lr
- add r0, r1
- subs r5, #1
- bne comp_ds_bilinear_w_x8_loop0
+ add r2, r7, lr
+ add r0, r1
+ subs r5, #1
+ bne comp_ds_bilinear_w_x8_loop0
- ldmia sp!, {r4-r7,lr}
+ ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
- stmdb sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
+ stmdb sp!, {r4-r7, lr}
- //Get the width and height
- ldr r4, [sp, #20] //src_width
- ldr r5, [sp, #24] //src_height
+ //Get the width and height
+ ldr r4, [sp, #20] //src_width
+ ldr r5, [sp, #24] //src_height
- //Get the difference
- sub lr, r3, r4
- sub r1, r1, r4, lsr #1
+ //Get the difference
+ sub lr, r3, r4
+ sub r1, r1, r4, lsr #1
- lsr r5, #1
+ lsr r5, #1
- //processing a colume data
+ //processing a colume data
comp_ds_bilinear_w_x16_loop0:
- lsr r6, r4, #4
- add r7, r2, r3
- //processing a line data
+ lsr r6, r4, #4
+ add r7, r2, r3
+ //processing a line data
comp_ds_bilinear_w_x16_loop1:
- vld1.8 {q0}, [r2]!
- vld1.8 {q1}, [r7]!
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vrshr.u16 q0, #1
- vrshr.u16 q1, #1
- vrhadd.u16 q0, q1
+ vld1.8 {q0}, [r2]!
+ vld1.8 {q1}, [r7]!
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vrshr.u16 q0, #1
+ vrshr.u16 q1, #1
+ vrhadd.u16 q0, q1
- vmovn.u16 d0, q0
- vst1.32 {d0}, [r0]!
- subs r6, #1
- bne comp_ds_bilinear_w_x16_loop1
+ vmovn.u16 d0, q0
+ vst1.32 {d0}, [r0]!
+ subs r6, #1
+ bne comp_ds_bilinear_w_x16_loop1
- add r2, r7, lr
- add r0, r1
- subs r5, #1
- bne comp_ds_bilinear_w_x16_loop0
+ add r2, r7, lr
+ add r0, r1
+ subs r5, #1
+ bne comp_ds_bilinear_w_x16_loop0
- ldmia sp!, {r4-r7,lr}
+ ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
- stmdb sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
+ stmdb sp!, {r4-r7, lr}
- //Get the width and height
- ldr r4, [sp, #20] //src_width
- ldr r5, [sp, #24] //src_height
+ //Get the width and height
+ ldr r4, [sp, #20] //src_width
+ ldr r5, [sp, #24] //src_height
- //Get the difference
- sub lr, r3, r4
- sub r1, r1, r4, lsr #1
+ //Get the difference
+ sub lr, r3, r4
+ sub r1, r1, r4, lsr #1
- lsr r5, #1
+ lsr r5, #1
- //processing a colume data
+ //processing a colume data
comp_ds_bilinear_w_x32_loop0:
- lsr r6, r4, #5
- add r7, r2, r3
- //processing a line data
+ lsr r6, r4, #5
+ add r7, r2, r3
+ //processing a line data
comp_ds_bilinear_w_x32_loop1:
- vld1.8 {q0,q1}, [r2]!
- vld1.8 {q2,q3}, [r7]!
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vpaddl.u8 q2, q2
- vpaddl.u8 q3, q3
- vrshr.u16 q0, #1
- vrshr.u16 q1, #1
- vrshr.u16 q2, #1
- vrshr.u16 q3, #1
- vrhadd.u16 q0, q2
- vrhadd.u16 q1, q3
+ vld1.8 {q0,q1}, [r2]!
+ vld1.8 {q2,q3}, [r7]!
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vrshr.u16 q0, #1
+ vrshr.u16 q1, #1
+ vrshr.u16 q2, #1
+ vrshr.u16 q3, #1
+ vrhadd.u16 q0, q2
+ vrhadd.u16 q1, q3
- vmovn.u16 d0, q0
- vmovn.u16 d1, q1
- vst1.32 {q0}, [r0]!
- subs r6, #1
- bne comp_ds_bilinear_w_x32_loop1
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q1
+ vst1.32 {q0}, [r0]!
+ subs r6, #1
+ bne comp_ds_bilinear_w_x32_loop1
- add r2, r7, lr
- add r0, r1
- subs r5, #1
- bne comp_ds_bilinear_w_x32_loop0
+ add r2, r7, lr
+ add r0, r1
+ subs r5, #1
+ bne comp_ds_bilinear_w_x32_loop0
- ldmia sp!, {r4-r7,lr}
+ ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
@@ -226,117 +226,117 @@
WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
stmdb sp!, {r4-r12, lr}
- //Get the data from stack
- ldr r4, [sp, #40] //the addr of src
- ldr r5, [sp, #44] //the value of src_stride
+ //Get the data from stack
+ ldr r4, [sp, #40] //the addr of src
+ ldr r5, [sp, #44] //the value of src_stride
ldr r6, [sp, #48] //the value of scaleX
ldr r7, [sp, #52] //the value of scaleY
mov r10, #32768
sub r10, #1
- and r8, r6, r10 // r8 uinc(scaleX mod 32767)
+ and r8, r6, r10 // r8 uinc(scaleX mod 32767)
mov r11, #-1
- mul r11, r8 // r11 -uinc
+ mul r11, r8 // r11 -uinc
vdup.s16 d2, r8
vdup.s16 d0, r11
vzip.s16 d0, d2 // uinc -uinc uinc -uinc
- and r9, r7, r10 // r9 vinc(scaleY mod 32767)
+ and r9, r7, r10 // r9 vinc(scaleY mod 32767)
mov r11, #-1
- mul r11, r9 // r11 -vinc
+ mul r11, r9 // r11 -vinc
- vdup.s16 d2, r9
- vdup.s16 d3, r11
- vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
+ vdup.s16 d2, r9
+ vdup.s16 d3, r11
+ vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
- mov r11, #0x40000000
+ mov r11, #0x40000000
mov r12, #0x4000
sub r12, #1
add r11, r12
- vdup.s32 d1, r11; //init u 16384 16383 16384 16383
+ vdup.s32 d1, r11; //init u 16384 16383 16384 16383
- mov r11, #16384
+ mov r11, #16384
vdup.s16 d16, r11
sub r11, #1
- vdup.s16 d17, r11
- vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
+ vdup.s16 d17, r11
+ vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
- veor q14, q14
- sub r1, r2 // stride - width
- mov r8, #16384 // yInverse
- sub r3, #1
+ veor q14, q14
+ sub r1, r2 // stride - width
+ mov r8, #16384 // yInverse
+ sub r3, #1
_HEIGHT:
ldr r4, [sp, #40] //the addr of src
- mov r11, r8
- lsr r11, #15
- mul r11, r5
- add r11, r4 // get current row address
- mov r12, r11
- add r12, r5
+ mov r11, r8
+ lsr r11, #15
+ mul r11, r5
+ add r11, r4 // get current row address
+ mov r12, r11
+ add r12, r5
- mov r9, #16384 // xInverse
- sub r10, r2, #1
+ mov r9, #16384 // xInverse
+ sub r10, r2, #1
vmov.s16 d6, d1
_WIDTH:
- mov lr, r9
- lsr lr, #15
+ mov lr, r9
+ lsr lr, #15
add r4, r11,lr
- vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
+ vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
add r4, r12,lr
- vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
- vzip.32 d28, d29 //q14: 000d000c000b000a;
+ vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
+ vzip.32 d28, d29 //q14: 000d000c000b000a;
- vmull.u16 q13, d6, d7 //q13: init u * init v
- vmull.u32 q12, d26,d28
- vmlal.u32 q12, d27,d29
- vqadd.u64 d24, d24,d25
- vrshr.u64 d24, #30
+ vmull.u16 q13, d6, d7 //q13: init u * init v
+ vmull.u32 q12, d26,d28
+ vmlal.u32 q12, d27,d29
+ vqadd.u64 d24, d24,d25
+ vrshr.u64 d24, #30
- vst1.8 {d24[0]}, [r0]!
- add r9, r6
- vadd.u16 d6, d0 // inc u
- vshl.u16 d6, #1
- vshr.u16 d6, #1
- subs r10, #1
- bne _WIDTH
+ vst1.8 {d24[0]}, [r0]!
+ add r9, r6
+ vadd.u16 d6, d0 // inc u
+ vshl.u16 d6, #1
+ vshr.u16 d6, #1
+ subs r10, #1
+ bne _WIDTH
WIDTH_END:
- lsr r9, #15
+ lsr r9, #15
add r4,r11,r9
- vld1.8 {d24[0]}, [r4]
- vst1.8 {d24[0]}, [r0]
- add r0, #1
- add r8, r7
- add r0, r1
- vadd.s16 d7, d5 // inc v
- vshl.u16 d7, #1
- vshr.u16 d7, #1
- subs r3, #1
- bne _HEIGHT
+ vld1.8 {d24[0]}, [r4]
+ vst1.8 {d24[0]}, [r0]
+ add r0, #1
+ add r8, r7
+ add r0, r1
+ vadd.s16 d7, d5 // inc v
+ vshl.u16 d7, #1
+ vshr.u16 d7, #1
+ subs r3, #1
+ bne _HEIGHT
LAST_ROW:
ldr r4, [sp, #40] //the addr of src
- lsr r8, #15
- mul r8, r5
- add r4, r8 // get current row address
- mov r9, #16384
+ lsr r8, #15
+ mul r8, r5
+ add r4, r8 // get current row address
+ mov r9, #16384
_LAST_ROW_WIDTH:
- mov r11, r9
- lsr r11, #15
+ mov r11, r9
+ lsr r11, #15
- add r3, r4,r11
- vld1.8 {d0[0]}, [r3]
- vst1.8 {d0[0]}, [r0]
- add r0, #1
- add r9, r6
- subs r2, #1
- bne _LAST_ROW_WIDTH
+ add r3, r4,r11
+ vld1.8 {d0[0]}, [r3]
+ vst1.8 {d0[0]}, [r0]
+ add r0, #1
+ add r9, r6
+ subs r2, #1
+ bne _LAST_ROW_WIDTH
- ldmia sp!, {r4-r12, lr}
+ ldmia sp!, {r4-r12, lr}
WELS_ASM_FUNC_END
#endif
--- a/codec/processing/src/arm/pixel_sad_neon.S
+++ b/codec/processing/src/arm/pixel_sad_neon.S
@@ -37,32 +37,32 @@
WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon
stmdb sp!, {lr}
- //Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1
- vld1.8 {d1}, [r2], r3
+ //Loading a horizontal line data (8 bytes)
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d1}, [r2], r3
- //Do the SAD for 8 bytes
- vabdl.u8 q1, d0, d1
+ //Do the SAD for 8 bytes
+ vabdl.u8 q1, d0, d1
- mov lr, #7
+ mov lr, #7
pixel_sad_8x8_loop0:
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1
- vld1.8 {d1}, [r2], r3
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d1}, [r2], r3
- subs lr, #1
+ subs lr, #1
- //Do the SAD for 8 bytes
- vabal.u8 q1, d0, d1
- bne pixel_sad_8x8_loop0
+ //Do the SAD for 8 bytes
+ vabal.u8 q1, d0, d1
+ bne pixel_sad_8x8_loop0
- vadd.u16 d2, d3
- vpaddl.u16 d2, d2
- vpaddl.u32 d2, d2
- vmov.u32 r0, d2[0]//TBO...
+ vadd.u16 d2, d3
+ vpaddl.u16 d2, d2
+ vpaddl.u32 d2, d2
+ vmov.u32 r0, d2[0]//TBO...
- ldmia sp!, {lr}
+ ldmia sp!, {lr}
WELS_ASM_FUNC_END
#endif
--- a/codec/processing/src/arm/vaa_calc_neon.S
+++ b/codec/processing/src/arm/vaa_calc_neon.S
@@ -37,61 +37,61 @@
#ifdef __APPLE__
.macro ABS_SUB_SUM_16BYTES
- vld1.32 {q15}, [$0], $2
- vld1.32 {q14}, [$1], $2
- vabal.u8 $3, d30, d28
- vabal.u8 $4, d31, d29
+ vld1.32 {q15}, [$0], $2
+ vld1.32 {q14}, [$1], $2
+ vabal.u8 $3, d30, d28
+ vabal.u8 $4, d31, d29
.endm
.macro ABS_SUB_SUM_8x16BYTES
- vld1.32 {q15}, [$0], $2
- vld1.32 {q14}, [$1], $2
- vabdl.u8 $3, d30, d28
- vabdl.u8 $4, d31, d29
+ vld1.32 {q15}, [$0], $2
+ vld1.32 {q14}, [$1], $2
+ vabdl.u8 $3, d30, d28
+ vabdl.u8 $4, d31, d29
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
.endm
.macro SAD_8X16BITS
- vadd.u16 d31, $0, $1
- vpaddl.u16 d31, d31
- vpaddl.u32 $2, d31
+ vadd.u16 d31, $0, $1
+ vpaddl.u16 d31, d31
+ vpaddl.u32 $2, d31
.endm
#else
.macro ABS_SUB_SUM_16BYTES arg0, arg1, arg2, arg3, arg4
- vld1.32 {q15}, [\arg0], \arg2
- vld1.32 {q14}, [\arg1], \arg2
- vabal.u8 \arg3, d30, d28
- vabal.u8 \arg4, d31, d29
+ vld1.32 {q15}, [\arg0], \arg2
+ vld1.32 {q14}, [\arg1], \arg2
+ vabal.u8 \arg3, d30, d28
+ vabal.u8 \arg4, d31, d29
.endm
.macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4
- vld1.32 {q15}, [\arg0], \arg2
- vld1.32 {q14}, [\arg1], \arg2
- vabdl.u8 \arg3, d30, d28
- vabdl.u8 \arg4, d31, d29
+ vld1.32 {q15}, [\arg0], \arg2
+ vld1.32 {q14}, [\arg1], \arg2
+ vabdl.u8 \arg3, d30, d28
+ vabdl.u8 \arg4, d31, d29
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
.endm
.macro SAD_8X16BITS arg0, arg1, arg2
- vadd.u16 d31, \arg0, \arg1
- vpaddl.u16 d31, d31
- vpaddl.u32 \arg2, d31
+ vadd.u16 d31, \arg0, \arg1
+ vpaddl.u16 d31, d31
+ vpaddl.u32 \arg2, d31
.endm
#endif
@@ -100,16 +100,16 @@
stmdb sp!, {r4-r8}
- ldr r4, [sp, #20] //load pic_stride
- ldr r5, [sp, #28] //load psad8x8
+ ldr r4, [sp, #20] //load pic_stride
+ ldr r5, [sp, #28] //load psad8x8
- //Initial the Q8 register for save the "psadframe"
- vmov.s64 q8, #0
+ //Initial the Q8 register for save the "psadframe"
+ vmov.s64 q8, #0
- //Get the jump distance to use on loop codes
- lsl r8, r4, #4
- sub r7, r8, #16 //R7 keep the 16*pic_stride-16
- sub r8, r2 //R8 keep the 16*pic_stride-pic_width
+ //Get the jump distance to use on loop codes
+ lsl r8, r4, #4
+ sub r7, r8, #16 //R7 keep the 16*pic_stride-16
+ sub r8, r2 //R8 keep the 16*pic_stride-pic_width
vaa_calc_sad_loop0:
@@ -118,44 +118,44 @@
vaa_calc_sad_loop1:
- //Process the 16x16 bytes
- ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1
- ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3
+ //Process the 16x16 bytes
+ ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1
+ ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3
- //Do the SAD
- SAD_8X16BITS d0, d1, d0
- SAD_8X16BITS d2, d3, d1
- SAD_8X16BITS d4, d5, d2
- SAD_8X16BITS d6, d7, d3
+ //Do the SAD
+ SAD_8X16BITS d0, d1, d0
+ SAD_8X16BITS d2, d3, d1
+ SAD_8X16BITS d4, d5, d2
+ SAD_8X16BITS d6, d7, d3
- //Write to "psad8x8" buffer
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
+ //Write to "psad8x8" buffer
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
- //Adjust the input address
- sub r0, r7
- sub r1, r7
+ //Adjust the input address
+ sub r0, r7
+ sub r1, r7
- subs r6, #16
+ subs r6, #16
- //Save to calculate "psadframe"
- vadd.u32 q0, q1
- vadd.u32 q8, q0
+ //Save to calculate "psadframe"
+ vadd.u32 q0, q1
+ vadd.u32 q8, q0
- bne vaa_calc_sad_loop1
+ bne vaa_calc_sad_loop1
- //Adjust the input address
- add r0, r8
- add r1, r8
+ //Adjust the input address
+ add r0, r8
+ add r1, r8
subs r3, #16
- bne vaa_calc_sad_loop0
+ bne vaa_calc_sad_loop0
- ldr r6, [sp, #24] //load psadframe
- vadd.u32 d16, d17
- vst1.32 {d16[0]}, [r6]
+ ldr r6, [sp, #24] //load psadframe
+ vadd.u32 d16, d17
+ vst1.32 {d16[0]}, [r6]
- ldmia sp!, {r4-r8}
+ ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
@@ -162,26 +162,26 @@
#ifdef __APPLE__
.macro SAD_SD_MAD_16BYTES
- vld1.32 {q0}, [$0], $2
- vld1.32 {q1}, [$1], $2
+ vld1.32 {q0}, [$0], $2
+ vld1.32 {q1}, [$1], $2
- vpadal.u8 $3, q0
- vpadal.u8 $4, q1
+ vpadal.u8 $3, q0
+ vpadal.u8 $4, q1
- vabd.u8 q0, q0, q1
- vmax.u8 $5, q0
- vpadal.u8 $6, q0
+ vabd.u8 q0, q0, q1
+ vmax.u8 $5, q0
+ vpadal.u8 $6, q0
.endm
.macro SAD_SD_MAD_8x16BYTES
- vld1.32 {q0}, [$0], $2
- vld1.32 {q1}, [$1], $2
+ vld1.32 {q0}, [$0], $2
+ vld1.32 {q1}, [$1], $2
- vpaddl.u8 q2, q0
- vpaddl.u8 q3, q1
+ vpaddl.u8 q2, q0
+ vpaddl.u8 q3, q1
- vabd.u8 $3, q0, q1
- vpaddl.u8 $4, $3 //abs_diff
+ vabd.u8 $3, q0, q1
+ vpaddl.u8 $4, $3 //abs_diff
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
@@ -192,41 +192,41 @@
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
- vsub.u16 $5, q2, q3
+ vsub.u16 $5, q2, q3
.endm
.macro SAD_SD_MAD_CALC
- vpmax.u8 d0, $0, $1 //8bytes
- vpmax.u8 d0, d0, d0 //4bytes
- vpmax.u8 $2, d0, d0 //2bytes
+ vpmax.u8 d0, $0, $1 //8bytes
+ vpmax.u8 d0, d0, d0 //4bytes
+ vpmax.u8 $2, d0, d0 //2bytes
- vpaddl.u16 $3, $3
- vpaddl.u32 $3, $3
- vpaddl.s16 $4, $4
- vpaddl.s32 $4, $4
+ vpaddl.u16 $3, $3
+ vpaddl.u32 $3, $3
+ vpaddl.s16 $4, $4
+ vpaddl.s32 $4, $4
.endm
#else
.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vld1.32 {q0}, [\arg0], \arg2
- vld1.32 {q1}, [\arg1], \arg2
+ vld1.32 {q0}, [\arg0], \arg2
+ vld1.32 {q1}, [\arg1], \arg2
- vpadal.u8 \arg3, q0
- vpadal.u8 \arg4, q1
+ vpadal.u8 \arg3, q0
+ vpadal.u8 \arg4, q1
- vabd.u8 q0, q0, q1
- vmax.u8 \arg5, q0
- vpadal.u8 \arg6, q0
+ vabd.u8 q0, q0, q1
+ vmax.u8 \arg5, q0
+ vpadal.u8 \arg6, q0
.endm
.macro SAD_SD_MAD_8x16BYTES arg0, arg1, arg2, arg3, arg4, arg5
- vld1.32 {q0}, [\arg0], \arg2
- vld1.32 {q1}, [\arg1], \arg2
+ vld1.32 {q0}, [\arg0], \arg2
+ vld1.32 {q1}, [\arg1], \arg2
- vpaddl.u8 q2, q0
- vpaddl.u8 q3, q1
+ vpaddl.u8 q2, q0
+ vpaddl.u8 q3, q1
- vabd.u8 \arg3, q0, q1
- vpaddl.u8 \arg4, \arg3 //abs_diff
+ vabd.u8 \arg3, q0, q1
+ vpaddl.u8 \arg4, \arg3 //abs_diff
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
@@ -237,18 +237,18 @@
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
- vsub.u16 \arg5, q2, q3
+ vsub.u16 \arg5, q2, q3
.endm
.macro SAD_SD_MAD_CALC arg0, arg1, arg2, arg3, arg4
- vpmax.u8 d0, \arg0, \arg1 //8bytes
- vpmax.u8 d0, d0, d0 //4bytes
- vpmax.u8 \arg2, d0, d0 //2bytes
+ vpmax.u8 d0, \arg0, \arg1 //8bytes
+ vpmax.u8 d0, d0, d0 //4bytes
+ vpmax.u8 \arg2, d0, d0 //2bytes
- vpaddl.u16 \arg3, \arg3
- vpaddl.u32 \arg3, \arg3
- vpaddl.s16 \arg4, \arg4
- vpaddl.s32 \arg4, \arg4
+ vpaddl.u16 \arg3, \arg3
+ vpaddl.u32 \arg3, \arg3
+ vpaddl.s16 \arg4, \arg4
+ vpaddl.s32 \arg4, \arg4
.endm
#endif
@@ -256,18 +256,18 @@
stmdb sp!, {r4-r10}
- ldr r4, [sp, #28] //load pic_stride
- ldr r5, [sp, #36] //load psad8x8
+ ldr r4, [sp, #28] //load pic_stride
+ ldr r5, [sp, #36] //load psad8x8
ldr r6, [sp, #40] //load psd8x8
ldr r7, [sp, #44] //load pmad8x8
- //Initial the Q4 register for save the "psadframe"
- vmov.s64 q15, #0
+ //Initial the Q4 register for save the "psadframe"
+ vmov.s64 q15, #0
- //Get the jump distance to use on loop codes
- lsl r10, r4, #4
- sub r9, r10, #16 //R9 keep the 16*pic_stride-16
- sub r10, r2 //R10 keep the 16*pic_stride-pic_width
+ //Get the jump distance to use on loop codes
+ lsl r10, r4, #4
+ sub r9, r10, #16 //R9 keep the 16*pic_stride-16
+ sub r10, r2 //R10 keep the 16*pic_stride-pic_width
vaa_calc_sad_bgd_loop0:
@@ -276,40 +276,40 @@
vaa_calc_sad_bgd_loop1:
- //Process the 16x16 bytes pmad psad psd
- SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9
- SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10
+ //Process the 16x16 bytes pmad psad psd
+ SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9
+ SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10
SAD_SD_MAD_CALC d26, d27, d16, q11, q9
SAD_SD_MAD_CALC d28, d29, d17, q12, q10
- //Write to "psad8x8" buffer
- vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
- //Adjust the input address
- sub r0, r9
- sub r1, r9
- //Write to "psd8x8" buffer
- vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
- subs r8, #16
- //Write to "pmad8x8" buffer
- vst2.16 {d16[0],d17[0]}, [r7]!
- //Save to calculate "psadframe"
- vadd.u32 q11, q12
- vadd.u32 q15, q11
+ //Write to "psad8x8" buffer
+ vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
+ //Adjust the input address
+ sub r0, r9
+ sub r1, r9
+ //Write to "psd8x8" buffer
+ vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
+ subs r8, #16
+ //Write to "pmad8x8" buffer
+ vst2.16 {d16[0],d17[0]}, [r7]!
+ //Save to calculate "psadframe"
+ vadd.u32 q11, q12
+ vadd.u32 q15, q11
- bne vaa_calc_sad_bgd_loop1
+ bne vaa_calc_sad_bgd_loop1
- //Adjust the input address
- add r0, r10
- add r1, r10
+ //Adjust the input address
+ add r0, r10
+ add r1, r10
subs r3, #16
- bne vaa_calc_sad_bgd_loop0
+ bne vaa_calc_sad_bgd_loop0
- ldr r8, [sp, #32] //load psadframe
- vadd.u32 d30, d31
- vst1.32 {d30[0]}, [r8]
- ldmia sp!, {r4-r10}
+ ldr r8, [sp, #32] //load psadframe
+ vadd.u32 d30, d31
+ vst1.32 {d30[0]}, [r8]
+ ldmia sp!, {r4-r10}
WELS_ASM_FUNC_END
@@ -316,344 +316,344 @@
#ifdef __APPLE__
.macro SSD_MUL_SUM_16BYTES_RESET
- vmull.u8 $3, $0, $0
- vpaddl.u16 $2, $3
+ vmull.u8 $3, $0, $0
+ vpaddl.u16 $2, $3
- vmull.u8 $3, $1, $1
- vpadal.u16 $2, $3
+ vmull.u8 $3, $1, $1
+ vpadal.u16 $2, $3
.endm
.macro SSD_MUL_SUM_16BYTES
- vmull.u8 $3, $0, $0
- vpadal.u16 $2, $3
+ vmull.u8 $3, $0, $0
+ vpadal.u16 $2, $3
- vmull.u8 $3, $1, $1
- vpadal.u16 $2, $3
+ vmull.u8 $3, $1, $1
+ vpadal.u16 $2, $3
.endm
.macro SAD_SSD_BGD_16
- vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q0}, [$0], $2 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
+ vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
- vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [$1], $2 //load ref_row
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vld1.8 {q1}, [$1], $2 //load ref_row
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//the last row of a 16x16 block
.macro SAD_SSD_BGD_16_end
- vld1.8 {q0}, [$0], $1 //load cur_row
+ vld1.8 {q0}, [$0], $1 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
+ vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
- vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//for the begin of a 8x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_8x8
- vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q0}, [$0], $2 //load cur_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+ vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [$1], $2 //load ref_row
+ vld1.8 {q1}, [$1], $2 //load ref_row
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//for the begin of a 16x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_16x16
- vld1.8 {q0}, [$0], $2 //load cur_row
- vld1.8 {q1}, [$1], $2 //load ref_row
+ vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q1}, [$1], $2 //load ref_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+ vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [$1], $2 //load ref_row
+ vld1.8 {q1}, [$1], $2 //load ref_row
- vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16
.endm
//for each 8x16 block
.macro SAD_SSD_BGD_CALC_8x16
- vpmax.u8 d10, d10, d11 //4 numbers
- vpmax.u8 d10, d10, d10 //2 numbers
- vpmax.u8 d10, d10, d10 //1 number1
+ vpmax.u8 d10, d10, d11 //4 numbers
+ vpmax.u8 d10, d10, d10 //2 numbers
+ vpmax.u8 d10, d10, d10 //1 number1
- vmov $0, d10 //d26 d27 keeps the l_mad
+ vmov $0, d10 //d26 d27 keeps the l_mad
- //p_sd8x8 fix me
- vpaddl.u16 q3, q3
- vpaddl.u16 q4, q4
+ //p_sd8x8 fix me
+ vpaddl.u16 q3, q3
+ vpaddl.u16 q4, q4
- vsub.i32 $1, q3, q4
- vpaddl.u32 $1, $1
+ vsub.i32 $1, q3, q4
+ vpaddl.u32 $1, $1
- //psad8x8
- vpaddl.u16 $2, $2
- vpaddl.u32 $2, $2
+ //psad8x8
+ vpaddl.u16 $2, $2
+ vpaddl.u32 $2, $2
- //psadframe
- vadd.i32 q12, $2
+ //psadframe
+ vadd.i32 q12, $2
.endm
.macro SAD_SSD_BGD_16x16
- //for one 8x16
- SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
+ //for one 8x16
+ SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_CALC_8x16 d26, q14, q6
+ SAD_SSD_BGD_CALC_8x16 d26, q14, q6
- //for another 8x16
- SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16_end $0, $2, q7
+ //for another 8x16
+ SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16_end $0, $2, q7
- SAD_SSD_BGD_CALC_8x16 d27, q15, q7
+ SAD_SSD_BGD_CALC_8x16 d27, q15, q7
.endm
.macro SSD_SAD_SD_MAD_PADDL
- vpaddl.s16 $0, $0
- vpaddl.s32 $0, $0
- vadd.i32 $1, $1, $2
+ vpaddl.s16 $0, $0
+ vpaddl.s32 $0, $0
+ vadd.i32 $1, $1, $2
.endm
#else
.macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
- vmull.u8 \arg3, \arg0, \arg0
- vpaddl.u16 \arg2, \arg3
+ vmull.u8 \arg3, \arg0, \arg0
+ vpaddl.u16 \arg2, \arg3
- vmull.u8 \arg3, \arg1, \arg1
- vpadal.u16 \arg2, \arg3
+ vmull.u8 \arg3, \arg1, \arg1
+ vpadal.u16 \arg2, \arg3
.endm
.macro SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3
- vmull.u8 \arg3, \arg0, \arg0
- vpadal.u16 \arg2, \arg3
+ vmull.u8 \arg3, \arg0, \arg0
+ vpadal.u16 \arg2, \arg3
- vmull.u8 \arg3, \arg1, \arg1
- vpadal.u16 \arg2, \arg3
+ vmull.u8 \arg3, \arg1, \arg1
+ vpadal.u16 \arg2, \arg3
.endm
.macro SAD_SSD_BGD_16 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
+ vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
- vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [\arg1], \arg2 //load ref_row
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//the last row of a 16x16 block
.macro SAD_SSD_BGD_16_end arg0, arg1, arg2
- vld1.8 {q0}, [\arg0], \arg1 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg1 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
+ vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
- vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//for the begin of a 8x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_8x8 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+ vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
- vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+ vld1.8 {q1}, [\arg1], \arg2 //load ref_row
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//for the begin of a 16x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_16x16 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q1}, [\arg1], \arg2 //load ref_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+ vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
- vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+ vld1.8 {q1}, [\arg1], \arg2 //load ref_row
- vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16
.endm
//for each 8x16 block
.macro SAD_SSD_BGD_CALC_8x16 arg0, arg1, arg2
- vpmax.u8 d10, d10, d11 //4 numbers
- vpmax.u8 d10, d10, d10 //2 numbers
- vpmax.u8 d10, d10, d10 //1 number1
+ vpmax.u8 d10, d10, d11 //4 numbers
+ vpmax.u8 d10, d10, d10 //2 numbers
+ vpmax.u8 d10, d10, d10 //1 number1
- vmov \arg0, d10 //d26 d27 keeps the l_mad
+ vmov \arg0, d10 //d26 d27 keeps the l_mad
- //p_sd8x8
- vpaddl.u16 q3, q3
- vpaddl.u16 q4, q4
+ //p_sd8x8
+ vpaddl.u16 q3, q3
+ vpaddl.u16 q4, q4
- vsub.i32 \arg1, q3, q4
- vpaddl.u32 \arg1, \arg1
+ vsub.i32 \arg1, q3, q4
+ vpaddl.u32 \arg1, \arg1
- //psad8x8
- vpaddl.u16 \arg2, \arg2
- vpaddl.u32 \arg2, \arg2
+ //psad8x8
+ vpaddl.u16 \arg2, \arg2
+ vpaddl.u32 \arg2, \arg2
- //psadframe
- vadd.i32 q12, \arg2
+ //psadframe
+ vadd.i32 q12, \arg2
.endm
.macro SAD_SSD_BGD_16x16 arg0, arg1, arg2
- //for one 8x16
- SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ //for one 8x16
+ SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_CALC_8x16 d26, q14, q6
+ SAD_SSD_BGD_CALC_8x16 d26, q14, q6
- //for another 8x16
- SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16_end \arg0, \arg2, q7
+ //for another 8x16
+ SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16_end \arg0, \arg2, q7
- SAD_SSD_BGD_CALC_8x16 d27, q15, q7
+ SAD_SSD_BGD_CALC_8x16 d27, q15, q7
.endm
.macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2
- vpaddl.s16 \arg0, \arg0
- vpaddl.s32 \arg0, \arg0
- vadd.i32 \arg1, \arg1, \arg2
+ vpaddl.s16 \arg0, \arg0
+ vpaddl.s32 \arg0, \arg0
+ vadd.i32 \arg1, \arg1, \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon
- stmdb sp!, {r0-r12, r14}
- vpush {q4-q7}
+ stmdb sp!, {r0-r12, r14}
+ vpush {q4-q7}
- ldr r4, [sp, #120] //r4 keeps the pic_stride
+ ldr r4, [sp, #120] //r4 keeps the pic_stride
- sub r5, r4, #1
- lsl r5, r5, #4 //r5 keeps the little step
+ sub r5, r4, #1
+ lsl r5, r5, #4 //r5 keeps the little step
- lsl r6, r4, #4
- sub r6, r2, r6 //r6 keeps the big step
+ lsl r6, r4, #4
+ sub r6, r2, r6 //r6 keeps the big step
- ldr r8, [sp, #128]//psad8x8
- ldr r9, [sp, #132]//psum16x16
- ldr r10, [sp, #136]//psqsum16x16
- ldr r11, [sp, #140]//psqdiff16x16
- ldr r12, [sp, #144]//p_sd8x8
- ldr r14, [sp, #148]//p_mad8x8
+ ldr r8, [sp, #128]//psad8x8
+ ldr r9, [sp, #132]//psum16x16
+ ldr r10, [sp, #136]//psqsum16x16
+ ldr r11, [sp, #140]//psqdiff16x16
+ ldr r12, [sp, #144]//p_sd8x8
+ ldr r14, [sp, #148]//p_mad8x8
- vmov.i8 q12, #0
+ vmov.i8 q12, #0
vaa_calc_sad_ssd_bgd_height_loop:
@@ -660,7 +660,7 @@
mov r7, r2
vaa_calc_sad_ssd_bgd_width_loop:
- //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff q8, l_sum q9, l_sqsum q10
+ //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff q8, l_sum q9, l_sqsum q10
SAD_SSD_BGD_16x16 r0,r1,r4
//psad8x8
@@ -694,20 +694,20 @@
bne vaa_calc_sad_ssd_bgd_width_loop
- sub r0, r0, r6 //jump to next 16 x width
- sub r1, r1, r6 //jump to next 16 x width
+ sub r0, r0, r6 //jump to next 16 x width
+ sub r1, r1, r6 //jump to next 16 x width
subs r3, #16
bne vaa_calc_sad_ssd_bgd_height_loop
- //psadframe
- ldr r7, [sp, #124]//psadframe
+ //psadframe
+ ldr r7, [sp, #124]//psadframe
- vadd.i32 d24, d24, d25
- vst1.32 {d24[0]}, [r7]
+ vadd.i32 d24, d24, d25
+ vst1.32 {d24[0]}, [r7]
- vpop {q4-q7}
- ldmia sp!, {r0-r12, r14}
+ vpop {q4-q7}
+ ldmia sp!, {r0-r12, r14}
WELS_ASM_FUNC_END
@@ -714,223 +714,223 @@
#ifdef __APPLE__
.macro SAD_VAR_16
- vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q0}, [$0], $2 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [$1], $2
+ vld1.8 {q1}, [$1], $2
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_END
- vld1.8 {q0}, [$0], $1 //load cur_row
+ vld1.8 {q0}, [$0], $1 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_RESET_16x16
- vld1.8 {q0}, [$0], $2 //load cur_row
- vld1.8 {q1}, [$1], $2
+ vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q1}, [$1], $2
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [$1], $2
+ vld1.8 {q1}, [$1], $2
- vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
+ SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
.endm
.macro SAD_VAR_16_RESET_8x8
- vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q0}, [$0], $2 //load cur_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [$1], $2
+ vld1.8 {q1}, [$1], $2
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16x16
- //for one 8x16
- SAD_VAR_16_RESET_16x16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
+ //for one 8x16
+ SAD_VAR_16_RESET_16x16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
- vpaddl.u16 q6, q6
- vpaddl.u32 q6, q6
- vadd.i32 q12, q6
+ vpaddl.u16 q6, q6
+ vpaddl.u32 q6, q6
+ vadd.i32 q12, q6
- //for another 8x16
- SAD_VAR_16_RESET_8x8 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16_END $0, $2, q7
+ //for another 8x16
+ SAD_VAR_16_RESET_8x8 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16_END $0, $2, q7
- vpaddl.u16 q7, q7
- vpaddl.u32 q7, q7
+ vpaddl.u16 q7, q7
+ vpaddl.u32 q7, q7
- vadd.i32 q12, q7
+ vadd.i32 q12, q7
.endm
#else
.macro SAD_VAR_16 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [\arg1], \arg2
+ vld1.8 {q1}, [\arg1], \arg2
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_END arg0, arg1, arg2
- vld1.8 {q0}, [\arg0], \arg1 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg1 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_RESET_16x16 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vld1.8 {q1}, [\arg1], \arg2
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q1}, [\arg1], \arg2
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [\arg1], \arg2
+ vld1.8 {q1}, [\arg1], \arg2
- vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
+ SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
.endm
.macro SAD_VAR_16_RESET_8x8 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [\arg1], \arg2
+ vld1.8 {q1}, [\arg1], \arg2
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16x16 arg0, arg1, arg2
- //for one 8x16
- SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ //for one 8x16
+ SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
- vpaddl.u16 q6, q6
- vpaddl.u32 q6, q6
- vadd.i32 q12, q6
+ vpaddl.u16 q6, q6
+ vpaddl.u32 q6, q6
+ vadd.i32 q12, q6
- //for another 8x16
- SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16_END \arg0, \arg2, q7
+ //for another 8x16
+ SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16_END \arg0, \arg2, q7
- vpaddl.u16 q7, q7
- vpaddl.u32 q7, q7
+ vpaddl.u16 q7, q7
+ vpaddl.u32 q7, q7
- vadd.i32 q12, q7
+ vadd.i32 q12, q7
.endm
#endif
WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon
- stmdb sp!, {r4-r11}
- vpush {q4}
- vpush {q6-q7}
+ stmdb sp!, {r4-r11}
+ vpush {q4}
+ vpush {q6-q7}
- ldr r4, [sp, #80] //r4 keeps the pic_stride
+ ldr r4, [sp, #80] //r4 keeps the pic_stride
- sub r5, r4, #1
- lsl r5, r5, #4 //r5 keeps the little step
+ sub r5, r4, #1
+ lsl r5, r5, #4 //r5 keeps the little step
- lsl r6, r4, #4
- sub r6, r2, r6 //r6 keeps the big step
+ lsl r6, r4, #4
+ sub r6, r2, r6 //r6 keeps the big step
- ldr r7, [sp, #84] //psadframe
- ldr r8, [sp, #88] //psad8x8
- ldr r9, [sp, #92] //psum16x16
- ldr r10, [sp, #96] //psqsum16x16
+ ldr r7, [sp, #84] //psadframe
+ ldr r8, [sp, #88] //psad8x8
+ ldr r9, [sp, #92] //psum16x16
+ ldr r10, [sp, #96] //psqsum16x16
- vmov.i8 q12, #0
+ vmov.i8 q12, #0
vaa_calc_sad_var_height_loop:
mov r11, r2
@@ -956,154 +956,154 @@
bne vaa_calc_sad_var_width_loop
- sub r0, r0, r6 //jump to next 16 x width
- sub r1, r1, r6 //jump to next 16 x width
+ sub r0, r0, r6 //jump to next 16 x width
+ sub r1, r1, r6 //jump to next 16 x width
subs r3, #16
bne vaa_calc_sad_var_height_loop
- vadd.i32 d24, d24, d25
- vst1.32 {d24[0]}, [r7]
+ vadd.i32 d24, d24, d25
+ vst1.32 {d24[0]}, [r7]
- vpop {q6-q7}
- vpop {q4}
- ldmia sp!, {r4-r11}
+ vpop {q6-q7}
+ vpop {q4}
+ ldmia sp!, {r4-r11}
WELS_ASM_FUNC_END
#ifdef __APPLE__
.macro SAD_SSD_16
- SAD_VAR_16 $0, $1, $2, $3
+ SAD_VAR_16 $0, $1, $2, $3
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11
.endm
.macro SAD_SSD_16_END
- SAD_VAR_16_END $0, $1, $2
+ SAD_VAR_16_END $0, $1, $2
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_16x16
- SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
+ SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
- SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_8x8
- SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
+ SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16x16
- //for one 8x16
- SAD_SSD_16_RESET_16x16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
+ //for one 8x16
+ SAD_SSD_16_RESET_16x16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
- vpaddl.u16 q6, q6
- vpaddl.u32 q6, q6
- vadd.i32 q12, q6
+ vpaddl.u16 q6, q6
+ vpaddl.u32 q6, q6
+ vadd.i32 q12, q6
- //for another 8x16
- SAD_SSD_16_RESET_8x8 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16_END $0, $2, q7
+ //for another 8x16
+ SAD_SSD_16_RESET_8x8 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16_END $0, $2, q7
- vpaddl.u16 q7, q7
- vpaddl.u32 q7, q7
+ vpaddl.u16 q7, q7
+ vpaddl.u32 q7, q7
- vadd.i32 q12, q7
+ vadd.i32 q12, q7
.endm
#else
.macro SAD_SSD_16 arg0, arg1, arg2, arg3
- SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
+ SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11
.endm
.macro SAD_SSD_16_END arg0, arg1, arg2
- SAD_VAR_16_END \arg0, \arg1, \arg2
+ SAD_VAR_16_END \arg0, \arg1, \arg2
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_16x16 arg0, arg1, arg2, arg3
- SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3
+ SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3
- SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_8x8 arg0, arg1, arg2, arg3
- SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3
+ SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16x16 arg0, arg1, arg2
- //for one 8x16
- SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ //for one 8x16
+ SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
- vpaddl.u16 q6, q6
- vpaddl.u32 q6, q6
- vadd.i32 q12, q6
+ vpaddl.u16 q6, q6
+ vpaddl.u32 q6, q6
+ vadd.i32 q12, q6
- //for another 8x16
- SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16_END \arg0, \arg2, q7
+ //for another 8x16
+ SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16_END \arg0, \arg2, q7
- vpaddl.u16 q7, q7
- vpaddl.u32 q7, q7
+ vpaddl.u16 q7, q7
+ vpaddl.u32 q7, q7
- vadd.i32 q12, q7
+ vadd.i32 q12, q7
.endm
#endif
WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon
- stmdb sp!, {r4-r12}
- vpush {q4}
- vpush {q6-q7}
+ stmdb sp!, {r4-r12}
+ vpush {q4}
+ vpush {q6-q7}
- ldr r4, [sp, #84] //r4 keeps the pic_stride
+ ldr r4, [sp, #84] //r4 keeps the pic_stride
- sub r5, r4, #1
- lsl r5, r5, #4 //r5 keeps the little step
+ sub r5, r4, #1
+ lsl r5, r5, #4 //r5 keeps the little step
- lsl r6, r4, #4
- sub r6, r2, r6 //r6 keeps the big step
+ lsl r6, r4, #4
+ sub r6, r2, r6 //r6 keeps the big step
- ldr r7, [sp, #88] //psadframe
- ldr r8, [sp, #92] //psad8x8
- ldr r9, [sp, #96] //psum16x16
- ldr r10, [sp, #100] //psqsum16x16
- ldr r11, [sp, #104] //psqdiff16x16
+ ldr r7, [sp, #88] //psadframe
+ ldr r8, [sp, #92] //psad8x8
+ ldr r9, [sp, #96] //psum16x16
+ ldr r10, [sp, #100] //psqsum16x16
+ ldr r11, [sp, #104] //psqdiff16x16
- vmov.i8 q12, #0
+ vmov.i8 q12, #0
vaa_calc_sad_ssd_height_loop:
mov r12, r2
@@ -1136,18 +1136,18 @@
bne vaa_calc_sad_ssd_width_loop
- sub r0, r0, r6 //jump to next 16 x width
- sub r1, r1, r6 //jump to next 16 x width
+ sub r0, r0, r6 //jump to next 16 x width
+ sub r1, r1, r6 //jump to next 16 x width
subs r3, #16
- bne vaa_calc_sad_ssd_height_loop
+ bne vaa_calc_sad_ssd_height_loop
- vadd.i32 d24, d24, d25
- vst1.32 {d24[0]}, [r7]
+ vadd.i32 d24, d24, d25
+ vst1.32 {d24[0]}, [r7]
- vpop {q6-q7}
- vpop {q4}
- ldmia sp!, {r4-r12}
+ vpop {q6-q7}
+ vpop {q4}
+ ldmia sp!, {r4-r12}
WELS_ASM_FUNC_END
#endif
--- a/codec/processing/src/x86/denoisefilter.asm
+++ b/codec/processing/src/x86/denoisefilter.asm
@@ -56,217 +56,217 @@
;***********************************************************************
SECTION .text
-%macro WEIGHT_LINE 9
- movq %2, %9
- punpcklbw %2, %7
- movdqa %8, %2
+%macro WEIGHT_LINE 9
+ movq %2, %9
+ punpcklbw %2, %7
+ movdqa %8, %2
- movdqa %1, %6
- psubusb %1, %8
- psubusb %8, %6
- por %8, %1 ; ABS(curPixel - centerPixel);
+ movdqa %1, %6
+ psubusb %1, %8
+ psubusb %8, %6
+ por %8, %1 ; ABS(curPixel - centerPixel);
- movdqa %1, %3
- psubusb %1, %8
+ movdqa %1, %3
+ psubusb %1, %8
- pmullw %1, %1
- psrlw %1, 5
- pmullw %2, %1
- paddusw %4, %1
- paddusw %5, %2
+ pmullw %1, %1
+ psrlw %1, 5
+ pmullw %2, %1
+ paddusw %4, %1
+ paddusw %5, %2
%endmacro
-%macro WEIGHT_LINE1_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
+%macro WEIGHT_LINE1_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
%endmacro
-%macro WEIGHT_LINE2_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
+%macro WEIGHT_LINE2_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
%endmacro
-%macro WEIGHT_LINE3_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
+%macro WEIGHT_LINE3_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- pmullw %2, [sse2_20]
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ pmullw %2, [sse2_20]
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
%endmacro
;***********************************************************************
; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
-; 1 2 3
-; 4 0 5
-; 6 7 8
-; 0: the center point
+; 1 2 3
+; 4 0 5
+; 6 7 8
+; 0: the center point
WELS_EXTERN BilateralLumaFilter8_sse2
- push r3
- %assign push_num 1
- LOAD_2_PARA
- PUSH_XMM 8
+ push r3
+ %assign push_num 1
+ LOAD_2_PARA
+ PUSH_XMM 8
- pxor xmm7, xmm7
+ pxor xmm7, xmm7
- mov r3, r0
+ mov r3, r0
- movq xmm6, [r0]
- punpcklbw xmm6, xmm7
- movdqa xmm3, [sse2_32]
- pxor xmm4, xmm4 ; nTotWeight
- pxor xmm5, xmm5 ; nSum
+ movq xmm6, [r0]
+ punpcklbw xmm6, xmm7
+ movdqa xmm3, [sse2_32]
+ pxor xmm4, xmm4 ; nTotWeight
+ pxor xmm5, xmm5 ; nSum
- dec r0
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
+ dec r0
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
- sub r0, r1
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
+ sub r0, r1
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
- lea r0, [r0 + r1 * 2]
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
+ lea r0, [r0 + r1 * 2]
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
- pcmpeqw xmm0, xmm0
- psrlw xmm0, 15
- psllw xmm0, 8
- psubusw xmm0, xmm4
- pmullw xmm0, xmm6
- paddusw xmm5, xmm0
- psrlw xmm5, 8
- packuswb xmm5, xmm5
- movq [r3], xmm5
+ pcmpeqw xmm0, xmm0
+ psrlw xmm0, 15
+ psllw xmm0, 8
+ psubusw xmm0, xmm4
+ pmullw xmm0, xmm6
+ paddusw xmm5, xmm0
+ psrlw xmm5, 8
+ packuswb xmm5, xmm5
+ movq [r3], xmm5
- POP_XMM
- pop r3
- %assign push_num 0
+ POP_XMM
+ pop r3
+ %assign push_num 0
- ret
+ ret
;***********************************************************************
-; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
;5x5 filter:
-;1 1 2 1 1
-;1 2 4 2 1
-;2 4 20 4 2
-;1 2 4 2 1
-;1 1 2 1 1
+;1 1 2 1 1
+;1 2 4 2 1
+;2 4 20 4 2
+;1 2 4 2 1
+;1 1 2 1 1
WELS_EXTERN WaverageChromaFilter8_sse2
- push r3
+ push r3
- %assign push_num 1
+ %assign push_num 1
- LOAD_2_PARA
+ LOAD_2_PARA
- mov r3, r1
- add r3, r3
- sub r0, r3 ; pixels - 2 * stride
- sub r0, 2
+ mov r3, r1
+ add r3, r3
+ sub r0, r3 ; pixels - 2 * stride
+ sub r0, 2
- pxor xmm0, xmm0
- pxor xmm3, xmm3
+ pxor xmm0, xmm0
+ pxor xmm3, xmm3
- movdqu xmm1, [r0]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+ movdqu xmm1, [r0]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
- movdqu xmm1, [r0 + r1]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+ movdqu xmm1, [r0 + r1]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
- add r0, r3
- movdqu xmm1, [r0]
- WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
+ add r0, r3
+ movdqu xmm1, [r0]
+ WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
- movdqu xmm1, [r0 + r1]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+ movdqu xmm1, [r0 + r1]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
- movdqu xmm1, [r0 + r1 * 2]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+ movdqu xmm1, [r0 + r1 * 2]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
- psrlw xmm3, 6
- packuswb xmm3, xmm3
- movq [r0 + 2], xmm3
+ psrlw xmm3, 6
+ packuswb xmm3, xmm3
+ movq [r0 + 2], xmm3
- pop r3
+ pop r3
- %assign push_num 0
- ret
+ %assign push_num 0
+ ret
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -29,13 +29,13 @@
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
-;* upsampling.asm
+;* upsampling.asm
;*
;* Abstract
-;* SIMD for pixel domain down sampling
+;* SIMD for pixel domain down sampling
;*
;* History
-;* 10/22/2009 Created
+;* 10/22/2009 Created
;*
;*************************************************************************/
%include "asm_inc.asm"
@@ -61,9 +61,9 @@
ALIGN 16
shufb_mask_low:
- db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+ db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
shufb_mask_high:
- db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+ db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
;***********************************************************************
@@ -73,737 +73,737 @@
SECTION .text
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
+ sar ebp, $01 ; iSrcHeight >> 1
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
- ; 2nd part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm1, [esi+16] ; 1st pSrc line + 16
- movq mm2, [esi+24] ; 1st pSrc line + 24
- movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
- movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
+ ; 2nd part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm1, [esi+16] ; 1st pSrc line + 16
+ movq mm2, [esi+24] ; 1st pSrc line + 24
+ movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
+ movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
- ; to handle mm1, mm2, mm3, mm4
- pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm5, mm6 ; d c D C b a B A
- pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
+ ; to handle mm1, mm2, mm3, mm4
+ pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm5, mm6 ; d c D C b a B A
+ pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
- pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm6, mm7 ; h g H G f e F E
- pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
+ pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm6, mm7 ; h g H G f e F E
+ pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
- pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm7, mm1 ; l k L K j i J I
- pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
+ pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm7, mm1 ; l k L K j i J I
+ pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
- pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm1, mm2 ; p o P O n m N M
- pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
+ pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm1, mm2 ; p o P O n m N M
+ pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
- ; to handle mm5, mm6, mm7, mm1
- movq mm2, mm5
- punpckldq mm2, mm6 ; H G F E D C B A
- punpckhdq mm5, mm6 ; h g f e d c b a
+ ; to handle mm5, mm6, mm7, mm1
+ movq mm2, mm5
+ punpckldq mm2, mm6 ; H G F E D C B A
+ punpckhdq mm5, mm6 ; h g f e d c b a
- movq mm3, mm7
- punpckldq mm3, mm1 ; P O N M L K J I
- punpckhdq mm7, mm1 ; p o n m l k j i
+ movq mm3, mm7
+ punpckldq mm3, mm1 ; P O N M L K J I
+ punpckhdq mm7, mm1 ; p o n m l k j i
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
- movq [edi ], mm0
- movq [edi+8], mm2
+ movq [edi ], mm0
+ movq [edi+8], mm2
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
+ sar ebp, $01 ; iSrcHeight >> 1
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
- movq [edi ], mm0
+ movq [edi ], mm0
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
+ sar ebp, $01 ; iSrcHeight >> 1
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $02 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 8 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $02 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 8 bytes
.xloops:
- ; 1st part horizonal loop: x8 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A
- ;2nd Line Src: mm1: h H g G f F e E
- ;=> target:
- ;: H G F E D C B A
- ;: h g f e d c b a
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+ecx] ; 2nd pSrc line
+ ; 1st part horizonal loop: x8 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A
+ ;2nd Line Src: mm1: h H g G f F e E
+ ;=> target:
+ ;: H G F E D C B A
+ ;: h g f e d c b a
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+ecx] ; 2nd pSrc line
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm2, mm3 ; d c D C b a B A
- pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm2, mm3 ; d c D C b a B A
+ pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
- pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm4, mm5 ; h g H G f e F E
- pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+ pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm4, mm5 ; h g H G f e F E
+ pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
- ; to handle mm2, mm4
- movq mm0, mm2 ;
- punpckldq mm0, mm4 ; H G F E D C B A
- punpckhdq mm2, mm4 ; h g f e d c b a
+ ; to handle mm2, mm4
+ movq mm0, mm2 ;
+ punpckldq mm0, mm4 ; H G F E D C B A
+ punpckhdq mm2, mm4 ; h g f e d c b a
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
- pshufw mm1, mm0, 04eh ; 01001110 B
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+ pshufw mm1, mm0, 04eh ; 01001110 B
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
- movd [edi], mm0
+ movd [edi], mm0
- ; next unit
- lea esi, [esi+8]
- lea edi, [edi+4]
+ ; next unit
+ lea esi, [esi+8]
+ lea edi, [edi+4]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
+ sar ebp, $01 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+16] ; 1st_src_line + 16
- movdqa xmm2, [esi+ecx] ; 2nd_src_line
- movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm4 high bits
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm4 high bits
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
- ; write pDst
- movdqa [edi], xmm0
+ ; write pDst
+ movdqa [edi], xmm0
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
+ sar ebp, $01 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+ecx] ; 2nd_src_line
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+ecx] ; 2nd_src_line
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm2 high bits
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm2 high bits
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
- ; write pDst
- movq [edi], xmm0
+ ; write pDst
+ movq [edi], xmm0
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
+ sar ebp, $01 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+16] ; 1st_src_line + 16
- movntdqa xmm2, [esi+ecx] ; 2nd_src_line
- movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movntdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
- ; write pDst
- movdqa [edi], xmm0
+ ; write pDst
+ movdqa [edi], xmm0
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
+ sar ebp, $01 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+ecx] ; 2nd_src_line
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+ecx] ; 2nd_src_line
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
- ; write pDst
- movq [edi], xmm0
+ ; write pDst
+ movq [edi], xmm0
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
@@ -811,202 +811,202 @@
;**************************************************************************************************************
;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
; unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************
-WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
- pxor xmm0, xmm0
- mov edx, 32767
- mov eax, [uiScaleX]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm1, eax ; uinc(uiScaleX mod 32767)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
- pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
+ pxor xmm0, xmm0
+ mov edx, 32767
+ mov eax, [uiScaleX]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm1, eax ; uinc(uiScaleX mod 32767)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
+ pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
- pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
+ pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
- mov edx, 40003fffh
- movd xmm5, edx
- punpcklwd xmm5, xmm0 ; 16384 16383
- pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
+ mov edx, 40003fffh
+ movd xmm5, edx
+ punpcklwd xmm5, xmm0 ; 16384 16383
+ pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
DOWNSAMPLE:
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
- pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
+ pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
- movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
+ movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
WIDTH:
- mov eax, [xInverse]
- shr eax, 15
+ mov eax, [xInverse]
+ shr eax, 15
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- pxor xmm0, xmm0
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
- punpcklwd xmm1, xmm0 ; 000d000c000b000a
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ pxor xmm0, xmm0
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
+ punpcklwd xmm1, xmm0 ; 000d000c000b000a
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- movdqa xmm0, xmm2
- pmuludq xmm2, xmm1
- psrlq xmm0, 32
- psrlq xmm1, 32
- pmuludq xmm0, xmm1
- paddq xmm2, xmm0
- pshufd xmm1, xmm2, 00001110b
- paddq xmm2, xmm1
- psrlq xmm2, 29
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ movdqa xmm0, xmm2
+ pmuludq xmm2, xmm1
+ psrlq xmm0, 32
+ psrlq xmm1, 32
+ pmuludq xmm0, xmm1
+ paddq xmm2, xmm0
+ pshufd xmm1, xmm2, 00001110b
+ paddq xmm2, xmm1
+ psrlq xmm2, 29
- movd eax, xmm2
- inc eax
- shr eax, 1
- mov [edi], al
- inc edi
+ movd eax, xmm2
+ inc eax
+ shr eax, 1
+ mov [edi], al
+ inc edi
- mov eax, [uiScaleX]
- add [xInverse], eax
+ mov eax, [uiScaleX]
+ add [xInverse], eax
- paddw xmm3, xmm7 ; inc u
- psllw xmm3, 1
- psrlw xmm3, 1
+ paddw xmm3, xmm7 ; inc u
+ psllw xmm3, 1
+ psrlw xmm3, 1
- loop WIDTH
+ loop WIDTH
WIDTH_END:
- mov eax, [xInverse]
- shr eax, 15
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
+ mov eax, [xInverse]
+ shr eax, 15
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
- dec dword [tmpHeight]
- jg HEIGHT
+ dec dword [tmpHeight]
+ jg HEIGHT
LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 15
+ mov eax, [xInverse]
+ shr eax, 15
- mov al, [esi+eax]
- mov [edi], al
- inc edi
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
- mov eax, [uiScaleX]
- add [xInverse], eax
+ mov eax, [uiScaleX]
+ add [xInverse], eax
- loop LAST_ROW_WIDTH
+ loop LAST_ROW_WIDTH
LAST_ROW_END:
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
- ret
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
+ ret
@@ -1013,193 +1013,193 @@
;**************************************************************************************************************
;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
; unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************
-WELS_EXTERN GeneralBilinearFastDownsampler_sse2
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
+WELS_EXTERN GeneralBilinearFastDownsampler_sse2
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
- pxor xmm0, xmm0
- mov edx, 65535
- mov eax, [uiScaleX]
- and eax, edx
- mov ebx, eax
- neg ebx
- and ebx, 65535
- movd xmm1, eax ; uinc(uiScaleX mod 65536)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 uinc 0 -uinc
- pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
+ pxor xmm0, xmm0
+ mov edx, 65535
+ mov eax, [uiScaleX]
+ and eax, edx
+ mov ebx, eax
+ neg ebx
+ and ebx, 65535
+ movd xmm1, eax ; uinc(uiScaleX mod 65536)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 uinc 0 -uinc
+ pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 vinc 0 -vinc
- pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 vinc 0 -vinc
+ pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
- mov edx, 80007fffh ; 32768 32767
- movd xmm5, edx
- pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
- mov ebx, 16384
+ mov edx, 80007fffh ; 32768 32767
+ movd xmm5, edx
+ pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
+ mov ebx, 16384
FAST_DOWNSAMPLE:
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
- pshuflw xmm4, xmm5, 01010000b
- psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
+ pshuflw xmm4, xmm5, 01010000b
+ psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
FAST_HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
- movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
+ movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
FAST_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
+ mov eax, [xInverse]
+ shr eax, 16
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- pmaddwd xmm2, xmm1
- pshufd xmm1, xmm2, 00000001b
- paddd xmm2, xmm1
- movd xmm1, ebx
- paddd xmm2, xmm1
- psrld xmm2, 15
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ pmaddwd xmm2, xmm1
+ pshufd xmm1, xmm2, 00000001b
+ paddd xmm2, xmm1
+ movd xmm1, ebx
+ paddd xmm2, xmm1
+ psrld xmm2, 15
- packuswb xmm2, xmm0
- movd eax, xmm2
- mov [edi], al
- inc edi
+ packuswb xmm2, xmm0
+ movd eax, xmm2
+ mov [edi], al
+ inc edi
- mov eax, [uiScaleX]
- add [xInverse], eax
+ mov eax, [uiScaleX]
+ add [xInverse], eax
- paddw xmm3, xmm7 ; inc u
+ paddw xmm3, xmm7 ; inc u
- loop FAST_WIDTH
+ loop FAST_WIDTH
FAST_WIDTH_END:
- mov eax, [xInverse]
- shr eax, 16
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
+ mov eax, [xInverse]
+ shr eax, 16
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
- dec dword [tmpHeight]
- jg FAST_HEIGHT
+ dec dword [tmpHeight]
+ jg FAST_HEIGHT
FAST_LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
FAST_LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
+ mov eax, [xInverse]
+ shr eax, 16
- mov al, [esi+eax]
- mov [edi], al
- inc edi
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
- mov eax, [uiScaleX]
- add [xInverse], eax
+ mov eax, [uiScaleX]
+ add [xInverse], eax
- loop FAST_LAST_ROW_WIDTH
+ loop FAST_LAST_ROW_WIDTH
FAST_LAST_ROW_END:
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
- ret
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
+ ret
%endif
--- a/codec/processing/src/x86/vaa.asm
+++ b/codec/processing/src/x86/vaa.asm
@@ -48,100 +48,100 @@
; Macros and other preprocessor constants
;***********************************************************************
%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
- movdqa %1, %2
- punpcklbw %1, %3
- punpckhbw %2, %3
- pmaddwd %1, %1
- pmaddwd %2, %2
- paddd %1, %2
- pshufd %2, %1, 04Eh ; 01001110 B
- paddd %1, %2
- pshufd %2, %1, 0B1h ; 10110001 B
- paddd %1, %2
+ movdqa %1, %2
+ punpcklbw %1, %3
+ punpckhbw %2, %3
+ pmaddwd %1, %1
+ pmaddwd %2, %2
+ paddd %1, %2
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddd %1, %2
+ pshufd %2, %1, 0B1h ; 10110001 B
+ paddd %1, %2
%endmacro ; END OF SUM_SQR_SSE2
%macro WELS_SAD_16x2_SSE2 3 ;esi :%1 edi:%2 ebx:%3
- movdqa xmm1, [%1]
- movdqa xmm2, [%2]
- movdqa xmm3, [%1+%3]
- movdqa xmm4, [%2+%3]
- psadbw xmm1, xmm2
- psadbw xmm3, xmm4
- paddd xmm6, xmm1
- paddd xmm6, xmm3
- lea %1, [%1+%3*2]
- lea %2, [%2+%3*2]
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, [%1+%3]
+ movdqa xmm4, [%2+%3]
+ psadbw xmm1, xmm2
+ psadbw xmm3, xmm4
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+ lea %1, [%1+%3*2]
+ lea %2, [%2+%3*2]
%endmacro
; by comparing it outperforms than phaddw(SSSE3) sets
%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
- ; @sum_8x2 begin
- pshufd %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 0B1h ; 10110001 B
- paddw %1, %2
- ; end of @sum_8x2
+ ; @sum_8x2 begin
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 0B1h ; 10110001 B
+ paddw %1, %2
+ ; end of @sum_8x2
%endmacro ; END of SUM_WORD_8x2_SSE2
%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
- movdqa xmm1, [%1]
- movdqa xmm2, [%2]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm6, xmm3
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm6, xmm3
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd xmm5, xmm3
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd xmm5, xmm3
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm4, xmm1
- paddd xmm4, xmm2
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm4, xmm1
+ paddd xmm4, xmm2
- add %1, %3
- add %2, %3
+ add %1, %3
+ add %2, %3
%endmacro
%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
- movdqa xmm1, [%1]
- movdqa xmm2, [%2]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm7, xmm3 ; sad
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm7, xmm3 ; sad
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; diff
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; diff
- movdqa xmm2, xmm1
- psadbw xmm2, xmm0
- paddd xmm6, xmm2 ; sum
+ movdqa xmm2, xmm1
+ psadbw xmm2, xmm0
+ paddd xmm6, xmm2 ; sum
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm5, xmm1
- paddd xmm5, xmm2 ; sqsum
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm5, xmm1
+ paddd xmm5, xmm2 ; sqsum
- movdqa xmm1, xmm3
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd xmm4, xmm1
- paddd xmm4, xmm3 ; sqdiff
+ movdqa xmm1, xmm3
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm4, xmm1
+ paddd xmm4, xmm3 ; sqdiff
- add %1, %3
- add %2, %3
+ add %1, %3
+ add %2, %3
%endmacro
%macro WELS_SAD_SD_MAD_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
@@ -149,40 +149,40 @@
%define sum_cur_reg %2
%define sum_ref_reg %3
%define mad_reg %4
- movdqa xmm1, [%5]
- movdqa xmm2, [%6]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_cur_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- paddd sum_ref_reg, xmm3 ; sum_ref
+ movdqa xmm1, [%5]
+ movdqa xmm2, [%6]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_cur_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ paddd sum_ref_reg, xmm3 ; sum_ref
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
- add %5, %7
- add %6, %7
+ add %5, %7
+ add %6, %7
%endmacro
%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
%define max_reg %1
- movdqa xmm1, max_reg
- psrldq xmm1, 4
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 2
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 1
- pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 4
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 2
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 1
+ pmaxub max_reg, xmm1
%endmacro
%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
@@ -190,50 +190,50 @@
%define sum_reg %2
%define mad_reg %3
%define sqdiff_reg %4
- movdqa xmm1, [%5]
- movdqa xmm2, xmm1
- movdqa xmm3, xmm1
- punpcklbw xmm2, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- paddd xmm2, xmm3
- movdqa xmm3, xmm2
- psllq xmm2, 32
- psrlq xmm3, 32
- psllq xmm3, 32
- paddd xmm2, xmm3
- paddd sad_reg, xmm2 ; sqsum
+ movdqa xmm1, [%5]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm2, xmm3
+ movdqa xmm3, xmm2
+ psllq xmm2, 32
+ psrlq xmm3, 32
+ psllq xmm3, 32
+ paddd xmm2, xmm3
+ paddd sad_reg, xmm2 ; sqsum
- movdqa xmm2, [%6]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- pslldq xmm3, 4
- paddd sum_reg, xmm3 ; sum_ref
+ movdqa xmm2, [%6]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ pslldq xmm3, 4
+ paddd sum_reg, xmm3 ; sum_ref
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
- movdqa xmm1, xmm3
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
+ movdqa xmm1, xmm3
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
- movdqa xmm3, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd sqdiff_reg, xmm1
- paddd sqdiff_reg, xmm3 ; sqdiff
+ movdqa xmm3, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd sqdiff_reg, xmm1
+ paddd sqdiff_reg, xmm3 ; sqdiff
- add %5, %7
- add %6, %7
+ add %5, %7
+ add %6, %7
%endmacro
@@ -249,99 +249,99 @@
; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
;***********************************************************************
WELS_EXTERN SampleVariance16x16_sse2
- push esi
- push edi
- push ebx
+ push esi
+ push edi
+ push ebx
- sub esp, 16
- %define SUM [esp]
- %define SUM_CUR [esp+4]
- %define SQR [esp+8]
- %define SQR_CUR [esp+12]
- %define PUSH_SIZE 28 ; 12 + 16
+ sub esp, 16
+ %define SUM [esp]
+ %define SUM_CUR [esp+4]
+ %define SQR [esp+8]
+ %define SQR_CUR [esp+12]
+ %define PUSH_SIZE 28 ; 12 + 16
- mov edi, [esp+PUSH_SIZE+4] ; y_ref
- mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
- mov esi, [esp+PUSH_SIZE+12] ; y_src
- mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
- mov ecx, 010h ; height = 16
+ mov edi, [esp+PUSH_SIZE+4] ; y_ref
+ mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
+ mov esi, [esp+PUSH_SIZE+12] ; y_src
+ mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
+ mov ecx, 010h ; height = 16
- pxor xmm7, xmm7
- movdqu SUM, xmm7
+ pxor xmm7, xmm7
+ movdqu SUM, xmm7
.hloops:
- movdqa xmm0, [edi] ; y_ref
- movdqa xmm1, [esi] ; y_src
- movdqa xmm2, xmm0 ; store first for future process
- movdqa xmm3, xmm1
- ; sum += diff;
- movdqa xmm4, xmm0
- psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
- ; to be continued for sum
- pshufd xmm5, xmm4, 0C6h ; 11000110 B
- paddw xmm4, xmm5
- movd ebx, xmm4
- add SUM, ebx
+ movdqa xmm0, [edi] ; y_ref
+ movdqa xmm1, [esi] ; y_src
+ movdqa xmm2, xmm0 ; store first for future process
+ movdqa xmm3, xmm1
+ ; sum += diff;
+ movdqa xmm4, xmm0
+ psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
+ ; to be continued for sum
+ pshufd xmm5, xmm4, 0C6h ; 11000110 B
+ paddw xmm4, xmm5
+ movd ebx, xmm4
+ add SUM, ebx
- ; sqr += diff * diff;
- pmaxub xmm0, xmm1
- pminub xmm1, xmm2
- psubb xmm0, xmm1 ; diff
- SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
- movd ebx, xmm1
- add SQR, ebx
+ ; sqr += diff * diff;
+ pmaxub xmm0, xmm1
+ pminub xmm1, xmm2
+ psubb xmm0, xmm1 ; diff
+ SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm1
+ add SQR, ebx
- ; sum_cur += y_src[x];
- movdqa xmm0, xmm3 ; cur_orig
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- paddw xmm0, xmm1 ; 8x2
- SUM_WORD_8x2_SSE2 xmm0, xmm1
- movd ebx, xmm0
- and ebx, 0ffffh
- add SUM_CUR, ebx
+ ; sum_cur += y_src[x];
+ movdqa xmm0, xmm3 ; cur_orig
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ paddw xmm0, xmm1 ; 8x2
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
+ movd ebx, xmm0
+ and ebx, 0ffffh
+ add SUM_CUR, ebx
- ; sqr_cur += y_src[x] * y_src[x];
- SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
- movd ebx, xmm0
- add SQR_CUR, ebx
+ ; sqr_cur += y_src[x] * y_src[x];
+ SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm0
+ add SQR_CUR, ebx
- lea edi, [edi+edx]
- lea esi, [esi+eax]
- dec ecx
- jnz near .hloops
+ lea edi, [edi+edx]
+ lea esi, [esi+eax]
+ dec ecx
+ jnz near .hloops
- mov ebx, 0
- mov bx, word SUM
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR
- sar ecx, 8
- sub ecx, ebx
- mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
- mov [edi], cx ; to store uiMotionIndex
- mov ebx, 0
- mov bx, word SUM_CUR
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR_CUR
- sar ecx, 8
- sub ecx, ebx
- mov [edi+2], cx ; to store uiTextureIndex
+ mov ebx, 0
+ mov bx, word SUM
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR
+ sar ecx, 8
+ sub ecx, ebx
+ mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
+ mov [edi], cx ; to store uiMotionIndex
+ mov ebx, 0
+ mov bx, word SUM_CUR
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR_CUR
+ sar ecx, 8
+ sub ecx, ebx
+ mov [edi+2], cx ; to store uiTextureIndex
- %undef SUM
- %undef SUM_CUR
- %undef SQR
- %undef SQR_CUR
- %undef PUSH_SIZE
+ %undef SUM
+ %undef SUM_CUR
+ %undef SQR
+ %undef SQR_CUR
+ %undef PUSH_SIZE
- add esp, 16
- pop ebx
- pop edi
- pop esi
+ add esp, 16
+ pop ebx
+ pop edi
+ pop esi
- ret
+ ret
@@ -360,67 +360,67 @@
%define psadframe esp + pushsize + 24
%define psad8x8 esp + pushsize + 28
%define pushsize 12
- push esi
- push edi
- push ebx
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
+ push esi
+ push edi
+ push ebx
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
height_loop:
- mov ecx, dword [iPicWidth]
- push esi
- push edi
+ mov ecx, dword [iPicWidth]
+ push esi
+ push edi
width_loop:
- pxor xmm6, xmm6 ;
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
+ pxor xmm6, xmm6 ;
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
- dec ecx
- jnz width_loop
+ dec ecx
+ jnz width_loop
- pop edi
- pop esi
- add esi, eax
- add edi, eax
+ pop edi
+ pop esi
+ add esi, eax
+ add edi, eax
- dec dword [iPicHeight]
- jnz height_loop
+ dec dword [iPicHeight]
+ jnz height_loop
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
%undef cur_data
%undef ref_data
@@ -430,10 +430,10 @@
%undef psadframe
%undef psad8x8
%undef pushsize
- pop ebx
- pop edi
- pop esi
- ret
+ pop ebx
+ pop edi
+ pop esi
+ ret
%else ;64-bit
@@ -441,98 +441,98 @@
; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
;***********************************************************************
WELS_EXTERN SampleVariance16x16_sse2
- %define SUM r10;[esp]
- %define SUM_CUR r11;[esp+4]
- %define SQR r13;[esp+8]
- %define SQR_CUR r15;[esp+12]
+ %define SUM r10;[esp]
+ %define SUM_CUR r11;[esp+4]
+ %define SQR r13;[esp+8]
+ %define SQR_CUR r15;[esp+12]
- push r12
- push r13
- push r14
- push r15
- %assign push_num 4
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1,r1d
- SIGN_EXTENSION r3,r3d
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1,r1d
+ SIGN_EXTENSION r3,r3d
- mov r12,010h
- pxor xmm7, xmm7
- movq SUM, xmm7
- movq SUM_CUR,xmm7
- movq SQR,xmm7
- movq SQR_CUR,xmm7
+ mov r12,010h
+ pxor xmm7, xmm7
+ movq SUM, xmm7
+ movq SUM_CUR,xmm7
+ movq SQR,xmm7
+ movq SQR_CUR,xmm7
.hloops:
- mov r14,0
- movdqa xmm0, [r0] ; y_ref
- movdqa xmm1, [r2] ; y_src
- movdqa xmm2, xmm0 ; store first for future process
- movdqa xmm3, xmm1
- ; sum += diff;
- movdqa xmm4, xmm0
- psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
- ; to be continued for sum
- pshufd xmm5, xmm4, 0C6h ; 11000110 B
- paddw xmm4, xmm5
- movd r14d, xmm4
- add SUM, r14
+ mov r14,0
+ movdqa xmm0, [r0] ; y_ref
+ movdqa xmm1, [r2] ; y_src
+ movdqa xmm2, xmm0 ; store first for future process
+ movdqa xmm3, xmm1
+ ; sum += diff;
+ movdqa xmm4, xmm0
+ psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
+ ; to be continued for sum
+ pshufd xmm5, xmm4, 0C6h ; 11000110 B
+ paddw xmm4, xmm5
+ movd r14d, xmm4
+ add SUM, r14
- ; sqr += diff * diff;
- pmaxub xmm0, xmm1
- pminub xmm1, xmm2
- psubb xmm0, xmm1 ; diff
- SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
- movd r14d, xmm1
- add SQR, r14
+ ; sqr += diff * diff;
+ pmaxub xmm0, xmm1
+ pminub xmm1, xmm2
+ psubb xmm0, xmm1 ; diff
+ SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+ movd r14d, xmm1
+ add SQR, r14
- ; sum_cur += y_src[x];
- movdqa xmm0, xmm3 ; cur_orig
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- paddw xmm0, xmm1 ; 8x2
- SUM_WORD_8x2_SSE2 xmm0, xmm1
- movd r14d, xmm0
- and r14, 0ffffh
- add SUM_CUR, r14
+ ; sum_cur += y_src[x];
+ movdqa xmm0, xmm3 ; cur_orig
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ paddw xmm0, xmm1 ; 8x2
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
+ movd r14d, xmm0
+ and r14, 0ffffh
+ add SUM_CUR, r14
- ; sqr_cur += y_src[x] * y_src[x];
- SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
- movd r14d, xmm0
- add SQR_CUR, r14
+ ; sqr_cur += y_src[x] * y_src[x];
+ SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+ movd r14d, xmm0
+ add SQR_CUR, r14
- lea r0, [r0+r1]
- lea r2, [r2+r3]
- dec r12
- jnz near .hloops
+ lea r0, [r0+r1]
+ lea r2, [r2+r3]
+ dec r12
+ jnz near .hloops
- mov r0, SUM
- sar r0, 8
- imul r0, r0
- mov r1, SQR
- sar r1, 8
- sub r1, r0
- mov [r4], r1w ; to store uiMotionIndex
- mov r0, SUM_CUR
- sar r0, 8
- imul r0, r0
- mov r1, SQR_CUR
- sar r1, 8
- sub r1, r0
- mov [r4+2], r1w ; to store uiTextureIndex
+ mov r0, SUM
+ sar r0, 8
+ imul r0, r0
+ mov r1, SQR
+ sar r1, 8
+ sub r1, r0
+ mov [r4], r1w ; to store uiMotionIndex
+ mov r0, SUM_CUR
+ sar r0, 8
+ imul r0, r0
+ mov r1, SQR_CUR
+ sar r1, 8
+ sub r1, r0
+ mov [r4+2], r1w ; to store uiTextureIndex
- POP_XMM
- LOAD_5_PARA_POP
- pop r15
- pop r14
- pop r13
- pop r12
+ POP_XMM
+ LOAD_5_PARA_POP
+ pop r15
+ pop r14
+ pop r13
+ pop r12
- %assign push_num 0
+ %assign push_num 0
- ret
+ ret
;*************************************************************************************************************
@@ -550,69 +550,69 @@
%define psadframe r5
%define psad8x8 r6
- push r12
- push r13
- %assign push_num 2
- LOAD_7_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
+ push r12
+ push r13
+ %assign push_num 2
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
- mov r12,r4
- shr r2, 4 ; iPicWidth/16
- shr r3, 4 ; iPicHeight/16
+ mov r12,r4
+ shr r2, 4 ; iPicWidth/16
+ shr r3, 4 ; iPicHeight/16
- shl r12, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
+ shl r12, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
height_loop:
- mov r13, r2
- push r0
- push r1
+ mov r13, r2
+ push r0
+ push r1
width_loop:
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r6], xmm6
- psrldq xmm6, 8
- movd [r6+4], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r6], xmm6
+ psrldq xmm6, 8
+ movd [r6+4], xmm6
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r6+8], xmm6
- psrldq xmm6, 8
- movd [r6+12], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r6+8], xmm6
+ psrldq xmm6, 8
+ movd [r6+12], xmm6
- add r6, 16
- sub r0, r12
- sub r1, r12
- add r0, 16
- add r1, 16
+ add r6, 16
+ sub r0, r12
+ sub r1, r12
+ add r0, 16
+ add r1, 16
- dec r13
- jnz width_loop
+ dec r13
+ jnz width_loop
- pop r1
- pop r0
- add r0, r12
- add r1, r12
+ pop r1
+ pop r0
+ add r0, r12
+ add r1, r12
- dec r3
- jnz height_loop
+ dec r3
+ jnz height_loop
- ;mov r13, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [psadframe], xmm7
+ ;mov r13, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [psadframe], xmm7
%undef cur_data
%undef ref_data
@@ -622,12 +622,12 @@
%undef psadframe
%undef psad8x8
%undef pushsize
- POP_XMM
- LOAD_7_PARA_POP
- pop r13
- pop r12
- %assign push_num 0
- ret
+ POP_XMM
+ LOAD_7_PARA_POP
+ pop r13
+ pop r12
+ %assign push_num 0
+ ret
%endif
@@ -653,103 +653,103 @@
%define tmp_esi esp + 0
%define tmp_edi esp + 4
%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
var_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
var_width_loop:
- pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
- pxor xmm5, xmm5 ; pSum16x16
- pxor xmm4, xmm4 ; sqsum_16x16
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
+ pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
+ pxor xmm5, xmm5 ; pSum16x16
+ pxor xmm4, xmm4 ; sqsum_16x16
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
- pxor xmm6, xmm6
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
- mov ebp, [psum16x16]
- movdqa xmm1, xmm5
- psrldq xmm1, 8
- paddd xmm5, xmm1
- movd [ebp], xmm5
- add dword [psum16x16], 4
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm5
+ psrldq xmm1, 8
+ paddd xmm5, xmm1
+ movd [ebp], xmm5
+ add dword [psum16x16], 4
- movdqa xmm5, xmm4
- psrldq xmm5, 8
- paddd xmm4, xmm5
- movdqa xmm3, xmm4
- psrldq xmm3, 4
- paddd xmm4, xmm3
+ movdqa xmm5, xmm4
+ psrldq xmm5, 8
+ paddd xmm4, xmm5
+ movdqa xmm3, xmm4
+ psrldq xmm3, 4
+ paddd xmm4, xmm3
- mov ebp, [psqsum16x16]
- movd [ebp], xmm4
- add dword [psqsum16x16], 4
+ mov ebp, [psqsum16x16]
+ movd [ebp], xmm4
+ add dword [psqsum16x16], 4
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
- dec ecx
- jnz var_width_loop
+ dec ecx
+ jnz var_width_loop
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
- dec dword [iPicHeight]
- jnz var_height_loop
+ dec dword [iPicHeight]
+ jnz var_height_loop
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
%undef cur_data
%undef ref_data
%undef iPicWidth
@@ -763,7 +763,7 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
%else ;64-bit
@@ -784,112 +784,112 @@
%define psum16x16 arg8
%define psqsum16x16 arg9
- push r12
- push r13
- push r14
- push r15
- %assign push_num 4
- PUSH_XMM 8
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+ PUSH_XMM 8
%ifdef WIN64
- mov r4, arg5 ;iPicStride
- mov r5, arg6 ;psad8x8
+ mov r4, arg5 ;iPicStride
+ mov r5, arg6 ;psad8x8
%endif
- mov r14,arg7
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
+ mov r14,arg7
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
- mov r13,r4
- shr r2,4
- shr r3,4
+ mov r13,r4
+ shr r2,4
+ shr r3,4
- shl r13,4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
+ shl r13,4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
var_height_loop:
- push r2
- %assign push_num push_num+1
- mov r11, r0
- mov r12, r1
+ push r2
+ %assign push_num push_num+1
+ mov r11, r0
+ mov r12, r1
var_width_loop:
- pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
- pxor xmm5, xmm5 ; pSum16x16
- pxor xmm4, xmm4 ; sqsum_16x16
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r14], xmm6
- psrldq xmm6, 8
- movd [r14+4], xmm6
+ pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
+ pxor xmm5, xmm5 ; pSum16x16
+ pxor xmm4, xmm4 ; sqsum_16x16
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r14], xmm6
+ psrldq xmm6, 8
+ movd [r14+4], xmm6
- pxor xmm6, xmm6
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r14+8], xmm6
- psrldq xmm6, 8
- movd [r14+12], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r14+8], xmm6
+ psrldq xmm6, 8
+ movd [r14+12], xmm6
- mov r15, psum16x16
- movdqa xmm1, xmm5
- psrldq xmm1, 8
- paddd xmm5, xmm1
- movd [r15], xmm5
- add dword psum16x16, 4
+ mov r15, psum16x16
+ movdqa xmm1, xmm5
+ psrldq xmm1, 8
+ paddd xmm5, xmm1
+ movd [r15], xmm5
+ add dword psum16x16, 4
- movdqa xmm5, xmm4
- psrldq xmm5, 8
- paddd xmm4, xmm5
- movdqa xmm3, xmm4
- psrldq xmm3, 4
- paddd xmm4, xmm3
+ movdqa xmm5, xmm4
+ psrldq xmm5, 8
+ paddd xmm4, xmm5
+ movdqa xmm3, xmm4
+ psrldq xmm3, 4
+ paddd xmm4, xmm3
- mov r15, psqsum16x16
- movd [r15], xmm4
- add dword psqsum16x16, 4
+ mov r15, psqsum16x16
+ movd [r15], xmm4
+ add dword psqsum16x16, 4
- add r14,16
- sub r0, r13
- sub r1, r13
- add r0, 16
- add r1, 16
+ add r14,16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
- dec r2
- jnz var_width_loop
+ dec r2
+ jnz var_width_loop
- pop r2
- %assign push_num push_num-1
- mov r0, r11
- mov r1, r12
- add r0, r13
- add r1, r13
- dec r3
- jnz var_height_loop
+ pop r2
+ %assign push_num push_num-1
+ mov r0, r11
+ mov r1, r12
+ add r0, r13
+ add r1, r13
+ dec r3
+ jnz var_height_loop
- mov r15, psadframe
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [r15], xmm7
+ mov r15, psadframe
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [r15], xmm7
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
%assign push_num 0
%undef cur_data
%undef ref_data
@@ -904,7 +904,7 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
%endif
@@ -932,118 +932,118 @@
%define tmp_edi esp + 4
%define tmp_sadframe esp + 8
%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
sqdiff_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
sqdiff_width_loop:
- pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
- pxor xmm6, xmm6 ; pSum16x16
- pxor xmm5, xmm5 ; sqsum_16x16 four dword
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- movdqa xmm1, xmm7
- movd [edx], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+4], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
+ pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
+ pxor xmm6, xmm6 ; pSum16x16
+ pxor xmm5, xmm5 ; sqsum_16x16 four dword
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ movdqa xmm1, xmm7
+ movd [edx], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+4], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
- pxor xmm7, xmm7
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- movdqa xmm1, xmm7
- movd [edx+8], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+12], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
+ pxor xmm7, xmm7
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ movdqa xmm1, xmm7
+ movd [edx+8], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+12], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
- mov ebp, [psum16x16]
- movdqa xmm1, xmm6
- psrldq xmm1, 8
- paddd xmm6, xmm1
- movd [ebp], xmm6
- add dword [psum16x16], 4
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm6
+ psrldq xmm1, 8
+ paddd xmm6, xmm1
+ movd [ebp], xmm6
+ add dword [psum16x16], 4
- mov ebp, [psqsum16x16]
- pshufd xmm6, xmm5, 14 ;00001110
- paddd xmm6, xmm5
- pshufd xmm5, xmm6, 1 ;00000001
- paddd xmm5, xmm6
- movd [ebp], xmm5
- add dword [psqsum16x16], 4
+ mov ebp, [psqsum16x16]
+ pshufd xmm6, xmm5, 14 ;00001110
+ paddd xmm6, xmm5
+ pshufd xmm5, xmm6, 1 ;00000001
+ paddd xmm5, xmm6
+ movd [ebp], xmm5
+ add dword [psqsum16x16], 4
- mov ebp, [psqdiff16x16]
- pshufd xmm5, xmm4, 14 ; 00001110
- paddd xmm5, xmm4
- pshufd xmm4, xmm5, 1 ; 00000001
- paddd xmm4, xmm5
- movd [ebp], xmm4
- add dword [psqdiff16x16], 4
+ mov ebp, [psqdiff16x16]
+ pshufd xmm5, xmm4, 14 ; 00001110
+ paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 1 ; 00000001
+ paddd xmm4, xmm5
+ movd [ebp], xmm4
+ add dword [psqdiff16x16], 4
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
- dec ecx
- jnz sqdiff_width_loop
+ dec ecx
+ jnz sqdiff_width_loop
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
- dec dword [iPicHeight]
- jnz sqdiff_height_loop
+ dec dword [iPicHeight]
+ jnz sqdiff_height_loop
- mov ebx, [tmp_sadframe]
- mov eax, [psadframe]
- mov [eax], ebx
+ mov ebx, [tmp_sadframe]
+ mov eax, [psadframe]
+ mov [eax], ebx
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
%undef cur_data
%undef ref_data
%undef iPicWidth
@@ -1059,7 +1059,7 @@
%undef tmp_sadframe
%undef pushsize
%undef localsize
- ret
+ ret
%else
@@ -1083,128 +1083,128 @@
%define psqsum16x16 arg9;
%define psqdiff16x16 arg10
- push r12
- push r13
- push r14
- push r15
- %assign push_num 4
- PUSH_XMM 10
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+ PUSH_XMM 10
%ifdef WIN64
- mov r4,arg5
+ mov r4,arg5
%endif
- mov r14,arg7
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
+ mov r14,arg7
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
- mov r13,r4
- shr r2,4 ; iPicWidth/16
- shr r3,4 ; iPicHeight/16
- shl r13,4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm8, xmm8 ;framesad
- pxor xmm9, xmm9
+ mov r13,r4
+ shr r2,4 ; iPicWidth/16
+ shr r3,4 ; iPicHeight/16
+ shl r13,4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8 ;framesad
+ pxor xmm9, xmm9
sqdiff_height_loop:
- ;mov ecx, dword [iPicWidth]
- ;mov r14,r2
- push r2
- %assign push_num push_num +1
- mov r10, r0
- mov r11, r1
+ ;mov ecx, dword [iPicWidth]
+ ;mov r14,r2
+ push r2
+ %assign push_num push_num +1
+ mov r10, r0
+ mov r11, r1
sqdiff_width_loop:
- pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
- pxor xmm6, xmm6 ; pSum16x16
- pxor xmm5, xmm5 ; sqsum_16x16 four dword
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- movdqa xmm1, xmm7
- movd [r14], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [r14+4], xmm7
- movd r15d, xmm1
- movd xmm9, r15d
- paddd xmm8,xmm9
+ pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
+ pxor xmm6, xmm6 ; pSum16x16
+ pxor xmm5, xmm5 ; sqsum_16x16 four dword
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ movdqa xmm1, xmm7
+ movd [r14], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [r14+4], xmm7
+ movd r15d, xmm1
+ movd xmm9, r15d
+ paddd xmm8,xmm9
- pxor xmm7, xmm7
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- movdqa xmm1, xmm7
- movd [r14+8], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [r14+12], xmm7
- movd r15d, xmm1
- movd xmm9, r15d
- paddd xmm8,xmm9
+ pxor xmm7, xmm7
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ movdqa xmm1, xmm7
+ movd [r14+8], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [r14+12], xmm7
+ movd r15d, xmm1
+ movd xmm9, r15d
+ paddd xmm8,xmm9
- mov r15, psum16x16
- movdqa xmm1, xmm6
- psrldq xmm1, 8
- paddd xmm6, xmm1
- movd [r15], xmm6
- add dword psum16x16, 4
+ mov r15, psum16x16
+ movdqa xmm1, xmm6
+ psrldq xmm1, 8
+ paddd xmm6, xmm1
+ movd [r15], xmm6
+ add dword psum16x16, 4
- mov r15, psqsum16x16
- pshufd xmm6, xmm5, 14 ;00001110
- paddd xmm6, xmm5
- pshufd xmm5, xmm6, 1 ;00000001
- paddd xmm5, xmm6
- movd [r15], xmm5
- add dword psqsum16x16, 4
+ mov r15, psqsum16x16
+ pshufd xmm6, xmm5, 14 ;00001110
+ paddd xmm6, xmm5
+ pshufd xmm5, xmm6, 1 ;00000001
+ paddd xmm5, xmm6
+ movd [r15], xmm5
+ add dword psqsum16x16, 4
- mov r15, psqdiff16x16
- pshufd xmm5, xmm4, 14 ; 00001110
- paddd xmm5, xmm4
- pshufd xmm4, xmm5, 1 ; 00000001
- paddd xmm4, xmm5
- movd [r15], xmm4
- add dword psqdiff16x16, 4
+ mov r15, psqdiff16x16
+ pshufd xmm5, xmm4, 14 ; 00001110
+ paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 1 ; 00000001
+ paddd xmm4, xmm5
+ movd [r15], xmm4
+ add dword psqdiff16x16, 4
- add r14,16
- sub r0, r13
- sub r1, r13
- add r0, 16
- add r1, 16
+ add r14,16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
- dec r2
- jnz sqdiff_width_loop
+ dec r2
+ jnz sqdiff_width_loop
- pop r2
- %assign push_num push_num -1
+ pop r2
+ %assign push_num push_num -1
- mov r0, r10
- mov r1, r11
- add r0, r13
- add r1, r13
+ mov r0, r10
+ mov r1, r11
+ add r0, r13
+ add r1, r13
- dec r3
- jnz sqdiff_height_loop
+ dec r3
+ jnz sqdiff_height_loop
- mov r13, psadframe
- movd [r13], xmm8
+ mov r13, psadframe
+ movd [r13], xmm8
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
- %assign push_num 0
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %assign push_num 0
%undef cur_data
%undef ref_data
@@ -1221,7 +1221,7 @@
%undef tmp_sadframe
%undef pushsize
%undef localsize
- ret
+ ret
@@ -1249,145 +1249,145 @@
%define tmp_edi esp + 4
%define tmp_ecx esp + 8
%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- xor ebp, ebp
- pxor xmm0, xmm0
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ xor ebp, ebp
+ pxor xmm0, xmm0
bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8
- pxor xmm6, xmm6 ; sum_cur_8x8
- pxor xmm5, xmm5 ; sum_ref_8x8
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ pxor xmm7, xmm7 ; pSad8x8
+ pxor xmm6, xmm6 ; sum_cur_8x8
+ pxor xmm5, xmm5 ; sum_ref_8x8
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
- pslldq xmm7, 4
- pslldq xmm6, 4
- pslldq xmm5, 4
+ pslldq xmm7, 4
+ pslldq xmm6, 4
+ pslldq xmm5, 4
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
- ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
+ ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
- mov edx, [psad8x8]
- pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
- movdqa [edx], xmm1
- add edx, 16
- mov [psad8x8], edx ; sad8x8
+ mov edx, [psad8x8]
+ pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [psad8x8], edx ; sad8x8
- paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
- pshufd xmm2, xmm1, 00000011b
- paddd xmm1, xmm2
- movd edx, xmm1
- add ebp, edx ; sad frame
+ paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
+ pshufd xmm2, xmm1, 00000011b
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add ebp, edx ; sad frame
- mov edx, [p_sd8x8]
- psubd xmm6, xmm5
- pshufd xmm1, xmm6, 10001101b
- movdqa [edx], xmm1
- add edx, 16
- mov [p_sd8x8], edx
+ mov edx, [p_sd8x8]
+ psubd xmm6, xmm5
+ pshufd xmm1, xmm6, 10001101b
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [p_sd8x8], edx
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
- mov ecx, [tmp_ecx]
- dec ecx
- jnz bgd_width_loop
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz bgd_width_loop
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
- dec dword [iPicHeight]
- jnz bgd_height_loop
+ dec dword [iPicHeight]
+ jnz bgd_height_loop
- mov edx, [psadframe]
- mov [edx], ebp
+ mov edx, [psadframe]
+ mov [edx], ebp
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
%undef cur_data
%undef ref_data
%undef iPicWidth
@@ -1401,7 +1401,7 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
@@ -1431,190 +1431,190 @@
%define tmp_sadframe esp + 8
%define tmp_ecx esp + 12
%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
sqdiff_bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
sqdiff_bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd [edx], xmm1 ; sum
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd [edx], xmm1 ; sum
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
- psrlq xmm7, 32
- psllq xmm7, 32 ; clear sad
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ psrlq xmm7, 32
+ psllq xmm7, 32 ; clear sad
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd ebp, xmm1 ; sum
- add [edx], ebp
- add edx, 4
- mov [psum16x16], edx
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd ebp, xmm1 ; sum
+ add [edx], ebp
+ add edx, 4
+ mov [psum16x16], edx
- mov edx, [psqsum16x16]
- psrlq xmm7, 32
- pshufd xmm2, xmm7, 00001110b
- paddd xmm2, xmm7
- movd [edx], xmm2 ; sqsum
- add edx, 4
- mov [psqsum16x16], edx
+ mov edx, [psqsum16x16]
+ psrlq xmm7, 32
+ pshufd xmm2, xmm7, 00001110b
+ paddd xmm2, xmm7
+ movd [edx], xmm2 ; sqsum
+ add edx, 4
+ mov [psqsum16x16], edx
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
- mov edx, [psqdiff16x16]
- pshufd xmm1, xmm4, 00001110b
- paddd xmm4, xmm1
- pshufd xmm1, xmm4, 00000001b
- paddd xmm4, xmm1
- movd [edx], xmm4
- add edx, 4
- mov [psqdiff16x16], edx
+ mov edx, [psqdiff16x16]
+ pshufd xmm1, xmm4, 00001110b
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm4, 00000001b
+ paddd xmm4, xmm1
+ movd [edx], xmm4
+ add edx, 4
+ mov [psqdiff16x16], edx
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
- mov ecx, [tmp_ecx]
- dec ecx
- jnz sqdiff_bgd_width_loop
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz sqdiff_bgd_width_loop
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
- dec dword [iPicHeight]
- jnz sqdiff_bgd_height_loop
+ dec dword [iPicHeight]
+ jnz sqdiff_bgd_height_loop
- mov edx, [psadframe]
- mov ebp, [tmp_sadframe]
- mov [edx], ebp
+ mov edx, [psadframe]
+ mov ebp, [tmp_sadframe]
+ mov [edx], ebp
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
%undef cur_data
%undef ref_data
%undef iPicWidth
@@ -1631,7 +1631,7 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
%else
;*************************************************************************************************************
@@ -1651,142 +1651,142 @@
%define p_sd8x8 arg8;
%define p_mad8x8 arg9;
- push r12
- push r13
- push r14
- push r15
+ push r12
+ push r13
+ push r14
+ push r15
%assign push_num 4
- PUSH_XMM 10
+ PUSH_XMM 10
%ifdef WIN64
- mov r4,arg5
- ; mov r5,arg6
+ mov r4,arg5
+ ; mov r5,arg6
%endif
- mov r14,arg7
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
+ mov r14,arg7
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
- mov r13,r4
- mov r15,r0
- shr r2,4
- shr r3,4
- shl r13,4
- pxor xmm0, xmm0
- pxor xmm8, xmm8
- pxor xmm9, xmm9
+ mov r13,r4
+ mov r15,r0
+ shr r2,4
+ shr r3,4
+ shl r13,4
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8
+ pxor xmm9, xmm9
bgd_height_loop:
- ;mov ecx, dword [iPicWidth]
- push r2
- %assign push_num push_num+1
- mov r10, r15
- mov r11, r1
+ ;mov ecx, dword [iPicWidth]
+ push r2
+ %assign push_num push_num+1
+ mov r10, r15
+ mov r11, r1
bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8
- pxor xmm6, xmm6 ; sum_cur_8x8
- pxor xmm5, xmm5 ; sum_ref_8x8
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ pxor xmm7, xmm7 ; pSad8x8
+ pxor xmm6, xmm6 ; sum_cur_8x8
+ pxor xmm5, xmm5 ; sum_ref_8x8
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- mov r14, p_mad8x8
- WELS_MAX_REG_SSE2 xmm4
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm4
- ;mov [tmp_ecx], ecx
- movhlps xmm1, xmm4
- movd r0d, xmm4
+ ;mov [tmp_ecx], ecx
+ movhlps xmm1, xmm4
+ movd r0d, xmm4
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- add r14, 2
- ;mov p_mad8x8, r14
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ add r14, 2
+ ;mov p_mad8x8, r14
- pslldq xmm7, 4
- pslldq xmm6, 4
- pslldq xmm5, 4
+ pslldq xmm7, 4
+ pslldq xmm6, 4
+ pslldq xmm5, 4
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- ;mov r14, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
+ ;mov r14, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
- movhlps xmm1, xmm4
- movd r0d, xmm4
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- add r14, 2
- mov p_mad8x8, r14
+ movhlps xmm1, xmm4
+ movd r0d, xmm4
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ add r14, 2
+ mov p_mad8x8, r14
- ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
+ ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
- mov r14, psad8x8
- pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
- movdqa [r14], xmm1
- add r14, 16
- mov psad8x8, r14 ; sad8x8
+ mov r14, psad8x8
+ pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
+ movdqa [r14], xmm1
+ add r14, 16
+ mov psad8x8, r14 ; sad8x8
- paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
- pshufd xmm2, xmm1, 00000011b
- paddd xmm1, xmm2
- movd r14d, xmm1
- movd xmm9, r14d
- paddd xmm8, xmm9 ; sad frame
+ paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
+ pshufd xmm2, xmm1, 00000011b
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9, r14d
+ paddd xmm8, xmm9 ; sad frame
- mov r14, p_sd8x8
- psubd xmm6, xmm5
- pshufd xmm1, xmm6, 10001101b
- movdqa [r14], xmm1
- add r14, 16
- mov p_sd8x8, r14
+ mov r14, p_sd8x8
+ psubd xmm6, xmm5
+ pshufd xmm1, xmm6, 10001101b
+ movdqa [r14], xmm1
+ add r14, 16
+ mov p_sd8x8, r14
- ;add edx, 16
- sub r15, r13
- sub r1, r13
- add r15, 16
- add r1, 16
+ ;add edx, 16
+ sub r15, r13
+ sub r1, r13
+ add r15, 16
+ add r1, 16
- dec r2
- jnz bgd_width_loop
- pop r2
+ dec r2
+ jnz bgd_width_loop
+ pop r2
%assign push_num push_num-1
- mov r15, r10
- mov r1, r11
- add r15, r13
- add r1, r13
+ mov r15, r10
+ mov r1, r11
+ add r15, r13
+ add r1, r13
- dec r3
- jnz bgd_height_loop
+ dec r3
+ jnz bgd_height_loop
- mov r13, psadframe
- movd [r13], xmm8
+ mov r13, psadframe
+ movd [r13], xmm8
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
%assign push_num 0
%undef cur_data
%undef ref_data
@@ -1801,7 +1801,7 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
@@ -1826,189 +1826,189 @@
%define p_sd8x8 arg11
%define p_mad8x8 arg12
- push r12
- push r13
- push r14
- push r15
+ push r12
+ push r13
+ push r14
+ push r15
%assign push_num 4
- PUSH_XMM 10
+ PUSH_XMM 10
%ifdef WIN64
- mov r4,arg5
- ;mov r5,arg6
+ mov r4,arg5
+ ;mov r5,arg6
%endif
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
- mov r13,r4
- shr r2, 4 ; iPicWidth/16
- shr r3, 4 ; iPicHeight/16
- shl r13, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm8, xmm8
- pxor xmm9, xmm9
+ mov r13,r4
+ shr r2, 4 ; iPicWidth/16
+ shr r3, 4 ; iPicHeight/16
+ shl r13, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8
+ pxor xmm9, xmm9
sqdiff_bgd_height_loop:
- mov r10, r0
- mov r11, r1
- push r2
+ mov r10, r0
+ mov r11, r1
+ push r2
%assign push_num push_num+1
sqdiff_bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- mov r14, psad8x8
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [r14], xmm2
- movd [r14+4], xmm1
- add r14, 8
- mov psad8x8, r14 ; sad8x8
+ mov r14, psad8x8
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [r14], xmm2
+ movd [r14+4], xmm1
+ add r14, 8
+ mov psad8x8, r14 ; sad8x8
- paddd xmm1, xmm2
- movd r14d, xmm1
- movd xmm9,r14d
- paddd xmm8, xmm9 ; iFrameSad
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9,r14d
+ paddd xmm8, xmm9 ; iFrameSad
- mov r14, psum16x16
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd [r14], xmm1 ; sum
+ mov r14, psum16x16
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd [r14], xmm1 ; sum
- mov r14, p_sd8x8
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [r14], xmm1
- add r14, 8
- mov p_sd8x8, r14
+ mov r14, p_sd8x8
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [r14], xmm1
+ add r14, 8
+ mov p_sd8x8, r14
- mov r14, p_mad8x8
- WELS_MAX_REG_SSE2 xmm5
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm5
- movhlps xmm1, xmm5
- push r0
- movd r0d, xmm5
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- pop r0
- add r14, 2
- mov p_mad8x8, r14
+ movhlps xmm1, xmm5
+ push r0
+ movd r0d, xmm5
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ pop r0
+ add r14, 2
+ mov p_mad8x8, r14
- psrlq xmm7, 32
- psllq xmm7, 32 ; clear sad
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ psrlq xmm7, 32
+ psllq xmm7, 32 ; clear sad
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- mov r14, psad8x8
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [r14], xmm2
- movd [r14+4], xmm1
- add r14, 8
- mov psad8x8, r14 ; sad8x8
+ mov r14, psad8x8
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [r14], xmm2
+ movd [r14+4], xmm1
+ add r14, 8
+ mov psad8x8, r14 ; sad8x8
- paddd xmm1, xmm2
- movd r14d, xmm1
- movd xmm9, r14d
- paddd xmm8, xmm9 ; iFrameSad
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9, r14d
+ paddd xmm8, xmm9 ; iFrameSad
- mov r14, psum16x16
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd r15d, xmm1 ; sum
- add [r14], r15d
- add r14, 4
- mov psum16x16, r14
+ mov r14, psum16x16
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd r15d, xmm1 ; sum
+ add [r14], r15d
+ add r14, 4
+ mov psum16x16, r14
- mov r14, psqsum16x16
- psrlq xmm7, 32
- pshufd xmm2, xmm7, 00001110b
- paddd xmm2, xmm7
- movd [r14], xmm2 ; sqsum
- add r14, 4
- mov psqsum16x16, r14
+ mov r14, psqsum16x16
+ psrlq xmm7, 32
+ pshufd xmm2, xmm7, 00001110b
+ paddd xmm2, xmm7
+ movd [r14], xmm2 ; sqsum
+ add r14, 4
+ mov psqsum16x16, r14
- mov r14, p_sd8x8
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [r14], xmm1
- add r14, 8
- mov p_sd8x8, r14
+ mov r14, p_sd8x8
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [r14], xmm1
+ add r14, 8
+ mov p_sd8x8, r14
- mov r14, p_mad8x8
- WELS_MAX_REG_SSE2 xmm5
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm5
- movhlps xmm1, xmm5
- push r0
- movd r0d, xmm5
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- pop r0
- add r14, 2
- mov p_mad8x8, r14
+ movhlps xmm1, xmm5
+ push r0
+ movd r0d, xmm5
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ pop r0
+ add r14, 2
+ mov p_mad8x8, r14
- mov r14, psqdiff16x16
- pshufd xmm1, xmm4, 00001110b
- paddd xmm4, xmm1
- pshufd xmm1, xmm4, 00000001b
- paddd xmm4, xmm1
- movd [r14], xmm4
- add r14, 4
- mov psqdiff16x16, r14
+ mov r14, psqdiff16x16
+ pshufd xmm1, xmm4, 00001110b
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm4, 00000001b
+ paddd xmm4, xmm1
+ movd [r14], xmm4
+ add r14, 4
+ mov psqdiff16x16, r14
- add r14, 16
- sub r0, r13
- sub r1, r13
- add r0, 16
- add r1, 16
+ add r14, 16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
- dec r2
- jnz sqdiff_bgd_width_loop
- pop r2
- %assign push_num push_num-1
- mov r0, r10
- mov r1, r11
- add r0, r13
- add r1, r13
+ dec r2
+ jnz sqdiff_bgd_width_loop
+ pop r2
+ %assign push_num push_num-1
+ mov r0, r10
+ mov r1, r11
+ add r0, r13
+ add r1, r13
- dec r3
- jnz sqdiff_bgd_height_loop
+ dec r3
+ jnz sqdiff_bgd_height_loop
- mov r14, psadframe
- movd [r14], xmm8
+ mov r14, psadframe
+ movd [r14], xmm8
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
%assign push_num 0
%undef cur_data
%undef ref_data
@@ -2026,5 +2026,5 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
%endif