ref: c2c355b62341874eae06f281386be73452d266da
parent: d9998335174171f0b37b0d963f8081cf13545972
parent: 57f6bcc4b0da529101c25fd97349e9e55a6a5cee
author: Ethan Hugg <[email protected]>
date: Mon Jun 2 03:12:30 EDT 2014
Merge pull request #911 from mstorsjo/reformat-asm Convert all tabs to spaces in assembly sources, unify indentation
--- a/codec/common/arm/copy_mb_neon.S
+++ b/codec/common/arm/copy_mb_neon.S
@@ -35,76 +35,76 @@
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
-.macro LOAD_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- vld1.64 {$0}, [$4,:128], $5
- vld1.64 {$1}, [$4,:128], $5
- vld1.64 {$2}, [$4,:128], $5
- vld1.64 {$3}, [$4,:128], $5
-// }
+.macro LOAD_ALIGNED_DATA_WITH_STRIDE
+// { // input: $0~$3, src*, src_stride
+ vld1.64 {$0}, [$4,:128], $5
+ vld1.64 {$1}, [$4,:128], $5
+ vld1.64 {$2}, [$4,:128], $5
+ vld1.64 {$3}, [$4,:128], $5
+// }
.endm
-.macro STORE_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- vst1.64 {$0}, [$4,:128], $5
- vst1.64 {$1}, [$4,:128], $5
- vst1.64 {$2}, [$4,:128], $5
- vst1.64 {$3}, [$4,:128], $5
-// }
+.macro STORE_ALIGNED_DATA_WITH_STRIDE
+// { // input: $0~$3, dst*, dst_stride
+ vst1.64 {$0}, [$4,:128], $5
+ vst1.64 {$1}, [$4,:128], $5
+ vst1.64 {$2}, [$4,:128], $5
+ vst1.64 {$3}, [$4,:128], $5
+// }
.endm
-.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- vld1.64 {$0}, [$4], $5
- vld1.64 {$1}, [$4], $5
- vld1.64 {$2}, [$4], $5
- vld1.64 {$3}, [$4], $5
-// }
+.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
+// { // input: $0~$3, src*, src_stride
+ vld1.64 {$0}, [$4], $5
+ vld1.64 {$1}, [$4], $5
+ vld1.64 {$2}, [$4], $5
+ vld1.64 {$3}, [$4], $5
+// }
.endm
-.macro STORE_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- vst1.64 {$0}, [$4], $5
- vst1.64 {$1}, [$4], $5
- vst1.64 {$2}, [$4], $5
- vst1.64 {$3}, [$4], $5
-// }
+.macro STORE_UNALIGNED_DATA_WITH_STRIDE
+// { // input: $0~$3, dst*, dst_stride
+ vst1.64 {$0}, [$4], $5
+ vst1.64 {$1}, [$4], $5
+ vst1.64 {$2}, [$4], $5
+ vst1.64 {$3}, [$4], $5
+// }
.endm
#else
-.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, src*, src_stride
- vld1.64 {\arg0}, [\arg4,:128], \arg5
- vld1.64 {\arg1}, [\arg4,:128], \arg5
- vld1.64 {\arg2}, [\arg4,:128], \arg5
- vld1.64 {\arg3}, [\arg4,:128], \arg5
-// }
+.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+// { // input: \arg0~\arg3, src*, src_stride
+ vld1.64 {\arg0}, [\arg4,:128], \arg5
+ vld1.64 {\arg1}, [\arg4,:128], \arg5
+ vld1.64 {\arg2}, [\arg4,:128], \arg5
+ vld1.64 {\arg3}, [\arg4,:128], \arg5
+// }
.endm
-.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, dst*, dst_stride
- vst1.64 {\arg0}, [\arg4,:128], \arg5
- vst1.64 {\arg1}, [\arg4,:128], \arg5
- vst1.64 {\arg2}, [\arg4,:128], \arg5
- vst1.64 {\arg3}, [\arg4,:128], \arg5
-// }
+.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+// { // input: \arg0~\arg3, dst*, dst_stride
+ vst1.64 {\arg0}, [\arg4,:128], \arg5
+ vst1.64 {\arg1}, [\arg4,:128], \arg5
+ vst1.64 {\arg2}, [\arg4,:128], \arg5
+ vst1.64 {\arg3}, [\arg4,:128], \arg5
+// }
.endm
-.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, src*, src_stride
- vld1.64 {\arg0}, [\arg4], \arg5
- vld1.64 {\arg1}, [\arg4], \arg5
- vld1.64 {\arg2}, [\arg4], \arg5
- vld1.64 {\arg3}, [\arg4], \arg5
-// }
+.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+// { // input: \arg0~\arg3, src*, src_stride
+ vld1.64 {\arg0}, [\arg4], \arg5
+ vld1.64 {\arg1}, [\arg4], \arg5
+ vld1.64 {\arg2}, [\arg4], \arg5
+ vld1.64 {\arg3}, [\arg4], \arg5
+// }
.endm
-.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, dst*, dst_stride
- vst1.64 {\arg0}, [\arg4], \arg5
- vst1.64 {\arg1}, [\arg4], \arg5
- vst1.64 {\arg2}, [\arg4], \arg5
- vst1.64 {\arg3}, [\arg4], \arg5
-// }
+.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+// { // input: \arg0~\arg3, dst*, dst_stride
+ vst1.64 {\arg0}, [\arg4], \arg5
+ vst1.64 {\arg1}, [\arg4], \arg5
+ vst1.64 {\arg2}, [\arg4], \arg5
+ vst1.64 {\arg3}, [\arg4], \arg5
+// }
.endm
#endif
@@ -112,13 +112,13 @@
WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
- LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
WELS_ASM_FUNC_END
@@ -125,21 +125,21 @@
WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
- LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+ LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
- STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+ STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
- LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
+ LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
- STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
+ STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
- LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+ LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
- STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+ STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
- LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
+ LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
- STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
+ STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
@@ -146,21 +146,21 @@
WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
- LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
@@ -167,13 +167,13 @@
WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
- LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
@@ -180,21 +180,21 @@
WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
- LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
- LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
+ LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
- STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
+ STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
WELS_ASM_FUNC_END
--- a/codec/common/arm/deblocking_neon.S
+++ b/codec/common/arm/deblocking_neon.S
@@ -1,35 +1,35 @@
/*!
-* \copy
-* Copyright (c) 2013, Cisco Systems
-* All rights reserved.
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions
-* are met:
-
-* * Redistributions of source code must retain the above copyright
-* notice, this list of conditions and the following disclaimer.
-
-* * Redistributions in binary form must reproduce the above copyright
-* notice, this list of conditions and the following disclaimer in
-* the documentation and/or other materials provided with the
-* distribution.
-
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-* POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
#ifdef HAVE_NEON
.text
@@ -36,815 +36,815 @@
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
-.macro JMP_IF_128BITS_IS_ZERO
- vorr.s16 $2, $0, $1
- vmov r3, r2, $2
- orr r3, r3, r2
- cmp r3, #0
+.macro JMP_IF_128BITS_IS_ZERO
+ vorr.s16 $2, $0, $1
+ vmov r3, r2, $2
+ orr r3, r3, r2
+ cmp r3, #0
.endm
-.macro MASK_MATRIX
- vabd.u8 $6, $1, $2
- vcgt.u8 $6, $4, $6
+.macro MASK_MATRIX
+ vabd.u8 $6, $1, $2
+ vcgt.u8 $6, $4, $6
- vabd.u8 $4, $0, $1
- vclt.u8 $4, $4, $5
- vand.u8 $6, $6, $4
+ vabd.u8 $4, $0, $1
+ vclt.u8 $4, $4, $5
+ vand.u8 $6, $6, $4
- vabd.u8 $4, $3, $2
- vclt.u8 $4, $4, $5
- vand.u8 $6, $6, $4
+ vabd.u8 $4, $3, $2
+ vclt.u8 $4, $4, $5
+ vand.u8 $6, $6, $4
.endm
-.macro DIFF_LUMA_LT4_P1_Q1
+.macro DIFF_LUMA_LT4_P1_Q1
vmov.i8 $9, #128
- vrhadd.u8 $8, $2, $3
- vhadd.u8 $8, $0, $8
- vsub.s8 $8, $8, $9
- vsub.s8 $9, $1, $9
- vqsub.s8 $8, $8, $9
- vmax.s8 $8, $8, $5
- vmin.s8 $8, $8, $6
- vabd.u8 $9, $0, $2
- vclt.u8 $9, $9, $4
- vand.s8 $8, $8, $9
- vand.s8 $8, $8, $7
- vadd.u8 $8, $1, $8
- vabs.s8 $9, $9
+ vrhadd.u8 $8, $2, $3
+ vhadd.u8 $8, $0, $8
+ vsub.s8 $8, $8, $9
+ vsub.s8 $9, $1, $9
+ vqsub.s8 $8, $8, $9
+ vmax.s8 $8, $8, $5
+ vmin.s8 $8, $8, $6
+ vabd.u8 $9, $0, $2
+ vclt.u8 $9, $9, $4
+ vand.s8 $8, $8, $9
+ vand.s8 $8, $8, $7
+ vadd.u8 $8, $1, $8
+ vabs.s8 $9, $9
.endm
-.macro DIFF_LUMA_LT4_P0_Q0
- vsubl.u8 $5, $0, $3
- vsubl.u8 $6, $2, $1
- vshl.s16 $6, $6, #2
- vadd.s16 $5, $5, $6
- vqrshrn.s16 $4, $5, #3
+.macro DIFF_LUMA_LT4_P0_Q0
+ vsubl.u8 $5, $0, $3
+ vsubl.u8 $6, $2, $1
+ vshl.s16 $6, $6, #2
+ vadd.s16 $5, $5, $6
+ vqrshrn.s16 $4, $5, #3
.endm
-.macro DIFF_LUMA_EQ4_P2P1P0
- vaddl.u8 q4, $1, $2
- vaddl.u8 q5, $3, $4
- vadd.u16 q5, q4, q5
+.macro DIFF_LUMA_EQ4_P2P1P0
+ vaddl.u8 q4, $1, $2
+ vaddl.u8 q5, $3, $4
+ vadd.u16 q5, q4, q5
- vaddl.u8 q4, $0, $1
- vshl.u16 q4, q4, #1
- vadd.u16 q4, q5, q4
+ vaddl.u8 q4, $0, $1
+ vshl.u16 q4, q4, #1
+ vadd.u16 q4, q5, q4
- vrshrn.u16 $0, q5, #2
- vrshrn.u16 $7, q4, #3
+ vrshrn.u16 $0, q5, #2
+ vrshrn.u16 $7, q4, #3
- vshl.u16 q5, q5, #1
- vsubl.u8 q4, $5, $1
- vadd.u16 q5, q4,q5
+ vshl.u16 q5, q5, #1
+ vsubl.u8 q4, $5, $1
+ vadd.u16 q5, q4,q5
- vaddl.u8 q4, $2, $5
- vaddw.u8 q4, q4, $2
- vaddw.u8 q4, q4, $3
+ vaddl.u8 q4, $2, $5
+ vaddw.u8 q4, q4, $2
+ vaddw.u8 q4, q4, $3
- vrshrn.u16 d10,q5, #3
- vrshrn.u16 d8, q4, #2
- vbsl.u8 $6, d10, d8
+ vrshrn.u16 d10,q5, #3
+ vrshrn.u16 d8, q4, #2
+ vbsl.u8 $6, d10, d8
.endm
-.macro DIFF_LUMA_EQ4_MASK
- vmov $3, $2
- vbsl.u8 $3, $0, $1
+.macro DIFF_LUMA_EQ4_MASK
+ vmov $3, $2
+ vbsl.u8 $3, $0, $1
.endm
-.macro DIFF_CHROMA_EQ4_P0Q0
- vaddl.u8 $4, $0, $3
- vaddw.u8 $5, $4, $1
- vaddw.u8 $6, $4, $2
- vaddw.u8 $5, $5, $0
+.macro DIFF_CHROMA_EQ4_P0Q0
+ vaddl.u8 $4, $0, $3
+ vaddw.u8 $5, $4, $1
+ vaddw.u8 $6, $4, $2
+ vaddw.u8 $5, $5, $0
- vaddw.u8 $6, $6, $3
- vrshrn.u16 $7, $5, #2
- vrshrn.u16 $8, $6, #2
+ vaddw.u8 $6, $6, $3
+ vrshrn.u16 $7, $5, #2
+ vrshrn.u16 $8, $6, #2
.endm
-.macro LOAD_CHROMA_DATA_4
- vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
- vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+.macro LOAD_CHROMA_DATA_4
+ vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+ vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
.endm
-.macro STORE_CHROMA_DATA_4
- vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
- vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+.macro STORE_CHROMA_DATA_4
+ vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+ vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
.endm
-.macro LOAD_LUMA_DATA_3
- vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
- vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
+.macro LOAD_LUMA_DATA_3
+ vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
+ vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
.endm
-.macro STORE_LUMA_DATA_4
- vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
- vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
+.macro STORE_LUMA_DATA_4
+ vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
+ vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
.endm
-.macro STORE_LUMA_DATA_3
- vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
- vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
+.macro STORE_LUMA_DATA_3
+ vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
+ vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
.endm
-.macro EXTRACT_DELTA_INTO_TWO_PART
- vcge.s8 $1, $0, #0
- vand $1, $0, $1
- vsub.s8 $0, $1, $0
+.macro EXTRACT_DELTA_INTO_TWO_PART
+ vcge.s8 $1, $0, #0
+ vand $1, $0, $1
+ vsub.s8 $0, $1, $0
.endm
#else
-.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
- vorr.s16 \arg2, \arg0, \arg1
- vmov r3, r2, \arg2
- orr r3, r3, r2
- cmp r3, #0
+.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
+ vorr.s16 \arg2, \arg0, \arg1
+ vmov r3, r2, \arg2
+ orr r3, r3, r2
+ cmp r3, #0
.endm
-.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vabd.u8 \arg6, \arg1, \arg2
- vcgt.u8 \arg6, \arg4, \arg6
+.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ vabd.u8 \arg6, \arg1, \arg2
+ vcgt.u8 \arg6, \arg4, \arg6
- vabd.u8 \arg4, \arg0, \arg1
- vclt.u8 \arg4, \arg4, \arg5
- vand.u8 \arg6, \arg6, \arg4
+ vabd.u8 \arg4, \arg0, \arg1
+ vclt.u8 \arg4, \arg4, \arg5
+ vand.u8 \arg6, \arg6, \arg4
- vabd.u8 \arg4, \arg3, \arg2
- vclt.u8 \arg4, \arg4, \arg5
- vand.u8 \arg6, \arg6, \arg4
+ vabd.u8 \arg4, \arg3, \arg2
+ vclt.u8 \arg4, \arg4, \arg5
+ vand.u8 \arg6, \arg6, \arg4
.endm
-.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
vmov.i8 \arg9, #128
- vrhadd.u8 \arg8, \arg2, \arg3
- vhadd.u8 \arg8, \arg0, \arg8
- vsub.s8 \arg8, \arg8, \arg9
- vsub.s8 \arg9, \arg1, \arg9
+ vrhadd.u8 \arg8, \arg2, \arg3
+ vhadd.u8 \arg8, \arg0, \arg8
+ vsub.s8 \arg8, \arg8, \arg9
+ vsub.s8 \arg9, \arg1, \arg9
vqsub.s8 \arg8, \arg8, \arg9
- vmax.s8 \arg8, \arg8, \arg5
- vmin.s8 \arg8, \arg8, \arg6
- vabd.u8 \arg9, \arg0, \arg2
- vclt.u8 \arg9, \arg9, \arg4
- vand.s8 \arg8, \arg8, \arg9
- vand.s8 \arg8, \arg8, \arg7
- vadd.u8 \arg8, \arg1, \arg8
- vabs.s8 \arg9, \arg9
+ vmax.s8 \arg8, \arg8, \arg5
+ vmin.s8 \arg8, \arg8, \arg6
+ vabd.u8 \arg9, \arg0, \arg2
+ vclt.u8 \arg9, \arg9, \arg4
+ vand.s8 \arg8, \arg8, \arg9
+ vand.s8 \arg8, \arg8, \arg7
+ vadd.u8 \arg8, \arg1, \arg8
+ vabs.s8 \arg9, \arg9
.endm
-.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vsubl.u8 \arg5, \arg0, \arg3
- vsubl.u8 \arg6, \arg2, \arg1
- vshl.s16 \arg6, \arg6, #2
- vadd.s16 \arg5, \arg5, \arg6
- vqrshrn.s16 \arg4, \arg5, #3
+.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ vsubl.u8 \arg5, \arg0, \arg3
+ vsubl.u8 \arg6, \arg2, \arg1
+ vshl.s16 \arg6, \arg6, #2
+ vadd.s16 \arg5, \arg5, \arg6
+ vqrshrn.s16 \arg4, \arg5, #3
.endm
-.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
- vaddl.u8 q4, \arg1, \arg2
- vaddl.u8 q5, \arg3, \arg4
- vadd.u16 q5, q4, q5
+.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+ vaddl.u8 q4, \arg1, \arg2
+ vaddl.u8 q5, \arg3, \arg4
+ vadd.u16 q5, q4, q5
- vaddl.u8 q4, \arg0, \arg1
- vshl.u16 q4, q4, #1
- vadd.u16 q4, q5, q4
+ vaddl.u8 q4, \arg0, \arg1
+ vshl.u16 q4, q4, #1
+ vadd.u16 q4, q5, q4
- vrshrn.u16 \arg0, q5, #2
- vrshrn.u16 \arg7, q4, #3
+ vrshrn.u16 \arg0, q5, #2
+ vrshrn.u16 \arg7, q4, #3
- vshl.u16 q5, q5, #1
- vsubl.u8 q4, \arg5, \arg1
- vadd.u16 q5, q4,q5
+ vshl.u16 q5, q5, #1
+ vsubl.u8 q4, \arg5, \arg1
+ vadd.u16 q5, q4,q5
- vaddl.u8 q4, \arg2, \arg5
- vaddw.u8 q4, q4, \arg2
- vaddw.u8 q4, q4, \arg3
+ vaddl.u8 q4, \arg2, \arg5
+ vaddw.u8 q4, q4, \arg2
+ vaddw.u8 q4, q4, \arg3
- vrshrn.u16 d10,q5, #3
- vrshrn.u16 d8, q4, #2
- vbsl.u8 \arg6, d10, d8
+ vrshrn.u16 d10,q5, #3
+ vrshrn.u16 d8, q4, #2
+ vbsl.u8 \arg6, d10, d8
.endm
-.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
- vmov \arg3, \arg2
- vbsl.u8 \arg3, \arg0, \arg1
+.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
+ vmov \arg3, \arg2
+ vbsl.u8 \arg3, \arg0, \arg1
.endm
-.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- vaddl.u8 \arg4, \arg0, \arg3
- vaddw.u8 \arg5, \arg4, \arg1
- vaddw.u8 \arg6, \arg4, \arg2
- vaddw.u8 \arg5, \arg5, \arg0
- vaddw.u8 \arg6, \arg6, \arg3
- vrshrn.u16 \arg7, \arg5, #2
- vrshrn.u16 \arg8, \arg6, #2
+.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+ vaddl.u8 \arg4, \arg0, \arg3
+ vaddw.u8 \arg5, \arg4, \arg1
+ vaddw.u8 \arg6, \arg4, \arg2
+ vaddw.u8 \arg5, \arg5, \arg0
+ vaddw.u8 \arg6, \arg6, \arg3
+ vrshrn.u16 \arg7, \arg5, #2
+ vrshrn.u16 \arg8, \arg6, #2
.endm
-.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
- vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+ vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+ vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
.endm
-.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
- vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+ vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+ vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
.endm
-.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
- vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
+ vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
.endm
-.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
- vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
- vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
+.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
+ vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
+ vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
.endm
-.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
- vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
+ vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
.endm
-.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
- vcge.s8 \arg1, \arg0, #0
- vand \arg1, \arg0, \arg1
- vsub.s8 \arg0, \arg1, \arg0
+.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
+ vcge.s8 \arg1, \arg0, #0
+ vand \arg1, \arg0, \arg1
+ vsub.s8 \arg0, \arg1, \arg0
.endm
#endif
WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
- vpush {q4-q7}
- vdup.u8 q11, r2
- vdup.u8 q9, r3
+ vpush {q4-q7}
+ vdup.u8 q11, r2
+ vdup.u8 q9, r3
- add r2, r1, r1, lsl #1
- sub r2, r0, r2
- vld1.u8 {q0}, [r2], r1
- vld1.u8 {q3}, [r0], r1
- vld1.u8 {q1}, [r2], r1
- vld1.u8 {q4}, [r0], r1
- vld1.u8 {q2}, [r2]
- vld1.u8 {q5}, [r0]
- sub r2, r2, r1
+ add r2, r1, r1, lsl #1
+ sub r2, r0, r2
+ vld1.u8 {q0}, [r2], r1
+ vld1.u8 {q3}, [r0], r1
+ vld1.u8 {q1}, [r2], r1
+ vld1.u8 {q4}, [r0], r1
+ vld1.u8 {q2}, [r2]
+ vld1.u8 {q5}, [r0]
+ sub r2, r2, r1
- ldr r3, [sp, #64]
- vld1.s8 {d31}, [r3]
- vdup.s8 d28, d31[0]
- vdup.s8 d30, d31[1]
- vdup.s8 d29, d31[2]
- vdup.s8 d31, d31[3]
- vtrn.32 d28, d30
- vtrn.32 d29, d31
- vcge.s8 q10, q14, #0
+ ldr r3, [sp, #64]
+ vld1.s8 {d31}, [r3]
+ vdup.s8 d28, d31[0]
+ vdup.s8 d30, d31[1]
+ vdup.s8 d29, d31[2]
+ vdup.s8 d31, d31[3]
+ vtrn.32 d28, d30
+ vtrn.32 d29, d31
+ vcge.s8 q10, q14, #0
- MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
- vand.u8 q10, q10, q15
+ MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
+ vand.u8 q10, q10, q15
- veor q15, q15
- vsub.i8 q15,q15,q14
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
- vst1.u8 {q6}, [r2], r1
+ DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+ vst1.u8 {q6}, [r2], r1
- DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+ DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
- vabs.s8 q12, q12
- vabs.s8 q13, q13
- vadd.u8 q14,q14,q12
- vadd.u8 q14,q14,q13
- veor q15, q15
- vsub.i8 q15,q15,q14
+ vabs.s8 q12, q12
+ vabs.s8 q13, q13
+ vadd.u8 q14,q14,q12
+ vadd.u8 q14,q14,q13
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
- vand.s8 q8, q8, q10
- EXTRACT_DELTA_INTO_TWO_PART q8, q9
- vqadd.u8 q2, q2, q9
- vqsub.u8 q2, q2, q8
- vst1.u8 {q2}, [r2], r1
- vqsub.u8 q3, q3, q9
- vqadd.u8 q3, q3, q8
- vst1.u8 {q3}, [r2] , r1
- vst1.u8 {q7}, [r2]
+ DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
+ vand.s8 q8, q8, q10
+ EXTRACT_DELTA_INTO_TWO_PART q8, q9
+ vqadd.u8 q2, q2, q9
+ vqsub.u8 q2, q2, q8
+ vst1.u8 {q2}, [r2], r1
+ vqsub.u8 q3, q3, q9
+ vqadd.u8 q3, q3, q8
+ vst1.u8 {q3}, [r2] , r1
+ vst1.u8 {q7}, [r2]
- vpop {q4-q7}
+ vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
- vpush {q4-q7}
+ vpush {q4-q7}
- vdup.u8 q5, r2
- vdup.u8 q4, r3
+ vdup.u8 q5, r2
+ vdup.u8 q4, r3
- sub r3, r0, r1, lsl #2
- vld1.u8 {q8}, [r3], r1
- vld1.u8 {q12}, [r0], r1
- vld1.u8 {q9}, [r3], r1
- vld1.u8 {q13}, [r0], r1
- vld1.u8 {q10}, [r3], r1
- vld1.u8 {q14}, [r0], r1
- vld1.u8 {q11}, [r3]
- vld1.u8 {q15}, [r0]
- sub r3, r3, r1 , lsl #1
+ sub r3, r0, r1, lsl #2
+ vld1.u8 {q8}, [r3], r1
+ vld1.u8 {q12}, [r0], r1
+ vld1.u8 {q9}, [r3], r1
+ vld1.u8 {q13}, [r0], r1
+ vld1.u8 {q10}, [r3], r1
+ vld1.u8 {q14}, [r0], r1
+ vld1.u8 {q11}, [r3]
+ vld1.u8 {q15}, [r0]
+ sub r3, r3, r1 , lsl #1
- MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
+ MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
- mov r2, r2, lsr #2
- add r2, r2, #2
- vdup.u8 q5, r2
- vabd.u8 q0, q11, q12
- vclt.u8 q7, q0, q5
+ mov r2, r2, lsr #2
+ add r2, r2, #2
+ vdup.u8 q5, r2
+ vabd.u8 q0, q11, q12
+ vclt.u8 q7, q0, q5
- vabd.u8 q1, q9, q11
- vclt.u8 q1, q1, q4
- vand.s8 q1, q1, q7
+ vabd.u8 q1, q9, q11
+ vclt.u8 q1, q1, q4
+ vand.s8 q1, q1, q7
- vabd.u8 q2, q14,q12
- vclt.u8 q2, q2, q4
- vand.s8 q2, q2, q7
- vand.u8 q7, q7, q6
+ vabd.u8 q2, q14,q12
+ vclt.u8 q2, q2, q4
+ vand.s8 q2, q2, q7
+ vand.u8 q7, q7, q6
- vmov q3, q1
+ vmov q3, q1
- DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
- DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
+ DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
+ DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
- vand.u8 q3, q7, q3
- DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q8,q10, q3, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q1,q11, q6, q4
- vst1.u8 {q4}, [r3], r1
+ vand.u8 q3, q7, q3
+ DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q8,q10, q3, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q1,q11, q6, q4
+ vst1.u8 {q4}, [r3], r1
- vmov q0, q2
- DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6
- DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7
+ vmov q0, q2
+ DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6
+ DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7
- vand.u8 q0, q7, q0
- DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q15, q13, q0, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q3, q14, q0, q4
- vst1.u8 {q4}, [r3], r1
+ vand.u8 q0, q7, q0
+ DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q15, q13, q0, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q3, q14, q0, q4
+ vst1.u8 {q4}, [r3], r1
- vpop {q4-q7}
+ vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
- vpush {q4-q7}
+ vpush {q4-q7}
- vdup.u8 q11, r2
- vdup.u8 q9, r3
+ vdup.u8 q11, r2
+ vdup.u8 q9, r3
- sub r2, r0, #3
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7
+ sub r2, r0, #3
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7
- vswp d1, d2
- vswp d3, d4
- vswp d1, d4
- vswp d7, d8
- vswp d9, d10
- vswp d7, d10
+ vswp d1, d2
+ vswp d3, d4
+ vswp d1, d4
+ vswp d7, d8
+ vswp d9, d10
+ vswp d7, d10
- sub r0, r0, r1, lsl #4
+ sub r0, r0, r1, lsl #4
- ldr r3, [sp, #64]
- vld1.s8 {d31}, [r3]
- vdup.s8 d28, d31[0]
- vdup.s8 d30, d31[1]
- vdup.s8 d29, d31[2]
- vdup.s8 d31, d31[3]
- vtrn.32 d28, d30
- vtrn.32 d29, d31
- vcge.s8 q10, q14, #0
+ ldr r3, [sp, #64]
+ vld1.s8 {d31}, [r3]
+ vdup.s8 d28, d31[0]
+ vdup.s8 d30, d31[1]
+ vdup.s8 d29, d31[2]
+ vdup.s8 d31, d31[3]
+ vtrn.32 d28, d30
+ vtrn.32 d29, d31
+ vcge.s8 q10, q14, #0
- MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
- vand.u8 q10, q10, q15
+ MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
+ vand.u8 q10, q10, q15
- veor q15, q15
- vsub.i8 q15,q15,q14
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
- DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+ DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+ DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
- vabs.s8 q12, q12
- vabs.s8 q13, q13
- vadd.u8 q14,q14,q12
- vadd.u8 q14,q14,q13
- veor q15, q15
- vsub.i8 q15,q15,q14
+ vabs.s8 q12, q12
+ vabs.s8 q13, q13
+ vadd.u8 q14,q14,q12
+ vadd.u8 q14,q14,q13
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
- vand.s8 q8, q8, q10
- EXTRACT_DELTA_INTO_TWO_PART q8, q9
- vqadd.u8 q2, q2, q9
- vqsub.u8 q2, q2, q8
+ DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
+ vand.s8 q8, q8, q10
+ EXTRACT_DELTA_INTO_TWO_PART q8, q9
+ vqadd.u8 q2, q2, q9
+ vqsub.u8 q2, q2, q8
- vqsub.u8 q3, q3, q9
- vqadd.u8 q3, q3, q8
+ vqsub.u8 q3, q3, q9
+ vqadd.u8 q3, q3, q8
- sub r0, #2
- add r2, r0, r1
- lsl r1, #1
+ sub r0, #2
+ add r2, r0, r1
+ lsl r1, #1
- vmov q1, q6
- vmov q4, q7
+ vmov q1, q6
+ vmov q4, q7
- vswp q2, q3
- vswp d3, d6
- vswp d5, d8
+ vswp q2, q3
+ vswp d3, d6
+ vswp d5, d8
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7
- vpop {q4-q7}
+ vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
- vpush {q4-q7}
- vdup.u8 q5, r2
- vdup.u8 q4, r3
+ vpush {q4-q7}
+ vdup.u8 q5, r2
+ vdup.u8 q4, r3
- sub r3, r0, #4 // pix -= 4
+ sub r3, r0, #4 // pix -= 4
- vld1.u8 {d16}, [r3], r1
- vld1.u8 {d17}, [r3], r1
- vld1.u8 {d18}, [r3], r1
- vld1.u8 {d19}, [r3], r1
- vld1.u8 {d20}, [r3], r1
- vld1.u8 {d21}, [r3], r1
- vld1.u8 {d22}, [r3], r1
- vld1.u8 {d23}, [r3], r1
- vld1.u8 {d24}, [r3], r1
- vld1.u8 {d25}, [r3], r1
- vld1.u8 {d26}, [r3], r1
- vld1.u8 {d27}, [r3], r1
- vld1.u8 {d28}, [r3], r1
- vld1.u8 {d29}, [r3], r1
- vld1.u8 {d30}, [r3], r1
- vld1.u8 {d31}, [r3], r1
+ vld1.u8 {d16}, [r3], r1
+ vld1.u8 {d17}, [r3], r1
+ vld1.u8 {d18}, [r3], r1
+ vld1.u8 {d19}, [r3], r1
+ vld1.u8 {d20}, [r3], r1
+ vld1.u8 {d21}, [r3], r1
+ vld1.u8 {d22}, [r3], r1
+ vld1.u8 {d23}, [r3], r1
+ vld1.u8 {d24}, [r3], r1
+ vld1.u8 {d25}, [r3], r1
+ vld1.u8 {d26}, [r3], r1
+ vld1.u8 {d27}, [r3], r1
+ vld1.u8 {d28}, [r3], r1
+ vld1.u8 {d29}, [r3], r1
+ vld1.u8 {d30}, [r3], r1
+ vld1.u8 {d31}, [r3], r1
- vtrn.u32 d16, d20
- vtrn.u32 d17, d21
- vtrn.u32 d18, d22
- vtrn.u32 d19, d23
- vtrn.u32 d24, d28
- vtrn.u32 d25, d29
- vtrn.u32 d26, d30
- vtrn.u32 d27, d31
+ vtrn.u32 d16, d20
+ vtrn.u32 d17, d21
+ vtrn.u32 d18, d22
+ vtrn.u32 d19, d23
+ vtrn.u32 d24, d28
+ vtrn.u32 d25, d29
+ vtrn.u32 d26, d30
+ vtrn.u32 d27, d31
- vtrn.u16 d16, d18
- vtrn.u16 d17, d19
- vtrn.u16 d20, d22
- vtrn.u16 d21, d23
- vtrn.u16 d24, d26
- vtrn.u16 d25, d27
- vtrn.u16 d28, d30
- vtrn.u16 d29, d31
+ vtrn.u16 d16, d18
+ vtrn.u16 d17, d19
+ vtrn.u16 d20, d22
+ vtrn.u16 d21, d23
+ vtrn.u16 d24, d26
+ vtrn.u16 d25, d27
+ vtrn.u16 d28, d30
+ vtrn.u16 d29, d31
- vtrn.u8 d16, d17
- vtrn.u8 d18, d19
- vtrn.u8 d20, d21
- vtrn.u8 d22, d23
- vtrn.u8 d24, d25
- vtrn.u8 d26, d27
- vtrn.u8 d28, d29
- vtrn.u8 d30, d31
+ vtrn.u8 d16, d17
+ vtrn.u8 d18, d19
+ vtrn.u8 d20, d21
+ vtrn.u8 d22, d23
+ vtrn.u8 d24, d25
+ vtrn.u8 d26, d27
+ vtrn.u8 d28, d29
+ vtrn.u8 d30, d31
- vswp d17, d24
- vswp d19, d26
- vswp d21, d28
- vswp d23, d30
+ vswp d17, d24
+ vswp d19, d26
+ vswp d21, d28
+ vswp d23, d30
- vswp q12, q9
- vswp q14, q11
+ vswp q12, q9
+ vswp q14, q11
- vswp q12, q10
- vswp q13, q11
+ vswp q12, q10
+ vswp q13, q11
- MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
+ MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
- mov r2, r2, lsr #2
- add r2, r2, #2
- vdup.u8 q5, r2
- vabd.u8 q0, q11, q12
- vclt.u8 q7, q0, q5
+ mov r2, r2, lsr #2
+ add r2, r2, #2
+ vdup.u8 q5, r2
+ vabd.u8 q0, q11, q12
+ vclt.u8 q7, q0, q5
- vabd.u8 q1, q9, q11
- vclt.u8 q1, q1, q4
- vand.s8 q1, q1, q7
+ vabd.u8 q1, q9, q11
+ vclt.u8 q1, q1, q4
+ vand.s8 q1, q1, q7
- vabd.u8 q2, q14,q12
- vclt.u8 q2, q2, q4
- vand.s8 q2, q2, q7
- vand.u8 q7, q7, q6
+ vabd.u8 q2, q14,q12
+ vclt.u8 q2, q2, q4
+ vand.s8 q2, q2, q7
+ vand.u8 q7, q7, q6
- vmov q3, q1
+ vmov q3, q1
- DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
- DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
+ DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
+ DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
- vand.u8 q3, q7, q3
- DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
- vmov q9, q4
- vbsl.u8 q3, q8, q10
- DIFF_LUMA_EQ4_MASK q1,q11, q6, q8
+ vand.u8 q3, q7, q3
+ DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
+ vmov q9, q4
+ vbsl.u8 q3, q8, q10
+ DIFF_LUMA_EQ4_MASK q1,q11, q6, q8
- vand.u8 q7, q7, q2
+ vand.u8 q7, q7, q2
- DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0
- DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1
+ DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0
+ DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1
- vbsl.u8 q6, q2, q12
- DIFF_LUMA_EQ4_MASK q15, q13, q7, q4
+ vbsl.u8 q6, q2, q12
+ DIFF_LUMA_EQ4_MASK q15, q13, q7, q4
- vbsl.u8 q7, q0, q14
+ vbsl.u8 q7, q0, q14
- vmov q5, q6
- vmov q2, q9
- vmov q6, q4
- vmov q4, q8
+ vmov q5, q6
+ vmov q2, q9
+ vmov q6, q4
+ vmov q4, q8
- vswp d8, d6
- vswp d5, d7
- vswp d5, d8
- vswp d14, d12
- vswp d11, d13
- vswp d11, d14
+ vswp d8, d6
+ vswp d5, d7
+ vswp d5, d8
+ vswp d14, d12
+ vswp d11, d13
+ vswp d11, d14
- sub r3, r0, #3
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7
+ sub r3, r0, #3
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7
- vpop {q4-q7}
+ vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
- vdup.u8 q11, r3
- ldr r3, [sp, #0]
+ vdup.u8 q11, r3
+ ldr r3, [sp, #0]
- sub r0, r0, r2 , lsl #1
- sub r1, r1, r2, lsl #1
- vdup.u8 q9, r3
- ldr r3, [sp, #4]
+ sub r0, r0, r2 , lsl #1
+ sub r1, r1, r2, lsl #1
+ vdup.u8 q9, r3
+ ldr r3, [sp, #4]
- vld1.u8 {d0}, [r0], r2
- vld1.u8 {d1}, [r1], r2
- vld1.u8 {d2}, [r0], r2
- vld1.u8 {d3}, [r1], r2
- vld1.u8 {d4}, [r0], r2
- vld1.u8 {d5}, [r1], r2
- vld1.u8 {d6}, [r0]
- vld1.u8 {d7}, [r1]
+ vld1.u8 {d0}, [r0], r2
+ vld1.u8 {d1}, [r1], r2
+ vld1.u8 {d2}, [r0], r2
+ vld1.u8 {d3}, [r1], r2
+ vld1.u8 {d4}, [r0], r2
+ vld1.u8 {d5}, [r1], r2
+ vld1.u8 {d6}, [r0]
+ vld1.u8 {d7}, [r1]
- sub r0, r0, r2, lsl #1
- sub r1, r1, r2, lsl #1
+ sub r0, r0, r2, lsl #1
+ sub r1, r1, r2, lsl #1
- vld1.s8 {d31}, [r3]
- vmovl.u8 q14,d31
- vshl.u64 d29,d28,#8
- vorr d28,d29
- vmov d29, d28
- veor q15, q15
- vsub.i8 q15,q15,q14
+ vld1.s8 {d31}, [r3]
+ vmovl.u8 q14,d31
+ vshl.u64 d29,d28,#8
+ vorr d28,d29
+ vmov d29, d28
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
- DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
+ DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
- vand.s8 q8, q8, q10
- vcge.s8 q14, q14, #0
- vand.s8 q8, q8, q14
- EXTRACT_DELTA_INTO_TWO_PART q8, q10
- vqadd.u8 q1, q1, q10
- vqsub.u8 q1, q1, q8
- vst1.u8 {d2}, [r0], r2
- vst1.u8 {d3}, [r1], r2
- vqsub.u8 q2, q2, q10
- vqadd.u8 q2, q2, q8
- vst1.u8 {d4}, [r0]
- vst1.u8 {d5}, [r1]
+ vand.s8 q8, q8, q10
+ vcge.s8 q14, q14, #0
+ vand.s8 q8, q8, q14
+ EXTRACT_DELTA_INTO_TWO_PART q8, q10
+ vqadd.u8 q1, q1, q10
+ vqsub.u8 q1, q1, q8
+ vst1.u8 {d2}, [r0], r2
+ vst1.u8 {d3}, [r1], r2
+ vqsub.u8 q2, q2, q10
+ vqadd.u8 q2, q2, q8
+ vst1.u8 {d4}, [r0]
+ vst1.u8 {d5}, [r1]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
- vpush {q4-q5}
+ vpush {q4-q5}
- vdup.u8 q11, r3
- ldr r3, [sp, #32]
+ vdup.u8 q11, r3
+ ldr r3, [sp, #32]
- sub r0, r0, r2 , lsl #1
- sub r1, r1, r2, lsl #1
- vdup.u8 q9, r3
- vld1.u8 {d0}, [r0], r2 // q0::p1
- vld1.u8 {d1}, [r1], r2
- vld1.u8 {d2}, [r0], r2 // q1::p0
- vld1.u8 {d3}, [r1], r2
- vld1.u8 {d4}, [r0], r2 // q2::q0
- vld1.u8 {d5}, [r1], r2
- vld1.u8 {d6}, [r0] // q3::q1
- vld1.u8 {d7}, [r1]
+ sub r0, r0, r2 , lsl #1
+ sub r1, r1, r2, lsl #1
+ vdup.u8 q9, r3
+ vld1.u8 {d0}, [r0], r2 // q0::p1
+ vld1.u8 {d1}, [r1], r2
+ vld1.u8 {d2}, [r0], r2 // q1::p0
+ vld1.u8 {d3}, [r1], r2
+ vld1.u8 {d4}, [r0], r2 // q2::q0
+ vld1.u8 {d5}, [r1], r2
+ vld1.u8 {d6}, [r0] // q3::q1
+ vld1.u8 {d7}, [r1]
- sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
- sub r1, r1, r2, lsl #1
+ sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
+ sub r1, r1, r2, lsl #1
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
- vmov q11, q10
+ vmov q11, q10
- DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q8, d30, d0 // Cb::p0' q0'
- DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d31, d1 // Cr::p0' q0'
+ DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q8, d30, d0 // Cb::p0' q0'
+ DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d31, d1 // Cr::p0' q0'
- vbsl.u8 q10, q15, q1
- vst1.u8 {d20}, [r0], r2
- vst1.u8 {d21}, [r1], r2
+ vbsl.u8 q10, q15, q1
+ vst1.u8 {d20}, [r0], r2
+ vst1.u8 {d21}, [r1], r2
- vbsl.u8 q11, q0, q2
- vst1.u8 {d22}, [r0]
- vst1.u8 {d23}, [r1]
+ vbsl.u8 q11, q0, q2
+ vst1.u8 {d22}, [r0]
+ vst1.u8 {d23}, [r1]
- vpop {q4-q5}
+ vpop {q4-q5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
- vdup.u8 q11, r3
- ldr r3, [sp, #0]
+ vdup.u8 q11, r3
+ ldr r3, [sp, #0]
- sub r0, r0, #2
- vdup.u8 q9, r3
- ldr r3, [sp, #4]
- sub r1, r1, #2
- vld1.s8 {d31}, [r3]
+ sub r0, r0, #2
+ vdup.u8 q9, r3
+ ldr r3, [sp, #4]
+ sub r1, r1, #2
+ vld1.s8 {d31}, [r3]
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
- vswp q1, q2
- vswp d1, d2
- vswp d6, d5
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+ vswp q1, q2
+ vswp d1, d2
+ vswp d6, d5
- vmovl.u8 q14, d31
- vshl.u64 d29,d28,#8
- vorr d28,d29
- vmov d29, d28
- veor q15, q15
- vsub.i8 q15,q15,q14
+ vmovl.u8 q14, d31
+ vshl.u64 d29,d28,#8
+ vorr d28,d29
+ vmov d29, d28
+ veor q15, q15
+ vsub.i8 q15,q15,q14
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
- DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
+ DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
- vand.s8 q8, q8, q10
- vcge.s8 q14, q14, #0
- vand.s8 q8, q8, q14
- EXTRACT_DELTA_INTO_TWO_PART q8, q10
- vqadd.u8 q1, q1, q10
- vqsub.u8 q1, q1, q8
- vqsub.u8 q2, q2, q10
- vqadd.u8 q2, q2, q8
+ vand.s8 q8, q8, q10
+ vcge.s8 q14, q14, #0
+ vand.s8 q8, q8, q14
+ EXTRACT_DELTA_INTO_TWO_PART q8, q10
+ vqadd.u8 q1, q1, q10
+ vqsub.u8 q1, q1, q8
+ vqsub.u8 q2, q2, q10
+ vqadd.u8 q2, q2, q8
- sub r0, r0, r2, lsl #3
- sub r1, r1, r2, lsl #3
- vswp d1, d2
- vswp d6, d5
- vswp q1, q2
+ sub r0, r0, r2, lsl #3
+ sub r1, r1, r2, lsl #3
+ vswp d1, d2
+ vswp d6, d5
+ vswp q1, q2
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
- vpush {q4-q5}
- vdup.u8 q11, r3
- ldr r3, [sp, #32]
+ vpush {q4-q5}
+ vdup.u8 q11, r3
+ ldr r3, [sp, #32]
- sub r0, r0, #2
- sub r1, r1, #2
+ sub r0, r0, #2
+ sub r1, r1, #2
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
- vswp q1, q2
- vswp d1, d2
- vswp d6, d5
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+ vswp q1, q2
+ vswp d1, d2
+ vswp d6, d5
- vdup.u8 q9, r3
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
- vmov q11, q10
+ vdup.u8 q9, r3
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+ vmov q11, q10
- DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10
- DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11
+ DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10
+ DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11
- vbsl.u8 q10, q4, q1
- vbsl.u8 q11, q5, q2
- sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
- sub r1, r1, r2, lsl #3
+ vbsl.u8 q10, q4, q1
+ vbsl.u8 q11, q5, q2
+ sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
+ sub r1, r1, r2, lsl #3
- vmov q1, q10
- vmov q2, q11
- vswp d1, d2
- vswp d6, d5
- vswp q1, q2
- // Cb:d0d1d2d3, Cr:d4d5d6d7
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+ vmov q1, q10
+ vmov q2, q11
+ vswp d1, d2
+ vswp d6, d5
+ vswp q1, q2
+ // Cb:d0d1d2d3, Cr:d4d5d6d7
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
- vpop {q4-q5}
+ vpop {q4-q5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
- vld1.64 {d0-d2}, [r0]
+ vld1.64 {d0-d2}, [r0]
- vceq.s8 q0, q0, #0
- vceq.s8 d2, d2, #0
- vmvn q0, q0
- vmvn d2, d2
- vabs.s8 q0, q0
- vabs.s8 d2, d2
+ vceq.s8 q0, q0, #0
+ vceq.s8 d2, d2, #0
+ vmvn q0, q0
+ vmvn d2, d2
+ vabs.s8 q0, q0
+ vabs.s8 d2, d2
- vst1.64 {d0-d2}, [r0]
+ vst1.64 {d0-d2}, [r0]
WELS_ASM_FUNC_END
#ifdef __APPLE__
@@ -851,37 +851,37 @@
.macro BS_NZC_CHECK
vld1.8 {d0,d1}, [$0]
/* Arrenge the input data --- TOP */
- ands r6, $1, #2
- beq bs_nzc_check_jump0
+ ands r6, $1, #2
+ beq bs_nzc_check_jump0
sub r6, $0, $2, lsl #4
- sub r6, $2, lsl #3
+ sub r6, $2, lsl #3
add r6, #12
vld1.32 d3[1], [r6]
bs_nzc_check_jump0:
vext.8 q1, q1, q0, #12
- vadd.u8 $3, q0, q1
+ vadd.u8 $3, q0, q1
/* Arrenge the input data --- LEFT */
- ands r6, $1, #1
- beq bs_nzc_check_jump1
+ ands r6, $1, #1
+ beq bs_nzc_check_jump1
sub r6, $0, #21
- add r7, r6, #4
+ add r7, r6, #4
vld1.8 d3[4], [r6]
- add r6, r7, #4
+ add r6, r7, #4
vld1.8 d3[5], [r7]
- add r7, r6, #4
+ add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_nzc_check_jump1:
- vzip.8 d0, d1
- vzip.8 d0, d1
+ vzip.8 d0, d1
+ vzip.8 d0, d1
vext.8 q1, q1, q0, #12
- vadd.u8 $4, q0, q1
+ vadd.u8 $4, q0, q1
.endm
.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
@@ -888,7 +888,7 @@
mov r6, #4
vabd.s16 q8, $0, $1
vabd.s16 q9, $1, $2
- vdup.s16 $0, r6
+ vdup.s16 $0, r6
vabd.s16 q10, $2, $3
vabd.s16 q11, $3, $4
@@ -897,7 +897,7 @@
vcge.s16 q10, $0
vcge.s16 q11, $0
- vpadd.i16 d16, d16, d17
+ vpadd.i16 d16, d16, d17
vpadd.i16 d17, d18, d19
vpadd.i16 d18, d20, d21
vpadd.i16 d19, d22, d23
@@ -910,8 +910,8 @@
vldm $0, {q0,q1,q2,q3}
/* Arrenge the input data --- TOP */
- ands r6, $1, #2
- beq bs_mv_check_jump0
+ ands r6, $1, #2
+ beq bs_mv_check_jump0
sub r6, $0, $2, lsl #6
add r6, #48
@@ -921,22 +921,22 @@
BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
/* Arrenge the input data --- LEFT */
- ands r6, $1, #1
- beq bs_mv_check_jump1
+ ands r6, $1, #1
+ beq bs_mv_check_jump1
sub r6, $0, #52
add r7, r6, #16
- vld1.32 d8[0], [r6]
- add r6, r7, #16
+ vld1.32 d8[0], [r6]
+ add r6, r7, #16
vld1.32 d8[1], [r7]
- add r7, r6, #16
+ add r7, r6, #16
vld1.32 d9[0], [r6]
vld1.32 d9[1], [r7]
bs_mv_check_jump1:
- vzip.32 q0, q2
- vzip.32 q1, q3
- vzip.32 q0, q1
+ vzip.32 q0, q2
+ vzip.32 q1, q3
+ vzip.32 q0, q1
vzip.32 q2, q3
BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
.endm
@@ -1038,41 +1038,41 @@
WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
- stmdb sp!, {r5-r7}
- vpush {q4}
+ stmdb sp!, {r5-r7}
+ vpush {q4}
- ldr r5, [sp, #28] //Save BS to r5
+ ldr r5, [sp, #28] //Save BS to r5
- /* Checking the nzc status */
- BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
+ /* Checking the nzc status */
+ BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
- /* For checking bS[I] = 2 */
- mov r6, #2
- vcgt.s8 q14, q14, #0
- vdup.u8 q0, r6
- vcgt.s8 q15, q15, #0
+ /* For checking bS[I] = 2 */
+ mov r6, #2
+ vcgt.s8 q14, q14, #0
+ vdup.u8 q0, r6
+ vcgt.s8 q15, q15, #0
- vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
- vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
+ vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
+ vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
- /* Checking the mv status*/
- BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
+ /* Checking the mv status*/
+ BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
- /* For checking bS[I] = 1 */
+ /* For checking bS[I] = 1 */
mov r6, #1
- vdup.u8 q0, r6
+ vdup.u8 q0, r6
- vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
- vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
+ vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
+ vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
- /* Check bS[I] is '1' or '2' */
- vmax.u8 q1, q12, q14
- vmax.u8 q0, q13, q15
+ /* Check bS[I] is '1' or '2' */
+ vmax.u8 q1, q12, q14
+ vmax.u8 q0, q13, q15
- //vstm r5, {q0, q1}
+ //vstm r5, {q0, q1}
vst1.32 {q0, q1}, [r5]
- vpop {q4}
- ldmia sp!, {r5-r7}
+ vpop {q4}
+ ldmia sp!, {r5-r7}
WELS_ASM_FUNC_END
#endif
--- a/codec/common/arm/expand_picture_neon.S
+++ b/codec/common/arm/expand_picture_neon.S
@@ -37,119 +37,119 @@
WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
stmdb sp!, {r4-r8}
- //Save the dst
- mov r7, r0
- mov r8, r3
+ //Save the dst
+ mov r7, r0
+ mov r8, r3
- add r4, r7, r2
- sub r4, #1
+ add r4, r7, r2
+ sub r4, #1
//For the left and right expand
_expand_picture_luma_loop2:
- sub r5, r7, #32
- add r6, r4, #1
+ sub r5, r7, #32
+ add r6, r4, #1
- vld1.8 {d0[], d1[]}, [r7], r1
- vld1.8 {d2[], d3[]}, [r4], r1
+ vld1.8 {d0[], d1[]}, [r7], r1
+ vld1.8 {d2[], d3[]}, [r4], r1
- vst1.8 {q0}, [r5]!
- vst1.8 {q0}, [r5]
- vst1.8 {q1}, [r6]!
- vst1.8 {q1}, [r6]
- subs r8, #1
- bne _expand_picture_luma_loop2
+ vst1.8 {q0}, [r5]!
+ vst1.8 {q0}, [r5]
+ vst1.8 {q1}, [r6]!
+ vst1.8 {q1}, [r6]
+ subs r8, #1
+ bne _expand_picture_luma_loop2
- //for the top and bottom expand
- add r2, #64
- sub r0, #32
- mla r4, r1, r3, r0
- sub r4, r1
+ //for the top and bottom expand
+ add r2, #64
+ sub r0, #32
+ mla r4, r1, r3, r0
+ sub r4, r1
_expand_picture_luma_loop0:
- mov r5, #32
+ mov r5, #32
mls r5, r5, r1, r0
- add r6, r4, r1
- vld1.8 {q0}, [r0]!
- vld1.8 {q1}, [r4]!
+ add r6, r4, r1
+ vld1.8 {q0}, [r0]!
+ vld1.8 {q1}, [r4]!
- mov r8, #32
+ mov r8, #32
_expand_picture_luma_loop1:
- vst1.8 {q0}, [r5], r1
- vst1.8 {q1}, [r6], r1
- subs r8, #1
+ vst1.8 {q0}, [r5], r1
+ vst1.8 {q1}, [r6], r1
+ subs r8, #1
bne _expand_picture_luma_loop1
- subs r2, #16
- bne _expand_picture_luma_loop0
+ subs r2, #16
+ bne _expand_picture_luma_loop0
//vldreq.32 d0, [r0]
- ldmia sp!, {r4-r8}
+ ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
stmdb sp!, {r4-r9}
- //Save the dst
- mov r7, r0
- mov r8, r3
+ //Save the dst
+ mov r7, r0
+ mov r8, r3
- add r4, r7, r2
- sub r4, #1
+ add r4, r7, r2
+ sub r4, #1
//For the left and right expand
_expand_picture_chroma_loop2:
- sub r5, r7, #16
- add r6, r4, #1
+ sub r5, r7, #16
+ add r6, r4, #1
- vld1.8 {d0[], d1[]}, [r7], r1
- vld1.8 {d2[], d3[]}, [r4], r1
+ vld1.8 {d0[], d1[]}, [r7], r1
+ vld1.8 {d2[], d3[]}, [r4], r1
- vst1.8 {q0}, [r5]
- vst1.8 {q1}, [r6]
- subs r8, #1
- bne _expand_picture_chroma_loop2
+ vst1.8 {q0}, [r5]
+ vst1.8 {q1}, [r6]
+ subs r8, #1
+ bne _expand_picture_chroma_loop2
- //for the top and bottom expand
- add r2, #32
- mov r9, r2
- bic r2, #15
- sub r0, #16
- mla r4, r1, r3, r0
- sub r4, r1
+ //for the top and bottom expand
+ add r2, #32
+ mov r9, r2
+ bic r2, #15
+ sub r0, #16
+ mla r4, r1, r3, r0
+ sub r4, r1
_expand_picture_chroma_loop0:
- mov r5, #16
- mls r5, r5, r1, r0
- add r6, r4, r1
- vld1.8 {q0}, [r0]!
- vld1.8 {q1}, [r4]!
+ mov r5, #16
+ mls r5, r5, r1, r0
+ add r6, r4, r1
+ vld1.8 {q0}, [r0]!
+ vld1.8 {q1}, [r4]!
- mov r8, #16
+ mov r8, #16
_expand_picture_chroma_loop1:
- vst1.8 {q0}, [r5], r1
- vst1.8 {q1}, [r6], r1
- subs r8, #1
- bne _expand_picture_chroma_loop1
+ vst1.8 {q0}, [r5], r1
+ vst1.8 {q1}, [r6], r1
+ subs r8, #1
+ bne _expand_picture_chroma_loop1
- subs r2, #16
- bne _expand_picture_chroma_loop0
+ subs r2, #16
+ bne _expand_picture_chroma_loop0
//vldreq.32 d0, [r0]
- and r9, #15
- cmp r9, #8
- bne _expand_picture_chroma_end
- mov r5, #16
- mls r5, r5, r1, r0
- add r6, r4, r1
- vld1.8 {d0}, [r0]!
- vld1.8 {d2}, [r4]!
- mov r8, #16
+ and r9, #15
+ cmp r9, #8
+ bne _expand_picture_chroma_end
+ mov r5, #16
+ mls r5, r5, r1, r0
+ add r6, r4, r1
+ vld1.8 {d0}, [r0]!
+ vld1.8 {d2}, [r4]!
+ mov r8, #16
_expand_picture_chroma_loop3:
- vst1.8 {d0}, [r5], r1
- vst1.8 {d2}, [r6], r1
- subs r8, #1
- bne _expand_picture_chroma_loop3
+ vst1.8 {d0}, [r5], r1
+ vst1.8 {d2}, [r6], r1
+ subs r8, #1
+ bne _expand_picture_chroma_loop3
_expand_picture_chroma_end:
- ldmia sp!, {r4-r9}
+ ldmia sp!, {r4-r9}
WELS_ASM_FUNC_END
#endif
--- a/codec/common/arm/mc_neon.S
+++ b/codec/common/arm/mc_neon.S
@@ -35,2176 +35,2176 @@
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
-.macro AVERAGE_TWO_8BITS
-// { // input:dst_d, src_d A and B; working: q13
- vaddl.u8 q13, $2, $1
- vrshrn.u16 $0, q13, #1
-// }
+.macro AVERAGE_TWO_8BITS
+// { // input:dst_d, src_d A and B; working: q13
+ vaddl.u8 q13, $2, $1
+ vrshrn.u16 $0, q13, #1
+// }
.endm
-.macro FILTER_6TAG_8BITS
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
-// }
+.macro FILTER_6TAG_8BITS
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+// }
.endm
-.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
- vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
- vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
- vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
- vpadd.s16 $0, $0, $0
- vpadd.s16 $0, $0, $0
- vqrshrun.s16 $0, $4, #5
-// }
+.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
+ vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
+ vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
+ vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
+ vpadd.s16 $0, $0, $0
+ vpadd.s16 $0, $0, $0
+ vqrshrun.s16 $0, $4, #5
+// }
.endm
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $2, $6
- vrshrn.u16 $6, q13, #1
-// }
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+ vaddl.u8 q13, $2, $6
+ vrshrn.u16 $6, q13, #1
+// }
.endm
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $3, $6
- vrshrn.u16 $6, q13, #1
-// }
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+ vaddl.u8 q13, $3, $6
+ vrshrn.u16 $6, q13, #1
+// }
.endm
-.macro FILTER_6TAG_8BITS_TO_16BITS
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
- vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+.macro FILTER_6TAG_8BITS_TO_16BITS
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+ vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
-.macro FILTER_3_IN_16BITS_TO_8BITS
-// { // input:a, b, c, dst_d;
- vsub.s16 $0, $0, $1 //a-b
- vshr.s16 $0, $0, #2 //(a-b)/4
- vsub.s16 $0, $0, $1 //(a-b)/4-b
- vadd.s16 $0, $0, $2 //(a-b)/4-b+c
- vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
- vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 $3, $0, #6 //(+32)>>6
-// }
+.macro FILTER_3_IN_16BITS_TO_8BITS
+// { // input:a, b, c, dst_d;
+ vsub.s16 $0, $0, $1 //a-b
+ vshr.s16 $0, $0, #2 //(a-b)/4
+ vsub.s16 $0, $0, $1 //(a-b)/4-b
+ vadd.s16 $0, $0, $2 //(a-b)/4-b+c
+ vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 $3, $0, #6 //(+32)>>6
+// }
.endm
-.macro UNPACK_2_16BITS_TO_ABC
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- vext.16 $4, $0, $1, #2 //src[0]
- vext.16 $3, $0, $1, #3 //src[1]
- vadd.s16 $4, $3 //c=src[0]+src[1]
+.macro UNPACK_2_16BITS_TO_ABC
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ vext.16 $4, $0, $1, #2 //src[0]
+ vext.16 $3, $0, $1, #3 //src[1]
+ vadd.s16 $4, $3 //c=src[0]+src[1]
- vext.16 $3, $0, $1, #1 //src[-1]
- vext.16 $2, $0, $1, #4 //src[2]
- vadd.s16 $3, $2 //b=src[-1]+src[2]
+ vext.16 $3, $0, $1, #1 //src[-1]
+ vext.16 $2, $0, $1, #4 //src[2]
+ vadd.s16 $3, $2 //b=src[-1]+src[2]
- vext.16 $2, $0, $1, #5 //src[3]
- vadd.s16 $2, $0 //a=src[-2]+src[3]
-// }
+ vext.16 $2, $0, $1, #5 //src[3]
+ vadd.s16 $2, $0 //a=src[-2]+src[3]
+// }
.endm
-.macro UNPACK_1_IN_8x16BITS_TO_8BITS
-// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
- vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
- vrev64.16 $1, $1
- vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
- vshr.s64 $1, $2, #16
- vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
+.macro UNPACK_1_IN_8x16BITS_TO_8BITS
+// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+ vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
+ vrev64.16 $1, $1
+ vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
+ vshr.s64 $1, $2, #16
+ vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
- vsub.s16 $0, $0, $1 //a-b
- vshr.s16 $0, $0, #2 //(a-b)/4
- vsub.s16 $0, $0, $1 //(a-b)/4-b
- vadd.s16 $0, $0, $2 //(a-b)/4-b+c
- vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
- vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 $0, $3, #6 //(+32)>>6
-// }
+ vsub.s16 $0, $0, $1 //a-b
+ vshr.s16 $0, $0, #2 //(a-b)/4
+ vsub.s16 $0, $0, $1 //(a-b)/4-b
+ vadd.s16 $0, $0, $2 //(a-b)/4-b+c
+ vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 $0, $3, #6 //(+32)>>6
+// }
.endm
#else
-.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
-// { // input:dst_d, src_d A and B; working: q13
- vaddl.u8 q13, \arg2, \arg1
- vrshrn.u16 \arg0, q13, #1
-// }
+.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
+// { // input:dst_d, src_d A and B; working: q13
+ vaddl.u8 q13, \arg2, \arg1
+ vrshrn.u16 \arg0, q13, #1
+// }
.endm
-.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
-// }
+.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+// }
.endm
-.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
- vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
- vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
- vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32]
- vpadd.s16 \arg0, \arg0, \arg0
- vpadd.s16 \arg0, \arg0, \arg0
- vqrshrun.s16 \arg0, \arg4, #5
-// }
+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
+ vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
+ vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
+ vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32]
+ vpadd.s16 \arg0, \arg0, \arg0
+ vpadd.s16 \arg0, \arg0, \arg0
+ vqrshrun.s16 \arg0, \arg4, #5
+// }
.endm
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
- vaddl.u8 q13, \arg2, \arg6
- vrshrn.u16 \arg6, q13, #1
-// }
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+ vaddl.u8 q13, \arg2, \arg6
+ vrshrn.u16 \arg6, q13, #1
+// }
.endm
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
- vaddl.u8 q13, \arg3, \arg6
- vrshrn.u16 \arg6, q13, #1
-// }
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+ vaddl.u8 q13, \arg3, \arg6
+ vrshrn.u16 \arg6, q13, #1
+// }
.endm
-.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
- vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+ vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
-.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
-// { // input:a, b, c, dst_d;
- vsub.s16 \arg0, \arg0, \arg1 //a-b
- vshr.s16 \arg0, \arg0, #2 //(a-b)/4
- vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
- vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
- vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
- vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6
-// }
+.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
+// { // input:a, b, c, dst_d;
+ vsub.s16 \arg0, \arg0, \arg1 //a-b
+ vshr.s16 \arg0, \arg0, #2 //(a-b)/4
+ vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
+ vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
+ vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6
+// }
.endm
-.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- vext.16 \arg4, \arg0, \arg1, #2 //src[0]
- vext.16 \arg3, \arg0, \arg1, #3 //src[1]
- vadd.s16 \arg4, \arg3 //c=src[0]+src[1]
+.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ vext.16 \arg4, \arg0, \arg1, #2 //src[0]
+ vext.16 \arg3, \arg0, \arg1, #3 //src[1]
+ vadd.s16 \arg4, \arg3 //c=src[0]+src[1]
- vext.16 \arg3, \arg0, \arg1, #1 //src[-1]
- vext.16 \arg2, \arg0, \arg1, #4 //src[2]
- vadd.s16 \arg3,\arg2 //b=src[-1]+src[2]
+ vext.16 \arg3, \arg0, \arg1, #1 //src[-1]
+ vext.16 \arg2, \arg0, \arg1, #4 //src[2]
+ vadd.s16 \arg3,\arg2 //b=src[-1]+src[2]
- vext.16 \arg2, \arg0, \arg1, #5 //src[3]
- vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
-// }
+ vext.16 \arg2, \arg0, \arg1, #5 //src[3]
+ vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
+// }
.endm
-.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
-// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
- vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
- vrev64.16 \arg1, \arg1
- vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5]
- vshr.s64 \arg1, \arg2, #16
- vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0
+.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
+// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+ vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
+ vrev64.16 \arg1, \arg1
+ vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5]
+ vshr.s64 \arg1, \arg2, #16
+ vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0
- vsub.s16 \arg0, \arg0, \arg1 //a-b
- vshr.s16 \arg0, \arg0, #2 //(a-b)/4
- vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
- vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
- vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
- vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
-// }
+ vsub.s16 \arg0, \arg0, \arg1 //a-b
+ vshr.s16 \arg0, \arg0, #2 //(a-b)/4
+ vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
+ vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
+ vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
+// }
.endm
#endif
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w16_h_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
- FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d2, q14, q15
+ FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d2, q14, q15
- FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d3, q14, q15
+ FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d3, q14, q15
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
- cmp r4, #0
- bne w16_h_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w16_h_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w8_h_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
- FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d1, q14, q15
+ FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d1, q14, q15
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
- cmp r4, #0
- bne w8_h_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w8_h_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w4_h_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q8, q2, q2, #2 //src[1:6 * *]
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q8, q2, q2, #2 //src[1:6 * *]
- vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+ vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
- FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15
+ FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
- sub r6, #2
- cmp r6, #0
- bne w4_h_mc_luma_loop
+ sub r6, #2
+ cmp r6, #0
+ bne w4_h_mc_luma_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w16_xy_10_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d16, d18, d20, d2, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d16, d18, d20, d2, q14, q15
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d17, d19, d21, d3, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d17, d19, d21, d3, q14, q15
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
- cmp r4, #0
- bne w16_xy_10_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w16_xy_10_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w8_xy_10_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
- cmp r4, #0
- bne w8_xy_10_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w8_xy_10_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w4_xy_10_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q8, q2, q2, #2 //src[1:6 * *]
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q8, q2, q2, #2 //src[1:6 * *]
- vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+ vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
- sub r6, #2
- cmp r6, #0
- bne w4_xy_10_mc_luma_loop
+ sub r6, #2
+ cmp r6, #0
+ bne w4_xy_10_mc_luma_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w16_xy_30_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d16, d18, d20, d2, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d16, d18, d20, d2, q14, q15
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d17, d19, d21, d3, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d17, d19, d21, d3, q14, q15
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
- cmp r4, #0
- bne w16_xy_30_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w16_xy_30_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w8_xy_30_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
- cmp r4, #0
- bne w8_xy_30_mc_luma_loop
- pop {r4}
+ cmp r4, #0
+ bne w8_xy_30_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w4_xy_30_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q8, q2, q2, #2 //src[1:6 * *]
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q8, q2, q2, #2 //src[1:6 * *]
- vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+ vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
- sub r6, #2
- cmp r6, #0
- bne w4_xy_30_mc_luma_loop
+ sub r6, #2
+ cmp r6, #0
+ bne w4_xy_30_mc_luma_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
w16_xy_01_luma_loop:
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
- //q2, q3, q4, q5, q0 --> q0~q4
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q4
+ //q2, q3, q4, q5, q0 --> q0~q4
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q4
- sub r4, #8
- cmp r4, #0
- bne w16_xy_01_luma_loop
- pop {r4}
+ sub r4, #8
+ cmp r4, #0
+ bne w16_xy_01_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
w8_xy_01_mc_luma_loop:
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
- sub r4, #4
- cmp r4, #0
- bne w8_xy_01_mc_luma_loop
+ sub r4, #4
+ cmp r4, #0
+ bne w8_xy_01_mc_luma_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
w4_xy_01_mc_luma_loop:
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vmov r4, r5, d16
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vmov r4, r5, d16
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vmov r5, r6, d16
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vmov r5, r6, d16
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
- sub r7, #4
- cmp r7, #0
- bne w4_xy_01_mc_luma_loop
+ sub r7, #4
+ cmp r7, #0
+ bne w4_xy_01_mc_luma_loop
- pop {r4, r5, r6, r7}
+ pop {r4, r5, r6, r7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
w16_xy_03_luma_loop:
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
- //q2, q3, q8, q9, q0 --> q0~q8
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q8
+ //q2, q3, q8, q9, q0 --> q0~q8
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q8
- sub r4, #8
- cmp r4, #0
- bne w16_xy_03_luma_loop
- pop {r4}
+ sub r4, #8
+ cmp r4, #0
+ bne w16_xy_03_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
w8_xy_03_mc_luma_loop:
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
- sub r4, #4
- cmp r4, #0
- bne w8_xy_03_mc_luma_loop
+ sub r4, #4
+ cmp r4, #0
+ bne w8_xy_03_mc_luma_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
w4_xy_03_mc_luma_loop:
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vmov r4, r5, d16
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vmov r4, r5, d16
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vmov r5, r6, d16
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vmov r5, r6, d16
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
- sub r7, #4
- cmp r7, #0
- bne w4_xy_03_mc_luma_loop
+ sub r7, #4
+ cmp r7, #0
+ bne w4_xy_03_mc_luma_loop
- pop {r4, r5, r6, r7}
+ pop {r4, r5, r6, r7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
w16_v_mc_luma_loop:
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
- FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+ FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
- FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+ FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
- FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+ FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
- FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+ FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
- //q2, q3, q8, q9, q0 --> q0~q8
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q8
+ //q2, q3, q8, q9, q0 --> q0~q8
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q8
- sub r4, #8
- cmp r4, #0
- bne w16_v_mc_luma_loop
- pop {r4}
+ sub r4, #8
+ cmp r4, #0
+ bne w16_v_mc_luma_loop
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
w8_v_mc_luma_loop:
- pld [r0]
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
- sub r4, #4
- cmp r4, #0
- bne w8_v_mc_luma_loop
+ sub r4, #4
+ cmp r4, #0
+ bne w8_v_mc_luma_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
w4_v_mc_luma_loop:
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vmov r4, r5, d16
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vmov r4, r5, d16
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
- vmov r5, r6, d16
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vmov r5, r6, d16
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
- sub r7, #4
- cmp r7, #0
- bne w4_v_mc_luma_loop
+ sub r7, #4
+ cmp r7, #0
+ bne w4_v_mc_luma_loop
- pop {r4, r5, r6, r7}
+ pop {r4, r5, r6, r7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
- push {r4}
- vpush {q4-q7}
- ldr r4, [sp, #68]
+ push {r4}
+ vpush {q4-q7}
+ ldr r4, [sp, #68]
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2]
- vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2]
+ vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0]
- vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
+ vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0]
+ vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
w16_hv_mc_luma_loop:
- vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {q0}, [r2], r3 //write 16Byte
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {q0}, [r2], r3 //write 16Byte
- vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
+ vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
- vst1.u8 {d3, d4}, [r2], r3 //write 16Byte
+ vst1.u8 {d3, d4}, [r2], r3 //write 16Byte
- vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
+ vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
- vst1.u8 {d6, d7}, [r2], r3 //write 16Byte
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
+ vst1.u8 {d6, d7}, [r2], r3 //write 16Byte
- vld1.u8 {d6-d8}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
- vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
+ vld1.u8 {d6-d8}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
+ vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
- //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
- vswp q0, q6
- vswp q6, q3
- vmov q5, q2
- vmov q2, q8
+ //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+ vswp q0, q6
+ vswp q6, q3
+ vmov q5, q2
+ vmov q2, q8
- vmov d20,d8
- vmov q4, q1
- vmov q1, q7
- vmov d14,d20
+ vmov d20,d8
+ vmov q4, q1
+ vmov q1, q7
+ vmov d14,d20
- sub r4, #4
- cmp r4, #0
- bne w16_hv_mc_luma_loop
- vpop {q4-q7}
- pop {r4}
+ sub r4, #4
+ cmp r4, #0
+ bne w16_hv_mc_luma_loop
+ vpop {q4-q7}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
- push {r4}
- vpush {q4}
- ldr r4, [sp, #20]
+ push {r4}
+ vpush {q4}
+ ldr r4, [sp, #20]
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 13(8+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 13(8+5), =src[-1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 13(8+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 13(8+5), =src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //use 13(8+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 13(8+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 13(8+5), =src[2]
+ vld1.u8 {q2}, [r0], r1 //use 13(8+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 13(8+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 13(8+5), =src[2]
w8_hv_mc_luma_loop:
- vld1.u8 {q8}, [r0], r1 //use 13(8+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
+ vld1.u8 {q8}, [r0], r1 //use 13(8+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
- vld1.u8 {q2}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
- //q4~q5, q0~q2, --> q0~q4
- vswp q0, q4
- vswp q2, q4
- vmov q3, q1
- vmov q1, q8
+ //q4~q5, q0~q2, --> q0~q4
+ vswp q0, q4
+ vswp q2, q4
+ vmov q3, q1
+ vmov q1, q8
- sub r4, #4
- cmp r4, #0
- bne w8_hv_mc_luma_loop
- vpop {q4}
- pop {r4}
+ sub r4, #4
+ cmp r4, #0
+ bne w8_hv_mc_luma_loop
+ vpop {q4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
- push {r4 ,r5, r6}
- vpush {q4-q7}
- ldr r6, [sp, #76]
+ push {r4 ,r5, r6}
+ vpush {q4-q7}
+ ldr r6, [sp, #76]
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2]
+ vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2]
w4_hv_mc_luma_loop:
- vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3]
- vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4]
+ vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3]
+ vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4]
- //the 1st&2nd row
- pld [r0]
- pld [r0, r1]
- // vertical filtered
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail
+ //the 1st&2nd row
+ pld [r0]
+ pld [r0, r1]
+ // vertical filtered
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
- UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
+ UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail
- vmov d23, d0
- vmov d25, d14
- vmov d27, d16
+ vmov d23, d0
+ vmov d25, d14
+ vmov d27, d16
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
- vmov r4 ,r5, d22
- str r4, [r2], r3 //write 4Byte
- str r5, [r2], r3 //write 4Byte
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
+ vmov r4 ,r5, d22
+ str r4, [r2], r3 //write 4Byte
+ str r5, [r2], r3 //write 4Byte
- //the 3rd&4th row
- vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3]
- vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4]
- pld [r0]
- pld [r0, r1]
- // vertical filtered
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail
+ //the 3rd&4th row
+ vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3]
+ vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4]
+ pld [r0]
+ pld [r0, r1]
+ // vertical filtered
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail
- FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
- UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
+ UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail
- vmov d23, d4
- vmov d25, d14
- vmov d27, d16
+ vmov d23, d4
+ vmov d25, d14
+ vmov d27, d16
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
- vmov r4 ,r5, d22
- str r4, [r2], r3 //write 4Byte
- str r5, [r2], r3 //write 4Byte
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
+ vmov r4 ,r5, d22
+ str r4, [r2], r3 //write 4Byte
+ str r5, [r2], r3 //write 4Byte
- //q4~q6, q0~q1, --> q0~q4
- vswp q4, q0
- vmov q3, q4
- vmov q4, q1
- vmov q1, q5
- vmov q2, q6
+ //q4~q6, q0~q1, --> q0~q4
+ vswp q4, q0
+ vmov q3, q4
+ vmov q4, q1
+ vmov q1, q5
+ vmov q2, q6
- sub r6, #4
- cmp r6, #0
- bne w4_hv_mc_luma_loop
+ sub r6, #4
+ cmp r6, #0
+ bne w4_hv_mc_luma_loop
- vpop {q4-q7}
- pop {r4, r5, r6}
+ vpop {q4-q7}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
w16_copy_loop:
- vld1.u8 {q0}, [r0], r1
- sub r4, #2
- vld1.u8 {q1}, [r0], r1
- vst1.u8 {q0}, [r2], r3
- cmp r4, #0
- vst1.u8 {q1}, [r2], r3
- bne w16_copy_loop
+ vld1.u8 {q0}, [r0], r1
+ sub r4, #2
+ vld1.u8 {q1}, [r0], r1
+ vst1.u8 {q0}, [r2], r3
+ cmp r4, #0
+ vst1.u8 {q1}, [r2], r3
+ bne w16_copy_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
w8_copy_loop:
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d1}, [r0], r1
- vst1.u8 {d0}, [r2], r3
- vst1.u8 {d1}, [r2], r3
- sub r4, #2
- cmp r4, #0
- bne w8_copy_loop
+ vld1.u8 {d0}, [r0], r1
+ vld1.u8 {d1}, [r0], r1
+ vst1.u8 {d0}, [r2], r3
+ vst1.u8 {d1}, [r2], r3
+ sub r4, #2
+ cmp r4, #0
+ bne w8_copy_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
w4_copy_loop:
- ldr r5, [r0], r1
- ldr r6, [r0], r1
- str r5, [r2], r3
- str r6, [r2], r3
+ ldr r5, [r0], r1
+ ldr r6, [r0], r1
+ str r5, [r2], r3
+ str r6, [r2], r3
- sub r4, #2
- cmp r4, #0
- bne w4_copy_loop
+ sub r4, #2
+ cmp r4, #0
+ bne w4_copy_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
w16_pix_avg_loop:
- vld1.u8 {q0}, [r2]!
- vld1.u8 {q1}, [r3]!
- vld1.u8 {q2}, [r2]!
- vld1.u8 {q3}, [r3]!
+ vld1.u8 {q0}, [r2]!
+ vld1.u8 {q1}, [r3]!
+ vld1.u8 {q2}, [r2]!
+ vld1.u8 {q3}, [r3]!
- vld1.u8 {q8}, [r2]!
- vld1.u8 {q9}, [r3]!
- vld1.u8 {q10}, [r2]!
- vld1.u8 {q11}, [r3]!
+ vld1.u8 {q8}, [r2]!
+ vld1.u8 {q9}, [r3]!
+ vld1.u8 {q10}, [r2]!
+ vld1.u8 {q11}, [r3]!
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {q0}, [r0], r1
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {q0}, [r0], r1
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {q2}, [r0], r1
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {q2}, [r0], r1
- AVERAGE_TWO_8BITS d16, d16, d18
- AVERAGE_TWO_8BITS d17, d17, d19
- vst1.u8 {q8}, [r0], r1
+ AVERAGE_TWO_8BITS d16, d16, d18
+ AVERAGE_TWO_8BITS d17, d17, d19
+ vst1.u8 {q8}, [r0], r1
- AVERAGE_TWO_8BITS d20, d20, d22
- AVERAGE_TWO_8BITS d21, d21, d23
- vst1.u8 {q10}, [r0], r1
+ AVERAGE_TWO_8BITS d20, d20, d22
+ AVERAGE_TWO_8BITS d21, d21, d23
+ vst1.u8 {q10}, [r0], r1
- sub r4, #4
- cmp r4, #0
- bne w16_pix_avg_loop
+ sub r4, #4
+ cmp r4, #0
+ bne w16_pix_avg_loop
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
- push {r4, r5}
- ldr r4, [sp, #8]
- mov r5, #16
+ push {r4, r5}
+ ldr r4, [sp, #8]
+ mov r5, #16
w8_pix_avg_loop:
- vld1.u8 {d0}, [r2], r5
- vld1.u8 {d2}, [r3], r5
- vld1.u8 {d1}, [r2], r5
- vld1.u8 {d3}, [r3], r5
+ vld1.u8 {d0}, [r2], r5
+ vld1.u8 {d2}, [r3], r5
+ vld1.u8 {d1}, [r2], r5
+ vld1.u8 {d3}, [r3], r5
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {d0}, [r0], r1
- vst1.u8 {d1}, [r0], r1
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {d0}, [r0], r1
+ vst1.u8 {d1}, [r0], r1
- vld1.u8 {d4}, [r2], r5
- vld1.u8 {d6}, [r3], r5
- vld1.u8 {d5}, [r2], r5
- vld1.u8 {d7}, [r3], r5
+ vld1.u8 {d4}, [r2], r5
+ vld1.u8 {d6}, [r3], r5
+ vld1.u8 {d5}, [r2], r5
+ vld1.u8 {d7}, [r3], r5
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {d4}, [r0], r1
- vst1.u8 {d5}, [r0], r1
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {d4}, [r0], r1
+ vst1.u8 {d5}, [r0], r1
- sub r4, #4
- cmp r4, #0
- bne w8_pix_avg_loop
+ sub r4, #4
+ cmp r4, #0
+ bne w8_pix_avg_loop
- pop {r4, r5}
+ pop {r4, r5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
- push {r4-r8}
- ldr r4, [sp, #20]
+ push {r4-r8}
+ ldr r4, [sp, #20]
w4_pix_avg_loop:
- ldr r5, [r2]
- ldr r6, [r2, #16]
- ldr r7, [r3]
- ldr r8, [r3, #16]
- add r2, #32
- add r3, #32
+ ldr r5, [r2]
+ ldr r6, [r2, #16]
+ ldr r7, [r3]
+ ldr r8, [r3, #16]
+ add r2, #32
+ add r3, #32
- vmov d0, r5, r6
- vmov d1, r7, r8
- AVERAGE_TWO_8BITS d0, d0, d1
- vmov r5, r6, d0
+ vmov d0, r5, r6
+ vmov d1, r7, r8
+ AVERAGE_TWO_8BITS d0, d0, d1
+ vmov r5, r6, d0
- str r5, [r0], r1
- str r6, [r0], r1
+ str r5, [r0], r1
+ str r6, [r0], r1
- sub r4, #2
- cmp r4, #0
- bne w4_pix_avg_loop
+ sub r4, #2
+ cmp r4, #0
+ bne w4_pix_avg_loop
- pop {r4-r8}
+ pop {r4-r8}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
- push {r4, r5}
- ldr r4, [sp, #8]
- ldr r5, [sp, #12]
-// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-// we can opti it by adding vert only/ hori only cases, to be continue
- vld1.u8 {d31}, [r4] //load A/B/C/D
- vld1.u8 {q0}, [r0], r1 //src[x]
+ push {r4, r5}
+ ldr r4, [sp, #8]
+ ldr r5, [sp, #12]
+// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+// we can opti it by adding vert only/ hori only cases, to be continue
+ vld1.u8 {d31}, [r4] //load A/B/C/D
+ vld1.u8 {q0}, [r0], r1 //src[x]
- vdup.u8 d28, d31[0] //A
- vdup.u8 d29, d31[1] //B
- vdup.u8 d30, d31[2] //C
- vdup.u8 d31, d31[3] //D
+ vdup.u8 d28, d31[0] //A
+ vdup.u8 d29, d31[1] //B
+ vdup.u8 d30, d31[2] //C
+ vdup.u8 d31, d31[3] //D
- vext.u8 d1, d0, d1, #1 //src[x+1]
+ vext.u8 d1, d0, d1, #1 //src[x+1]
-w8_mc_chroma_loop: // each two pxl row
- vld1.u8 {q1}, [r0], r1 //src[x+stride]
- vld1.u8 {q2}, [r0], r1 //src[x+2*stride]
- vext.u8 d3, d2, d3, #1 //src[x+stride+1]
- vext.u8 d5, d4, d5, #1 //src[x+2*stride+1]
+w8_mc_chroma_loop: // each two pxl row
+ vld1.u8 {q1}, [r0], r1 //src[x+stride]
+ vld1.u8 {q2}, [r0], r1 //src[x+2*stride]
+ vext.u8 d3, d2, d3, #1 //src[x+stride+1]
+ vext.u8 d5, d4, d5, #1 //src[x+2*stride+1]
- vmull.u8 q3, d0, d28 //(src[x] * A)
- vmlal.u8 q3, d1, d29 //+=(src[x+1] * B)
- vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C)
- vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d6, q3, #6
- vst1.u8 d6, [r2], r3
+ vmull.u8 q3, d0, d28 //(src[x] * A)
+ vmlal.u8 q3, d1, d29 //+=(src[x+1] * B)
+ vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D)
+ vrshrn.u16 d6, q3, #6
+ vst1.u8 d6, [r2], r3
- vmull.u8 q3, d2, d28 //(src[x] * A)
- vmlal.u8 q3, d3, d29 //+=(src[x+1] * B)
- vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C)
- vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d6, q3, #6
- vst1.u8 d6, [r2], r3
+ vmull.u8 q3, d2, d28 //(src[x] * A)
+ vmlal.u8 q3, d3, d29 //+=(src[x+1] * B)
+ vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D)
+ vrshrn.u16 d6, q3, #6
+ vst1.u8 d6, [r2], r3
- vmov q0, q2
- sub r5, #2
- cmp r5, #0
- bne w8_mc_chroma_loop
+ vmov q0, q2
+ sub r5, #2
+ cmp r5, #0
+ bne w8_mc_chroma_loop
- pop {r4, r5}
+ pop {r4, r5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r6, [sp, #16]
-// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-// we can opti it by adding vert only/ hori only cases, to be continue
- vld1.u8 {d31}, [r4] //load A/B/C/D
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r6, [sp, #16]
+// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+// we can opti it by adding vert only/ hori only cases, to be continue
+ vld1.u8 {d31}, [r4] //load A/B/C/D
- vdup.u8 d28, d31[0] //A
- vdup.u8 d29, d31[1] //B
- vdup.u8 d30, d31[2] //C
- vdup.u8 d31, d31[3] //D
+ vdup.u8 d28, d31[0] //A
+ vdup.u8 d29, d31[1] //B
+ vdup.u8 d30, d31[2] //C
+ vdup.u8 d31, d31[3] //D
-w4_mc_chroma_loop: // each two pxl row
- vld1.u8 {d0}, [r0], r1 //a::src[x]
- vld1.u8 {d2}, [r0], r1 //b::src[x+stride]
- vld1.u8 {d4}, [r0] //c::src[x+2*stride]
+w4_mc_chroma_loop: // each two pxl row
+ vld1.u8 {d0}, [r0], r1 //a::src[x]
+ vld1.u8 {d2}, [r0], r1 //b::src[x+stride]
+ vld1.u8 {d4}, [r0] //c::src[x+2*stride]
- vshr.u64 d1, d0, #8
- vshr.u64 d3, d2, #8
- vshr.u64 d5, d4, #8
+ vshr.u64 d1, d0, #8
+ vshr.u64 d3, d2, #8
+ vshr.u64 d5, d4, #8
- vmov q3, q1 //b::[0:7]+b::[1~8]
- vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
- vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
+ vmov q3, q1 //b::[0:7]+b::[1~8]
+ vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
+ vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
- vmull.u8 q1, d0, d28 //(src[x] * A)
- vmlal.u8 q1, d1, d29 //+=(src[x+1] * B)
- vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C)
- vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D)
+ vmull.u8 q1, d0, d28 //(src[x] * A)
+ vmlal.u8 q1, d1, d29 //+=(src[x+1] * B)
+ vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d2, q1, #6
- vmov r4, r5, d2
- str r4, [r2], r3
- str r5, [r2], r3
+ vrshrn.u16 d2, q1, #6
+ vmov r4, r5, d2
+ str r4, [r2], r3
+ str r5, [r2], r3
- sub r6, #2
- cmp r6, #0
- bne w4_mc_chroma_loop
+ sub r6, #2
+ cmp r6, #0
+ bne w4_mc_chroma_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon
- push {r4-r5}
- mov r4, #20
- mov r5, #1
- sub r4, r4, r4, lsl #(16-2)
- lsl r5, #16
- ror r4, #16
- vmov d3, r5, r4 // 0x0014FFFB00010000
+ push {r4-r5}
+ mov r4, #20
+ mov r5, #1
+ sub r4, r4, r4, lsl #(16-2)
+ lsl r5, #16
+ ror r4, #16
+ vmov d3, r5, r4 // 0x0014FFFB00010000
- sub r3, #16
- ldr r4, [sp, #8]
+ sub r3, #16
+ ldr r4, [sp, #8]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w17_h_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2]
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2]
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
- FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d22, q14, q15
+ FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d22, q14, q15
- FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d23, q14, q15
+ FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d23, q14, q15
- vst1.u8 {d22, d23}, [r2]! //write [0:15] Byte
+ vst1.u8 {d22, d23}, [r2]! //write [0:15] Byte
- vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS d2, d3, d22, q11, q1
+ vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+ FILTER_SINGLE_TAG_8BITS d2, d3, d22, q11, q1
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
- sub r4, #1
- cmp r4, #0
- bne w17_h_mc_luma_loop
- pop {r4-r5}
+ sub r4, #1
+ cmp r4, #0
+ bne w17_h_mc_luma_loop
+ pop {r4-r5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon
- push {r4-r5}
- mov r4, #20
- mov r5, #1
- sub r4, r4, r4, lsl #(16-2)
- lsl r5, #16
- ror r4, #16
- vmov d7, r5, r4 // 0x0014FFFB00010000
+ push {r4-r5}
+ mov r4, #20
+ mov r5, #1
+ sub r4, r4, r4, lsl #(16-2)
+ lsl r5, #16
+ ror r4, #16
+ vmov d7, r5, r4 // 0x0014FFFB00010000
- sub r3, #8
- ldr r4, [sp, #8]
+ sub r3, #8
+ ldr r4, [sp, #8]
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
w9_h_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2]
- pld [r0]
+ vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2]
+ pld [r0]
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
- FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d16, q14, q15
+ FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d16, q14, q15
- sub r4, #1
- vst1.u8 {d16}, [r2]! //write [0:7] Byte
+ sub r4, #1
+ vst1.u8 {d16}, [r2]! //write [0:7] Byte
- vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS d2, d7, d18, q9, q1
- vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte
+ vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+ FILTER_SINGLE_TAG_8BITS d2, d7, d18, q9, q1
+ vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte
- cmp r4, #0
- bne w9_h_mc_luma_loop
- pop {r4-r5}
+ cmp r4, #0
+ bne w9_h_mc_luma_loop
+ pop {r4-r5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
w17_v_mc_luma_loop:
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
- FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+ FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
- FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+ FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
- FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+ FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
- FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+ FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
- //q2, q3, q8, q9, q0 --> q0~q8
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q8
+ //q2, q3, q8, q9, q0 --> q0~q8
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q8
- sub r4, #8
- cmp r4, #1
- bne w17_v_mc_luma_loop
- // the last 16Bytes
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+ sub r4, #8
+ cmp r4, #1
+ bne w17_v_mc_luma_loop
+ // the last 16Bytes
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
w9_v_mc_luma_loop:
- pld [r0]
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
- pld [r0]
- FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
- sub r4, #4
- cmp r4, #1
- bne w9_v_mc_luma_loop
+ sub r4, #4
+ cmp r4, #1
+ bne w9_v_mc_luma_loop
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vst1.u8 {d16}, [r2], r3 //write last 8Byte
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vst1.u8 {d16}, [r2], r3 //write last 8Byte
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon
- push {r4}
- vpush {q4-q7}
- ldr r4, [sp, #68]
+ push {r4}
+ vpush {q4-q7}
+ ldr r4, [sp, #68]
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2]
- vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2]
+ vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0]
- vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
- sub r3, #16
+ vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0]
+ vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
+ sub r3, #16
w17_hv_mc_luma_loop:
- vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {d0, d1}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {d0, d1}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
- vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
- vst1.u8 {d3, d4}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0]
- vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte
+ vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
+ vst1.u8 {d3, d4}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0]
+ vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte
- vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
- vst1.u8 {d6, d7}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0]
- vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte
+ vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
+ vst1.u8 {d6, d7}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0]
+ vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte
- vld1.u8 {d6-d8}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
- vst1.u8 {d9, d10}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0]
- vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte
+ vld1.u8 {d6-d8}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
+ vst1.u8 {d9, d10}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0]
+ vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte
- //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
- vswp q0, q6
- vswp q6, q3
- vmov q5, q2
- vmov q2, q8
+ //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+ vswp q0, q6
+ vswp q6, q3
+ vmov q5, q2
+ vmov q2, q8
- vmov d20,d8
- vmov q4, q1
- vmov q1, q7
- vmov d14,d20
+ vmov d20,d8
+ vmov q4, q1
+ vmov q1, q7
+ vmov d14,d20
- sub r4, #4
- cmp r4, #1
- bne w17_hv_mc_luma_loop
- //the last row
- vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {q0}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+ sub r4, #4
+ cmp r4, #1
+ bne w17_hv_mc_luma_loop
+ //the last row
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {q0}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
- vpop {q4-q7}
- pop {r4}
+ vpop {q4-q7}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon
- push {r4}
- vpush {q4}
- ldr r4, [sp, #20]
+ push {r4}
+ vpush {q4}
+ ldr r4, [sp, #20]
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1]
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2]
- sub r3, #8
+ vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2]
+ sub r3, #8
w9_hv_mc_luma_loop:
- vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
- vld1.u8 {q2}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
- //q4~q8, q0~q2, --> q0~q4
- vswp q0, q4
- vswp q2, q4
- vmov q3, q1
- vmov q1, q8
+ //q4~q8, q0~q2, --> q0~q4
+ vswp q0, q4
+ vswp q2, q4
+ vmov q3, q1
+ vmov q1, q8
- sub r4, #4
- cmp r4, #1
- bne w9_hv_mc_luma_loop
- //the last row
- vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
- vpop {q4}
- pop {r4}
+ sub r4, #4
+ cmp r4, #1
+ bne w9_hv_mc_luma_loop
+ //the last row
+ vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vpop {q4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r5, [sp, #16]
- ldr r6, [sp, #20]
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ ldr r6, [sp, #20]
enc_w16_pix_avg_loop:
- vld1.u8 {q0}, [r2], r3
- vld1.u8 {q1}, [r4], r5
- vld1.u8 {q2}, [r2], r3
- vld1.u8 {q3}, [r4], r5
+ vld1.u8 {q0}, [r2], r3
+ vld1.u8 {q1}, [r4], r5
+ vld1.u8 {q2}, [r2], r3
+ vld1.u8 {q3}, [r4], r5
- vld1.u8 {q8}, [r2], r3
- vld1.u8 {q9}, [r4], r5
- vld1.u8 {q10}, [r2], r3
- vld1.u8 {q11}, [r4], r5
+ vld1.u8 {q8}, [r2], r3
+ vld1.u8 {q9}, [r4], r5
+ vld1.u8 {q10}, [r2], r3
+ vld1.u8 {q11}, [r4], r5
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {q0}, [r0], r1
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {q0}, [r0], r1
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {q2}, [r0], r1
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {q2}, [r0], r1
- AVERAGE_TWO_8BITS d16, d16, d18
- AVERAGE_TWO_8BITS d17, d17, d19
- vst1.u8 {q8}, [r0], r1
+ AVERAGE_TWO_8BITS d16, d16, d18
+ AVERAGE_TWO_8BITS d17, d17, d19
+ vst1.u8 {q8}, [r0], r1
- AVERAGE_TWO_8BITS d20, d20, d22
- AVERAGE_TWO_8BITS d21, d21, d23
- vst1.u8 {q10}, [r0], r1
+ AVERAGE_TWO_8BITS d20, d20, d22
+ AVERAGE_TWO_8BITS d21, d21, d23
+ vst1.u8 {q10}, [r0], r1
- sub r6, #4
- cmp r6, #0
- bne enc_w16_pix_avg_loop
+ sub r6, #4
+ cmp r6, #0
+ bne enc_w16_pix_avg_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r5, [sp, #16]
- ldr r6, [sp, #20]
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ ldr r6, [sp, #20]
enc_w8_pix_avg_loop:
- vld1.u8 {d0}, [r2], r3
- vld1.u8 {d2}, [r4], r5
- vld1.u8 {d1}, [r2], r3
- vld1.u8 {d3}, [r4], r5
+ vld1.u8 {d0}, [r2], r3
+ vld1.u8 {d2}, [r4], r5
+ vld1.u8 {d1}, [r2], r3
+ vld1.u8 {d3}, [r4], r5
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {d0}, [r0], r1
- vst1.u8 {d1}, [r0], r1
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {d0}, [r0], r1
+ vst1.u8 {d1}, [r0], r1
- vld1.u8 {d4}, [r2], r3
- vld1.u8 {d6}, [r4], r5
- vld1.u8 {d5}, [r2], r3
- vld1.u8 {d7}, [r4], r5
+ vld1.u8 {d4}, [r2], r3
+ vld1.u8 {d6}, [r4], r5
+ vld1.u8 {d5}, [r2], r3
+ vld1.u8 {d7}, [r4], r5
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {d4}, [r0], r1
- vst1.u8 {d5}, [r0], r1
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {d4}, [r0], r1
+ vst1.u8 {d5}, [r0], r1
- sub r6, #4
- cmp r6, #0
- bne enc_w8_pix_avg_loop
+ sub r6, #4
+ cmp r6, #0
+ bne enc_w8_pix_avg_loop
- pop {r4, r5, r6}
+ pop {r4, r5, r6}
WELS_ASM_FUNC_END
#endif
--- a/codec/common/arm64/expand_picture_aarch64_neon.S
+++ b/codec/common/arm64/expand_picture_aarch64_neon.S
@@ -53,88 +53,88 @@
sub x8, x8, #1
cbnz x8, _expand_picture_luma_loop2
//for the top and bottom expand
- add x2, x2, #64
- sub x0, x0, #32
+ add x2, x2, #64
+ sub x0, x0, #32
madd x4, x1, x3, x0
sub x4, x4, x1
_expand_picture_luma_loop0:
- mov x5, #32
+ mov x5, #32
msub x5, x5, x1, x0
- add x6, x4, x1
+ add x6, x4, x1
ld1 {v0.16b}, [x0], x10
ld1 {v1.16b}, [x4], x10
- mov x8, #32
+ mov x8, #32
_expand_picture_luma_loop1:
- st1 {v0.16b}, [x5], x1
- st1 {v1.16b}, [x6], x1
- sub x8, x8, #1
+ st1 {v0.16b}, [x5], x1
+ st1 {v1.16b}, [x6], x1
+ sub x8, x8, #1
cbnz x8, _expand_picture_luma_loop1
- sub x2, x2, #16
- cbnz x2, _expand_picture_luma_loop0
+ sub x2, x2, #16
+ cbnz x2, _expand_picture_luma_loop0
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon
- //Save the dst
- mov x7, x0
- mov x8, x3
+ //Save the dst
+ mov x7, x0
+ mov x8, x3
mov x10, #16
- add x4, x7, x2
- sub x4, x4, #1
+ add x4, x7, x2
+ sub x4, x4, #1
//For the left and right expand
_expand_picture_chroma_loop2:
- sub x5, x7, #16
- add x6, x4, #1
+ sub x5, x7, #16
+ add x6, x4, #1
- ld1r {v0.16b}, [x7], x1
- ld1r {v1.16b}, [x4], x1
+ ld1r {v0.16b}, [x7], x1
+ ld1r {v1.16b}, [x4], x1
- st1 {v0.16b}, [x5]
- st1 {v1.16b}, [x6]
- sub x8, x8, #1
- cbnz x8, _expand_picture_chroma_loop2
+ st1 {v0.16b}, [x5]
+ st1 {v1.16b}, [x6]
+ sub x8, x8, #1
+ cbnz x8, _expand_picture_chroma_loop2
- //for the top and bottom expand
- add x2, x2, #32
+ //for the top and bottom expand
+ add x2, x2, #32
//
mov x9, x2
mov x11, #15
bic x2, x2, x11
//
- sub x0, x0, #16
- madd x4, x1, x3, x0
- sub x4, x4, x1
+ sub x0, x0, #16
+ madd x4, x1, x3, x0
+ sub x4, x4, x1
_expand_picture_chroma_loop0:
- mov x5, #16
+ mov x5, #16
msub x5, x5, x1, x0
- add x6, x4, x1
- ld1 {v0.16b}, [x0], x10
- ld1 {v1.16b}, [x4], x10
+ add x6, x4, x1
+ ld1 {v0.16b}, [x0], x10
+ ld1 {v1.16b}, [x4], x10
- mov x8, #16
+ mov x8, #16
_expand_picture_chroma_loop1:
- st1 {v0.16b}, [x5], x1
- st1 {v1.16b}, [x6], x1
- sub x8, x8, #1
+ st1 {v0.16b}, [x5], x1
+ st1 {v1.16b}, [x6], x1
+ sub x8, x8, #1
cbnz x8, _expand_picture_chroma_loop1
- sub x2, x2, #16
- cbnz x2, _expand_picture_chroma_loop0
+ sub x2, x2, #16
+ cbnz x2, _expand_picture_chroma_loop0
and x9, x9, #15
sub x9, x9, #8
cbnz x9, _expand_picture_chroma_end
- mov x5, #16
+ mov x5, #16
msub x5, x5, x1, x0
- add x6, x4, x1
- ld1 {v0.8b}, [x0]
- ld1 {v1.8b}, [x4]
+ add x6, x4, x1
+ ld1 {v0.8b}, [x0]
+ ld1 {v1.8b}, [x4]
- mov x8, #16
+ mov x8, #16
_expand_picture_chroma_loop3:
- st1 {v0.8b}, [x5], x1
- st1 {v1.8b}, [x6], x1
- sub x8, x8, #1
+ st1 {v0.8b}, [x5], x1
+ st1 {v1.8b}, [x6], x1
+ sub x8, x8, #1
cbnz x8, _expand_picture_chroma_loop3
_expand_picture_chroma_end:
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -38,32 +38,32 @@
#ifdef __APPLE__
-.macro FILTER_6TAG_8BITS1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
+ uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun $6.8b, v18.8h, #5
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS2
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
+ uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 $6.16b, v18.8h, #5
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
+ uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -70,13 +70,13 @@
sqrshrun $6.8b, v18.8h, #5
uaddl v19.8h, $2.8b, $6.8b
rshrn $6.8b, v19.8h, #1
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
+ uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -83,13 +83,13 @@
sqrshrun2 $6.16b, v18.8h, #5
uaddl2 v19.8h, $2.16b, $6.16b
rshrn2 $6.16b, v19.8h, #1
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
+ uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -96,13 +96,13 @@
sqrshrun $6.8b, v18.8h, #5
uaddl v19.8h, $3.8b, $6.8b
rshrn $6.8b, v19.8h, #1
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
+ uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -109,134 +109,134 @@
sqrshrun2 $6.16b, v18.8h, #5
uaddl2 v19.8h, $3.16b, $6.16b
rshrn2 $6.16b, v19.8h, #1
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS_TO_16BITS1
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3]
- uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1]
- mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl v31.8h, $1.8b, $4.8b //src[-1]+src[2]
- mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+.macro FILTER_6TAG_8BITS_TO_16BITS1
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+ uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3]
+ uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1]
+ mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
+ uaddl v31.8h, $1.8b, $4.8b //src[-1]+src[2]
+ mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
-.macro FILTER_6TAG_8BITS_TO_16BITS2
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3]
- uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1]
- mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v31.8h, $1.16b, $4.16b //src[-1]+src[2]
- mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+.macro FILTER_6TAG_8BITS_TO_16BITS2
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+ uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3]
+ uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1]
+ mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
+ uaddl2 v31.8h, $1.16b, $4.16b //src[-1]+src[2]
+ mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
-.macro FILTER_3_IN_16BITS_TO_8BITS1
-// { // input:a, b, c, dst_d;
- sub $0.8h, $0.8h, $1.8h //a-b
- sshr $0.8h, $0.8h, #2 //(a-b)/4
- sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
- add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
- sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
- add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun $3.8b, $0.8h, #6 //(+32)>>6
-// }
+.macro FILTER_3_IN_16BITS_TO_8BITS1
+// { // input:a, b, c, dst_d;
+ sub $0.8h, $0.8h, $1.8h //a-b
+ sshr $0.8h, $0.8h, #2 //(a-b)/4
+ sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
+ add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
+ sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
+ add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ sqrshrun $3.8b, $0.8h, #6 //(+32)>>6
+// }
.endm
-.macro FILTER_3_IN_16BITS_TO_8BITS2
-// { // input:a, b, c, dst_d;
- sub $0.8h, $0.8h, $1.8h //a-b
- sshr $0.8h, $0.8h, #2 //(a-b)/4
- sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
- add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
- sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
- add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun2 $3.16b, $0.8h, #6 //(+32)>>6
-// }
+.macro FILTER_3_IN_16BITS_TO_8BITS2
+// { // input:a, b, c, dst_d;
+ sub $0.8h, $0.8h, $1.8h //a-b
+ sshr $0.8h, $0.8h, #2 //(a-b)/4
+ sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
+ add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
+ sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
+ add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ sqrshrun2 $3.16b, $0.8h, #6 //(+32)>>6
+// }
.endm
-.macro UNPACK_2_16BITS_TO_ABC
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- ext $4.16b, $0.16b, $1.16b, #4 //src[0]
- ext $3.16b, $0.16b, $1.16b, #6 //src[1]
- add $4.8h, $4.8h, $3.8h //c=src[0]+src[1]
+.macro UNPACK_2_16BITS_TO_ABC
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ ext $4.16b, $0.16b, $1.16b, #4 //src[0]
+ ext $3.16b, $0.16b, $1.16b, #6 //src[1]
+ add $4.8h, $4.8h, $3.8h //c=src[0]+src[1]
- ext $3.16b, $0.16b, $1.16b, #2 //src[-1]
- ext $2.16b, $0.16b, $1.16b, #8 //src[2]
- add $3.8h, $3.8h, $2.8h //b=src[-1]+src[2]
+ ext $3.16b, $0.16b, $1.16b, #2 //src[-1]
+ ext $2.16b, $0.16b, $1.16b, #8 //src[2]
+ add $3.8h, $3.8h, $2.8h //b=src[-1]+src[2]
- ext $2.16b, $0.16b, $1.16b, #10 //src[3]
- add $2.8h, $2.8h, $0.8h //a=src[-2]+src[3]
-// }
+ ext $2.16b, $0.16b, $1.16b, #10 //src[3]
+ add $2.8h, $2.8h, $0.8h //a=src[-2]+src[3]
+// }
.endm
-.macro AVERAGE_TWO_8BITS1
-// { // input:dst_d, src_d A and B; working: v5
- uaddl v30.8h, $2.8b, $1.8b
- rshrn $0.8b, v30.8h, #1
-// }
+.macro AVERAGE_TWO_8BITS1
+// { // input:dst_d, src_d A and B; working: v5
+ uaddl v30.8h, $2.8b, $1.8b
+ rshrn $0.8b, v30.8h, #1
+// }
.endm
-.macro AVERAGE_TWO_8BITS2
-// { // input:dst_d, src_d A and B; working: v5
- uaddl2 v30.8h, $2.16b, $1.16b
- rshrn2 $0.16b, v30.8h, #1
-// }
+.macro AVERAGE_TWO_8BITS2
+// { // input:dst_d, src_d A and B; working: v5
+ uaddl2 v30.8h, $2.16b, $1.16b
+ rshrn2 $0.16b, v30.8h, #1
+// }
.endm
-.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X},
- rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O
- uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]*
- mul $2.4h, $2.4h, $1.4h // 0+1*[50]-5*[41]+20[32]
+.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X},
+ rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O
+ uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]*
+ mul $2.4h, $2.4h, $1.4h // 0+1*[50]-5*[41]+20[32]
addv $3, $2.4h
sqrshrun $0.8b, $0.8h, #5
-// }
+// }
.endm
-.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
-// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
+.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
+// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
ext.16b $3, $1, $1, #14 // X[0][1][2][3][4][5]O
ext.16b $4, $3, $3, #8 // [3][4][5]OX[0][1][2]
- rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O
+ rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O
add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]*
- smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32]
+ smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32]
saddlv $5, $3.4s
//sshr $0.2d, $0.2d, #4
sqrshrun $0.2s, $0.2d, #10
uqxtn $0.4h, $0.4s
uqxtn $0.8b, $0.8h
- // }
+ // }
.endm
#else
-.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
- uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun \arg6\().8b, v18.8h, #5
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 \arg6\().16b, v18.8h, #5
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
- uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -243,13 +243,13 @@
sqrshrun \arg6\().8b, v18.8h, #5
uaddl v19.8h, \arg2\().8b, \arg6\().8b
rshrn \arg6\().8b, v19.8h, #1
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -256,13 +256,13 @@
sqrshrun2 \arg6\().16b, v18.8h, #5
uaddl2 v19.8h, \arg2\().16b, \arg6\().16b
rshrn2 \arg6\().16b, v19.8h, #1
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
- uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -269,13 +269,13 @@
sqrshrun \arg6\().8b, v18.8h, #5
uaddl v19.8h, \arg3\().8b, \arg6\().8b
rshrn \arg6\().8b, v19.8h, #1
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -282,106 +282,106 @@
sqrshrun2 \arg6\().16b, v18.8h, #5
uaddl2 v19.8h, \arg3\().16b, \arg6\().16b
rshrn2 \arg6\().16b, v19.8h, #1
-// }
+// }
.endm
-.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl \arg6\().8h, \arg0\().8b, \arg5\().8b //dst_q=src[-2]+src[3]
- uaddl v31.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
- mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl v31.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
- mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+ uaddl \arg6\().8h, \arg0\().8b, \arg5\().8b //dst_q=src[-2]+src[3]
+ uaddl v31.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
+ uaddl v31.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
+ mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
-.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl2 \arg6\().8h, \arg0\().16b, \arg5\().16b //dst_q=src[-2]+src[3]
- uaddl2 v31.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
- mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v31.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
- mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
+.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+ uaddl2 \arg6\().8h, \arg0\().16b, \arg5\().16b //dst_q=src[-2]+src[3]
+ uaddl2 v31.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
+ uaddl2 v31.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
+ mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
.endm
-.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
-// { // input:a, b, c, dst_d;
- sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
- sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
- sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
- add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
- sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
- add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun \arg3\().8b, \arg0\().8h, #6 //(+32)>>6
-// }
+.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
+// { // input:a, b, c, dst_d;
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
+ sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
+ sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ sqrshrun \arg3\().8b, \arg0\().8h, #6 //(+32)>>6
+// }
.endm
-.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
-// { // input:a, b, c, dst_d;
- sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
- sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
- sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
- add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
- sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
- add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun2 \arg3\().16b, \arg0\().8h, #6 //(+32)>>6
-// }
+.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
+// { // input:a, b, c, dst_d;
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
+ sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
+ sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ sqrshrun2 \arg3\().16b, \arg0\().8h, #6 //(+32)>>6
+// }
.endm
-.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4 //src[0]
- ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6 //src[1]
- add \arg4\().8h, \arg4\().8h, \arg3\().8h //c=src[0]+src[1]
+.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4 //src[0]
+ ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6 //src[1]
+ add \arg4\().8h, \arg4\().8h, \arg3\().8h //c=src[0]+src[1]
- ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2 //src[-1]
- ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8 //src[2]
- add \arg3\().8h, \arg3\().8h, \arg2\().8h //b=src[-1]+src[2]
+ ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2 //src[-1]
+ ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8 //src[2]
+ add \arg3\().8h, \arg3\().8h, \arg2\().8h //b=src[-1]+src[2]
- ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10 //src[3]
- add \arg2\().8h, \arg2\().8h, \arg0\().8h //a=src[-2]+src[3]
-// }
+ ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10 //src[3]
+ add \arg2\().8h, \arg2\().8h, \arg0\().8h //a=src[-2]+src[3]
+// }
.endm
-.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2
-// { // input:dst_d, src_d A and B; working: v5
- uaddl v30.8h, \arg2\().8b, \arg1\().8b
- rshrn \arg0\().8b, v30.8h, #1
-// }
+.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2
+// { // input:dst_d, src_d A and B; working: v5
+ uaddl v30.8h, \arg2\().8b, \arg1\().8b
+ rshrn \arg0\().8b, v30.8h, #1
+// }
.endm
-.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2
-// { // input:dst_d, src_d A and B; working: v5
- uaddl2 v30.8h, \arg2\().16b, \arg1\().16b
- rshrn2 \arg0\().16b, v30.8h, #1
-// }
+.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2
+// { // input:dst_d, src_d A and B; working: v5
+ uaddl2 v30.8h, \arg2\().16b, \arg1\().16b
+ rshrn2 \arg0\().16b, v30.8h, #1
+// }
.endm
-.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
// when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X},
- rev64 \arg2\().8b, \arg0\().8b // X[5][4][3][2][1][0]O
- uaddl \arg2\().8h, \arg0\().8b, \arg2\().8b // each 16bits, *[50][41][32][23][14][05]*
- mul \arg2\().4h, \arg2\().4h, \arg1\().4h // 0+1*[50]-5*[41]+20[32]
+// { // input: src_d{Y[0][1][2][3][4][5]X},
+ rev64 \arg2\().8b, \arg0\().8b // X[5][4][3][2][1][0]O
+ uaddl \arg2\().8h, \arg0\().8b, \arg2\().8b // each 16bits, *[50][41][32][23][14][05]*
+ mul \arg2\().4h, \arg2\().4h, \arg1\().4h // 0+1*[50]-5*[41]+20[32]
addv \arg3, \arg2\().4h
sqrshrun \arg0\().8b, \arg0\().8h, #5
-// }
+// }
.endm
-.macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5
-// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
+.macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5
+// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
ext \arg3\().16b, \arg1\().16b, \arg1\().16b, #14 // X[0][1][2][3][4][5]O
ext \arg4\().16b, \arg3\().16b, \arg3\().16b, #8 // [3][4][5]OX[0][1][2]
- rev64 \arg4\().8h, \arg4\().8h // X[5][4][3][2][1][0]O
+ rev64 \arg4\().8h, \arg4\().8h // X[5][4][3][2][1][0]O
add \arg3\().8h, \arg3\().8h, \arg4\().8h // each 16bits, *[50][41][32][23][14][05]*
- smull \arg3\().4s, \arg3\().4h, \arg2\().4h // 0+1*[50]-5*[41]+20[32]
+ smull \arg3\().4s, \arg3\().4h, \arg2\().4h // 0+1*[50]-5*[41]+20[32]
saddlv \arg5, \arg3\().4s
//sshr \arg0\().2d, \arg0\().2d, #4
sqrshrun \arg0\().2s, \arg0\().2d, #10
uqxtn \arg0\().4h, \arg0\().4s
uqxtn \arg0\().8b, \arg0\().8h
- // }
+ // }
.endm
#endif
@@ -405,7 +405,7 @@
sub x4, x4, #1
st1 {v20.16b}, [x2], x3 //write 16Byte
- cbnz x4, w16_h_mc_luma_loop
+ cbnz x4, w16_h_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon
@@ -426,7 +426,7 @@
sub x4, x4, #1
st1 {v20.8b}, [x2], x3 //write 8Byte
- cbnz x4, w8_h_mc_luma_loop
+ cbnz x4, w8_h_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon
@@ -461,7 +461,7 @@
st1 {v20.s}[0], [x2], x3 //write 4Byte
st1 {v20.s}[1], [x2], x3 //write 4Byte
sub x4, x4, #1
- cbnz x4, w4_h_mc_luma_loop
+ cbnz x4, w4_h_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon
@@ -483,7 +483,7 @@
sub x4, x4, #1
st1 {v20.16b}, [x2], x3 //write 16Byte
- cbnz x4, w16_xy_10_mc_luma_loop
+ cbnz x4, w16_xy_10_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -505,7 +505,7 @@
sub x4, x4, #1
st1 {v20.8b}, [x2], x3 //write 8Byte
- cbnz x4, w8_xy_10_mc_luma_loop
+ cbnz x4, w8_xy_10_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon
@@ -540,7 +540,7 @@
st1 {v20.s}[0], [x2], x3 //write 4Byte
st1 {v20.s}[1], [x2], x3 //write 4Byte
sub x4, x4, #1
- cbnz x4, w4_xy_10_mc_luma_loop
+ cbnz x4, w4_xy_10_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -563,7 +563,7 @@
sub x4, x4, #1
st1 {v20.16b}, [x2], x3 //write 16Byte
- cbnz x4, w16_xy_30_mc_luma_loop
+ cbnz x4, w16_xy_30_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -585,7 +585,7 @@
sub x4, x4, #1
st1 {v20.8b}, [x2], x3 //write 8Byte
- cbnz x4, w8_xy_30_mc_luma_loop
+ cbnz x4, w8_xy_30_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon
@@ -620,7 +620,7 @@
st1 {v20.s}[0], [x2], x3 //write 4Byte
st1 {v20.s}[1], [x2], x3 //write 4Byte
sub x4, x4, #1
- cbnz x4, w4_xy_30_mc_luma_loop
+ cbnz x4, w4_xy_30_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -703,7 +703,7 @@
mov.16b v4, v6
mov.16b v6, v7
sub x4, x4, #8
- cbnz x4, w16_xy_01_mc_luma_loop
+ cbnz x4, w16_xy_01_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -753,7 +753,7 @@
mov.16b v6, v4
mov.16b v4, v7
sub x4, x4, #4
- cbnz x4, w8_xy_01_mc_luma_loop
+ cbnz x4, w8_xy_01_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -809,7 +809,7 @@
mov.8b v5, v21
sub x4, x4, #4
- cbnz x4, w4_xy_01_mc_luma_loop
+ cbnz x4, w4_xy_01_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -892,7 +892,7 @@
mov.16b v4, v6
mov.16b v6, v7
sub x4, x4, #8
- cbnz x4, w16_xy_03_mc_luma_loop
+ cbnz x4, w16_xy_03_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -942,7 +942,7 @@
mov.16b v6, v4
mov.16b v4, v7
sub x4, x4, #4
- cbnz x4, w8_xy_03_mc_luma_loop
+ cbnz x4, w8_xy_03_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -998,7 +998,7 @@
mov.8b v5, v21
sub x4, x4, #4
- cbnz x4, w4_xy_03_mc_luma_loop
+ cbnz x4, w4_xy_03_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1081,7 +1081,7 @@
mov.16b v4, v6
mov.16b v6, v7
sub x4, x4, #8
- cbnz x4, w16_xy_02_mc_luma_loop
+ cbnz x4, w16_xy_02_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1131,7 +1131,7 @@
mov.16b v6, v4
mov.16b v4, v7
sub x4, x4, #4
- cbnz x4, w8_xy_02_mc_luma_loop
+ cbnz x4, w8_xy_02_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1187,7 +1187,7 @@
mov.8b v5, v21
sub x4, x4, #4
- cbnz x4, w4_xy_02_mc_luma_loop
+ cbnz x4, w4_xy_02_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1220,12 +1220,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 0 line
//prfm pldl1strm, [x0, x1]
@@ -1234,12 +1234,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 1 line
//prfm pldl1strm, [x0, x1]
@@ -1248,12 +1248,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 2 line
//prfm pldl1strm, [x0, x1]
@@ -1262,12 +1262,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 3 line
//prfm pldl1strm, [x0, x1]
@@ -1276,12 +1276,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 4 line
//prfm pldl1strm, [x0, x1]
@@ -1290,12 +1290,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 5 line
//prfm pldl1strm, [x0, x1]
@@ -1304,12 +1304,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 6 line
//prfm pldl1strm, [x0, x1]
@@ -1318,12 +1318,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x3 //write 16Byte : 7 line
mov.16b v5, v11
@@ -1348,7 +1348,7 @@
mov.16b v16, v30
sub x4, x4, #8
- cbnz x4, w16_hv_mc_luma_loop
+ cbnz x4, w16_hv_mc_luma_loop
ldp d14, d15, [sp], #16
ldp d12, d13, [sp], #16
@@ -1381,8 +1381,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x3 //write 8Byte : 0 line
//prfm pldl1strm, [x0, x1]
@@ -1391,8 +1391,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x3 //write 8Byte : 1 line
//prfm pldl1strm, [x0, x1]
@@ -1401,8 +1401,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x3 //write 8Byte : 2 line
//prfm pldl1strm, [x0, x1]
@@ -1411,8 +1411,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x3 //write 8Byte : 3 line
@@ -1424,7 +1424,7 @@
mov.16b v4, v30
sub x4, x4, #4
- cbnz x4, w8_hv_mc_luma_loop
+ cbnz x4, w8_hv_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1458,12 +1458,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v22, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v23, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26
- UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30
+ UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26
+ UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30
zip1 v24.2d, v24.2d, v28.2d
zip1 v25.2d, v25.2d, v29.2d
zip1 v26.2d, v26.2d, v30.2d
- FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
+ FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
st1 {v27.s}[0], [x2], x3 //write 4Byte : 0 line
st1 {v27.s}[1], [x2], x3 //write 4Byte : 1 line
@@ -1478,12 +1478,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v22, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v23, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26
- UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30
+ UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26
+ UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30
zip1 v24.2d, v24.2d, v28.2d
zip1 v25.2d, v25.2d, v29.2d
zip1 v26.2d, v26.2d, v30.2d
- FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
+ FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
st1 {v27.s}[0], [x2], x3 //write 4Byte : 2 line
st1 {v27.s}[1], [x2], x3 //write 4Byte : 3 line
@@ -1495,7 +1495,7 @@
mov.16b v4, v30
sub x4, x4, #4
- cbnz x4, w4_hv_mc_luma_loop
+ cbnz x4, w4_hv_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon
@@ -1509,7 +1509,7 @@
st1 {v1.16b}, [x2], x3 //write 16Byte : 1 line
sub x4, x4, #2
- cbnz x4, w16_copy_loop
+ cbnz x4, w16_copy_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon
@@ -1523,7 +1523,7 @@
st1 {v1.8b}, [x2], x3 //write 16Byte : 1 line
sub x4, x4, #2
- cbnz x4, w8_copy_loop
+ cbnz x4, w8_copy_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon
@@ -1537,7 +1537,7 @@
st1 {v1.s}[0], [x2], x3 //write 16Byte : 1 line
sub x4, x4, #2
- cbnz x4, w4_copy_loop
+ cbnz x4, w4_copy_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon
@@ -1570,7 +1570,7 @@
st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
sub x6, x6, #4
- cbnz x6, enc_w16_pix_avg_loop
+ cbnz x6, enc_w16_pix_avg_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon
@@ -1607,7 +1607,7 @@
st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
sub x6, x6, #4
- cbnz x6, enc_w8_pix_avg_loop
+ cbnz x6, enc_w8_pix_avg_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon
@@ -1649,7 +1649,7 @@
st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
sub x6, x6, #4
- cbnz x6, w16_pix_avg_loop
+ cbnz x6, w16_pix_avg_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon
@@ -1686,7 +1686,7 @@
st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
sub x6, x6, #4
- cbnz x6, w8_pix_avg_loop
+ cbnz x6, w8_pix_avg_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1707,7 +1707,7 @@
st1 {v2.s}[1], [x0], x1 //write 4Byte : 1 line
sub x6, x6, #2
- cbnz x6, w4_pix_avg_loop
+ cbnz x6, w4_pix_avg_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
@@ -1738,7 +1738,7 @@
mov.16b v0, v18
mov.16b v1, v19
sub x5, x5, #2
- cbnz x5, w8_mc_chroma_loop
+ cbnz x5, w8_mc_chroma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon
@@ -1767,7 +1767,7 @@
mov.8b v0, v18
mov.8b v1, v19
sub x5, x5, #2
- cbnz x5, w4_mc_chroma_loop
+ cbnz x5, w4_mc_chroma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1793,11 +1793,11 @@
st1 {v20.16b}, [x2], x5 //write 16Byte
ext.8b v21, v4, v4, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
- st1 {v21.b}[0], [x2], x3 //write 16th Byte
+ FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
+ st1 {v21.b}[0], [x2], x3 //write 16th Byte
sub x4, x4, #1
- cbnz x4, w17_h_mc_luma_loop
+ cbnz x4, w17_h_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon
@@ -1821,11 +1821,11 @@
st1 {v20.8b}, [x2], x5 //write 8Byte
ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
- st1 {v21.b}[0], [x2], x3 //write 9th Byte
+ FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
+ st1 {v21.b}[0], [x2], x3 //write 9th Byte
sub x4, x4, #1
- cbnz x4, w9_h_mc_luma_loop
+ cbnz x4, w9_h_mc_luma_loop
WELS_ASM_ARCH64_FUNC_END
@@ -1863,12 +1863,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
@@ -1879,12 +1879,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 1 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 1 line
@@ -1895,12 +1895,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 2 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 2 line
@@ -1911,12 +1911,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 3 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 3 line
@@ -1927,12 +1927,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 4 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 4 line
@@ -1943,12 +1943,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 5 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 5 line
@@ -1959,12 +1959,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 6 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 6 line
@@ -1975,12 +1975,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15Byte : 7 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 7 line
@@ -2007,7 +2007,7 @@
mov.16b v16, v30
sub x4, x4, #8
- cbnz x4, w17_hv_mc_luma_loop
+ cbnz x4, w17_hv_mc_luma_loop
//prfm pldl1strm, [x0, x1]
ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride]
@@ -2015,12 +2015,12 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
// vertical filtered into v21/v22
FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
- UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
+ UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
@@ -2061,8 +2061,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
@@ -2073,8 +2073,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x5 //write 0:7Byte : 1 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 1 line
@@ -2085,8 +2085,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x5 //write 0:7Byte : 2 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 2 line
@@ -2097,8 +2097,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x5 //write 0:7Byte : 3 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 3 line
@@ -2112,7 +2112,7 @@
mov.16b v4, v30
sub x4, x4, #4
- cbnz x4, w9_hv_mc_luma_loop
+ cbnz x4, w9_hv_mc_luma_loop
//prfm pldl1strm, [x0, x1]
ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
@@ -2120,8 +2120,8 @@
FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
// horizon filtered
- UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
- FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
@@ -2207,7 +2207,7 @@
mov.16b v4, v6
mov.16b v6, v7
sub x4, x4, #8
- cbnz x4, w17_v_mc_luma_loop
+ cbnz x4, w17_v_mc_luma_loop
//prfm pldl1strm, [x0, x1]
ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
@@ -2262,7 +2262,7 @@
mov.16b v6, v4
mov.16b v4, v7
sub x4, x4, #4
- cbnz x4, w9_v_mc_luma_loop
+ cbnz x4, w9_v_mc_luma_loop
//prfm pldl1strm, [x0, x1]
ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -44,15 +44,15 @@
;***********************************************************************
%if 1
- %define MOVDQ movdqa
+ %define MOVDQ movdqa
%else
- %define MOVDQ movdqu
+ %define MOVDQ movdqu
%endif
%if 1
- %define WELSEMMS emms
+ %define WELSEMMS emms
%else
- %define WELSEMMS
+ %define WELSEMMS
%endif
@@ -220,7 +220,7 @@
%macro LOAD_1_PARA 0
%ifdef X86_32
- mov r0, [esp + push_num*4 + 4]
+ mov r0, [esp + push_num*4 + 4]
%endif
%endmacro
@@ -234,8 +234,8 @@
%macro LOAD_3_PARA 0
%ifdef X86_32
mov r0, [esp + push_num*4 + 4]
- mov r1, [esp + push_num*4 + 8]
- mov r2, [esp + push_num*4 + 12]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
%endif
%endmacro
@@ -267,7 +267,7 @@
%macro LOAD_6_PARA 0
%ifdef X86_32
- push r3
+ push r3
push r4
push r5
%assign push_num push_num+3
@@ -310,7 +310,7 @@
%macro LOAD_4_PARA_POP 0
%ifdef X86_32
- pop r3
+ pop r3
%endif
%endmacro
@@ -317,7 +317,7 @@
%macro LOAD_5_PARA_POP 0
%ifdef X86_32
pop r4
- pop r3
+ pop r3
%endif
%endmacro
@@ -324,8 +324,8 @@
%macro LOAD_6_PARA_POP 0
%ifdef X86_32
pop r5
- pop r4
- pop r3
+ pop r4
+ pop r3
%endif
%endmacro
@@ -416,13 +416,13 @@
%macro SIGN_EXTENSION 2
%ifndef X86_32
- movsxd %1, %2
+ movsxd %1, %2
%endif
%endmacro
%macro SIGN_EXTENSIONW 2
%ifndef X86_32
- movsx %1, %2
+ movsx %1, %2
%endif
%endmacro
@@ -438,13 +438,13 @@
%endmacro
%macro WELS_AbsW 2
- pxor %2, %2
+ pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
%endmacro
%macro MMX_XSwap 4
- movq %4, %2
+ movq %4, %2
punpckh%1 %4, %3
punpckl%1 %2, %3
%endmacro
@@ -485,35 +485,35 @@
;in: m1, m2, m3, m4, m5, m6, m7, m8
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
%macro SSE2_TransTwo8x8B 9
- movdqa %9, %8
- SSE2_XSawp bw, %1, %2, %8
- SSE2_XSawp bw, %3, %4, %2
- SSE2_XSawp bw, %5, %6, %4
- movdqa %6, %9
- movdqa %9, %4
- SSE2_XSawp bw, %7, %6, %4
+ movdqa %9, %8
+ SSE2_XSawp bw, %1, %2, %8
+ SSE2_XSawp bw, %3, %4, %2
+ SSE2_XSawp bw, %5, %6, %4
+ movdqa %6, %9
+ movdqa %9, %4
+ SSE2_XSawp bw, %7, %6, %4
- SSE2_XSawp wd, %1, %3, %6
- SSE2_XSawp wd, %8, %2, %3
- SSE2_XSawp wd, %5, %7, %2
- movdqa %7, %9
- movdqa %9, %3
- SSE2_XSawp wd, %7, %4, %3
+ SSE2_XSawp wd, %1, %3, %6
+ SSE2_XSawp wd, %8, %2, %3
+ SSE2_XSawp wd, %5, %7, %2
+ movdqa %7, %9
+ movdqa %9, %3
+ SSE2_XSawp wd, %7, %4, %3
- SSE2_XSawp dq, %1, %5, %4
- SSE2_XSawp dq, %6, %2, %5
- SSE2_XSawp dq, %8, %7, %2
- movdqa %7, %9
- movdqa %9, %5
- SSE2_XSawp dq, %7, %3, %5
+ SSE2_XSawp dq, %1, %5, %4
+ SSE2_XSawp dq, %6, %2, %5
+ SSE2_XSawp dq, %8, %7, %2
+ movdqa %7, %9
+ movdqa %9, %5
+ SSE2_XSawp dq, %7, %3, %5
- SSE2_XSawp qdq, %1, %8, %3
- SSE2_XSawp qdq, %4, %2, %8
- SSE2_XSawp qdq, %6, %7, %2
- movdqa %7, %9
- movdqa %9, %1
- SSE2_XSawp qdq, %7, %5, %1
- movdqa %5, %9
+ SSE2_XSawp qdq, %1, %8, %3
+ SSE2_XSawp qdq, %4, %2, %8
+ SSE2_XSawp qdq, %6, %7, %2
+ movdqa %7, %9
+ movdqa %9, %1
+ SSE2_XSawp qdq, %7, %5, %1
+ movdqa %5, %9
%endmacro
;xmm0, xmm6, xmm7, [eax], [ecx]
@@ -528,32 +528,32 @@
; m2 = m1 + m2, m1 = m1 - m2
%macro SSE2_SumSub 3
- movdqa %3, %2
+ movdqa %3, %2
paddw %2, %1
psubw %1, %3
%endmacro
-%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
- mov %3h, %3l
- movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+ mov %3h, %3l
+ movd %1, e%3x ; i.e, 1% = eax (=b0)
+ pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+ pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
%endmacro
;copy a dw into a xmm for 8 times
-%macro SSE2_Copy8Times 2
- movd %1, %2
- punpcklwd %1, %1
- pshufd %1, %1, 0
+%macro SSE2_Copy8Times 2
+ movd %1, %2
+ punpcklwd %1, %1
+ pshufd %1, %1, 0
%endmacro
;copy a db into a xmm for 16 times
-%macro SSE2_Copy16Times 2
- movd %1, %2
- pshuflw %1, %1, 0
- punpcklqdq %1, %1
- packuswb %1, %1
+%macro SSE2_Copy16Times 2
+ movd %1, %2
+ pshuflw %1, %1, 0
+ punpcklqdq %1, %1
+ packuswb %1, %1
%endmacro
@@ -564,35 +564,35 @@
;dw 32,32,32,32,32,32,32,32 for xmm
;dw 32,32,32,32 for mm
%macro WELS_DW32 1
- pcmpeqw %1,%1
- psrlw %1,15
- psllw %1,5
+ pcmpeqw %1,%1
+ psrlw %1,15
+ psllw %1,5
%endmacro
;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
;dw 1, 1, 1, 1 for mm
%macro WELS_DW1 1
- pcmpeqw %1,%1
- psrlw %1,15
+ pcmpeqw %1,%1
+ psrlw %1,15
%endmacro
;all 0 for xmm and mm
-%macro WELS_Zero 1
- pxor %1, %1
+%macro WELS_Zero 1
+ pxor %1, %1
%endmacro
;dd 1, 1, 1, 1 for xmm
;dd 1, 1 for mm
%macro WELS_DD1 1
- pcmpeqw %1,%1
- psrld %1,31
+ pcmpeqw %1,%1
+ psrld %1,31
%endmacro
;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
%macro WELS_DB1 1
- pcmpeqw %1,%1
- psrlw %1,15
- packuswb %1,%1
+ pcmpeqw %1,%1
+ psrlw %1,15
+ packuswb %1,%1
%endmacro
--- a/codec/common/x86/cpuid.asm
+++ b/codec/common/x86/cpuid.asm
@@ -29,13 +29,13 @@
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
-;* cpu_mmx.asm
+;* cpu_mmx.asm
;*
;* Abstract
-;* verify cpuid feature support and cpuid detection
+;* verify cpuid feature support and cpuid detection
;*
;* History
-;* 04/29/2009 Created
+;* 04/29/2009 Created
;*
;*************************************************************************/
@@ -115,13 +115,13 @@
%elifdef X86_32
WELS_EXTERN WelsCPUId
- push ebx
- push edi
+ push ebx
+ push edi
- mov eax, [esp+12] ; operating index
+ mov eax, [esp+12] ; operating index
mov edi, [esp+24]
mov ecx, [edi]
- cpuid ; cpuid
+ cpuid ; cpuid
; processing various information return
mov edi, [esp+16]
@@ -133,7 +133,7 @@
mov edi, [esp+28]
mov [edi], edx
- pop edi
+ pop edi
pop ebx
ret
@@ -145,31 +145,31 @@
;****************************************************************************************************
WELS_EXTERN WelsCPUSupportAVX
%ifdef WIN64
- mov eax, ecx
- mov ecx, edx
+ mov eax, ecx
+ mov ecx, edx
%elifdef UNIX64
- mov eax, edi
- mov ecx, esi
+ mov eax, edi
+ mov ecx, esi
%else
- mov eax, [esp+4]
- mov ecx, [esp+8]
+ mov eax, [esp+4]
+ mov ecx, [esp+8]
%endif
- ; refer to detection of AVX addressed in INTEL AVX manual document
- and ecx, 018000000H
- cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
- jne avx_not_supported
- ; processor supports AVX instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne avx_not_supported
- mov eax, 1
- ret
+ ; refer to detection of AVX addressed in INTEL AVX manual document
+ and ecx, 018000000H
+ cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
+ jne avx_not_supported
+ ; processor supports AVX instructions and XGETBV is enabled by OS
+ mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
+ XGETBV ; result in EDX:EAX
+ and eax, 06H
+ cmp eax, 06H ; check OS has enabled both XMM and YMM state support
+ jne avx_not_supported
+ mov eax, 1
+ ret
avx_not_supported:
- mov eax, 0
- ret
+ mov eax, 0
+ ret
; need call after cpuid=1 and eax, ecx flag got then
@@ -178,35 +178,35 @@
;****************************************************************************************************
WELS_EXTERN WelsCPUSupportFMA
%ifdef WIN64
- mov eax, ecx
- mov ecx, edx
+ mov eax, ecx
+ mov ecx, edx
%elifdef UNIX64
- mov eax, edi
- mov ecx, esi
+ mov eax, edi
+ mov ecx, esi
%else
- mov eax, [esp+4]
- mov ecx, [esp+8]
+ mov eax, [esp+4]
+ mov ecx, [esp+8]
%endif
- ; refer to detection of FMA addressed in INTEL AVX manual document
- and ecx, 018001000H
- cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
- jne fma_not_supported
- ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne fma_not_supported
- mov eax, 1
- ret
+ ; refer to detection of FMA addressed in INTEL AVX manual document
+ and ecx, 018001000H
+ cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
+ jne fma_not_supported
+ ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
+ mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
+ XGETBV ; result in EDX:EAX
+ and eax, 06H
+ cmp eax, 06H ; check OS has enabled both XMM and YMM state support
+ jne fma_not_supported
+ mov eax, 1
+ ret
fma_not_supported:
- mov eax, 0
- ret
+ mov eax, 0
+ ret
;******************************************************************************************
; void WelsEmms()
;******************************************************************************************
WELS_EXTERN WelsEmms
- emms ; empty mmx technology states
- ret
+ emms ; empty mmx technology states
+ ret
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -57,1032 +57,1032 @@
WELS_EXTERN DeblockLumaLt4V_ssse3
- push rbp
- mov r11,[rsp + 16 + 20h] ; pTC
- PUSH_XMM 16
- sub rsp,1B0h
- lea rbp,[rsp+20h]
- movd xmm4,r8d
- movd xmm2,r9d
- mov qword [rbp+180h],r12
- mov r10,rcx
- movsxd r12,edx
- add edx,edx
- movsxd rdx,edx
- sub r10,r12
- movsx r8d,byte [r11]
- pxor xmm3,xmm3
- punpcklwd xmm2,xmm2
- movaps [rbp+50h],xmm14
- lea rax,[r12+r12*2]
- movdqa xmm14,[rdx+rcx]
- neg rax
- pshufd xmm0,xmm2,0
- movd xmm2,r8d
- movsx edx,byte [r11+1]
- movsx r8d,byte [r11+2]
- movsx r11d,byte [r11+3]
- movaps [rbp+70h],xmm12
- movd xmm1,edx
- movaps [rbp+80h],xmm11
- movd xmm12,r8d
- movd xmm11,r11d
- movdqa xmm5, [rax+rcx]
- lea rax,[r12+r12]
- punpcklwd xmm12,xmm12
- neg rax
- punpcklwd xmm11,xmm11
- movaps [rbp],xmm8
- movdqa xmm8, [r10]
- punpcklwd xmm2,xmm2
- punpcklwd xmm1,xmm1
- punpcklqdq xmm12,xmm12
- punpcklqdq xmm11,xmm11
- punpcklqdq xmm2,xmm2
- punpcklqdq xmm1,xmm1
- shufps xmm12,xmm11,88h
- movdqa xmm11,xmm8
- movaps [rbp+30h],xmm9
- movdqa xmm9,[rcx]
- shufps xmm2,xmm1,88h
- movdqa xmm1,xmm5
- punpcklbw xmm11,xmm3
- movaps [rbp+20h],xmm6
- movaps [rbp+60h],xmm13
- movdqa xmm13,xmm11
- movaps [rbp+90h],xmm10
- movdqa xmm10,xmm9
- movdqa xmm6,[rax+rcx]
- punpcklbw xmm1,xmm3
- movaps [rbp+0A0h],xmm12
- psubw xmm13,xmm1
- movaps [rbp+40h],xmm15
- movdqa xmm15,xmm14
- movaps [rbp+10h],xmm7
- movdqa xmm7,xmm6
- punpcklbw xmm10,xmm3
- movdqa xmm12,[r12+rcx]
- punpcklbw xmm7,xmm3
- punpcklbw xmm12,xmm3
- punpcklbw xmm15,xmm3
- pabsw xmm3,xmm13
- movdqa xmm13,xmm10
- psubw xmm13,xmm15
- movdqa [rbp+0F0h],xmm15
- pabsw xmm15,xmm13
- movdqa xmm13,xmm11
- movdqa [rbp+0B0h],xmm1
- movdqa xmm1,xmm0
- pavgw xmm13,xmm10
- pcmpgtw xmm1,xmm3
- movdqa [rbp+120h],xmm13
- movaps xmm13,xmm2
- punpcklwd xmm4,xmm4
- movdqa xmm3,xmm0
- movdqa [rbp+100h],xmm1
- psubw xmm13,xmm1
- movdqa xmm1,xmm10
- pcmpgtw xmm3,xmm15
- pshufd xmm4,xmm4,0
- psubw xmm1,xmm11
- movdqa [rbp+0D0h],xmm10
- psubw xmm13,xmm3
- movdqa [rbp+110h],xmm3
- pabsw xmm15,xmm1
- movdqa xmm3,xmm4
- psubw xmm10,xmm12
- pcmpgtw xmm3,xmm15
- pabsw xmm15,xmm10
- movdqa xmm10,xmm0
- psllw xmm1,2
- movdqa [rbp+0C0h],xmm11
- psubw xmm11,xmm7
- pcmpgtw xmm10,xmm15
- pabsw xmm11,xmm11
- movdqa xmm15,xmm0
- pand xmm3,xmm10
- pcmpgtw xmm15,xmm11
- movaps xmm11,xmm2
- pxor xmm10,xmm10
- pand xmm3,xmm15
- pcmpgtw xmm11,xmm10
- pcmpeqw xmm10,xmm2
- por xmm11,xmm10
- pand xmm3,xmm11
- movdqa xmm11,xmm7
- psubw xmm11,xmm12
- pxor xmm15,xmm15
- paddw xmm11,xmm1
- psubw xmm15,xmm13
- movdqa [rbp+0E0h],xmm12
- paddw xmm11,[FOUR_16B_SSE2]
- pxor xmm12,xmm12
- psraw xmm11,3
- punpckhbw xmm8,xmm12
- pmaxsw xmm15,xmm11
- punpckhbw xmm5,xmm12
- movdqa xmm11,xmm8
- pminsw xmm13,xmm15
- psubw xmm11,xmm5
- punpckhbw xmm9,xmm12
- pand xmm13,xmm3
- movdqa [rbp+130h],xmm13
- pabsw xmm13,xmm11
- punpckhbw xmm14,xmm12
- movdqa xmm11,xmm9
- psubw xmm11,xmm14
- movdqa xmm15,xmm0
- movdqa [rbp+140h],xmm14
- pabsw xmm14,xmm11
- movdqa xmm11,xmm8
- pcmpgtw xmm15,xmm14
- movdqa xmm1,[r12+rcx]
- pavgw xmm11,xmm9
- movdqa [rbp+170h],xmm11
- movdqa xmm10,xmm9
- punpckhbw xmm6,xmm12
- psubw xmm10,xmm8
- punpckhbw xmm1,xmm12
- movdqa xmm12,xmm0
- movaps xmm11,[rbp+0A0h]
- pcmpgtw xmm12,xmm13
- movaps xmm13,xmm11
- psubw xmm13,xmm12
- movdqa [rbp+160h],xmm15
- psubw xmm13,xmm15
- movdqa xmm15,xmm9
- psubw xmm15,xmm1
- movdqa [rbp+150h],xmm12
- pabsw xmm12,xmm10
- pabsw xmm14,xmm15
- movdqa xmm15,xmm8
- pcmpgtw xmm4,xmm12
- movdqa xmm12,xmm0
- psubw xmm15,xmm6
- pcmpgtw xmm12,xmm14
- pabsw xmm14,xmm15
- psllw xmm10,2
- pcmpgtw xmm0,xmm14
- movdqa xmm14,xmm6
- psubw xmm14,xmm1
- pand xmm4,xmm12
- paddw xmm14,xmm10
- pand xmm4,xmm0
- paddw xmm14,[FOUR_16B_SSE2]
- pxor xmm15,xmm15
- movaps xmm12,xmm11
- psubw xmm15,xmm13
- pxor xmm0,xmm0
- psraw xmm14,3
- pcmpgtw xmm12,xmm0
- pcmpeqw xmm0,xmm11
- pmaxsw xmm15,xmm14
- por xmm12,xmm0
- movdqa xmm0,[rbp+120h]
- pminsw xmm13,xmm15
- movdqa xmm15,[rbp+0B0h]
- movdqa xmm10,xmm7
- pand xmm4,xmm12
- paddw xmm15,xmm0
- pxor xmm12,xmm12
- paddw xmm10,xmm7
- movdqa xmm14,xmm12
- psubw xmm15,xmm10
- psubw xmm14,xmm2
- psraw xmm15,1
- pmaxsw xmm15,xmm14
- movdqa xmm10,xmm6
- pminsw xmm15,xmm2
- paddw xmm10,xmm6
- pand xmm15,xmm3
- psubw xmm12,xmm11
- pand xmm15,[rbp+100h]
- pand xmm13,xmm4
- paddw xmm7,xmm15
- paddw xmm8,xmm13
- movdqa xmm15,[rbp+170h]
- psubw xmm9,xmm13
- paddw xmm5,xmm15
- psubw xmm5,xmm10
- psraw xmm5,1
- pmaxsw xmm5,xmm12
- pminsw xmm5,xmm11
- pand xmm5,xmm4
- pand xmm5,[rbp+150h]
- paddw xmm6,xmm5
- movdqa xmm5,[rbp+0C0h]
- packuswb xmm7,xmm6
- movdqa xmm6,[rbp+130h]
- paddw xmm5,xmm6
- packuswb xmm5,xmm8
- movdqa xmm8,[rbp+0D0h]
- psubw xmm8,xmm6
- movdqa xmm6,[rbp+0F0h]
- paddw xmm6,xmm0
- movdqa xmm0,[rbp+0E0h]
- packuswb xmm8,xmm9
- movdqa xmm9,xmm0
- paddw xmm9,xmm0
- psubw xmm6,xmm9
- psraw xmm6,1
- pmaxsw xmm14,xmm6
- pminsw xmm2,xmm14
- pand xmm2,xmm3
- pand xmm2,[rbp+110h]
- paddw xmm0,xmm2
- movdqa xmm2,[rbp+140h]
- paddw xmm2,xmm15
- movdqa xmm15,xmm1
- paddw xmm15,xmm1
- psubw xmm2,xmm15
- psraw xmm2,1
- pmaxsw xmm12,xmm2
- pminsw xmm11,xmm12
- pand xmm11,xmm4
- pand xmm11,[rbp+160h]
- paddw xmm1,xmm11
- movdqa [rax+rcx],xmm7
- movdqa [r10],xmm5
- packuswb xmm0,xmm1
- movdqa [rcx],xmm8
- movdqa [r12+rcx],xmm0
- mov r12,qword [rbp+180h]
- lea rsp,[rbp+190h]
- POP_XMM
- pop rbp
- ret
+ push rbp
+ mov r11,[rsp + 16 + 20h] ; pTC
+ PUSH_XMM 16
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,r8d
+ movd xmm2,r9d
+ mov qword [rbp+180h],r12
+ mov r10,rcx
+ movsxd r12,edx
+ add edx,edx
+ movsxd rdx,edx
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rcx]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx edx,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,edx
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rcx]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rcx]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rcx]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rcx]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rcx]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rcx],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rcx],xmm8
+ movdqa [r12+rcx],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ POP_XMM
+ pop rbp
+ ret
WELS_EXTERN DeblockLumaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rdi
- pop rsi
- pop rbp
- pop rbx
- ret
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaLt4V_ssse3
- mov rax,rsp
- push rbx
- push rdi
- PUSH_XMM 16
- sub rsp,0C8h
- mov r10,qword [rax + 30h] ; pTC
- pxor xmm1,xmm1
- mov rbx,rcx
- movsxd r11,r8d
- movsx ecx,byte [r10]
- movsx r8d,byte [r10+2]
- mov rdi,rdx
- movq xmm2,[rbx]
- movq xmm9,[r11+rbx]
- movsx edx,byte [r10+1]
- mov word [rsp+2],cx
- mov word [rsp],cx
- movsx eax,byte [r10+3]
- mov word [rsp+6],dx
- mov word [rsp+4],dx
- movdqa xmm11,xmm1
- mov word [rsp+0Eh],ax
- mov word [rsp+0Ch],ax
- lea eax,[r11+r11]
- movsxd rcx,eax
- mov rax,rbx
- mov rdx,rdi
- sub rax,rcx
- mov word [rsp+0Ah],r8w
- mov word [rsp+8],r8w
- movdqa xmm6,[rsp]
- movdqa xmm7,xmm6
- movq xmm13, [rax]
- mov rax,rdi
- sub rax,rcx
- mov rcx,rbx
- pcmpgtw xmm7,xmm1
- psubw xmm11,xmm6
- sub rcx,r11
- sub rdx,r11
- movq xmm0,[rax]
- movsx eax,r9w
- movq xmm15,[rcx]
- punpcklqdq xmm13,xmm0
- movq xmm0, [rdx]
- movdqa xmm4,xmm13
- punpcklqdq xmm15,xmm0
- movq xmm0, [rdi]
- punpcklbw xmm4,xmm1
- movdqa xmm12,xmm15
- punpcklqdq xmm2,xmm0
- movq xmm0, [r11+rdi]
- punpcklbw xmm12,xmm1
- movdqa xmm14,xmm2
- punpcklqdq xmm9,xmm0
- punpckhbw xmm2,xmm1
- punpcklbw xmm14,xmm1
- movd xmm0,eax
- movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta
- punpckhbw xmm13,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm9
- movdqa [rsp+10h],xmm2
- punpcklwd xmm0,xmm0
- punpckhbw xmm9,xmm1
- punpcklbw xmm3,xmm1
- movdqa xmm1,xmm14
- pshufd xmm10,xmm0,0
- movd xmm0,eax
- mov eax,4
- cwde
- punpcklwd xmm0,xmm0
- pshufd xmm8,xmm0,0
- movd xmm0,eax
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- psubw xmm1,xmm12
- movdqa xmm2,xmm10
- lea r11,[rsp+0C8h]
- psllw xmm1,2
- movdqa xmm0,xmm4
- psubw xmm4,xmm12
- psubw xmm0,xmm3
- psubw xmm3,xmm14
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm11
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm12
- psubw xmm0,xmm14
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- movdqa xmm3,[rsp]
- pand xmm2,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- psubw xmm0,xmm9
- psubw xmm13,xmm15
- pand xmm2,xmm7
- pand xmm6,xmm2
- paddw xmm12,xmm6
- psubw xmm14,xmm6
- movdqa xmm2,[rsp+10h]
- movaps xmm6,[r11-18h]
- movdqa xmm1,xmm2
- psubw xmm1,xmm15
- psubw xmm9,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm15
- psubw xmm0,xmm2
- psraw xmm1,3
- pmaxsw xmm11,xmm1
- pabsw xmm0,xmm0
- movdqa xmm1,xmm8
- pcmpgtw xmm10,xmm0
- pabsw xmm0,xmm13
- pminsw xmm3,xmm11
- movaps xmm11,[r11-68h]
- movaps xmm13,[rsp+40h]
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm9
- movaps xmm9, [r11-48h]
- pand xmm10,xmm1
- pcmpgtw xmm8,xmm0
- pand xmm10,xmm8
- pand xmm10,xmm7
- movaps xmm8,[r11-38h]
- movaps xmm7,[r11-28h]
- pand xmm3,xmm10
- paddw xmm15,xmm3
- psubw xmm2,xmm3
- movaps xmm10,[r11-58h]
- packuswb xmm12,xmm15
- movaps xmm15,[rsp+20h]
- packuswb xmm14,xmm2
- movq [rcx],xmm12
- movq [rbx],xmm14
- psrldq xmm12,8
- psrldq xmm14,8
- movq [rdx],xmm12
- movaps xmm12,[r11-78h]
- movq [rdi],xmm14
- movaps xmm14,[rsp+30h]
- mov rsp,r11
- POP_XMM
- pop rdi
- pop rbx
- ret
+ mov rax,rsp
+ push rbx
+ push rdi
+ PUSH_XMM 16
+ sub rsp,0C8h
+ mov r10,qword [rax + 30h] ; pTC
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
+ movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ POP_XMM
+ pop rdi
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaEq4V_ssse3
- mov rax,rsp
- push rbx
- PUSH_XMM 15
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
- movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movaps xmm7,[rsp+70h]
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movaps xmm6,[r11-10h]
- movaps xmm8,[r11-30h]
- movaps xmm9,[r11-40h]
- movq [rbx],xmm1
- movaps xmm10,[r11-50h]
- movaps xmm11,[r11-60h]
- movaps xmm12,[r11-70h]
- movaps xmm13,[r11-80h]
- mov rsp,r11
- POP_XMM
- pop rbx
- ret
+ mov rax,rsp
+ push rbx
+ PUSH_XMM 15
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
+ movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movaps xmm7,[rsp+70h]
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movaps xmm6,[r11-10h]
+ movaps xmm8,[r11-30h]
+ movaps xmm9,[r11-40h]
+ movq [rbx],xmm1
+ movaps xmm10,[r11-50h]
+ movaps xmm11,[r11-60h]
+ movaps xmm12,[r11-70h]
+ movaps xmm13,[r11-80h]
+ mov rsp,r11
+ POP_XMM
+ pop rbx
+ ret
@@ -1089,548 +1089,548 @@
WELS_EXTERN DeblockChromaEq4H_ssse3
- mov rax,rsp
- mov [rax+20h],rbx
- push rdi
- PUSH_XMM 16
- sub rsp,140h
- mov rdi,rdx
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
- mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- movsx eax,word [rsp+170h + 160] ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea rsp,[rsp+140h]
- POP_XMM
- mov rbx, [rsp+28h]
- pop rdi
- ret
+ mov rax,rsp
+ mov [rax+20h],rbx
+ push rdi
+ PUSH_XMM 16
+ sub rsp,140h
+ mov rdi,rdx
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
+ mov eax,[rdx+rcx-2]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ movsx eax,word [rsp+170h + 160] ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea rsp,[rsp+140h]
+ POP_XMM
+ mov rbx, [rsp+28h]
+ pop rdi
+ ret
WELS_EXTERN DeblockChromaLt4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- push r12
- PUSH_XMM 16
- sub rsp,170h
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ push r12
+ PUSH_XMM 16
+ sub rsp,170h
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
- mov rax, [rsp+1C8h+160] ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
- movsx eax,word [rsp+1C0h+160] ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
- POP_XMM
- pop r12
- pop rdi
- pop rsi
- pop rbp
- pop rbx
- ret
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
+ mov rax, [rsp+1C8h+160] ; pTC
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
+ movsx eax,word [rsp+1C0h+160] ; iBeta
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
+ POP_XMM
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ ret
@@ -1638,1591 +1638,1591 @@
WELS_EXTERN DeblockLumaLt4V_ssse3
- push rbp
- mov r11,r8 ; pTC
- sub rsp,1B0h
- lea rbp,[rsp+20h]
- movd xmm4,edx
- movd xmm2,ecx
- mov qword [rbp+180h],r12
- mov r10,rdi
- movsxd r12,esi
- add rsi,rsi
- movsxd rdx,esi
- sub r10,r12
- movsx r8d,byte [r11]
- pxor xmm3,xmm3
- punpcklwd xmm2,xmm2
- movaps [rbp+50h],xmm14
- lea rax,[r12+r12*2]
- movdqa xmm14,[rdx+rdi]
- neg rax
- pshufd xmm0,xmm2,0
- movd xmm2,r8d
- movsx rsi,byte [r11+1]
- movsx r8d,byte [r11+2]
- movsx r11d,byte [r11+3]
- movaps [rbp+70h],xmm12
- movd xmm1,esi
- movaps [rbp+80h],xmm11
- movd xmm12,r8d
- movd xmm11,r11d
- movdqa xmm5, [rax+rdi]
- lea rax,[r12+r12]
- punpcklwd xmm12,xmm12
- neg rax
- punpcklwd xmm11,xmm11
- movaps [rbp],xmm8
- movdqa xmm8, [r10]
- punpcklwd xmm2,xmm2
- punpcklwd xmm1,xmm1
- punpcklqdq xmm12,xmm12
- punpcklqdq xmm11,xmm11
- punpcklqdq xmm2,xmm2
- punpcklqdq xmm1,xmm1
- shufps xmm12,xmm11,88h
- movdqa xmm11,xmm8
- movaps [rbp+30h],xmm9
- movdqa xmm9,[rdi]
- shufps xmm2,xmm1,88h
- movdqa xmm1,xmm5
- punpcklbw xmm11,xmm3
- movaps [rbp+20h],xmm6
- movaps [rbp+60h],xmm13
- movdqa xmm13,xmm11
- movaps [rbp+90h],xmm10
- movdqa xmm10,xmm9
- movdqa xmm6,[rax+rdi]
- punpcklbw xmm1,xmm3
- movaps [rbp+0A0h],xmm12
- psubw xmm13,xmm1
- movaps [rbp+40h],xmm15
- movdqa xmm15,xmm14
- movaps [rbp+10h],xmm7
- movdqa xmm7,xmm6
- punpcklbw xmm10,xmm3
- movdqa xmm12,[r12+rdi]
- punpcklbw xmm7,xmm3
- punpcklbw xmm12,xmm3
- punpcklbw xmm15,xmm3
- pabsw xmm3,xmm13
- movdqa xmm13,xmm10
- psubw xmm13,xmm15
- movdqa [rbp+0F0h],xmm15
- pabsw xmm15,xmm13
- movdqa xmm13,xmm11
- movdqa [rbp+0B0h],xmm1
- movdqa xmm1,xmm0
- pavgw xmm13,xmm10
- pcmpgtw xmm1,xmm3
- movdqa [rbp+120h],xmm13
- movaps xmm13,xmm2
- punpcklwd xmm4,xmm4
- movdqa xmm3,xmm0
- movdqa [rbp+100h],xmm1
- psubw xmm13,xmm1
- movdqa xmm1,xmm10
- pcmpgtw xmm3,xmm15
- pshufd xmm4,xmm4,0
- psubw xmm1,xmm11
- movdqa [rbp+0D0h],xmm10
- psubw xmm13,xmm3
- movdqa [rbp+110h],xmm3
- pabsw xmm15,xmm1
- movdqa xmm3,xmm4
- psubw xmm10,xmm12
- pcmpgtw xmm3,xmm15
- pabsw xmm15,xmm10
- movdqa xmm10,xmm0
- psllw xmm1,2
- movdqa [rbp+0C0h],xmm11
- psubw xmm11,xmm7
- pcmpgtw xmm10,xmm15
- pabsw xmm11,xmm11
- movdqa xmm15,xmm0
- pand xmm3,xmm10
- pcmpgtw xmm15,xmm11
- movaps xmm11,xmm2
- pxor xmm10,xmm10
- pand xmm3,xmm15
- pcmpgtw xmm11,xmm10
- pcmpeqw xmm10,xmm2
- por xmm11,xmm10
- pand xmm3,xmm11
- movdqa xmm11,xmm7
- psubw xmm11,xmm12
- pxor xmm15,xmm15
- paddw xmm11,xmm1
- psubw xmm15,xmm13
- movdqa [rbp+0E0h],xmm12
- paddw xmm11,[FOUR_16B_SSE2]
- pxor xmm12,xmm12
- psraw xmm11,3
- punpckhbw xmm8,xmm12
- pmaxsw xmm15,xmm11
- punpckhbw xmm5,xmm12
- movdqa xmm11,xmm8
- pminsw xmm13,xmm15
- psubw xmm11,xmm5
- punpckhbw xmm9,xmm12
- pand xmm13,xmm3
- movdqa [rbp+130h],xmm13
- pabsw xmm13,xmm11
- punpckhbw xmm14,xmm12
- movdqa xmm11,xmm9
- psubw xmm11,xmm14
- movdqa xmm15,xmm0
- movdqa [rbp+140h],xmm14
- pabsw xmm14,xmm11
- movdqa xmm11,xmm8
- pcmpgtw xmm15,xmm14
- movdqa xmm1,[r12+rdi]
- pavgw xmm11,xmm9
- movdqa [rbp+170h],xmm11
- movdqa xmm10,xmm9
- punpckhbw xmm6,xmm12
- psubw xmm10,xmm8
- punpckhbw xmm1,xmm12
- movdqa xmm12,xmm0
- movaps xmm11,[rbp+0A0h]
- pcmpgtw xmm12,xmm13
- movaps xmm13,xmm11
- psubw xmm13,xmm12
- movdqa [rbp+160h],xmm15
- psubw xmm13,xmm15
- movdqa xmm15,xmm9
- psubw xmm15,xmm1
- movdqa [rbp+150h],xmm12
- pabsw xmm12,xmm10
- pabsw xmm14,xmm15
- movdqa xmm15,xmm8
- pcmpgtw xmm4,xmm12
- movdqa xmm12,xmm0
- psubw xmm15,xmm6
- pcmpgtw xmm12,xmm14
- pabsw xmm14,xmm15
- psllw xmm10,2
- pcmpgtw xmm0,xmm14
- movdqa xmm14,xmm6
- psubw xmm14,xmm1
- pand xmm4,xmm12
- paddw xmm14,xmm10
- pand xmm4,xmm0
- paddw xmm14,[FOUR_16B_SSE2]
- pxor xmm15,xmm15
- movaps xmm12,xmm11
- psubw xmm15,xmm13
- pxor xmm0,xmm0
- psraw xmm14,3
- pcmpgtw xmm12,xmm0
- pcmpeqw xmm0,xmm11
- pmaxsw xmm15,xmm14
- por xmm12,xmm0
- movdqa xmm0,[rbp+120h]
- pminsw xmm13,xmm15
- movdqa xmm15,[rbp+0B0h]
- movdqa xmm10,xmm7
- pand xmm4,xmm12
- paddw xmm15,xmm0
- pxor xmm12,xmm12
- paddw xmm10,xmm7
- movdqa xmm14,xmm12
- psubw xmm15,xmm10
- psubw xmm14,xmm2
- psraw xmm15,1
- pmaxsw xmm15,xmm14
- movdqa xmm10,xmm6
- pminsw xmm15,xmm2
- paddw xmm10,xmm6
- pand xmm15,xmm3
- psubw xmm12,xmm11
- pand xmm15,[rbp+100h]
- pand xmm13,xmm4
- paddw xmm7,xmm15
- paddw xmm8,xmm13
- movdqa xmm15,[rbp+170h]
- psubw xmm9,xmm13
- paddw xmm5,xmm15
- psubw xmm5,xmm10
- psraw xmm5,1
- pmaxsw xmm5,xmm12
- pminsw xmm5,xmm11
- pand xmm5,xmm4
- pand xmm5,[rbp+150h]
- paddw xmm6,xmm5
- movdqa xmm5,[rbp+0C0h]
- packuswb xmm7,xmm6
- movdqa xmm6,[rbp+130h]
- paddw xmm5,xmm6
- packuswb xmm5,xmm8
- movdqa xmm8,[rbp+0D0h]
- psubw xmm8,xmm6
- movdqa xmm6,[rbp+0F0h]
- paddw xmm6,xmm0
- movdqa xmm0,[rbp+0E0h]
- packuswb xmm8,xmm9
- movdqa xmm9,xmm0
- paddw xmm9,xmm0
- psubw xmm6,xmm9
- psraw xmm6,1
- pmaxsw xmm14,xmm6
- pminsw xmm2,xmm14
- pand xmm2,xmm3
- pand xmm2,[rbp+110h]
- paddw xmm0,xmm2
- movdqa xmm2,[rbp+140h]
- paddw xmm2,xmm15
- movdqa xmm15,xmm1
- paddw xmm15,xmm1
- psubw xmm2,xmm15
- psraw xmm2,1
- pmaxsw xmm12,xmm2
- pminsw xmm11,xmm12
- pand xmm11,xmm4
- pand xmm11,[rbp+160h]
- paddw xmm1,xmm11
- movdqa [rax+rdi],xmm7
- movdqa [r10],xmm5
- packuswb xmm0,xmm1
- movdqa [rdi],xmm8
- movdqa [r12+rdi],xmm0
- mov r12,qword [rbp+180h]
- lea rsp,[rbp+190h]
- pop rbp
- ret
+ push rbp
+ mov r11,r8 ; pTC
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,edx
+ movd xmm2,ecx
+ mov qword [rbp+180h],r12
+ mov r10,rdi
+ movsxd r12,esi
+ add rsi,rsi
+ movsxd rdx,esi
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rdi]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx rsi,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,esi
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rdi]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rdi]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rdi]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rdi]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rdi]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rdi],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rdi],xmm8
+ movdqa [r12+rdi],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ pop rbp
+ ret
WELS_EXTERN DeblockLumaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rbp
- pop rbx
- ret
+ mov rax,rsp
+ push rbx
+ push rbp
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaLt4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
- mov r10, rdx
- mov r11, rcx
- mov rcx, rdi
- mov rdx, rsi
- mov rsi, r10
- mov r10, r9
- mov rbp, r8
- mov r8, rsi
- mov r9, r11
- sub rsp,0C8h
- pxor xmm1,xmm1
- mov rbx,rcx
- movsxd r11,r8d
- movsx ecx,byte [r10]
- movsx r8d,byte [r10+2]
- mov rdi,rdx
- movq xmm2,[rbx]
- movq xmm9,[r11+rbx]
- movsx edx,byte [r10+1]
- mov word [rsp+2],cx
- mov word [rsp],cx
- movsx eax,byte [r10+3]
- mov word [rsp+6],dx
- mov word [rsp+4],dx
- movdqa xmm11,xmm1
- mov word [rsp+0Eh],ax
- mov word [rsp+0Ch],ax
- lea eax,[r11+r11]
- movsxd rcx,eax
- mov rax,rbx
- mov rdx,rdi
- sub rax,rcx
- mov word [rsp+0Ah],r8w
- mov word [rsp+8],r8w
- movdqa xmm6,[rsp]
- movdqa xmm7,xmm6
- movq xmm13, [rax]
- mov rax,rdi
- sub rax,rcx
- mov rcx,rbx
- pcmpgtw xmm7,xmm1
- psubw xmm11,xmm6
- sub rcx,r11
- sub rdx,r11
- movq xmm0,[rax]
- movsx eax,r9w
- movq xmm15,[rcx]
- punpcklqdq xmm13,xmm0
- movq xmm0, [rdx]
- movdqa xmm4,xmm13
- punpcklqdq xmm15,xmm0
- movq xmm0, [rdi]
- punpcklbw xmm4,xmm1
- movdqa xmm12,xmm15
- punpcklqdq xmm2,xmm0
- movq xmm0, [r11+rdi]
- punpcklbw xmm12,xmm1
- movdqa xmm14,xmm2
- punpcklqdq xmm9,xmm0
- punpckhbw xmm2,xmm1
- punpcklbw xmm14,xmm1
- movd xmm0,eax
- mov eax, ebp ; iBeta
- punpckhbw xmm13,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm9
- movdqa [rsp+10h],xmm2
- punpcklwd xmm0,xmm0
- punpckhbw xmm9,xmm1
- punpcklbw xmm3,xmm1
- movdqa xmm1,xmm14
- pshufd xmm10,xmm0,0
- movd xmm0,eax
- mov eax,4
- cwde
- punpcklwd xmm0,xmm0
- pshufd xmm8,xmm0,0
- movd xmm0,eax
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- psubw xmm1,xmm12
- movdqa xmm2,xmm10
- lea r11,[rsp+0C8h]
- psllw xmm1,2
- movdqa xmm0,xmm4
- psubw xmm4,xmm12
- psubw xmm0,xmm3
- psubw xmm3,xmm14
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm11
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm12
- psubw xmm0,xmm14
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- movdqa xmm3,[rsp]
- pand xmm2,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- psubw xmm0,xmm9
- psubw xmm13,xmm15
- pand xmm2,xmm7
- pand xmm6,xmm2
- paddw xmm12,xmm6
- psubw xmm14,xmm6
- movdqa xmm2,[rsp+10h]
- movaps xmm6,[r11-18h]
- movdqa xmm1,xmm2
- psubw xmm1,xmm15
- psubw xmm9,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm15
- psubw xmm0,xmm2
- psraw xmm1,3
- pmaxsw xmm11,xmm1
- pabsw xmm0,xmm0
- movdqa xmm1,xmm8
- pcmpgtw xmm10,xmm0
- pabsw xmm0,xmm13
- pminsw xmm3,xmm11
- movaps xmm11,[r11-68h]
- movaps xmm13,[rsp+40h]
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm9
- movaps xmm9, [r11-48h]
- pand xmm10,xmm1
- pcmpgtw xmm8,xmm0
- pand xmm10,xmm8
- pand xmm10,xmm7
- movaps xmm8,[r11-38h]
- movaps xmm7,[r11-28h]
- pand xmm3,xmm10
- paddw xmm15,xmm3
- psubw xmm2,xmm3
- movaps xmm10,[r11-58h]
- packuswb xmm12,xmm15
- movaps xmm15,[rsp+20h]
- packuswb xmm14,xmm2
- movq [rcx],xmm12
- movq [rbx],xmm14
- psrldq xmm12,8
- psrldq xmm14,8
- movq [rdx],xmm12
- movaps xmm12,[r11-78h]
- movq [rdi],xmm14
- movaps xmm14,[rsp+30h]
- mov rsp,r11
- pop rbp
- pop rbx
- ret
+ mov rax,rsp
+ push rbx
+ push rbp
+ mov r10, rdx
+ mov r11, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ mov rsi, r10
+ mov r10, r9
+ mov rbp, r8
+ mov r8, rsi
+ mov r9, r11
+ sub rsp,0C8h
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
+ mov eax, ebp ; iBeta
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
+ mov rax,rsp
+ push rbx
+ push rbp
- mov rbp, r8
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
+ mov rbp, r8
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
- mov eax, ebp ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- ;movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movq [rbx],xmm1
- mov rsp,r11
- pop rbp
- pop rbx
- ret
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
+ mov eax, ebp ; iBeta
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ ;movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movq [rbx],xmm1
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaEq4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push r12
+ mov rax,rsp
+ push rbx
+ push rbp
+ push r12
- mov rbp, r8
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
- mov rdi, rdx
+ mov rbp, r8
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ mov rdi, rdx
- sub rsp,140h
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
+ sub rsp,140h
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
- mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- mov eax, ebp ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea r11,[rsp+140h]
- mov rbx, [r11+28h]
- mov rsp,r11
- pop r12
- pop rbp
- pop rbx
- ret
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
+ mov eax,[rdx+rcx-2]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ mov eax, ebp ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea r11,[rsp+140h]
+ mov rbx, [r11+28h]
+ mov rsp,r11
+ pop r12
+ pop rbp
+ pop rbx
+ ret
WELS_EXTERN DeblockChromaLt4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push r12
- push r13
- push r14
- sub rsp,170h
+ mov rax,rsp
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ sub rsp,170h
- mov r13, r8
- mov r14, r9
- mov r8, rdx
- mov r9, rcx
- mov rdx, rdi
- mov rcx, rsi
+ mov r13, r8
+ mov r14, r9
+ mov r8, rdx
+ mov r9, rcx
+ mov rdx, rdi
+ mov rcx, rsi
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
- mov rax, r14 ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
- mov eax, r13d ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- ret
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
+ mov rax, r14 ; pTC
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
+ mov eax, r13d ; iBeta
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ ret
@@ -3233,166 +3233,166 @@
; int32_t iAlpha, int32_t iBeta)
;********************************************************************************
WELS_EXTERN DeblockChromaEq4V_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
- mov edx,[ebp+10h] ; iStride
- mov eax,[ebp+8] ; pPixCb
- mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,68h
+ mov edx,[ebp+10h] ; iStride
+ mov eax,[ebp+8] ; pPixCb
+ mov ecx,[ebp+0Ch] ; pPixCr
+ movq xmm4,[ecx]
+ movq xmm5,[edx+ecx]
+ push esi
+ push edi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ movq xmm1,[edi]
+ mov edi,ecx
+ sub edi,esi
+ movq xmm2,[edi]
+ punpcklqdq xmm1,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm2,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm3,[edi]
+ punpcklqdq xmm2,xmm3
+ movq xmm3,[eax]
+ punpcklqdq xmm3,xmm4
+ movq xmm4,[edx+eax]
+ mov edx, [ebp + 14h]
+ punpcklqdq xmm4,xmm5
+ movd xmm5,edx
+ mov edx, [ebp + 18h]
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm5
+ punpcklwd xmm6,xmm5
+ pshufd xmm5,xmm6,0
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,xmm1
+ punpckhbw xmm1,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+40h],xmm1
+ movdqa [esp+60h],xmm7
+ movdqa xmm7,xmm2
+ punpcklbw xmm7,xmm0
+ movdqa [esp+10h],xmm7
+ movdqa xmm7,xmm3
+ punpcklbw xmm7,xmm0
+ punpckhbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm7,xmm4
+ punpckhbw xmm4,xmm0
+ punpckhbw xmm2,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+30h],xmm3
+ movdqa xmm3,[esp+10h]
+ movdqa xmm1,xmm3
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa [esp+20h],xmm4
+ movdqa xmm0,xmm5
+ pcmpgtw xmm0,xmm1
+ movdqa xmm1,[esp+60h]
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ pand xmm0,xmm4
+ movdqa xmm1,xmm7
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,xmm2
+ psubw xmm1,[esp+30h]
+ pabsw xmm1,xmm1
+ pcmpgtw xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ pand xmm0,xmm4
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,[esp+20h]
+ psubw xmm1,[esp+30h]
+ pand xmm5,xmm4
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ pand xmm5,xmm6
+ mov edx,2
+ movsx edx,dx
+ movd xmm1,edx
+ movdqa xmm4,xmm1
+ punpcklwd xmm4,xmm1
+ pshufd xmm1,xmm4,0
+ movdqa xmm4,[esp+60h]
+ movdqa xmm6,xmm4
+ paddw xmm6,xmm4
+ paddw xmm6,xmm3
+ paddw xmm6,xmm7
+ movdqa [esp+10h],xmm1
+ paddw xmm6,[esp+10h]
+ psraw xmm6,2
+ movdqa xmm4,xmm0
+ pandn xmm4,xmm3
+ movdqa xmm3,[esp+40h]
+ movdqa xmm1,xmm0
+ pand xmm1,xmm6
+ por xmm1,xmm4
+ movdqa xmm6,xmm3
+ paddw xmm6,xmm3
+ movdqa xmm3,[esp+10h]
+ paddw xmm6,xmm2
+ paddw xmm6,[esp+20h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm4,xmm5
+ pand xmm4,xmm6
+ movdqa xmm6,xmm5
+ pandn xmm6,xmm2
+ por xmm4,xmm6
+ packuswb xmm1,xmm4
+ movdqa xmm4,[esp+50h]
+ movdqa xmm6,xmm7
+ paddw xmm6,xmm7
+ paddw xmm6,xmm4
+ paddw xmm6,[esp+60h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm2,xmm0
+ pand xmm2,xmm6
+ pandn xmm0,xmm4
+ por xmm2,xmm0
+ movdqa xmm0,[esp+20h]
+ movdqa xmm6,xmm0
+ paddw xmm6,xmm0
+ movdqa xmm0,[esp+30h]
+ paddw xmm6,xmm0
+ paddw xmm6,[esp+40h]
+ movdqa xmm4,xmm5
+ paddw xmm6,xmm3
+ movq [esi],xmm1
+ psraw xmm6,2
+ pand xmm4,xmm6
+ pandn xmm5,xmm0
+ por xmm4,xmm5
+ packuswb xmm2,xmm4
+ movq [eax],xmm2
+ psrldq xmm1,8
+ movq [edi],xmm1
+ pop edi
+ psrldq xmm2,8
+ movq [ecx],xmm2
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
;******************************************************************************
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3400,200 +3400,200 @@
;*******************************************************************************
WELS_EXTERN DeblockChromaLt4V_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0E4h
- push ebx
- push esi
- mov esi, [ebp+1Ch] ; pTC
- movsx ebx, byte [esi+2]
- push edi
- movsx di,byte [esi+3]
- mov word [esp+0Ch],bx
- movsx bx,byte [esi+1]
- movsx esi,byte [esi]
- mov word [esp+0Eh],si
- movzx esi,di
- movd xmm1,esi
- movzx esi,di
- movd xmm2,esi
- mov si,word [esp+0Ch]
- mov edx, [ebp + 10h]
- mov eax, [ebp + 08h]
- movzx edi,si
- movzx esi,si
- mov ecx, [ebp + 0Ch]
- movd xmm4,esi
- movzx esi,bx
- movd xmm5,esi
- movd xmm3,edi
- movzx esi,bx
- movd xmm6,esi
- mov si,word [esp+0Eh]
- movzx edi,si
- movzx esi,si
- punpcklwd xmm6,xmm2
- pxor xmm0,xmm0
- movdqa [esp+40h],xmm0
- movd xmm7,edi
- movd xmm0,esi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+40h]
- punpcklwd xmm0,xmm4
- movq xmm4,[edx+ecx]
- punpcklwd xmm7,xmm3
- movq xmm3,[eax]
- punpcklwd xmm0,xmm6
- movq xmm6,[edi]
- punpcklwd xmm7,xmm5
- punpcklwd xmm0,xmm7
- mov edi,ecx
- sub edi,esi
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+60h],xmm2
- movq xmm2, [edi]
- punpcklqdq xmm6,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm7,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm2,[edi]
- punpcklqdq xmm7,xmm2
- movq xmm2,[ecx]
- punpcklqdq xmm3,xmm2
- movq xmm2,[edx+eax]
- movsx edx,word [ebp + 14h]
- punpcklqdq xmm2,xmm4
- movdqa [esp+0E0h],xmm2
- movd xmm2,edx
- movsx edx,word [ebp + 18h]
- movdqa xmm4,xmm2
- punpcklwd xmm4,xmm2
- movd xmm2,edx
- movdqa xmm5,xmm2
- punpcklwd xmm5,xmm2
- pshufd xmm2,xmm5,0
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- movdqa [esp+0D0h],xmm3
- pshufd xmm4,xmm4,0
- movdqa [esp+30h],xmm2
- punpckhbw xmm6,xmm1
- movdqa [esp+80h],xmm6
- movdqa xmm6,[esp+0D0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+70h],xmm6
- movdqa xmm6, [esp+0E0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+90h],xmm6
- movdqa xmm5, [esp+0E0h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0A0h],xmm7
- punpcklbw xmm3,xmm1
- mov edx,4
- punpcklbw xmm2,xmm1
- movsx edx,dx
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,[esp+30h]
- movdqa [esp+20h],xmm6
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa xmm1,[esp+60h]
- movdqa [esp+40h],xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6, [esp+20h]
- movdqa xmm7, [esp+50h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa [esp+10h],xmm0
- movdqa xmm6, [esp+10h]
- pminsw xmm6,xmm1
- movdqa [esp+10h],xmm6
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm6,xmm4
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+30h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1,[esp+50h]
- pand xmm6,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5,[esp+80h]
- psubw xmm5,[esp+90h]
- pand xmm6,xmm1
- pand xmm6,[esp+40h]
- movdqa xmm1,[esp+10h]
- pand xmm1,xmm6
- movdqa xmm6,[esp+70h]
- movdqa [esp+30h],xmm1
- movdqa xmm1,[esp+0A0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6,[esp+20h]
- movdqa xmm5,[esp+60h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+70h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+80h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+90h]
- pand xmm4,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+40h]
- pand xmm0,xmm4
- movdqa xmm4,[esp+30h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- packuswb xmm2,xmm1
- movq [esi],xmm2
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm3,xmm5
- movq [eax],xmm3
- psrldq xmm2,8
- movq [edi],xmm2
- pop edi
- pop esi
- psrldq xmm3,8
- movq [ecx],xmm3
- pop ebx
- mov esp,ebp
- pop ebp
- ret
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0E4h
+ push ebx
+ push esi
+ mov esi, [ebp+1Ch] ; pTC
+ movsx ebx, byte [esi+2]
+ push edi
+ movsx di,byte [esi+3]
+ mov word [esp+0Ch],bx
+ movsx bx,byte [esi+1]
+ movsx esi,byte [esi]
+ mov word [esp+0Eh],si
+ movzx esi,di
+ movd xmm1,esi
+ movzx esi,di
+ movd xmm2,esi
+ mov si,word [esp+0Ch]
+ mov edx, [ebp + 10h]
+ mov eax, [ebp + 08h]
+ movzx edi,si
+ movzx esi,si
+ mov ecx, [ebp + 0Ch]
+ movd xmm4,esi
+ movzx esi,bx
+ movd xmm5,esi
+ movd xmm3,edi
+ movzx esi,bx
+ movd xmm6,esi
+ mov si,word [esp+0Eh]
+ movzx edi,si
+ movzx esi,si
+ punpcklwd xmm6,xmm2
+ pxor xmm0,xmm0
+ movdqa [esp+40h],xmm0
+ movd xmm7,edi
+ movd xmm0,esi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ punpcklwd xmm0,xmm4
+ movq xmm4,[edx+ecx]
+ punpcklwd xmm7,xmm3
+ movq xmm3,[eax]
+ punpcklwd xmm0,xmm6
+ movq xmm6,[edi]
+ punpcklwd xmm7,xmm5
+ punpcklwd xmm0,xmm7
+ mov edi,ecx
+ sub edi,esi
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+60h],xmm2
+ movq xmm2, [edi]
+ punpcklqdq xmm6,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm7,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm2,[edi]
+ punpcklqdq xmm7,xmm2
+ movq xmm2,[ecx]
+ punpcklqdq xmm3,xmm2
+ movq xmm2,[edx+eax]
+ movsx edx,word [ebp + 14h]
+ punpcklqdq xmm2,xmm4
+ movdqa [esp+0E0h],xmm2
+ movd xmm2,edx
+ movsx edx,word [ebp + 18h]
+ movdqa xmm4,xmm2
+ punpcklwd xmm4,xmm2
+ movd xmm2,edx
+ movdqa xmm5,xmm2
+ punpcklwd xmm5,xmm2
+ pshufd xmm2,xmm5,0
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ movdqa [esp+0D0h],xmm3
+ pshufd xmm4,xmm4,0
+ movdqa [esp+30h],xmm2
+ punpckhbw xmm6,xmm1
+ movdqa [esp+80h],xmm6
+ movdqa xmm6,[esp+0D0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+70h],xmm6
+ movdqa xmm6, [esp+0E0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+90h],xmm6
+ movdqa xmm5, [esp+0E0h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0A0h],xmm7
+ punpcklbw xmm3,xmm1
+ mov edx,4
+ punpcklbw xmm2,xmm1
+ movsx edx,dx
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,[esp+30h]
+ movdqa [esp+20h],xmm6
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1,[esp+60h]
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6, [esp+20h]
+ movdqa xmm7, [esp+50h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa [esp+10h],xmm0
+ movdqa xmm6, [esp+10h]
+ pminsw xmm6,xmm1
+ movdqa [esp+10h],xmm6
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm6,xmm4
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+30h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1,[esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5,[esp+80h]
+ psubw xmm5,[esp+90h]
+ pand xmm6,xmm1
+ pand xmm6,[esp+40h]
+ movdqa xmm1,[esp+10h]
+ pand xmm1,xmm6
+ movdqa xmm6,[esp+70h]
+ movdqa [esp+30h],xmm1
+ movdqa xmm1,[esp+0A0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6,[esp+20h]
+ movdqa xmm5,[esp+60h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+70h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+80h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+90h]
+ pand xmm4,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+40h]
+ pand xmm0,xmm4
+ movdqa xmm4,[esp+30h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ packuswb xmm2,xmm1
+ movq [esi],xmm2
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm3,xmm5
+ movq [eax],xmm3
+ psrldq xmm2,8
+ movq [edi],xmm2
+ pop edi
+ pop esi
+ psrldq xmm3,8
+ movq [ecx],xmm3
+ pop ebx
+ mov esp,ebp
+ pop ebp
+ ret
;***************************************************************************
; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3601,280 +3601,280 @@
;***************************************************************************
WELS_EXTERN DeblockChromaEq4H_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0C8h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+18h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+7Ch]
+ push edi
+ mov dword [esp+14h],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+0Ch],edx
+ mov dword [esp+10h],eax
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+0Ch]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+10h]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ movsx ecx,word [ebp+14h]
+ movsx edx,word [ebp+18h]
+ movdqa xmm6,[esp+80h]
+ movdqa xmm4,[esp+90h]
+ movdqa xmm5,[esp+0A0h]
+ movdqa xmm7,[esp+0B0h]
+ pxor xmm0,xmm0
+ movd xmm1,ecx
+ movdqa xmm2,xmm1
+ punpcklwd xmm2,xmm1
+ pshufd xmm1,xmm2,0
+ movd xmm2,edx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3,xmm6
+ punpckhbw xmm6,xmm0
+ movdqa [esp+60h],xmm6
+ movdqa xmm6,[esp+90h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+30h],xmm6
+ movdqa xmm6,[esp+0A0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,[esp+0B0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+70h],xmm6
+ punpcklbw xmm7,xmm0
+ punpcklbw xmm4,xmm0
+ punpcklbw xmm5,xmm0
+ punpcklbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm6,xmm4
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ movdqa xmm0,xmm1
+ pcmpgtw xmm0,xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm4
+ pabsw xmm6,xmm6
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+30h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pcmpgtw xmm1,xmm6
+ movdqa xmm6,[esp+60h]
+ psubw xmm6,[esp+30h]
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+70h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pand xmm1,xmm7
+ pcmpgtw xmm2,xmm6
+ pand xmm1,xmm2
+ mov eax,2
+ movsx ecx,ax
+ movd xmm2,ecx
+ movdqa xmm6,xmm2
+ punpcklwd xmm6,xmm2
+ pshufd xmm2,xmm6,0
+ movdqa [esp+20h],xmm2
+ movdqa xmm2,xmm3
+ paddw xmm2,xmm3
+ paddw xmm2,xmm4
+ paddw xmm2,[esp+50h]
+ paddw xmm2,[esp+20h]
+ psraw xmm2,2
+ movdqa xmm6,xmm0
+ pand xmm6,xmm2
+ movdqa xmm2,xmm0
+ pandn xmm2,xmm4
+ por xmm6,xmm2
+ movdqa xmm2,[esp+60h]
+ movdqa xmm7,xmm2
+ paddw xmm7,xmm2
+ paddw xmm7,[esp+30h]
+ paddw xmm7,[esp+70h]
+ paddw xmm7,[esp+20h]
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ pandn xmm2,[esp+30h]
+ psraw xmm7,2
+ pand xmm4,xmm7
+ por xmm4,xmm2
+ movdqa xmm2,[esp+50h]
+ packuswb xmm6,xmm4
+ movdqa [esp+90h],xmm6
+ movdqa xmm6,xmm2
+ paddw xmm6,xmm2
+ movdqa xmm2,[esp+20h]
+ paddw xmm6,xmm5
+ paddw xmm6,xmm3
+ movdqa xmm4,xmm0
+ pandn xmm0,xmm5
+ paddw xmm6,xmm2
+ psraw xmm6,2
+ pand xmm4,xmm6
+ por xmm4,xmm0
+ movdqa xmm0,[esp+70h]
+ movdqa xmm5,xmm0
+ paddw xmm5,xmm0
+ movdqa xmm0,[esp+40h]
+ paddw xmm5,xmm0
+ paddw xmm5,[esp+60h]
+ movdqa xmm3,xmm1
+ paddw xmm5,xmm2
+ psraw xmm5,2
+ pand xmm3,xmm5
+ pandn xmm1,xmm0
+ por xmm3,xmm1
+ packuswb xmm4,xmm3
+ movdqa [esp+0A0h],xmm4
+ mov esi,dword [esp+10h]
+ movdqa xmm0,[esi]
+ movdqa xmm1,[esi+10h]
+ movdqa xmm2,[esi+20h]
+ movdqa xmm3,[esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+0Ch]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
;*******************************************************************************
; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3882,308 +3882,308 @@
;*******************************************************************************
WELS_EXTERN DeblockChromaLt4H_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,108h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+10h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+6Ch]
+ push edi
+ mov dword [esp+0Ch],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+10h],edx
+ mov dword [esp+1Ch],eax
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+10h]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+1Ch]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ mov eax,dword [ebp+1Ch]
+ movsx cx,byte [eax+3]
+ movsx dx,byte [eax+2]
+ movsx si,byte [eax+1]
+ movsx ax,byte [eax]
+ movzx edi,cx
+ movzx ecx,cx
+ movd xmm2,ecx
+ movzx ecx,dx
+ movzx edx,dx
+ movd xmm3,ecx
+ movd xmm4,edx
+ movzx ecx,si
+ movzx edx,si
+ movd xmm5,ecx
+ pxor xmm0,xmm0
+ movd xmm6,edx
+ movzx ecx,ax
+ movdqa [esp+60h],xmm0
+ movzx edx,ax
+ movsx eax,word [ebp+14h]
+ punpcklwd xmm6,xmm2
+ movd xmm1,edi
+ movd xmm7,ecx
+ movsx ecx,word [ebp+18h]
+ movd xmm0,edx
+ punpcklwd xmm7,xmm3
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+60h]
+ punpcklwd xmm7,xmm5
+ movdqa xmm5,[esp+0A0h]
+ punpcklwd xmm0,xmm4
+ punpcklwd xmm0,xmm6
+ movdqa xmm6, [esp+70h]
+ punpcklwd xmm0,xmm7
+ movdqa xmm7,[esp+80h]
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+0D0h],xmm2
+ movd xmm2,eax
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm4,xmm3,0
+ movd xmm2,ecx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3, [esp+90h]
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa [esp+40h],xmm2
+ movdqa [esp+0B0h],xmm6
+ movdqa xmm6,[esp+90h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm2,xmm1
+ punpcklbw xmm3,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0F0h],xmm7
+ movdqa [esp+0C0h],xmm6
+ movdqa xmm6, [esp+0A0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+0E0h],xmm6
+ mov edx,4
+ movsx eax,dx
+ movd xmm6,eax
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa [esp+30h],xmm6
+ movdqa xmm7, [esp+40h]
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa [esp+60h],xmm6
+ movdqa xmm1, [esp+0D0h]
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6,[esp+30h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa xmm7,[esp+50h]
+ movdqa [esp+20h],xmm0
+ movdqa xmm6, [esp+20h]
+ pminsw xmm6,xmm1
+ movdqa [esp+20h],xmm6
+ movdqa xmm6,xmm4
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+40h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1, [esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5, [esp+0B0h]
+ psubw xmm5,[esp+0E0h]
+ pand xmm6,xmm1
+ pand xmm6, [esp+60h]
+ movdqa xmm1, [esp+20h]
+ pand xmm1,xmm6
+ movdqa xmm6, [esp+0C0h]
+ movdqa [esp+40h],xmm1
+ movdqa xmm1, [esp+0F0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6, [esp+30h]
+ movdqa xmm5, [esp+0D0h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+0C0h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+0B0h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6, [esp+0E0h]
+ pand xmm4,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+60h]
+ pand xmm0,xmm4
+ movdqa xmm4, [esp+40h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm2,xmm1
+ packuswb xmm3,xmm5
+ movdqa [esp+80h],xmm2
+ movdqa [esp+90h],xmm3
+ mov esi,dword [esp+1Ch]
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi+10h]
+ movdqa xmm2, [esi+20h]
+ movdqa xmm3, [esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+10h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
@@ -4194,385 +4194,385 @@
WELS_EXTERN DeblockLumaLt4V_ssse3
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 420 ; 000001a4H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 420 ; 000001a4H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
- pxor xmm0, xmm0
- push ebx
- mov edx, dword [ebp+24]
- movdqa [esp+424-384], xmm0
- push esi
+ pxor xmm0, xmm0
+ push ebx
+ mov edx, dword [ebp+24]
+ movdqa [esp+424-384], xmm0
+ push esi
- lea esi, [ecx+ecx*2]
- push edi
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
+ lea esi, [ecx+ecx*2]
+ push edi
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
- lea esi, [ecx+ecx]
- movdqa [esp+432-208], xmm0
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
- movdqa [esp+448-208], xmm0
+ lea esi, [ecx+ecx]
+ movdqa [esp+432-208], xmm0
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+ movdqa [esp+448-208], xmm0
- mov ebx, eax
- sub ebx, ecx
- movdqa xmm0, [ebx]
- movdqa [esp+464-208], xmm0
+ mov ebx, eax
+ sub ebx, ecx
+ movdqa xmm0, [ebx]
+ movdqa [esp+464-208], xmm0
- movdqa xmm0, [eax]
+ movdqa xmm0, [eax]
- add ecx, eax
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [ecx]
- mov dword [esp+432-404], ecx
+ add ecx, eax
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [ecx]
+ mov dword [esp+432-404], ecx
- movsx ecx, word [ebp+16]
- movdqa [esp+496-208], xmm0
- movdqa xmm0, [esi+eax]
+ movsx ecx, word [ebp+16]
+ movdqa [esp+496-208], xmm0
+ movdqa xmm0, [esi+eax]
- movsx si, byte [edx]
- movdqa [esp+512-208], xmm0
- movd xmm0, ecx
- movsx ecx, word [ebp+20]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- pshufd xmm0, xmm1, 0
- movdqa [esp+432-112], xmm0
- movd xmm0, ecx
- movsx cx, byte [edx+1]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- mov dword [esp+432-408], ebx
- movzx ebx, cx
- pshufd xmm0, xmm1, 0
- movd xmm1, ebx
- movzx ebx, cx
- movd xmm2, ebx
- movzx ebx, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, si
- movd xmm5, ecx
- movzx ecx, si
- movd xmm6, ecx
- movzx ecx, si
- movd xmm7, ecx
- movzx ecx, si
- movdqa [esp+432-336], xmm0
- movd xmm0, ecx
+ movsx si, byte [edx]
+ movdqa [esp+512-208], xmm0
+ movd xmm0, ecx
+ movsx ecx, word [ebp+20]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ pshufd xmm0, xmm1, 0
+ movdqa [esp+432-112], xmm0
+ movd xmm0, ecx
+ movsx cx, byte [edx+1]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ mov dword [esp+432-408], ebx
+ movzx ebx, cx
+ pshufd xmm0, xmm1, 0
+ movd xmm1, ebx
+ movzx ebx, cx
+ movd xmm2, ebx
+ movzx ebx, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, si
+ movd xmm5, ecx
+ movzx ecx, si
+ movd xmm6, ecx
+ movzx ecx, si
+ movd xmm7, ecx
+ movzx ecx, si
+ movdqa [esp+432-336], xmm0
+ movd xmm0, ecx
- movsx cx, byte [edx+3]
- movsx dx, byte [edx+2]
- movd xmm3, ebx
- punpcklwd xmm0, xmm4
- movzx esi, cx
- punpcklwd xmm6, xmm2
- punpcklwd xmm5, xmm1
- punpcklwd xmm0, xmm6
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- punpcklwd xmm0, xmm7
- movdqa [esp+432-400], xmm0
- movd xmm0, esi
- movzx esi, cx
- movd xmm2, esi
- movzx esi, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, dx
- movd xmm3, esi
- movd xmm5, ecx
- punpcklwd xmm5, xmm0
+ movsx cx, byte [edx+3]
+ movsx dx, byte [edx+2]
+ movd xmm3, ebx
+ punpcklwd xmm0, xmm4
+ movzx esi, cx
+ punpcklwd xmm6, xmm2
+ punpcklwd xmm5, xmm1
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ punpcklwd xmm0, xmm7
+ movdqa [esp+432-400], xmm0
+ movd xmm0, esi
+ movzx esi, cx
+ movd xmm2, esi
+ movzx esi, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, dx
+ movd xmm3, esi
+ movd xmm5, ecx
+ punpcklwd xmm5, xmm0
- movdqa xmm0, [esp+432-384]
- movzx ecx, dx
- movd xmm6, ecx
- movzx ecx, dx
- movzx edx, dx
- punpcklwd xmm6, xmm2
- movd xmm7, ecx
- movd xmm1, edx
+ movdqa xmm0, [esp+432-384]
+ movzx ecx, dx
+ movd xmm6, ecx
+ movzx ecx, dx
+ movzx edx, dx
+ punpcklwd xmm6, xmm2
+ movd xmm7, ecx
+ movd xmm1, edx
- movdqa xmm2, [esp+448-208]
- punpcklbw xmm2, xmm0
+ movdqa xmm2, [esp+448-208]
+ punpcklbw xmm2, xmm0
- mov ecx, 4
- movsx edx, cx
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- movdqa xmm5, [esp+496-208]
- movdqa xmm3, [esp+464-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-240], xmm5
- movdqa xmm5, [esp+512-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-352], xmm5
- punpcklwd xmm1, xmm4
- movdqa xmm4, [esp+432-208]
- punpcklwd xmm1, xmm6
- movdqa xmm6, [esp+480-208]
- punpcklwd xmm1, xmm7
- punpcklbw xmm6, xmm0
- punpcklbw xmm3, xmm0
- punpcklbw xmm4, xmm0
- movdqa xmm7, xmm3
- psubw xmm7, xmm4
- pabsw xmm7, xmm7
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-336]
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-352]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
- movdqa xmm5, xmm3
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
- movdqa xmm5, [esp+432-400]
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, xmm3
- movdqa [esp+432-32], xmm6
- psubw xmm6, [esp+432-240]
- movdqa xmm7, xmm5
- movdqa [esp+432-384], xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
+ mov ecx, 4
+ movsx edx, cx
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ movdqa xmm5, [esp+496-208]
+ movdqa xmm3, [esp+464-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-240], xmm5
+ movdqa xmm5, [esp+512-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-352], xmm5
+ punpcklwd xmm1, xmm4
+ movdqa xmm4, [esp+432-208]
+ punpcklwd xmm1, xmm6
+ movdqa xmm6, [esp+480-208]
+ punpcklwd xmm1, xmm7
+ punpcklbw xmm6, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ movdqa xmm7, xmm3
+ psubw xmm7, xmm4
+ pabsw xmm7, xmm7
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-336]
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-352]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+ movdqa xmm5, xmm3
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+ movdqa xmm5, [esp+432-400]
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, xmm3
+ movdqa [esp+432-32], xmm6
+ psubw xmm6, [esp+432-240]
+ movdqa xmm7, xmm5
+ movdqa [esp+432-384], xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
- pand xmm5, xmm7
- movdqa xmm6, xmm3
- psubw xmm6, xmm2
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-400]
- pand xmm5, xmm7
- movdqa xmm7, xmm6
- pcmpeqw xmm6, xmm0
- pcmpgtw xmm7, xmm0
- por xmm7, xmm6
- pand xmm5, xmm7
- movdqa [esp+432-320], xmm5
- movd xmm5, edx
- movdqa xmm6, xmm5
- punpcklwd xmm6, xmm5
- pshufd xmm5, xmm6, 0
- movdqa [esp+432-336], xmm5
- movdqa xmm5, [esp+432-224]
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm0
- psubw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- psllw xmm5, 2
- movdqa xmm7, xmm2
- psubw xmm7, [esp+432-240]
- paddw xmm7, xmm5
- paddw xmm7, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- psraw xmm7, 3
- pmaxsw xmm6, xmm7
- pminsw xmm5, xmm6
+ pand xmm5, xmm7
+ movdqa xmm6, xmm3
+ psubw xmm6, xmm2
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-400]
+ pand xmm5, xmm7
+ movdqa xmm7, xmm6
+ pcmpeqw xmm6, xmm0
+ pcmpgtw xmm7, xmm0
+ por xmm7, xmm6
+ pand xmm5, xmm7
+ movdqa [esp+432-320], xmm5
+ movd xmm5, edx
+ movdqa xmm6, xmm5
+ punpcklwd xmm6, xmm5
+ pshufd xmm5, xmm6, 0
+ movdqa [esp+432-336], xmm5
+ movdqa xmm5, [esp+432-224]
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm0
+ psubw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ psllw xmm5, 2
+ movdqa xmm7, xmm2
+ psubw xmm7, [esp+432-240]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ psraw xmm7, 3
+ pmaxsw xmm6, xmm7
+ pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- movdqa xmm6, [esp+432-400]
- movdqa [esp+432-64], xmm5
- movdqa [esp+432-384], xmm6
- movdqa xmm5, xmm0
- psubw xmm5, xmm6
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm2
- paddw xmm7, xmm2
- psubw xmm5, xmm7
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- pminsw xmm5, xmm6
+ pand xmm5, [esp+432-320]
+ movdqa xmm6, [esp+432-400]
+ movdqa [esp+432-64], xmm5
+ movdqa [esp+432-384], xmm6
+ movdqa xmm5, xmm0
+ psubw xmm5, xmm6
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm2
+ psubw xmm5, xmm7
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-288]
- movdqa xmm6, [esp+432-240]
- movdqa [esp+432-96], xmm5
- movdqa xmm5, [esp+432-352]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm6
- paddw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
- psubw xmm5, xmm7
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-288]
+ movdqa xmm6, [esp+432-240]
+ movdqa [esp+432-96], xmm5
+ movdqa xmm5, [esp+432-352]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm6
+ paddw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+ psubw xmm5, xmm7
- movdqa xmm7, [esp+496-208]
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-400]
- pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-256]
- movdqa xmm6, [esp+448-208]
- punpckhbw xmm7, xmm0
- movdqa [esp+432-352], xmm7
+ movdqa xmm7, [esp+496-208]
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-400]
+ pminsw xmm5, xmm6
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-256]
+ movdqa xmm6, [esp+448-208]
+ punpckhbw xmm7, xmm0
+ movdqa [esp+432-352], xmm7
- movdqa xmm7, [esp+512-208]
- punpckhbw xmm6, xmm0
- movdqa [esp+432-48], xmm5
- movdqa xmm5, [esp+432-208]
- movdqa [esp+432-368], xmm6
- movdqa xmm6, [esp+464-208]
- punpckhbw xmm7, xmm0
- punpckhbw xmm5, xmm0
- movdqa [esp+432-384], xmm7
- punpckhbw xmm6, xmm0
- movdqa [esp+432-400], xmm6
+ movdqa xmm7, [esp+512-208]
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-48], xmm5
+ movdqa xmm5, [esp+432-208]
+ movdqa [esp+432-368], xmm6
+ movdqa xmm6, [esp+464-208]
+ punpckhbw xmm7, xmm0
+ punpckhbw xmm5, xmm0
+ movdqa [esp+432-384], xmm7
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-400], xmm6
- movdqa xmm7, [esp+432-400]
- movdqa xmm6, [esp+480-208]
- psubw xmm7, xmm5
- movdqa [esp+432-16], xmm5
- pabsw xmm7, xmm7
- punpckhbw xmm6, xmm0
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
+ movdqa xmm7, [esp+432-400]
+ movdqa xmm6, [esp+480-208]
+ psubw xmm7, xmm5
+ movdqa [esp+432-16], xmm5
+ pabsw xmm7, xmm7
+ punpckhbw xmm6, xmm0
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-384]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-384]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
- movdqa xmm5, [esp+432-400]
- movdqa [esp+432-80], xmm6
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
+ movdqa xmm5, [esp+432-400]
+ movdqa [esp+432-80], xmm6
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
- movdqa xmm5, xmm1
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, [esp+432-400]
- psubw xmm6, [esp+432-352]
- movdqa [esp+432-272], xmm5
- movdqa xmm7, xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- movdqa xmm7, xmm4
- pabsw xmm6, xmm6
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
+ movdqa xmm5, xmm1
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, [esp+432-400]
+ psubw xmm6, [esp+432-352]
+ movdqa [esp+432-272], xmm5
+ movdqa xmm7, xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ movdqa xmm7, xmm4
+ pabsw xmm6, xmm6
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
- pand xmm5, xmm7
- movdqa xmm7, [esp+432-400]
- psubw xmm7, xmm6
- psubw xmm6, [esp+432-352]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
- pand xmm5, xmm4
+ pand xmm5, xmm7
+ movdqa xmm7, [esp+432-400]
+ psubw xmm7, xmm6
+ psubw xmm6, [esp+432-352]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+ pand xmm5, xmm4
- paddw xmm2, [esp+432-96]
- movdqa xmm4, xmm1
- pcmpgtw xmm4, xmm0
- movdqa xmm7, xmm1
- pcmpeqw xmm7, xmm0
- por xmm4, xmm7
- pand xmm5, xmm4
- movdqa xmm4, [esp+432-224]
- movdqa [esp+432-320], xmm5
- movdqa xmm5, [esp+432-272]
- movdqa xmm7, xmm0
- psubw xmm7, xmm4
- psubw xmm0, xmm1
- psllw xmm5, 2
- paddw xmm6, xmm5
- paddw xmm6, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- movdqa [esp+432-336], xmm0
- psraw xmm6, 3
- pmaxsw xmm7, xmm6
- pminsw xmm4, xmm7
- pand xmm4, [esp+432-320]
- movdqa xmm6, xmm0
- movdqa xmm0, [esp+432-16]
- paddw xmm0, [esp+432-304]
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-368]
- paddw xmm4, xmm4
- psubw xmm0, xmm4
+ paddw xmm2, [esp+432-96]
+ movdqa xmm4, xmm1
+ pcmpgtw xmm4, xmm0
+ movdqa xmm7, xmm1
+ pcmpeqw xmm7, xmm0
+ por xmm4, xmm7
+ pand xmm5, xmm4
+ movdqa xmm4, [esp+432-224]
+ movdqa [esp+432-320], xmm5
+ movdqa xmm5, [esp+432-272]
+ movdqa xmm7, xmm0
+ psubw xmm7, xmm4
+ psubw xmm0, xmm1
+ psllw xmm5, 2
+ paddw xmm6, xmm5
+ paddw xmm6, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ movdqa [esp+432-336], xmm0
+ psraw xmm6, 3
+ pmaxsw xmm7, xmm6
+ pminsw xmm4, xmm7
+ pand xmm4, [esp+432-320]
+ movdqa xmm6, xmm0
+ movdqa xmm0, [esp+432-16]
+ paddw xmm0, [esp+432-304]
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-368]
+ paddw xmm4, xmm4
+ psubw xmm0, xmm4
- movdqa xmm4, [esp+432-64]
- psraw xmm0, 1
- pmaxsw xmm6, xmm0
- movdqa xmm0, [esp+432-400]
- movdqa xmm7, xmm1
- pminsw xmm7, xmm6
- movdqa xmm6, [esp+432-320]
- pand xmm7, xmm6
- pand xmm7, [esp+432-288]
- paddw xmm5, xmm7
- packuswb xmm2, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm0, xmm5
- paddw xmm3, xmm4
- packuswb xmm3, xmm0
+ movdqa xmm4, [esp+432-64]
+ psraw xmm0, 1
+ pmaxsw xmm6, xmm0
+ movdqa xmm0, [esp+432-400]
+ movdqa xmm7, xmm1
+ pminsw xmm7, xmm6
+ movdqa xmm6, [esp+432-320]
+ pand xmm7, xmm6
+ pand xmm7, [esp+432-288]
+ paddw xmm5, xmm7
+ packuswb xmm2, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm0, xmm5
+ paddw xmm3, xmm4
+ packuswb xmm3, xmm0
- movdqa xmm0, [esp+432-32]
- psubw xmm0, xmm4
- movdqa xmm4, [esp+432-80]
- psubw xmm4, xmm5
+ movdqa xmm0, [esp+432-32]
+ psubw xmm0, xmm4
+ movdqa xmm4, [esp+432-80]
+ psubw xmm4, xmm5
- movdqa xmm5, [esp+432-240]
- paddw xmm5, [esp+432-48]
- packuswb xmm0, xmm4
- movdqa xmm4, [esp+432-384]
- paddw xmm4, [esp+432-304]
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [esp+432-352]
- movdqa xmm7, xmm0
- paddw xmm0, xmm0
+ movdqa xmm5, [esp+432-240]
+ paddw xmm5, [esp+432-48]
+ packuswb xmm0, xmm4
+ movdqa xmm4, [esp+432-384]
+ paddw xmm4, [esp+432-304]
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [esp+432-352]
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm0
- mov ecx, dword [esp+432-408]
+ mov ecx, dword [esp+432-408]
- mov edx, dword [esp+432-404]
- psubw xmm4, xmm0
- movdqa xmm0, [esp+432-336]
- movdqa [edi], xmm2
- psraw xmm4, 1
- pmaxsw xmm0, xmm4
- pminsw xmm1, xmm0
- movdqa xmm0, [esp+480-208]
+ mov edx, dword [esp+432-404]
+ psubw xmm4, xmm0
+ movdqa xmm0, [esp+432-336]
+ movdqa [edi], xmm2
+ psraw xmm4, 1
+ pmaxsw xmm0, xmm4
+ pminsw xmm1, xmm0
+ movdqa xmm0, [esp+480-208]
- pop edi
- pand xmm1, xmm6
- pand xmm1, [esp+428-256]
- movdqa [ecx], xmm3
- paddw xmm7, xmm1
- pop esi
- packuswb xmm5, xmm7
- movdqa [eax], xmm0
- movdqa [edx], xmm5
- pop ebx
- mov esp, ebp
- pop ebp
- ret
+ pop edi
+ pand xmm1, xmm6
+ pand xmm1, [esp+428-256]
+ movdqa [ecx], xmm3
+ paddw xmm7, xmm1
+ pop esi
+ packuswb xmm5, xmm7
+ movdqa [eax], xmm0
+ movdqa [edx], xmm5
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
;*******************************************************************************
@@ -4583,542 +4583,542 @@
WELS_EXTERN DeblockLumaEq4V_ssse3
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 628 ; 00000274H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
- push ebx
- push esi
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 628 ; 00000274H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+ push ebx
+ push esi
- lea edx, [ecx*4]
- pxor xmm0, xmm0
- movdqa xmm2, xmm0
+ lea edx, [ecx*4]
+ pxor xmm0, xmm0
+ movdqa xmm2, xmm0
- movdqa xmm0, [ecx+eax]
- mov esi, eax
- sub esi, edx
- movdqa xmm3, [esi]
- movdqa xmm5, [eax]
- push edi
- lea edi, [ecx+ecx]
- lea ebx, [ecx+ecx*2]
- mov dword [esp+640-600], edi
- mov esi, eax
- sub esi, edi
- movdqa xmm1, [esi]
- movdqa [esp+720-272], xmm0
- mov edi, eax
- sub edi, ecx
- movdqa xmm4, [edi]
- add ecx, eax
- mov dword [esp+640-596], ecx
+ movdqa xmm0, [ecx+eax]
+ mov esi, eax
+ sub esi, edx
+ movdqa xmm3, [esi]
+ movdqa xmm5, [eax]
+ push edi
+ lea edi, [ecx+ecx]
+ lea ebx, [ecx+ecx*2]
+ mov dword [esp+640-600], edi
+ mov esi, eax
+ sub esi, edi
+ movdqa xmm1, [esi]
+ movdqa [esp+720-272], xmm0
+ mov edi, eax
+ sub edi, ecx
+ movdqa xmm4, [edi]
+ add ecx, eax
+ mov dword [esp+640-596], ecx
- mov ecx, dword [esp+640-600]
- movdqa xmm0, [ecx+eax]
- movdqa [esp+736-272], xmm0
+ mov ecx, dword [esp+640-600]
+ movdqa xmm0, [ecx+eax]
+ movdqa [esp+736-272], xmm0
- movdqa xmm0, [eax+ebx]
- mov edx, eax
- sub edx, ebx
+ movdqa xmm0, [eax+ebx]
+ mov edx, eax
+ sub edx, ebx
- movsx ebx, word [ebp+16]
- movdqa xmm6, [edx]
- add ecx, eax
- movdqa [esp+752-272], xmm0
- movd xmm0, ebx
+ movsx ebx, word [ebp+16]
+ movdqa xmm6, [edx]
+ add ecx, eax
+ movdqa [esp+752-272], xmm0
+ movd xmm0, ebx
- movsx ebx, word [ebp+20]
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
- movdqa [esp+640-320], xmm0
- movd xmm0, ebx
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
+ movsx ebx, word [ebp+20]
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+ movdqa [esp+640-320], xmm0
+ movd xmm0, ebx
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
- movdqa xmm7, [esp+736-272]
- punpcklbw xmm7, xmm2
- movdqa [esp+640-416], xmm7
- movdqa [esp+640-512], xmm0
- movdqa xmm0, xmm1
- movdqa [esp+672-272], xmm1
- movdqa xmm1, xmm4
- movdqa [esp+704-272], xmm5
- punpcklbw xmm5, xmm2
- punpcklbw xmm1, xmm2
+ movdqa xmm7, [esp+736-272]
+ punpcklbw xmm7, xmm2
+ movdqa [esp+640-416], xmm7
+ movdqa [esp+640-512], xmm0
+ movdqa xmm0, xmm1
+ movdqa [esp+672-272], xmm1
+ movdqa xmm1, xmm4
+ movdqa [esp+704-272], xmm5
+ punpcklbw xmm5, xmm2
+ punpcklbw xmm1, xmm2
- movdqa xmm7, xmm5
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- punpcklbw xmm0, xmm2
- movdqa [esp+688-272], xmm4
- movdqa xmm4, [esp+720-272]
- movdqa [esp+640-480], xmm0
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ punpcklbw xmm0, xmm2
+ movdqa [esp+688-272], xmm4
+ movdqa xmm4, [esp+720-272]
+ movdqa [esp+640-480], xmm0
- movdqa xmm7, xmm1
- psubw xmm7, xmm0
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm0
- movdqa xmm0, [esp+640-512]
- pabsw xmm7, xmm7
- punpcklbw xmm4, xmm2
- pcmpgtw xmm0, xmm7
- movdqa [esp+640-384], xmm4
- movdqa xmm7, xmm5
- psubw xmm7, xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+656-272], xmm6
- punpcklbw xmm6, xmm2
- pabsw xmm7, xmm7
- movdqa [esp+640-48], xmm2
- movdqa [esp+640-368], xmm6
- movdqa [esp+640-144], xmm1
- movdqa [esp+640-400], xmm5
- pcmpgtw xmm4, xmm7
- pand xmm0, xmm4
- movdqa xmm4, [esp+640-320]
- pcmpgtw xmm4, [esp+640-560]
- pand xmm0, xmm4
+ movdqa xmm0, [esp+640-512]
+ pabsw xmm7, xmm7
+ punpcklbw xmm4, xmm2
+ pcmpgtw xmm0, xmm7
+ movdqa [esp+640-384], xmm4
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+656-272], xmm6
+ punpcklbw xmm6, xmm2
+ pabsw xmm7, xmm7
+ movdqa [esp+640-48], xmm2
+ movdqa [esp+640-368], xmm6
+ movdqa [esp+640-144], xmm1
+ movdqa [esp+640-400], xmm5
+ pcmpgtw xmm4, xmm7
+ pand xmm0, xmm4
+ movdqa xmm4, [esp+640-320]
+ pcmpgtw xmm4, [esp+640-560]
+ pand xmm0, xmm4
- mov ebx, 2
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, [esp+640-320]
- psraw xmm4, 2
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm7
- movdqa [esp+640-576], xmm4
- pcmpgtw xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
+ mov ebx, 2
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, [esp+640-320]
+ psraw xmm4, 2
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm7
+ movdqa [esp+640-576], xmm4
+ pcmpgtw xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+640-624], xmm7
- movdqa xmm7, xmm1
- psubw xmm7, xmm6
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+640-624], xmm7
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm6
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
- pand xmm4, [esp+640-560]
- movdqa [esp+640-544], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa xmm7, xmm5
- psubw xmm7, [esp+640-416]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-544], xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa xmm7, xmm5
+ psubw xmm7, [esp+640-416]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
- pand xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
- movdqa xmm4, [esp+640-544]
- pandn xmm4, xmm6
- movdqa [esp+640-16], xmm4
- mov ebx, 4
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm2
- psllw xmm4, 1
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, [esp+640-480]
+ movdqa xmm4, [esp+640-544]
+ pandn xmm4, xmm6
+ movdqa [esp+640-16], xmm4
+ mov ebx, 4
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, xmm3
+ punpcklbw xmm4, xmm2
+ psllw xmm4, 1
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, [esp+640-480]
- movdqa xmm6, [esp+640-560]
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm1
- movdqa [esp+640-592], xmm7
- paddw xmm4, xmm5
- paddw xmm4, xmm7
- movdqa xmm7, [esp+640-416]
- pandn xmm6, xmm7
- movdqa [esp+640-80], xmm6
- movdqa xmm6, [esp+752-272]
- punpcklbw xmm6, xmm2
- psllw xmm6, 1
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-384]
+ movdqa xmm6, [esp+640-560]
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm1
+ movdqa [esp+640-592], xmm7
+ paddw xmm4, xmm5
+ paddw xmm4, xmm7
+ movdqa xmm7, [esp+640-416]
+ pandn xmm6, xmm7
+ movdqa [esp+640-80], xmm6
+ movdqa xmm6, [esp+752-272]
+ punpcklbw xmm6, xmm2
+ psllw xmm6, 1
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-384]
- movdqa xmm7, [esp+640-480]
- paddw xmm6, xmm5
- paddw xmm6, xmm1
- paddw xmm6, [esp+640-592]
- psraw xmm6, 3
- pand xmm6, [esp+640-560]
- movdqa [esp+640-112], xmm6
- movdqa xmm6, [esp+640-544]
- pandn xmm6, xmm7
- movdqa [esp+640-336], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-528], xmm6
- movdqa xmm6, [esp+640-368]
- paddw xmm6, xmm7
- movdqa xmm7, xmm1
- psraw xmm4, 3
- pand xmm4, [esp+640-544]
- paddw xmm7, xmm5
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
+ movdqa xmm7, [esp+640-480]
+ paddw xmm6, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, [esp+640-592]
+ psraw xmm6, 3
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-112], xmm6
+ movdqa xmm6, [esp+640-544]
+ pandn xmm6, xmm7
+ movdqa [esp+640-336], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-528], xmm6
+ movdqa xmm6, [esp+640-368]
+ paddw xmm6, xmm7
+ movdqa xmm7, xmm1
+ psraw xmm4, 3
+ pand xmm4, [esp+640-544]
+ paddw xmm7, xmm5
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
- paddw xmm5, xmm1
- psraw xmm6, 2
- pand xmm7, xmm6
+ paddw xmm5, xmm1
+ psraw xmm6, 2
+ pand xmm7, xmm6
- movdqa xmm6, [esp+640-384]
- movdqa [esp+640-64], xmm7
- movdqa xmm7, [esp+640-560]
- pandn xmm7, xmm6
- movdqa [esp+640-304], xmm7
- movdqa xmm7, [esp+640-560]
- movdqa [esp+640-528], xmm7
- movdqa xmm7, [esp+640-416]
- paddw xmm7, xmm6
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pand xmm5, xmm7
- movdqa [esp+640-32], xmm5
+ movdqa xmm6, [esp+640-384]
+ movdqa [esp+640-64], xmm7
+ movdqa xmm7, [esp+640-560]
+ pandn xmm7, xmm6
+ movdqa [esp+640-304], xmm7
+ movdqa xmm7, [esp+640-560]
+ movdqa [esp+640-528], xmm7
+ movdqa xmm7, [esp+640-416]
+ paddw xmm7, xmm6
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pand xmm5, xmm7
+ movdqa [esp+640-32], xmm5
- movdqa xmm5, [esp+640-544]
- movdqa [esp+640-528], xmm5
- movdqa xmm5, [esp+640-480]
- movdqa xmm7, xmm5
- paddw xmm7, xmm5
- movdqa xmm5, xmm1
- paddw xmm5, xmm6
- paddw xmm6, [esp+640-592]
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pandn xmm5, xmm7
- movdqa xmm7, [esp+640-480]
- paddw xmm7, xmm1
- paddw xmm7, [esp+640-400]
- movdqa xmm1, [esp+640-544]
- movdqa [esp+640-352], xmm5
- movdqa xmm5, [esp+640-368]
- psllw xmm7, 1
- paddw xmm7, xmm6
- paddw xmm5, xmm7
+ movdqa xmm5, [esp+640-544]
+ movdqa [esp+640-528], xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa xmm7, xmm5
+ paddw xmm7, xmm5
+ movdqa xmm5, xmm1
+ paddw xmm5, xmm6
+ paddw xmm6, [esp+640-592]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pandn xmm5, xmm7
+ movdqa xmm7, [esp+640-480]
+ paddw xmm7, xmm1
+ paddw xmm7, [esp+640-400]
+ movdqa xmm1, [esp+640-544]
+ movdqa [esp+640-352], xmm5
+ movdqa xmm5, [esp+640-368]
+ psllw xmm7, 1
+ paddw xmm7, xmm6
+ paddw xmm5, xmm7
- movdqa xmm7, [esp+640-400]
- psraw xmm5, 3
- pand xmm1, xmm5
- movdqa xmm5, [esp+640-480]
- movdqa [esp+640-96], xmm1
- movdqa xmm1, [esp+640-560]
- movdqa [esp+640-528], xmm1
- movdqa xmm1, [esp+640-384]
- movdqa xmm6, xmm1
- paddw xmm6, xmm1
- paddw xmm1, [esp+640-400]
- paddw xmm1, [esp+640-144]
- paddw xmm7, xmm5
- paddw xmm5, [esp+640-592]
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
- psraw xmm6, 2
- psllw xmm1, 1
- paddw xmm1, xmm5
+ movdqa xmm7, [esp+640-400]
+ psraw xmm5, 3
+ pand xmm1, xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa [esp+640-96], xmm1
+ movdqa xmm1, [esp+640-560]
+ movdqa [esp+640-528], xmm1
+ movdqa xmm1, [esp+640-384]
+ movdqa xmm6, xmm1
+ paddw xmm6, xmm1
+ paddw xmm1, [esp+640-400]
+ paddw xmm1, [esp+640-144]
+ paddw xmm7, xmm5
+ paddw xmm5, [esp+640-592]
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+ psraw xmm6, 2
+ psllw xmm1, 1
+ paddw xmm1, xmm5
- movdqa xmm5, [esp+656-272]
- pandn xmm7, xmm6
- movdqa xmm6, [esp+640-416]
- paddw xmm6, xmm1
- movdqa xmm1, [esp+640-560]
- psraw xmm6, 3
- pand xmm1, xmm6
+ movdqa xmm5, [esp+656-272]
+ pandn xmm7, xmm6
+ movdqa xmm6, [esp+640-416]
+ paddw xmm6, xmm1
+ movdqa xmm1, [esp+640-560]
+ psraw xmm6, 3
+ pand xmm1, xmm6
- movdqa xmm6, [esp+704-272]
- movdqa [esp+640-128], xmm1
- movdqa xmm1, [esp+672-272]
- punpckhbw xmm1, xmm2
- movdqa [esp+640-448], xmm1
- movdqa xmm1, [esp+688-272]
- punpckhbw xmm1, xmm2
- punpckhbw xmm6, xmm2
- movdqa [esp+640-288], xmm7
- punpckhbw xmm5, xmm2
- movdqa [esp+640-496], xmm1
- movdqa [esp+640-432], xmm6
+ movdqa xmm6, [esp+704-272]
+ movdqa [esp+640-128], xmm1
+ movdqa xmm1, [esp+672-272]
+ punpckhbw xmm1, xmm2
+ movdqa [esp+640-448], xmm1
+ movdqa xmm1, [esp+688-272]
+ punpckhbw xmm1, xmm2
+ punpckhbw xmm6, xmm2
+ movdqa [esp+640-288], xmm7
+ punpckhbw xmm5, xmm2
+ movdqa [esp+640-496], xmm1
+ movdqa [esp+640-432], xmm6
- movdqa xmm7, [esp+720-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-464], xmm7
+ movdqa xmm7, [esp+720-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-464], xmm7
- movdqa xmm7, [esp+736-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-528], xmm7
+ movdqa xmm7, [esp+736-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-528], xmm7
- movdqa xmm7, xmm6
+ movdqa xmm7, xmm6
- psubw xmm6, [esp+640-464]
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- por xmm4, [esp+640-16]
- pabsw xmm6, xmm6
- movdqa xmm7, xmm1
- psubw xmm7, [esp+640-448]
+ psubw xmm6, [esp+640-464]
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ por xmm4, [esp+640-16]
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm1
+ psubw xmm7, [esp+640-448]
- movdqa xmm1, [esp+640-512]
- pabsw xmm7, xmm7
- pcmpgtw xmm1, xmm7
- movdqa xmm7, [esp+640-512]
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+640-320]
- pand xmm1, xmm7
- movdqa xmm7, [esp+640-560]
- pcmpgtw xmm6, xmm7
- pand xmm1, xmm6
+ movdqa xmm1, [esp+640-512]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm1, xmm7
+ movdqa xmm7, [esp+640-512]
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+640-320]
+ pand xmm1, xmm7
+ movdqa xmm7, [esp+640-560]
+ pcmpgtw xmm6, xmm7
+ pand xmm1, xmm6
- movdqa xmm6, [esp+640-576]
- pcmpgtw xmm6, xmm7
+ movdqa xmm6, [esp+640-576]
+ pcmpgtw xmm6, xmm7
- movdqa xmm7, [esp+640-496]
- punpckhbw xmm3, xmm2
- movdqa [esp+640-560], xmm6
- movdqa xmm6, [esp+640-512]
- psubw xmm7, xmm5
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
+ movdqa xmm7, [esp+640-496]
+ punpckhbw xmm3, xmm2
+ movdqa [esp+640-560], xmm6
+ movdqa xmm6, [esp+640-512]
+ psubw xmm7, xmm5
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa xmm7, [esp+640-432]
- psubw xmm7, [esp+640-528]
+ pand xmm6, [esp+640-560]
+ movdqa xmm7, [esp+640-432]
+ psubw xmm7, [esp+640-528]
- psllw xmm3, 1
- movdqa [esp+640-544], xmm6
- movdqa xmm6, [esp+640-512]
+ psllw xmm3, 1
+ movdqa [esp+640-544], xmm6
+ movdqa xmm6, [esp+640-512]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, [esp+640-448]
- paddw xmm3, [esp+640-496]
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa [esp+640-560], xmm6
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, [esp+640-448]
+ paddw xmm3, [esp+640-496]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-560], xmm6
- movdqa xmm6, xmm0
- pand xmm6, xmm4
- movdqa xmm4, xmm0
- pandn xmm4, [esp+640-368]
- por xmm6, xmm4
- movdqa xmm4, [esp+640-432]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-592]
- psraw xmm3, 3
- pand xmm3, xmm2
- pandn xmm2, xmm5
- por xmm3, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm3
- movdqa xmm3, [esp+640-64]
- por xmm3, [esp+640-336]
- movdqa xmm2, xmm1
- pandn xmm2, xmm5
- por xmm7, xmm2
+ movdqa xmm6, xmm0
+ pand xmm6, xmm4
+ movdqa xmm4, xmm0
+ pandn xmm4, [esp+640-368]
+ por xmm6, xmm4
+ movdqa xmm4, [esp+640-432]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-592]
+ psraw xmm3, 3
+ pand xmm3, xmm2
+ pandn xmm2, xmm5
+ por xmm3, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm3
+ movdqa xmm3, [esp+640-64]
+ por xmm3, [esp+640-336]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm5
+ por xmm7, xmm2
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-480]
- por xmm2, xmm3
- packuswb xmm6, xmm7
- movdqa [esp+640-336], xmm2
- movdqa [esp+656-272], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa xmm2, xmm5
- paddw xmm2, [esp+640-448]
- movdqa xmm3, xmm1
- movdqa xmm7, [esp+640-496]
- paddw xmm7, xmm4
- paddw xmm2, xmm7
- paddw xmm2, [esp+640-624]
- movdqa xmm7, [esp+640-544]
- psraw xmm2, 2
- pand xmm6, xmm2
- movdqa xmm2, [esp+640-448]
- pandn xmm7, xmm2
- por xmm6, xmm7
- pand xmm3, xmm6
- movdqa xmm6, xmm1
- pandn xmm6, xmm2
- paddw xmm2, [esp+640-496]
- paddw xmm2, xmm4
- por xmm3, xmm6
- movdqa xmm6, [esp+640-336]
- packuswb xmm6, xmm3
- psllw xmm2, 1
- movdqa [esp+672-272], xmm6
- movdqa xmm6, [esp+640-96]
- por xmm6, [esp+640-352]
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-480]
+ por xmm2, xmm3
+ packuswb xmm6, xmm7
+ movdqa [esp+640-336], xmm2
+ movdqa [esp+656-272], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa xmm2, xmm5
+ paddw xmm2, [esp+640-448]
+ movdqa xmm3, xmm1
+ movdqa xmm7, [esp+640-496]
+ paddw xmm7, xmm4
+ paddw xmm2, xmm7
+ paddw xmm2, [esp+640-624]
+ movdqa xmm7, [esp+640-544]
+ psraw xmm2, 2
+ pand xmm6, xmm2
+ movdqa xmm2, [esp+640-448]
+ pandn xmm7, xmm2
+ por xmm6, xmm7
+ pand xmm3, xmm6
+ movdqa xmm6, xmm1
+ pandn xmm6, xmm2
+ paddw xmm2, [esp+640-496]
+ paddw xmm2, xmm4
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-336]
+ packuswb xmm6, xmm3
+ psllw xmm2, 1
+ movdqa [esp+672-272], xmm6
+ movdqa xmm6, [esp+640-96]
+ por xmm6, [esp+640-352]
- movdqa xmm3, xmm0
- pand xmm3, xmm6
- movdqa xmm6, xmm0
- pandn xmm6, [esp+640-144]
- por xmm3, xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-352], xmm3
- movdqa xmm3, [esp+640-464]
- paddw xmm3, [esp+640-592]
- paddw xmm2, xmm3
- movdqa xmm3, [esp+640-448]
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-496]
- psraw xmm5, 3
- pand xmm6, xmm5
- movdqa xmm5, [esp+640-464]
- paddw xmm2, xmm5
- paddw xmm5, [esp+640-432]
- movdqa xmm4, xmm3
- paddw xmm4, xmm3
- paddw xmm4, xmm2
- paddw xmm4, [esp+640-624]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, [esp+640-592]
- psraw xmm4, 2
- pandn xmm2, xmm4
- por xmm6, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-496]
- movdqa xmm2, xmm1
- pandn xmm2, xmm6
- por xmm7, xmm2
- movdqa xmm2, [esp+640-352]
- packuswb xmm2, xmm7
- movdqa [esp+688-272], xmm2
- movdqa xmm2, [esp+640-128]
- por xmm2, [esp+640-288]
+ movdqa xmm3, xmm0
+ pand xmm3, xmm6
+ movdqa xmm6, xmm0
+ pandn xmm6, [esp+640-144]
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-352], xmm3
+ movdqa xmm3, [esp+640-464]
+ paddw xmm3, [esp+640-592]
+ paddw xmm2, xmm3
+ movdqa xmm3, [esp+640-448]
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-496]
+ psraw xmm5, 3
+ pand xmm6, xmm5
+ movdqa xmm5, [esp+640-464]
+ paddw xmm2, xmm5
+ paddw xmm5, [esp+640-432]
+ movdqa xmm4, xmm3
+ paddw xmm4, xmm3
+ paddw xmm4, xmm2
+ paddw xmm4, [esp+640-624]
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, [esp+640-592]
+ psraw xmm4, 2
+ pandn xmm2, xmm4
+ por xmm6, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-496]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm6
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-352]
+ packuswb xmm2, xmm7
+ movdqa [esp+688-272], xmm2
+ movdqa xmm2, [esp+640-128]
+ por xmm2, [esp+640-288]
- movdqa xmm4, xmm0
- pand xmm4, xmm2
- paddw xmm5, xmm6
- movdqa xmm2, xmm0
- pandn xmm2, [esp+640-400]
- por xmm4, xmm2
- movdqa xmm2, [esp+640-528]
- psllw xmm5, 1
- paddw xmm5, xmm3
- movdqa xmm3, [esp+640-560]
- paddw xmm2, xmm5
- psraw xmm2, 3
- movdqa [esp+640-288], xmm4
- movdqa xmm4, [esp+640-560]
- pand xmm4, xmm2
- movdqa xmm2, [esp+640-464]
- movdqa xmm5, xmm2
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-432]
- paddw xmm2, [esp+640-448]
- movdqa xmm7, xmm1
- paddw xmm5, xmm2
- paddw xmm5, [esp+640-624]
- movdqa xmm6, [esp+640-560]
- psraw xmm5, 2
- pandn xmm3, xmm5
- por xmm4, xmm3
- movdqa xmm3, [esp+640-32]
- por xmm3, [esp+640-304]
- pand xmm7, xmm4
- movdqa xmm4, [esp+640-432]
- movdqa xmm5, [esp+640-464]
- movdqa xmm2, xmm1
- pandn xmm2, xmm4
- paddw xmm4, [esp+640-496]
- por xmm7, xmm2
- movdqa xmm2, [esp+640-288]
- packuswb xmm2, xmm7
- movdqa [esp+704-272], xmm2
+ movdqa xmm4, xmm0
+ pand xmm4, xmm2
+ paddw xmm5, xmm6
+ movdqa xmm2, xmm0
+ pandn xmm2, [esp+640-400]
+ por xmm4, xmm2
+ movdqa xmm2, [esp+640-528]
+ psllw xmm5, 1
+ paddw xmm5, xmm3
+ movdqa xmm3, [esp+640-560]
+ paddw xmm2, xmm5
+ psraw xmm2, 3
+ movdqa [esp+640-288], xmm4
+ movdqa xmm4, [esp+640-560]
+ pand xmm4, xmm2
+ movdqa xmm2, [esp+640-464]
+ movdqa xmm5, xmm2
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-432]
+ paddw xmm2, [esp+640-448]
+ movdqa xmm7, xmm1
+ paddw xmm5, xmm2
+ paddw xmm5, [esp+640-624]
+ movdqa xmm6, [esp+640-560]
+ psraw xmm5, 2
+ pandn xmm3, xmm5
+ por xmm4, xmm3
+ movdqa xmm3, [esp+640-32]
+ por xmm3, [esp+640-304]
+ pand xmm7, xmm4
+ movdqa xmm4, [esp+640-432]
+ movdqa xmm5, [esp+640-464]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm4
+ paddw xmm4, [esp+640-496]
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-288]
+ packuswb xmm2, xmm7
+ movdqa [esp+704-272], xmm2
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-384]
- por xmm2, xmm3
- movdqa [esp+640-304], xmm2
- movdqa xmm2, [esp+640-528]
- movdqa xmm3, xmm2
- paddw xmm3, [esp+640-464]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-624]
- psraw xmm3, 2
- pand xmm6, xmm3
- movdqa xmm3, [esp+640-560]
- movdqa xmm4, xmm3
- pandn xmm4, xmm5
- por xmm6, xmm4
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-304]
- movdqa xmm4, xmm1
- pandn xmm4, xmm5
- por xmm7, xmm4
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-384]
+ por xmm2, xmm3
+ movdqa [esp+640-304], xmm2
+ movdqa xmm2, [esp+640-528]
+ movdqa xmm3, xmm2
+ paddw xmm3, [esp+640-464]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-624]
+ psraw xmm3, 2
+ pand xmm6, xmm3
+ movdqa xmm3, [esp+640-560]
+ movdqa xmm4, xmm3
+ pandn xmm4, xmm5
+ por xmm6, xmm4
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-304]
+ movdqa xmm4, xmm1
+ pandn xmm4, xmm5
+ por xmm7, xmm4
- movdqa xmm4, xmm0
- pandn xmm0, [esp+640-416]
- packuswb xmm6, xmm7
- movdqa xmm7, [esp+640-112]
- por xmm7, [esp+640-80]
- pand xmm4, xmm7
- por xmm4, xmm0
- movdqa xmm0, [esp+752-272]
- punpckhbw xmm0, [esp+640-48]
- psllw xmm0, 1
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm5
- paddw xmm0, [esp+640-432]
- paddw xmm0, [esp+640-496]
- paddw xmm0, [esp+640-592]
- psraw xmm0, 3
- pand xmm0, xmm3
- movdqa xmm7, xmm1
- pandn xmm3, xmm2
- por xmm0, xmm3
- pand xmm7, xmm0
+ movdqa xmm4, xmm0
+ pandn xmm0, [esp+640-416]
+ packuswb xmm6, xmm7
+ movdqa xmm7, [esp+640-112]
+ por xmm7, [esp+640-80]
+ pand xmm4, xmm7
+ por xmm4, xmm0
+ movdqa xmm0, [esp+752-272]
+ punpckhbw xmm0, [esp+640-48]
+ psllw xmm0, 1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm5
+ paddw xmm0, [esp+640-432]
+ paddw xmm0, [esp+640-496]
+ paddw xmm0, [esp+640-592]
+ psraw xmm0, 3
+ pand xmm0, xmm3
+ movdqa xmm7, xmm1
+ pandn xmm3, xmm2
+ por xmm0, xmm3
+ pand xmm7, xmm0
- movdqa xmm0, [esp+656-272]
- movdqa [edx], xmm0
+ movdqa xmm0, [esp+656-272]
+ movdqa [edx], xmm0
- movdqa xmm0, [esp+672-272]
+ movdqa xmm0, [esp+672-272]
- mov edx, dword [esp+640-596]
- movdqa [esi], xmm0
- movdqa xmm0, [esp+688-272]
- movdqa [edi], xmm0
- movdqa xmm0, [esp+704-272]
+ mov edx, dword [esp+640-596]
+ movdqa [esi], xmm0
+ movdqa xmm0, [esp+688-272]
+ movdqa [edi], xmm0
+ movdqa xmm0, [esp+704-272]
- pop edi
- pandn xmm1, xmm2
- movdqa [eax], xmm0
- por xmm7, xmm1
- pop esi
- packuswb xmm4, xmm7
- movdqa [edx], xmm6
- movdqa [ecx], xmm4
- pop ebx
- mov esp, ebp
- pop ebp
- ret
+ pop edi
+ pandn xmm1, xmm2
+ movdqa [eax], xmm0
+ por xmm7, xmm1
+ pop esi
+ packuswb xmm4, xmm7
+ movdqa [edx], xmm6
+ movdqa [ecx], xmm4
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
%endif
--- a/codec/common/x86/expand_picture.asm
+++ b/codec/common/x86/expand_picture.asm
@@ -77,280 +77,280 @@
;cccc|ceeeeeeeeeeeeeeeed|dddd
;cccc|ceeeeeeeeeeeeeeeed|dddd
-%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
+%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
%endmacro
-%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+%2]
+%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+%2]
%endmacro
-%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
- movdq%4 [%1], %3 ; top(bottom)_0
- movdq%4 [%1+%2], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdq%4 [%1], %3 ; top(bottom)_2
- movdq%4 [%1+%2], %3 ; top(bottom)_3
- lea %1, [%1+2*%2]
+%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
+ movdq%4 [%1], %3 ; top(bottom)_0
+ movdq%4 [%1+%2], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdq%4 [%1], %3 ; top(bottom)_2
+ movdq%4 [%1+%2], %3 ; top(bottom)_3
+ lea %1, [%1+2*%2]
%endmacro
-%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
- movdq%4 [%1], %3 ; top(bottom)_0
- movdq%4 [%1+%2], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdq%4 [%1], %3 ; top(bottom)_2
- movdq%4 [%1+%2], %3 ; top(bottom)_3
- lea %1, [%1+%2]
+%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
+ movdq%4 [%1], %3 ; top(bottom)_0
+ movdq%4 [%1+%2], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdq%4 [%1], %3 ; top(bottom)_2
+ movdq%4 [%1+%2], %3 ; top(bottom)_3
+ lea %1, [%1+%2]
%endmacro
-%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
- movdqa [%1], %3 ; top(bottom)_0
- movdqa [%1+16], %3 ; top(bottom)_0
- movdqa [%1+%2], %3 ; top(bottom)_1
- movdqa [%1+%2+16], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdqa [%1], %3 ; top(bottom)_2
- movdqa [%1+16], %3 ; top(bottom)_2
- movdqa [%1+%2], %3 ; top(bottom)_3
- movdqa [%1+%2+16], %3 ; top(bottom)_3
- lea %1, [%1+2*%2]
+%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
+ movdqa [%1], %3 ; top(bottom)_0
+ movdqa [%1+16], %3 ; top(bottom)_0
+ movdqa [%1+%2], %3 ; top(bottom)_1
+ movdqa [%1+%2+16], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdqa [%1], %3 ; top(bottom)_2
+ movdqa [%1+16], %3 ; top(bottom)_2
+ movdqa [%1+%2], %3 ; top(bottom)_3
+ movdqa [%1+%2+16], %3 ; top(bottom)_3
+ lea %1, [%1+2*%2]
%endmacro
-%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
- movdqa [%1], %3 ; top(bottom)_0
- movdqa [%1+16], %3 ; top(bottom)_0
- movdqa [%1+%2], %3 ; top(bottom)_1
- movdqa [%1+%2+16], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdqa [%1], %3 ; top(bottom)_2
- movdqa [%1+16], %3 ; top(bottom)_2
- movdqa [%1+%2], %3 ; top(bottom)_3
- movdqa [%1+%2+16], %3 ; top(bottom)_3
- lea %1, [%1+%2]
+%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
+ movdqa [%1], %3 ; top(bottom)_0
+ movdqa [%1+16], %3 ; top(bottom)_0
+ movdqa [%1+%2], %3 ; top(bottom)_1
+ movdqa [%1+%2+16], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdqa [%1], %3 ; top(bottom)_2
+ movdqa [%1+16], %3 ; top(bottom)_2
+ movdqa [%1+%2], %3 ; top(bottom)_3
+ movdqa [%1+%2+16], %3 ; top(bottom)_3
+ lea %1, [%1+%2]
%endmacro
-%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
+%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
;r2 [width/16(8)]
;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
-%if %1 == 32 ; for luma
- sar r2, 04h ; width / 16(8) pixels
+%if %1 == 32 ; for luma
+ sar r2, 04h ; width / 16(8) pixels
.top_bottom_loops:
- ; top
- movdqa xmm0, [r0] ; first line of picture pData
- mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_end16x4_sse2 r5, r1, xmm0, a
+ ; top
+ movdqa xmm0, [r0] ; first line of picture pData
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_end16x4_sse2 r5, r1, xmm0, a
- ; bottom
- movdqa xmm1, [r3] ; last line of picture pData
- mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_end16x4_sse2 r4, r1, xmm1, a
+ ; bottom
+ movdqa xmm1, [r3] ; last line of picture pData
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_end16x4_sse2 r4, r1, xmm1, a
- lea r0, [r0+16] ; top pSrc
- lea r5, [r5+16] ; top dst
- lea r3, [r3+16] ; bottom pSrc
- lea r4, [r4+16] ; bottom dst
- neg r1 ; positive/negative stride need for next loop?
+ lea r0, [r0+16] ; top pSrc
+ lea r5, [r5+16] ; top dst
+ lea r3, [r3+16] ; bottom pSrc
+ lea r4, [r4+16] ; bottom dst
+ neg r1 ; positive/negative stride need for next loop?
- dec r2
- jnz near .top_bottom_loops
-%elif %1 == 16 ; for chroma ??
- mov r6, r2
- sar r2, 04h ; (width / 16) pixels
+ dec r2
+ jnz near .top_bottom_loops
+%elif %1 == 16 ; for chroma ??
+ mov r6, r2
+ sar r2, 04h ; (width / 16) pixels
.top_bottom_loops:
- ; top
- movdqa xmm0, [r0] ; first line of picture pData
- mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_end16x4_sse2 r5, r1, xmm0, a
+ ; top
+ movdqa xmm0, [r0] ; first line of picture pData
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_end16x4_sse2 r5, r1, xmm0, a
- ; bottom
- movdqa xmm1, [r3] ; last line of picture pData
- mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_end16x4_sse2 r4, r1, xmm1, a
+ ; bottom
+ movdqa xmm1, [r3] ; last line of picture pData
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_end16x4_sse2 r4, r1, xmm1, a
- lea r0, [r0+16] ; top pSrc
- lea r5, [r5+16] ; top dst
- lea r3, [r3+16] ; bottom pSrc
- lea r4, [r4+16] ; bottom dst
- neg r1 ; positive/negative stride need for next loop?
+ lea r0, [r0+16] ; top pSrc
+ lea r5, [r5+16] ; top dst
+ lea r3, [r3+16] ; bottom pSrc
+ lea r4, [r4+16] ; bottom dst
+ neg r1 ; positive/negative stride need for next loop?
- dec r2
- jnz near .top_bottom_loops
+ dec r2
+ jnz near .top_bottom_loops
- ; for remaining 8 bytes
- and r6, 0fh ; any 8 bytes left?
- test r6, r6
- jz near .to_be_continued ; no left to exit here
+ ; for remaining 8 bytes
+ and r6, 0fh ; any 8 bytes left?
+ test r6, r6
+ jz near .to_be_continued ; no left to exit here
- ; top
- movq mm0, [r0] ; remained 8 byte
- mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- ; bottom
- movq mm1, [r3]
- mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- WELSEMMS
+ ; top
+ movq mm0, [r0] ; remained 8 byte
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ ; bottom
+ movq mm1, [r3]
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ WELSEMMS
.to_be_continued:
%endif
%endmacro
-%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
;r6 [height]
;r0 [pSrc+0] r5[pSrc-32] r1[stride]
;r3 [pSrc+(w-1)] r4[pSrc+w]
-%if %1 == 32 ; for luma
+%if %1 == 32 ; for luma
.left_right_loops:
- ; left
- movzx r2d, byte [r0] ; pixel pData for left border
- SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [r5], xmm0
- movdqa [r5+16], xmm0
+ ; left
+ movzx r2d, byte [r0] ; pixel pData for left border
+ SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r5], xmm0
+ movdqa [r5+16], xmm0
- ; right
- movzx r2d, byte [r3]
- SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [r4], xmm1
- movdqa [r4+16], xmm1
+ ; right
+ movzx r2d, byte [r3]
+ SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r4], xmm1
+ movdqa [r4+16], xmm1
- lea r0, [r0+r1] ; left pSrc
- lea r5, [r5+r1] ; left dst
- lea r3, [r3+r1] ; right pSrc
- lea r4, [r4+r1] ; right dst
+ lea r0, [r0+r1] ; left pSrc
+ lea r5, [r5+r1] ; left dst
+ lea r3, [r3+r1] ; right pSrc
+ lea r4, [r4+r1] ; right dst
- dec r6
- jnz near .left_right_loops
-%elif %1 == 16 ; for chroma ??
+ dec r6
+ jnz near .left_right_loops
+%elif %1 == 16 ; for chroma ??
.left_right_loops:
- ; left
- movzx r2d, byte [r0] ; pixel pData for left border
- SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [r5], xmm0
+ ; left
+ movzx r2d, byte [r0] ; pixel pData for left border
+ SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r5], xmm0
- ; right
- movzx r2d, byte [r3]
- SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
+ ; right
+ movzx r2d, byte [r3]
+ SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
- lea r0, [r0+r1] ; left pSrc
- lea r5, [r5+r1] ; left dst
- lea r3, [r3+r1] ; right pSrc
- lea r4, [r4+r1] ; right dst
+ lea r0, [r0+r1] ; left pSrc
+ lea r5, [r5+r1] ; left dst
+ lea r3, [r3+r1] ; right pSrc
+ lea r4, [r4+r1] ; right dst
- dec r6
- jnz near .left_right_loops
+ dec r6
+ jnz near .left_right_loops
%endif
%endmacro
-%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
- ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
- ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
+%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+ ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
+ ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
-%if %1 == 32 ; luma
- ; TL
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+%if %1 == 32 ; luma
+ ; TL
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- ; TR
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ ; TR
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- ; BL
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ ; BL
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- ; BR
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
-%elif %1 == 16 ; chroma
- ; TL
- mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
- mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ ; BR
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+%elif %1 == 16 ; chroma
+ ; TL
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
- ; TR
- mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
- mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ ; TR
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
- ; BL
- mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
- mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ ; BL
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
- ; BR
- mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
- mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ ; BR
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
%endif
%endmacro
;***********************************************************************----------------
-; void ExpandPictureLuma_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
+; void ExpandPictureLuma_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureLuma_sse2
@@ -403,8 +403,8 @@
exp_top_bottom_sse2 32
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
@@ -416,8 +416,8 @@
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
- movzx r6d,byte [r3] ;top -rigth
- SSE2_Copy16Times xmm4,r6d
+ movzx r6d,byte [r3] ;top -rigth
+ SSE2_Copy16Times xmm4,r6d
neg r1 ;r1 = stride
@@ -438,8 +438,8 @@
pop r1
pop r0
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
@@ -472,13 +472,13 @@
%assign push_num 0
- ret
+ ret
;***********************************************************************----------------
-; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
+; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureChromaAlign_sse2
@@ -531,8 +531,8 @@
exp_top_bottom_sse2 16
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
@@ -557,7 +557,7 @@
push r0
push r1
push r2
- push r6
+ push r6
exp_left_right_sse2 16,a
pop r6
@@ -565,8 +565,8 @@
pop r1
pop r0
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
@@ -599,16 +599,16 @@
%assign push_num 0
- ret
+ ret
;***********************************************************************----------------
-; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
+; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
;***********************************************************************----------------
WELS_EXTERN ExpandPictureChromaUnalign_sse2
- push r4
+ push r4
push r5
push r6
@@ -657,8 +657,8 @@
exp_top_bottom_sse2 16
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
pop r2
pop r1
@@ -683,7 +683,7 @@
push r0
push r1
push r2
- push r6
+ push r6
exp_left_right_sse2 16,u
pop r6
@@ -691,8 +691,8 @@
pop r1
pop r0
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg r1 ;r1 = -stride
@@ -725,4 +725,4 @@
%assign push_num 0
- ret
+ ret
--- a/codec/common/x86/mb_copy.asm
+++ b/codec/common/x86/mb_copy.asm
@@ -36,9 +36,9 @@
;*
;* History
;* 15/09/2009 Created
-;* 12/28/2009 Modified with larger throughput
-;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
-;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
+;* 12/28/2009 Modified with larger throughput
+;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
;*
;*
;*********************************************************************************************/
@@ -56,174 +56,174 @@
;***********************************************************************
-; void WelsCopy16x16_sse2( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
+; void WelsCopy16x16_sse2( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy16x16_sse2
- push r4
- push r5
- %assign push_num 2
+ push r4
+ push r5
+ %assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+2*r3]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+2*r3]
- movdqa xmm7, [r2+r5]
- lea r2, [r2+4*r3]
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+2*r3]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+2*r3]
+ movdqa xmm7, [r2+r5]
+ lea r2, [r2+4*r3]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ lea r0, [r0+4*r1]
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+2*r3]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+2*r3]
- movdqa xmm7, [r2+r5]
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+2*r3]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+2*r3]
+ movdqa xmm7, [r2+r5]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- POP_XMM
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
;***********************************************************************
-; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
+; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
;***********************************************************************
; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
WELS_EXTERN WelsCopy16x16NotAligned_sse2
- push r4
- push r5
- %assign push_num 2
+ push r4
+ push r5
+ %assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
- lea r2, [r2+4*r3]
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
+ lea r2, [r2+4*r3]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ lea r0, [r0+4*r1]
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- POP_XMM
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
; , 12/29/2011
;***********************************************************************
; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy16x8NotAligned_sse2
- push r4
- push r5
- %assign push_num 2
+ push r4
+ push r5
+ %assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- POP_XMM
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
;***********************************************************************
@@ -233,62 +233,62 @@
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy8x16_mmx
- %assign push_num 0
+ %assign push_num 0
LOAD_4_PARA
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
- lea r2, [r2+2*r3]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
+ lea r2, [r2+2*r3]
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
- lea r0, [r0+2*r1]
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
+ lea r0, [r0+2*r1]
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
- WELSEMMS
- LOAD_4_PARA_POP
- ret
+ WELSEMMS
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
; void WelsCopy8x8_mmx( uint8_t* Dst,
@@ -297,48 +297,48 @@
; int32_t iStrideS )
;***********************************************************************
WELS_EXTERN WelsCopy8x8_mmx
- push r4
- %assign push_num 1
+ push r4
+ %assign push_num 1
LOAD_4_PARA
- lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
+ lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
- WELSEMMS
- LOAD_4_PARA_POP
- pop r4
- ret
+ WELSEMMS
+ LOAD_4_PARA_POP
+ pop r4
+ ret
; (dunhuang@cisco), 12/21/2011
;***********************************************************************
@@ -349,13 +349,13 @@
%assign push_num 0
LOAD_2_PARA
- movd xmm0, r1d ; _mv
- pshufd xmm1, xmm0, $00
- movdqa [r0 ], xmm1
- movdqa [r0+0x10], xmm1
- movdqa [r0+0x20], xmm1
- movdqa [r0+0x30], xmm1
- ret
+ movd xmm0, r1d ; _mv
+ pshufd xmm1, xmm0, $00
+ movdqa [r0 ], xmm1
+ movdqa [r0+0x10], xmm1
+ movdqa [r0+0x20], xmm1
+ movdqa [r0+0x30], xmm1
+ ret
;*******************************************************************************
; Macros and other preprocessor constants
@@ -381,14 +381,14 @@
%assign push_num 0
LOAD_7_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
- SIGN_EXTENSION r6, r6d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
- movd mm0, [r4]
+ movd mm0, [r4]
pavgb mm0, [r2]
movd [r0], mm0
@@ -398,8 +398,8 @@
lea r4, [r4+r5]
jne .height_loop
- WELSEMMS
- LOAD_7_PARA_POP
+ WELSEMMS
+ LOAD_7_PARA_POP
ret
@@ -413,29 +413,29 @@
%assign push_num 0
LOAD_7_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
- SIGN_EXTENSION r6, r6d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
- movq mm0, [r2]
+ movq mm0, [r2]
pavgb mm0, [r4]
movq [r0], mm0
movq mm0, [r2+r3]
pavgb mm0, [r4+r5]
- movq [r0+r1], mm0
+ movq [r0+r1], mm0
- lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
sub r6, 2
jnz .height_loop
- WELSEMMS
- LOAD_7_PARA_POP
+ WELSEMMS
+ LOAD_7_PARA_POP
ret
@@ -450,46 +450,46 @@
%assign push_num 0
LOAD_7_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
- SIGN_EXTENSION r6, r6d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r6, r6d
ALIGN 4
.height_loop:
- movdqu xmm0, [r2]
- movdqu xmm1, [r4]
- pavgb xmm0, xmm1
- ;pavgb xmm0, [r4]
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r4]
+ pavgb xmm0, xmm1
+ ;pavgb xmm0, [r4]
movdqu [r0], xmm0
- movdqu xmm0, [r2+r3]
- movdqu xmm1, [r4+r5]
- pavgb xmm0, xmm1
+ movdqu xmm0, [r2+r3]
+ movdqu xmm1, [r4+r5]
+ pavgb xmm0, xmm1
movdqu [r0+r1], xmm0
- movdqu xmm0, [r2+2*r3]
- movdqu xmm1, [r4+2*r5]
- pavgb xmm0, xmm1
+ movdqu xmm0, [r2+2*r3]
+ movdqu xmm1, [r4+2*r5]
+ pavgb xmm0, xmm1
movdqu [r0+2*r1], xmm0
lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
- movdqu xmm0, [r2+r3]
- movdqu xmm1, [r4+r5]
- pavgb xmm0, xmm1
+ movdqu xmm0, [r2+r3]
+ movdqu xmm1, [r4+r5]
+ pavgb xmm0, xmm1
movdqu [r0+r1], xmm0
lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
sub r6, 4
jne .height_loop
- WELSEMMS
- LOAD_7_PARA_POP
+ WELSEMMS
+ LOAD_7_PARA_POP
ret
;*******************************************************************************
@@ -497,26 +497,26 @@
; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
WELS_EXTERN McCopyWidthEq4_mmx
- push r5
+ push r5
%assign push_num 1
LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
- mov r5d, [r0]
- mov [r2], r5d
+ mov r5d, [r0]
+ mov [r2], r5d
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
- WELSEMMS
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
+ WELSEMMS
LOAD_5_PARA_POP
- pop r5
+ pop r5
ret
;*******************************************************************************
@@ -527,21 +527,21 @@
%assign push_num 0
LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
- movq mm0, [r0]
- movq [r2], mm0
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
+ movq mm0, [r0]
+ movq [r2], mm0
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
- WELSEMMS
- LOAD_5_PARA_POP
+ WELSEMMS
+ LOAD_5_PARA_POP
ret
@@ -550,32 +550,32 @@
;*******************************************************************************
;read unaligned memory
%macro SSE_READ_UNA 2
- movq %1, [%2]
- movhps %1, [%2+8]
+ movq %1, [%2]
+ movhps %1, [%2+8]
%endmacro
;write unaligned memory
%macro SSE_WRITE_UNA 2
- movq [%1], %2
- movhps [%1+8], %2
+ movq [%1], %2
+ movhps [%1+8], %2
%endmacro
WELS_EXTERN McCopyWidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
ALIGN 4
.height_loop:
- SSE_READ_UNA xmm0, r0
- SSE_READ_UNA xmm1, r0+r1
- SSE_WRITE_UNA r2, xmm0
- SSE_WRITE_UNA r2+r3, xmm1
+ SSE_READ_UNA xmm0, r0
+ SSE_READ_UNA xmm1, r0+r1
+ SSE_WRITE_UNA r2, xmm0
+ SSE_WRITE_UNA r2+r3, xmm1
- sub r4, 2
+ sub r4, 2
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
jnz .height_loop
- LOAD_5_PARA_POP
+ LOAD_5_PARA_POP
ret
--- a/codec/common/x86/mc_chroma.asm
+++ b/codec/common/x86/mc_chroma.asm
@@ -53,10 +53,10 @@
ALIGN 16
h264_d0x20_sse2:
- dw 32,32,32,32,32,32,32,32
+ dw 32,32,32,32,32,32,32,32
ALIGN 16
h264_d0x20_mmx:
- dw 32,32,32,32
+ dw 32,32,32,32
;=============================================================================
@@ -67,152 +67,152 @@
;*******************************************************************************
; void McChromaWidthEq4_mmx( const uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; const uint8_t *pABCD,
-; int32_t iHeigh );
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; const uint8_t *pABCD,
+; int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
- %assign push_num 0
- LOAD_6_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_6_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
- movd mm3, [r4]; [eax]
- WELS_Zero mm7
- punpcklbw mm3, mm3
- movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
+ movd mm3, [r4]; [eax]
+ WELS_Zero mm7
+ punpcklbw mm3, mm3
+ movq mm4, mm3
+ punpcklwd mm3, mm3
+ punpckhwd mm4, mm4
- movq mm5, mm3
- punpcklbw mm3, mm7
- punpckhbw mm5, mm7
+ movq mm5, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm5, mm7
- movq mm6, mm4
- punpcklbw mm4, mm7
- punpckhbw mm6, mm7
+ movq mm6, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm6, mm7
- lea r4, [r0 + r1] ;lea ebx, [esi + eax]
- movd mm0, [r0]
- movd mm1, [r0+1]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
+ lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+ movd mm0, [r0]
+ movd mm1, [r0+1]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
.xloop:
- pmullw mm0, mm3
- pmullw mm1, mm5
- paddw mm0, mm1
+ pmullw mm0, mm3
+ pmullw mm1, mm5
+ paddw mm0, mm1
- movd mm1, [r4]
- punpcklbw mm1, mm7
- movq mm2, mm1
- pmullw mm1, mm4
- paddw mm0, mm1
+ movd mm1, [r4]
+ punpcklbw mm1, mm7
+ movq mm2, mm1
+ pmullw mm1, mm4
+ paddw mm0, mm1
- movd mm1, [r4+1]
- punpcklbw mm1, mm7
- movq mm7, mm1
- pmullw mm1,mm6
- paddw mm0, mm1
- movq mm1,mm7
+ movd mm1, [r4+1]
+ punpcklbw mm1, mm7
+ movq mm7, mm1
+ pmullw mm1,mm6
+ paddw mm0, mm1
+ movq mm1,mm7
- paddw mm0, [h264_d0x20_mmx]
- psrlw mm0, 6
+ paddw mm0, [h264_d0x20_mmx]
+ psrlw mm0, 6
- WELS_Zero mm7
- packuswb mm0, mm7
- movd [r2], mm0
+ WELS_Zero mm7
+ packuswb mm0, mm7
+ movd [r2], mm0
- movq mm0, mm2
+ movq mm0, mm2
- lea r2, [r2 + r3]
- lea r4, [r4 + r1]
+ lea r2, [r2 + r3]
+ lea r4, [r4 + r1]
- dec r5
- jnz near .xloop
- WELSEMMS
- LOAD_6_PARA_POP
- ret
+ dec r5
+ jnz near .xloop
+ WELSEMMS
+ LOAD_6_PARA_POP
+ ret
;*******************************************************************************
; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; const uint8_t *pABCD,
-; int32_t iheigh );
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; const uint8_t *pABCD,
+; int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
- %assign push_num 0
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
- movd xmm3, [r4]
- WELS_Zero xmm7
- punpcklbw xmm3, xmm3
- punpcklwd xmm3, xmm3
+ movd xmm3, [r4]
+ WELS_Zero xmm7
+ punpcklbw xmm3, xmm3
+ punpcklwd xmm3, xmm3
- movdqa xmm4, xmm3
- punpckldq xmm3, xmm3
- punpckhdq xmm4, xmm4
- movdqa xmm5, xmm3
- movdqa xmm6, xmm4
+ movdqa xmm4, xmm3
+ punpckldq xmm3, xmm3
+ punpckhdq xmm4, xmm4
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm4
- punpcklbw xmm3, xmm7
- punpckhbw xmm5, xmm7
- punpcklbw xmm4, xmm7
- punpckhbw xmm6, xmm7
+ punpcklbw xmm3, xmm7
+ punpckhbw xmm5, xmm7
+ punpcklbw xmm4, xmm7
+ punpckhbw xmm6, xmm7
- lea r4, [r0 + r1] ;lea ebx, [esi + eax]
- movq xmm0, [r0]
- movq xmm1, [r0+1]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
+ lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+ movq xmm0, [r0]
+ movq xmm1, [r0+1]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
.xloop:
- pmullw xmm0, xmm3
- pmullw xmm1, xmm5
- paddw xmm0, xmm1
+ pmullw xmm0, xmm3
+ pmullw xmm1, xmm5
+ paddw xmm0, xmm1
- movq xmm1, [r4]
- punpcklbw xmm1, xmm7
- movdqa xmm2, xmm1
- pmullw xmm1, xmm4
- paddw xmm0, xmm1
+ movq xmm1, [r4]
+ punpcklbw xmm1, xmm7
+ movdqa xmm2, xmm1
+ pmullw xmm1, xmm4
+ paddw xmm0, xmm1
- movq xmm1, [r4+1]
- punpcklbw xmm1, xmm7
- movdqa xmm7, xmm1
- pmullw xmm1, xmm6
- paddw xmm0, xmm1
- movdqa xmm1,xmm7
+ movq xmm1, [r4+1]
+ punpcklbw xmm1, xmm7
+ movdqa xmm7, xmm1
+ pmullw xmm1, xmm6
+ paddw xmm0, xmm1
+ movdqa xmm1,xmm7
- paddw xmm0, [h264_d0x20_sse2]
- psrlw xmm0, 6
+ paddw xmm0, [h264_d0x20_sse2]
+ psrlw xmm0, 6
- WELS_Zero xmm7
- packuswb xmm0, xmm7
- movq [r2], xmm0
+ WELS_Zero xmm7
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
- movdqa xmm0, xmm2
+ movdqa xmm0, xmm2
- lea r2, [r2 + r3]
- lea r4, [r4 + r1]
+ lea r2, [r2 + r3]
+ lea r4, [r4 + r1]
- dec r5
- jnz near .xloop
+ dec r5
+ jnz near .xloop
- POP_XMM
- LOAD_6_PARA_POP
+ POP_XMM
+ LOAD_6_PARA_POP
- ret
+ ret
@@ -219,19 +219,19 @@
;***********************************************************************
; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
-; int32_t iSrcStride,
+; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; const uint8_t *pABCD,
-; int32_t iHeigh);
+; int32_t iHeigh);
;***********************************************************************
WELS_EXTERN McChromaWidthEq8_ssse3
- %assign push_num 0
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
pxor xmm7, xmm7
movd xmm5, [r4]
@@ -243,27 +243,27 @@
sub r2, r3 ;sub esi, edi
sub r2, r3
- movdqa xmm7, [h264_d0x20_sse2]
+ movdqa xmm7, [h264_d0x20_sse2]
- movdqu xmm0, [r0]
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- punpcklbw xmm0, xmm1
+ movdqu xmm0, [r0]
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+ punpcklbw xmm0, xmm1
.hloop_chroma:
- lea r2, [r2+2*r3]
+ lea r2, [r2+2*r3]
- movdqu xmm2, [r0+r1]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm4, xmm2
+ movdqu xmm2, [r0+r1]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm4, xmm2
pmaddubsw xmm0, xmm5
pmaddubsw xmm2, xmm6
paddw xmm0, xmm2
paddw xmm0, xmm7
- psrlw xmm0, 6
+ psrlw xmm0, 6
packuswb xmm0, xmm0
movq [r2],xmm0
@@ -278,16 +278,16 @@
pmaddubsw xmm2, xmm6
paddw xmm4, xmm2
paddw xmm4, xmm7
- psrlw xmm4, 6
+ psrlw xmm4, 6
packuswb xmm4, xmm4
movq [r2+r3],xmm4
- sub r5, 2
- jnz .hloop_chroma
+ sub r5, 2
+ jnz .hloop_chroma
- POP_XMM
- LOAD_6_PARA_POP
+ POP_XMM
+ LOAD_6_PARA_POP
- ret
+ ret
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -52,13 +52,13 @@
ALIGN 16
h264_w0x10:
- dw 16, 16, 16, 16
+ dw 16, 16, 16, 16
ALIGN 16
h264_w0x10_1:
- dw 16, 16, 16, 16, 16, 16, 16, 16
+ dw 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 16
h264_mc_hc_32:
- dw 32, 32, 32, 32, 32, 32, 32, 32
+ dw 32, 32, 32, 32, 32, 32, 32, 32
;*******************************************************************************
@@ -72,55 +72,55 @@
;*******************************************************************************
; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight)
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight)
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq4_mmx
%assign push_num 0
LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
- sub r0, 2
- WELS_Zero mm7
- movq mm6, [h264_w0x10]
+ sub r0, 2
+ WELS_Zero mm7
+ movq mm6, [h264_w0x10]
.height_loop:
- movd mm0, [r0]
- punpcklbw mm0, mm7
- movd mm1, [r0+5]
- punpcklbw mm1, mm7
- movd mm2, [r0+1]
- punpcklbw mm2, mm7
- movd mm3, [r0+4]
- punpcklbw mm3, mm7
- movd mm4, [r0+2]
- punpcklbw mm4, mm7
- movd mm5, [r0+3]
- punpcklbw mm5, mm7
+ movd mm0, [r0]
+ punpcklbw mm0, mm7
+ movd mm1, [r0+5]
+ punpcklbw mm1, mm7
+ movd mm2, [r0+1]
+ punpcklbw mm2, mm7
+ movd mm3, [r0+4]
+ punpcklbw mm3, mm7
+ movd mm4, [r0+2]
+ punpcklbw mm4, mm7
+ movd mm5, [r0+3]
+ punpcklbw mm5, mm7
- paddw mm2, mm3
- paddw mm4, mm5
- psllw mm4, 2
- psubw mm4, mm2
- paddw mm0, mm1
- paddw mm0, mm4
- psllw mm4, 2
- paddw mm0, mm4
- paddw mm0, mm6
- psraw mm0, 5
- packuswb mm0, mm7
- movd [r2], mm0
+ paddw mm2, mm3
+ paddw mm4, mm5
+ psllw mm4, 2
+ psubw mm4, mm2
+ paddw mm0, mm1
+ paddw mm0, mm4
+ psllw mm4, 2
+ paddw mm0, mm4
+ paddw mm0, mm6
+ psraw mm0, 5
+ packuswb mm0, mm7
+ movd [r2], mm0
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
- WELSEMMS
- LOAD_5_PARA_POP
- ret
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
;*******************************************************************************
; Macros and other preprocessor constants
@@ -128,26 +128,26 @@
%macro SSE_LOAD_8P 3
- movq %1, %3
- punpcklbw %1, %2
+ movq %1, %3
+ punpcklbw %1, %2
%endmacro
%macro FILTER_HV_W8 9
- paddw %1, %6
- movdqa %8, %3
- movdqa %7, %2
- paddw %1, [h264_w0x10_1]
- paddw %8, %4
- paddw %7, %5
- psllw %8, 2
- psubw %8, %7
- paddw %1, %8
- psllw %8, 2
- paddw %1, %8
- psraw %1, 5
- WELS_Zero %8
- packuswb %1, %8
- movq %9, %1
+ paddw %1, %6
+ movdqa %8, %3
+ movdqa %7, %2
+ paddw %1, [h264_w0x10_1]
+ paddw %8, %4
+ paddw %7, %5
+ psllw %8, 2
+ psubw %8, %7
+ paddw %1, %8
+ psllw %8, 2
+ paddw %1, %8
+ psraw %1, 5
+ WELS_Zero %8
+ packuswb %1, %8
+ movq %9, %1
%endmacro
;*******************************************************************************
@@ -159,192 +159,192 @@
;***********************************************************************
; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
; int16_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride
-; int32_t iHeight
+; uint8_t *pDst,
+; int32_t iDstStride
+; int32_t iHeight
; )
;***********************************************************************
WELS_EXTERN McHorVer22Width8HorFirst_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- pxor xmm7, xmm7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ pxor xmm7, xmm7
- sub r0, r1 ;;;;;;;;need more 5 lines.
- sub r0, r1
+ sub r0, r1 ;;;;;;;;need more 5 lines.
+ sub r0, r1
.yloop_width_8:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- movdqa [r2], xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ movdqa [r2], xmm0
- add r0, r1
- add r2, r3
- dec r4
- jnz .yloop_width_8
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .yloop_width_8
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
;*******************************************************************************
; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc,
; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight,
; );
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq8_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- lea r0, [r0-2] ;pSrc -= 2;
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ lea r0, [r0-2] ;pSrc -= 2;
- pxor xmm7, xmm7
- movdqa xmm6, [h264_w0x10_1]
+ pxor xmm7, xmm7
+ movdqa xmm6, [h264_w0x10_1]
.y_loop:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [r2], xmm0
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
- lea r2, [r2+r3]
- lea r0, [r0+r1]
- dec r4
- jnz near .y_loop
+ lea r2, [r2+r3]
+ lea r0, [r0+r1]
+ dec r4
+ jnz near .y_loop
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
;*******************************************************************************
; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc,
; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight,
; );
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq16_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- lea r0, [r0-2] ;pSrc -= 2;
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ lea r0, [r0-2] ;pSrc -= 2;
- pxor xmm7, xmm7
- movdqa xmm6, [h264_w0x10_1]
+ pxor xmm7, xmm7
+ movdqa xmm6, [h264_w0x10_1]
.y_loop:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [r2], xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [r2+8], xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+ packuswb xmm0, xmm7
+ movq [r2+8], xmm0
- lea r2, [r2+r3]
- lea r0, [r0+r1]
- dec r4
- jnz near .y_loop
+ lea r2, [r2+r3]
+ lea r0, [r0+r1]
+ dec r4
+ jnz near .y_loop
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
;*******************************************************************************
@@ -355,81 +355,81 @@
; int iHeight )
;*******************************************************************************
WELS_EXTERN McHorVer02WidthEq8_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- sub r0, r1
- sub r0, r1
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ sub r0, r1
+ sub r0, r1
- WELS_Zero xmm7
+ WELS_Zero xmm7
- SSE_LOAD_8P xmm0, xmm7, [r0]
- SSE_LOAD_8P xmm1, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm7, [r0]
- SSE_LOAD_8P xmm3, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm7, [r0]
- SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+ SSE_LOAD_8P xmm0, xmm7, [r0]
+ SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm7, [r0]
+ SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm7, [r0]
+ SSE_LOAD_8P xmm5, xmm7, [r0+r1]
.start:
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r4
- jz near .xx_exit
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r4
+ jz near .xx_exit
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
- dec r4
- jz near .xx_exit
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+ dec r4
+ jz near .xx_exit
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm7, xmm0, [r0+r1]
- FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r4
- jz near .xx_exit
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+ FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r4
+ jz near .xx_exit
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm0, xmm1, [r0]
- FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
- dec r4
- jz near .xx_exit
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm0, xmm1, [r0]
+ FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+ dec r4
+ jz near .xx_exit
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm1, xmm2, [r0+r1]
- FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
- dec r4
- jz near .xx_exit
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+ FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+ dec r4
+ jz near .xx_exit
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm3, [r0]
- FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
- dec r4
- jz near .xx_exit
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm3, [r0]
+ FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+ dec r4
+ jz near .xx_exit
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm3, xmm4, [r0+r1]
- FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
- dec r4
- jz near .xx_exit
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+ FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+ dec r4
+ jz near .xx_exit
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm5, [r0]
- FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
- dec r4
- jz near .xx_exit
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm5, [r0]
+ FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+ dec r4
+ jz near .xx_exit
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm5, xmm6, [r0+r1]
- jmp near .start
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+ jmp near .start
.xx_exit:
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
;***********************************************************************
; Code
@@ -440,725 +440,725 @@
;***********************************************************************
-; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc,
+; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
-; int32_t iWidth,
+; int32_t iWidth,
; int32_t iHeight )
;***********************************************************************
WELS_EXTERN McHorVer02Height9Or17_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
%endif
- shr r4, 3
- sub r0, r1
- sub r0, r1
+ shr r4, 3
+ sub r0, r1
+ sub r0, r1
.xloop:
- WELS_Zero xmm7
- SSE_LOAD_8P xmm0, xmm7, [r0]
- SSE_LOAD_8P xmm1, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm7, [r0]
- SSE_LOAD_8P xmm3, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm7, [r0]
- SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+ WELS_Zero xmm7
+ SSE_LOAD_8P xmm0, xmm7, [r0]
+ SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm7, [r0]
+ SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm7, [r0]
+ SSE_LOAD_8P xmm5, xmm7, [r0+r1]
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- movdqa xmm0,xmm1
- movdqa xmm1,xmm2
- movdqa xmm2,xmm3
- movdqa xmm3,xmm4
- movdqa xmm4,xmm5
- movdqa xmm5,xmm6
- add r2, r3
- sub r0, r1
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm2
+ movdqa xmm2,xmm3
+ movdqa xmm3,xmm4
+ movdqa xmm4,xmm5
+ movdqa xmm5,xmm6
+ add r2, r3
+ sub r0, r1
.start:
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm7, xmm0, [r0+r1]
- FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+ FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm0, xmm1, [r0]
- FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm0, xmm1, [r0]
+ FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm1, xmm2, [r0+r1]
- FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+ FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm3, [r0]
- FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm3, [r0]
+ FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm3, xmm4, [r0+r1]
- FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+ FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm5, [r0]
- FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm5, [r0]
+ FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm5, xmm6, [r0+r1]
- jmp near .start
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+ jmp near .start
.x_loop_dec:
- dec r4
- jz near .xx_exit
+ dec r4
+ jz near .xx_exit
%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
%endif
- sub r0, r1
- sub r0, r1
- add r0, 8
- add r2, 8
- jmp near .xloop
+ sub r0, r1
+ sub r0, r1
+ add r0, 8
+ add r2, 8
+ jmp near .xloop
.xx_exit:
%ifndef X86_32
- pop r14
- pop r13
- pop r12
+ pop r14
+ pop r13
+ pop r12
%endif
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
;***********************************************************************
-; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
+; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight
; );
;***********************************************************************
WELS_EXTERN McHorVer20Width9Or17_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
- sub r0, 2
- pxor xmm7, xmm7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+ sub r0, 2
+ pxor xmm7, xmm7
- cmp r4, 9
- jne near .width_17
+ cmp r4, 9
+ jne near .width_17
.yloop_width_9:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movd [r2], xmm0
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movd [r2], xmm0
- pxor xmm7, xmm7
- movq xmm0, [r0+6]
- punpcklbw xmm0, xmm7
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6]
+ punpcklbw xmm0, xmm7
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- paddw xmm2, [h264_w0x10_1]
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r2+1], xmm2
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ paddw xmm2, [h264_w0x10_1]
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r2+1], xmm2
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_9
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_9
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
.width_17:
.yloop_width_17:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movq [r2], xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movq [r2], xmm0
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movd [r2+8], xmm0
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movd [r2+8], xmm0
- pxor xmm7, xmm7
- movq xmm0, [r0+6+8]
- punpcklbw xmm0, xmm7
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6+8]
+ punpcklbw xmm0, xmm7
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- paddw xmm2, [h264_w0x10_1]
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r2+9], xmm2
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_17
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ paddw xmm2, [h264_w0x10_1]
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r2+9], xmm2
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_17
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
;***********************************************************************
;void McHorVer22HorFirst_sse2
-; (const uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t * pTap,
-; int32_t iTapStride,
-; int32_t iWidth,int32_t iHeight);
+; (const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t * pTap,
+; int32_t iTapStride,
+; int32_t iWidth,int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22HorFirst_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
- pxor xmm7, xmm7
- sub r0, r1 ;;;;;;;;need more 5 lines.
- sub r0, r1
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+ pxor xmm7, xmm7
+ sub r0, r1 ;;;;;;;;need more 5 lines.
+ sub r0, r1
- cmp r4, 9
- jne near .width_17
+ cmp r4, 9
+ jne near .width_17
.yloop_width_9:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- movd [r2], xmm0
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ movd [r2], xmm0
- pxor xmm7, xmm7
- movq xmm0, [r0+6]
- punpcklbw xmm0, xmm7
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6]
+ punpcklbw xmm0, xmm7
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- movq [r2+2], xmm2
- movhps [r2+2+8], xmm2
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ movq [r2+2], xmm2
+ movhps [r2+2+8], xmm2
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_9
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_9
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
.width_17:
.yloop_width_17:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- movdqa [r2], xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ movdqa [r2], xmm0
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- movd [r2+16], xmm0
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ movd [r2+16], xmm0
- pxor xmm7, xmm7
- movq xmm0, [r0+6+8]
- punpcklbw xmm0, xmm7
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6+8]
+ punpcklbw xmm0, xmm7
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- movq [r2+18], xmm2
- movhps [r2+18+8], xmm2
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ movq [r2+18], xmm2
+ movhps [r2+18+8], xmm2
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_17
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_17
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
%macro FILTER_VER 9
- paddw %1, %6
- movdqa %7, %2
- movdqa %8, %3
+ paddw %1, %6
+ movdqa %7, %2
+ movdqa %8, %3
- paddw %7, %5
- paddw %8, %4
+ paddw %7, %5
+ paddw %8, %4
- psubw %1, %7
- psraw %1, 2
- paddw %1, %8
- psubw %1, %7
- psraw %1, 2
- paddw %8, %1
- paddw %8, [h264_mc_hc_32]
- psraw %8, 6
- packuswb %8, %8
- movq %9, %8
+ psubw %1, %7
+ psraw %1, 2
+ paddw %1, %8
+ psubw %1, %7
+ psraw %1, 2
+ paddw %8, %1
+ paddw %8, [h264_mc_hc_32]
+ psraw %8, 6
+ packuswb %8, %8
+ movq %9, %8
%endmacro
;***********************************************************************
;void McHorVer22Width8VerLastAlign_sse2(
-; const uint8_t *pTap,
-; int32_t iTapStride,
-; uint8_t * pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight);
+; const uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
%endif
- shr r4, 3
+ shr r4, 3
.width_loop:
- movdqa xmm0, [r0]
- movdqa xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movdqa xmm2, [r0]
- movdqa xmm3, [r0+r1]
- lea r0, [r0+2*r1]
- movdqa xmm4, [r0]
- movdqa xmm5, [r0+r1]
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ movdqa xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ movdqa xmm5, [r0+r1]
- FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- movdqa xmm6, [r0]
+ FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ movdqa xmm6, [r0]
- movdqa xmm0, xmm1
- movdqa xmm1, xmm2
- movdqa xmm2, xmm3
- movdqa xmm3, xmm4
- movdqa xmm4, xmm5
- movdqa xmm5, xmm6
+ movdqa xmm0, xmm1
+ movdqa xmm1, xmm2
+ movdqa xmm2, xmm3
+ movdqa xmm3, xmm4
+ movdqa xmm4, xmm5
+ movdqa xmm5, xmm6
- add r2, r3
- sub r0, r1
+ add r2, r3
+ sub r0, r1
.start:
- FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
+ FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqa xmm6, [r0]
- FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqa xmm6, [r0]
+ FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqa xmm7, [r0+r1]
- FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqa xmm7, [r0+r1]
+ FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqa xmm0, [r0]
- FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqa xmm0, [r0]
+ FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0+r1]
- FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0+r1]
+ FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqa xmm2, [r0]
- FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqa xmm3, [r0+r1]
- FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqa xmm3, [r0+r1]
+ FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqa xmm4, [r0]
- FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqa xmm5, [r0+r1]
- jmp near .start
+ lea r2, [r2+2*r3]
+ movdqa xmm5, [r0+r1]
+ jmp near .start
.x_loop_dec:
- dec r4
- jz near .exit
+ dec r4
+ jz near .exit
%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
%endif
- add r0, 16
- add r2, 8
- jmp .width_loop
+ add r0, 16
+ add r2, 8
+ jmp .width_loop
.exit:
%ifndef X86_32
- pop r14
- pop r13
- pop r12
+ pop r14
+ pop r13
+ pop r12
%endif
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
;***********************************************************************
;void McHorVer22Width8VerLastUnAlign_sse2(
-; const uint8_t *pTap,
-; int32_t iTapStride,
-; uint8_t * pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight);
+; const uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
- %assign push_num 0
+ %assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
%endif
- shr r4, 3
+ shr r4, 3
.width_loop:
- movdqu xmm0, [r0]
- movdqu xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm2, [r0]
- movdqu xmm3, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm4, [r0]
- movdqu xmm5, [r0+r1]
+ movdqu xmm0, [r0]
+ movdqu xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ movdqu xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm4, [r0]
+ movdqu xmm5, [r0+r1]
- FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- movdqu xmm6, [r0]
+ FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ movdqu xmm6, [r0]
- movdqa xmm0, xmm1
- movdqa xmm1, xmm2
- movdqa xmm2, xmm3
- movdqa xmm3, xmm4
- movdqa xmm4, xmm5
- movdqa xmm5, xmm6
+ movdqa xmm0, xmm1
+ movdqa xmm1, xmm2
+ movdqa xmm2, xmm3
+ movdqa xmm3, xmm4
+ movdqa xmm4, xmm5
+ movdqa xmm5, xmm6
- add r2, r3
- sub r0, r1
+ add r2, r3
+ sub r0, r1
.start:
- FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
+ FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqu xmm6, [r0]
- FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqu xmm6, [r0]
+ FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqu xmm7, [r0+r1]
- FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqu xmm7, [r0+r1]
+ FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqu xmm0, [r0]
- FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqu xmm0, [r0]
+ FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqu xmm1, [r0+r1]
- FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqu xmm1, [r0+r1]
+ FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqu xmm2, [r0]
- FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqu xmm3, [r0+r1]
- FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
- dec r5
- jz near .x_loop_dec
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r0+r1]
+ FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+ dec r5
+ jz near .x_loop_dec
- lea r0, [r0+2*r1]
- movdqu xmm4, [r0]
- FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
+ lea r0, [r0+2*r1]
+ movdqu xmm4, [r0]
+ FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
- lea r2, [r2+2*r3]
- movdqu xmm5, [r0+r1]
- jmp near .start
+ lea r2, [r2+2*r3]
+ movdqu xmm5, [r0+r1]
+ jmp near .start
.x_loop_dec:
- dec r4
- jz near .exit
+ dec r4
+ jz near .exit
%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
%endif
- add r0, 16
- add r2, 8
- jmp .width_loop
+ add r0, 16
+ add r2, 8
+ jmp .width_loop
.exit:
%ifndef X86_32
- pop r14
- pop r13
- pop r12
+ pop r14
+ pop r13
+ pop r12
%endif
- POP_XMM
- LOAD_6_PARA_POP
- ret
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -77,77 +77,77 @@
;
;***********************************************************************
%macro MMX_DW_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubw %1, %2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubw %1, %2
%endmacro
-%macro SSE2_SumWHorizon1 2
- movdqa %2, %1
- psrldq %2, 8
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 4
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 2
- paddusw %1, %2
+%macro SSE2_SumWHorizon1 2
+ movdqa %2, %1
+ psrldq %2, 8
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ paddusw %1, %2
%endmacro
%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %4, %5
- SSE2_SumSub %2, %4, %5
- SSE2_SumSub %1, %3, %5
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %4, %5
+ SSE2_SumSub %2, %4, %5
+ SSE2_SumSub %1, %3, %5
%endmacro
%macro SSE2_SumAbs4 7
- WELS_AbsW %1, %3
- WELS_AbsW %2, %3
- WELS_AbsW %4, %6
- WELS_AbsW %5, %6
- paddusw %1, %2
- paddusw %4, %5
- paddusw %7, %1
- paddusw %7, %4
+ WELS_AbsW %1, %3
+ WELS_AbsW %2, %3
+ WELS_AbsW %4, %6
+ WELS_AbsW %5, %6
+ paddusw %1, %2
+ paddusw %4, %5
+ paddusw %7, %1
+ paddusw %7, %4
%endmacro
-%macro SSE2_SumWHorizon 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+%macro SSE2_SumWHorizon 3
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
%endmacro
%macro SSE2_GetSatd8x8 0
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
%endmacro
;***********************************************************************
@@ -156,11 +156,11 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
movd xmm0, [r0]
movd xmm1, [r0+r1]
lea r0 , [r0+2*r1]
@@ -199,14 +199,14 @@
punpcklwd xmm0, xmm4
punpckhwd xmm4, xmm2
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
+ SSE2_XSawp dq, xmm0, xmm4, xmm3
+ SSE2_XSawp qdq, xmm0, xmm3, xmm5
movdqa xmm7, xmm0
paddw xmm0, xmm5
psubw xmm7, xmm5
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
+ SSE2_XSawp qdq, xmm0, xmm7, xmm1
movdqa xmm2, xmm0
paddw xmm0, xmm1
@@ -214,15 +214,15 @@
WELS_AbsW xmm0, xmm3
paddusw xmm6, xmm0
- WELS_AbsW xmm2, xmm4
+ WELS_AbsW xmm2, xmm4
paddusw xmm6, xmm2
SSE2_SumWHorizon1 xmm6, xmm4
- movd retrd, xmm6
+ movd retrd, xmm6
and retrd, 0xffff
shr retrd, 1
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -230,20 +230,20 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm6, xmm6
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -251,25 +251,25 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm6, xmm6
- pxor xmm7, xmm7
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -277,30 +277,30 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- push r0
- push r2
- pxor xmm6, xmm6
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
pxor xmm7, xmm7
- SSE2_GetSatd8x8
+ SSE2_GetSatd8x8
- pop r2
- pop r0
+ pop r2
+ pop r0
add r0, 8
add r2, 8
- SSE2_GetSatd8x8
+ SSE2_GetSatd8x8
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -308,38 +308,38 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- push r0
- push r2
- pxor xmm6, xmm6
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
pxor xmm7, xmm7
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
- pop r2
- pop r0
- add r0, 8
- add r2, 8
+ pop r2
+ pop r0
+ add r0, 8
+ add r2, 8
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -355,9 +355,9 @@
%macro SSE_DB_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubb %1, %2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubb %1, %2
%endmacro
;***********************************************************************
@@ -369,668 +369,668 @@
WELS_EXTERN WelsSampleSatdThree4x4_sse2
%ifdef X86_32
- push r3
- push r4
- push r5
- push r6
- %assign push_num 4
+ push r3
+ push r4
+ push r5
+ push r6
+ %assign push_num 4
%else
- %assign push_num 0
+ %assign push_num 0
%endif
- PUSH_XMM 8
+ PUSH_XMM 8
- mov r2, arg3
- mov r3, arg4
- SIGN_EXTENSION r3, r3d
+ mov r2, arg3
+ mov r3, arg4
+ SIGN_EXTENSION r3, r3d
- ; load source 4x4 samples and Hadamard transform
- movd xmm0, [r2]
- movd xmm1, [r2+r3]
- lea r2 , [r2+2*r3]
- movd xmm2, [r2]
- movd xmm3, [r2+r3]
- punpckldq xmm0, xmm2
- punpckldq xmm1, xmm3
+ ; load source 4x4 samples and Hadamard transform
+ movd xmm0, [r2]
+ movd xmm1, [r2+r3]
+ lea r2 , [r2+2*r3]
+ movd xmm2, [r2]
+ movd xmm3, [r2+r3]
+ punpckldq xmm0, xmm2
+ punpckldq xmm1, xmm3
- pxor xmm6, xmm6
- punpcklbw xmm0, xmm6
- punpcklbw xmm1, xmm6
+ pxor xmm6, xmm6
+ punpcklbw xmm0, xmm6
+ punpcklbw xmm1, xmm6
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
- SSE2_XSawp qdq, xmm0, xmm2, xmm3
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+ SSE2_XSawp qdq, xmm0, xmm2, xmm3
- movdqa xmm4, xmm0
- paddw xmm0, xmm3
- psubw xmm4, xmm3
+ movdqa xmm4, xmm0
+ paddw xmm0, xmm3
+ psubw xmm4, xmm3
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm4
- punpckhwd xmm4, xmm2
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm4, xmm2
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
+ SSE2_XSawp dq, xmm0, xmm4, xmm3
+ SSE2_XSawp qdq, xmm0, xmm3, xmm5
- movdqa xmm7, xmm0
- paddw xmm0, xmm5
- psubw xmm7, xmm5
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm5
+ psubw xmm7, xmm5
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
+ SSE2_XSawp qdq, xmm0, xmm7, xmm1
- ; Hadamard transform results are saved in xmm0 and xmm2
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
+ ; Hadamard transform results are saved in xmm0 and xmm2
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
- ;load top boundary samples: [a b c d]
- mov r0, arg1
- mov r1, arg2
- SIGN_EXTENSION r1, r1d
- sub r0, r1
+ ;load top boundary samples: [a b c d]
+ mov r0, arg1
+ mov r1, arg2
+ SIGN_EXTENSION r1, r1d
+ sub r0, r1
%ifdef UNIX64
- push r4
- push r5
+ push r4
+ push r5
%endif
- movzx r2d, byte [r0]
- movzx r3d, byte [r0+1]
- movzx r4d, byte [r0+2]
- movzx r5d, byte [r0+3]
+ movzx r2d, byte [r0]
+ movzx r3d, byte [r0+1]
+ movzx r4d, byte [r0+2]
+ movzx r5d, byte [r0+3]
- ; get the transform results of top boundary samples: [a b c d]
- add r3d, r2d ; r3d = a + b
- add r5d, r4d ; r5d = c + d
- add r2d, r2d ; r2d = a + a
- add r4d, r4d ; r4d = c + c
- sub r2d, r3d ; r2d = a + a - a - b = a - b
- sub r4d, r5d ; r4d = c + c - c - d = c - d
- add r5d, r3d ; r5d = (a + b) + (c + d)
- add r3d, r3d
- sub r3d, r5d ; r3d = (a + b) - (c + d)
- add r4d, r2d ; r4d = (a - b) + (c - d)
- add r2d, r2d
- sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
+ ; get the transform results of top boundary samples: [a b c d]
+ add r3d, r2d ; r3d = a + b
+ add r5d, r4d ; r5d = c + d
+ add r2d, r2d ; r2d = a + a
+ add r4d, r4d ; r4d = c + c
+ sub r2d, r3d ; r2d = a + a - a - b = a - b
+ sub r4d, r5d ; r4d = c + c - c - d = c - d
+ add r5d, r3d ; r5d = (a + b) + (c + d)
+ add r3d, r3d
+ sub r3d, r5d ; r3d = (a + b) - (c + d)
+ add r4d, r2d ; r4d = (a - b) + (c - d)
+ add r2d, r2d
+ sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
- movdqa xmm6, xmm0
- movdqa xmm7, xmm2
- movd xmm5, r5d ; store the edi for DC mode
- pxor xmm3, xmm3
- pxor xmm4, xmm4
- pinsrw xmm3, r5d, 0
- pinsrw xmm3, r4d, 4
- psllw xmm3, 2
- pinsrw xmm4, r3d, 0
- pinsrw xmm4, r2d, 4
- psllw xmm4, 2
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm2
+ movd xmm5, r5d ; store the edi for DC mode
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+ pinsrw xmm3, r5d, 0
+ pinsrw xmm3, r4d, 4
+ psllw xmm3, 2
+ pinsrw xmm4, r3d, 0
+ pinsrw xmm4, r2d, 4
+ psllw xmm4, 2
- ; get the satd of H
- psubw xmm0, xmm3
- psubw xmm2, xmm4
+ ; get the satd of H
+ psubw xmm0, xmm3
+ psubw xmm2, xmm4
- WELS_AbsW xmm0, xmm1
- WELS_AbsW xmm2, xmm1
- paddusw xmm0, xmm2
- SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0
+ WELS_AbsW xmm0, xmm1
+ WELS_AbsW xmm2, xmm1
+ paddusw xmm0, xmm2
+ SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0
- ;load left boundary samples: [a b c d]'
- add r0, r1
+ ;load left boundary samples: [a b c d]'
+ add r0, r1
- movzx r2d, byte [r0-1]
- movzx r3d, byte [r0+r1-1]
- lea r0 , [r0+2*r1]
- movzx r4d, byte [r0-1]
- movzx r5d, byte [r0+r1-1]
+ movzx r2d, byte [r0-1]
+ movzx r3d, byte [r0+r1-1]
+ lea r0 , [r0+2*r1]
+ movzx r4d, byte [r0-1]
+ movzx r5d, byte [r0+r1-1]
- ; get the transform results of left boundary samples: [a b c d]'
- add r3d, r2d ; r3d = a + b
- add r5d, r4d ; r5d = c + d
- add r2d, r2d ; r2d = a + a
- add r4d, r4d ; r4d = c + c
- sub r2d, r3d ; r2d = a + a - a - b = a - b
- sub r4d, r5d ; r4d = c + c - c - d = c - d
- add r5d, r3d ; r5d = (a + b) + (c + d)
- add r3d, r3d
- sub r3d, r5d ; r3d = (a + b) - (c + d)
- add r4d, r2d ; r4d = (a - b) + (c - d)
- add r2d, r2d
- sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
+ ; get the transform results of left boundary samples: [a b c d]'
+ add r3d, r2d ; r3d = a + b
+ add r5d, r4d ; r5d = c + d
+ add r2d, r2d ; r2d = a + a
+ add r4d, r4d ; r4d = c + c
+ sub r2d, r3d ; r2d = a + a - a - b = a - b
+ sub r4d, r5d ; r4d = c + c - c - d = c - d
+ add r5d, r3d ; r5d = (a + b) + (c + d)
+ add r3d, r3d
+ sub r3d, r5d ; r3d = (a + b) - (c + d)
+ add r4d, r2d ; r4d = (a - b) + (c - d)
+ add r2d, r2d
+ sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
- ; store the transform results in xmm3
- movd xmm3, r5d
- pinsrw xmm3, r3d, 1
- pinsrw xmm3, r2d, 2
- pinsrw xmm3, r4d, 3
- psllw xmm3, 2
+ ; store the transform results in xmm3
+ movd xmm3, r5d
+ pinsrw xmm3, r3d, 1
+ pinsrw xmm3, r2d, 2
+ pinsrw xmm3, r4d, 3
+ psllw xmm3, 2
- ; get the satd of V
- movdqa xmm2, xmm6
- movdqa xmm4, xmm7
- psubw xmm2, xmm3
- WELS_AbsW xmm2, xmm1
- WELS_AbsW xmm4, xmm1
- paddusw xmm2, xmm4
- SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2
+ ; get the satd of V
+ movdqa xmm2, xmm6
+ movdqa xmm4, xmm7
+ psubw xmm2, xmm3
+ WELS_AbsW xmm2, xmm1
+ WELS_AbsW xmm4, xmm1
+ paddusw xmm2, xmm4
+ SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2
- ; DC result is stored in xmm1
- add r5d, 4
- movd xmm1, r5d
- paddw xmm1, xmm5
- psrlw xmm1, 3
- movdqa xmm5, xmm1
- psllw xmm1, 4
+ ; DC result is stored in xmm1
+ add r5d, 4
+ movd xmm1, r5d
+ paddw xmm1, xmm5
+ psrlw xmm1, 3
+ movdqa xmm5, xmm1
+ psllw xmm1, 4
- ; get the satd of DC
- psubw xmm6, xmm1
- WELS_AbsW xmm6, xmm1
- WELS_AbsW xmm7, xmm1
- paddusw xmm6, xmm7
- SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6
+ ; get the satd of DC
+ psubw xmm6, xmm1
+ WELS_AbsW xmm6, xmm1
+ WELS_AbsW xmm7, xmm1
+ paddusw xmm6, xmm7
+ SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6
%ifdef UNIX64
- pop r5
- pop r4
+ pop r5
+ pop r4
%endif
- ; comparing order: DC H V
+ ; comparing order: DC H V
- mov r4, arg5
- movd r2d, xmm6
- movd r3d, xmm2
- movd r6d, xmm0
+ mov r4, arg5
+ movd r2d, xmm6
+ movd r3d, xmm2
+ movd r6d, xmm0
- and r2d, 0xffff
- shr r2d, 1
- and r3d, 0xffff
- shr r3d, 1
- and r6d, 0xffff
- shr r6d, 1
- add r2d, dword arg7
- add r3d, dword arg8
- add r6d, dword arg9
- cmp r2w, r3w
- jg near not_dc
- cmp r2w, r6w
- jg near not_dc_h
+ and r2d, 0xffff
+ shr r2d, 1
+ and r3d, 0xffff
+ shr r3d, 1
+ and r6d, 0xffff
+ shr r6d, 1
+ add r2d, dword arg7
+ add r3d, dword arg8
+ add r6d, dword arg9
+ cmp r2w, r3w
+ jg near not_dc
+ cmp r2w, r6w
+ jg near not_dc_h
- ; for DC mode
- movd r3d, xmm5
- imul r3d, 0x01010101
- movd xmm5, r3d
- pshufd xmm5, xmm5, 0
- movdqa [r4], xmm5
- mov r5, arg6
- mov dword [r5], 0x02
- mov retrd, r2d
- POP_XMM
+ ; for DC mode
+ movd r3d, xmm5
+ imul r3d, 0x01010101
+ movd xmm5, r3d
+ pshufd xmm5, xmm5, 0
+ movdqa [r4], xmm5
+ mov r5, arg6
+ mov dword [r5], 0x02
+ mov retrd, r2d
+ POP_XMM
%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
+ pop r6
+ pop r5
+ pop r4
+ pop r3
%endif
- ret
+ ret
not_dc:
- cmp r3w, r6w
- jg near not_dc_h
+ cmp r3w, r6w
+ jg near not_dc_h
- ; for H mode
- SSE_DB_1_2REG xmm6, xmm7
- sub r0, r1
- sub r0, r1
- movzx r6d, byte [r0-1]
- movd xmm0, r6d
- pmuludq xmm0, xmm6
+ ; for H mode
+ SSE_DB_1_2REG xmm6, xmm7
+ sub r0, r1
+ sub r0, r1
+ movzx r6d, byte [r0-1]
+ movd xmm0, r6d
+ pmuludq xmm0, xmm6
- movzx r6d, byte [r0+r1-1]
- movd xmm1, r6d
- pmuludq xmm1, xmm6
- punpckldq xmm0, xmm1
+ movzx r6d, byte [r0+r1-1]
+ movd xmm1, r6d
+ pmuludq xmm1, xmm6
+ punpckldq xmm0, xmm1
- lea r0, [r0+r1*2]
- movzx r6d, byte [r0-1]
- movd xmm2, r6d
- pmuludq xmm2, xmm6
+ lea r0, [r0+r1*2]
+ movzx r6d, byte [r0-1]
+ movd xmm2, r6d
+ pmuludq xmm2, xmm6
- movzx r6d, byte [r0+r1-1]
- movd xmm3, r6d
- pmuludq xmm3, xmm6
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
+ movzx r6d, byte [r0+r1-1]
+ movd xmm3, r6d
+ pmuludq xmm3, xmm6
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
- movdqa [r4],xmm0
+ movdqa [r4],xmm0
- mov retrd, r3d
- mov r5, arg6
- mov dword [r5], 0x01
- POP_XMM
+ mov retrd, r3d
+ mov r5, arg6
+ mov dword [r5], 0x01
+ POP_XMM
%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
+ pop r6
+ pop r5
+ pop r4
+ pop r3
%endif
- ret
+ ret
not_dc_h:
- sub r0, r1
- sub r0, r1
- sub r0, r1
- movd xmm0, [r0]
- pshufd xmm0, xmm0, 0
- movdqa [r4],xmm0
- mov retrd, r6d
- mov r5, arg6
- mov dword [r5], 0x00
- POP_XMM
+ sub r0, r1
+ sub r0, r1
+ sub r0, r1
+ movd xmm0, [r0]
+ pshufd xmm0, xmm0, 0
+ movdqa [r4],xmm0
+ mov retrd, r6d
+ mov r5, arg6
+ mov dword [r5], 0x00
+ POP_XMM
%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
+ pop r6
+ pop r5
+ pop r4
+ pop r3
%endif
- ret
+ ret
%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
- paddd xmm4, %1 ;for dc
- paddd xmm4, %3 ;for dc
- packssdw %1, %3
- psllw %1, 2
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
+ paddd xmm4, %1 ;for dc
+ paddd xmm4, %3 ;for dc
+ packssdw %1, %3
+ psllw %1, 2
%endmacro
%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
; paddd xmm4, %1 ;for dc
-; paddd xmm4, %3 ;for dc
- movdqa %4, %1
- punpcklqdq %4, %3
- packssdw %1, %3
- psllw %1, 2
+; paddd xmm4, %3 ;for dc
+ movdqa %4, %1
+ punpcklqdq %4, %3
+ packssdw %1, %3
+ psllw %1, 2
%endmacro
%macro SSE41_GetX38x4SatdDec 0
- pxor xmm7, xmm7
- movq xmm0, [r2]
- movq xmm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq xmm2, [r2]
- movq xmm3, [r2+r3]
- lea r2, [r2+2*r3]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
- ;doesn't need another transpose
+ pxor xmm7, xmm7
+ movq xmm0, [r2]
+ movq xmm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq xmm2, [r2]
+ movq xmm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+ punpcklbw xmm2, xmm7
+ punpcklbw xmm3, xmm7
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+ ;doesn't need another transpose
%endmacro
%macro SSE41_GetX38x4SatdV 2
- pxor xmm0, xmm0
- pinsrw xmm0, word[r6+%2], 0
- pinsrw xmm0, word[r6+%2+8], 4
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[r6+%2+2], 0
- pinsrw xmm0, word[r6+%2+10], 4
- psubsw xmm0, xmm1
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[r6+%2+4], 0
- pinsrw xmm0, word[r6+%2+12], 4
- psubsw xmm0, xmm3
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[r6+%2+6], 0
- pinsrw xmm0, word[r6+%2+14], 4
- psubsw xmm0, xmm2
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[r6+%2], 0
+ pinsrw xmm0, word[r6+%2+8], 4
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[r6+%2+2], 0
+ pinsrw xmm0, word[r6+%2+10], 4
+ psubsw xmm0, xmm1
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[r6+%2+4], 0
+ pinsrw xmm0, word[r6+%2+12], 4
+ psubsw xmm0, xmm3
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[r6+%2+6], 0
+ pinsrw xmm0, word[r6+%2+14], 4
+ psubsw xmm0, xmm2
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
%endmacro
%macro SSE41_GetX38x4SatdH 3
- movq xmm0, [r6+%3+8*%1]
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm5, xmm0
- pabsw xmm1, xmm1
- pabsw xmm2, xmm2
- pabsw xmm3, xmm3
- paddw xmm2, xmm1;for DC
- paddw xmm2, xmm3;for DC
- paddw xmm5, xmm2
+ movq xmm0, [r6+%3+8*%1]
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm5, xmm0
+ pabsw xmm1, xmm1
+ pabsw xmm2, xmm2
+ pabsw xmm3, xmm3
+ paddw xmm2, xmm1;for DC
+ paddw xmm2, xmm3;for DC
+ paddw xmm5, xmm2
%endmacro
%macro SSE41_I16X16GetX38x4SatdDC 0
- pxor xmm0, xmm0
- movq2dq xmm0, mm4
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
+ pxor xmm0, xmm0
+ movq2dq xmm0, mm4
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
%endmacro
%macro SSE41_ChromaGetX38x4SatdDC 1
- shl %1, 4
- movdqa xmm0, [r6+32+%1]
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
+ shl %1, 4
+ movdqa xmm0, [r6+32+%1]
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
%endmacro
%macro SSE41_I16x16GetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 32
- SSE41_I16X16GetX38x4SatdDC
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 32
+ SSE41_I16X16GetX38x4SatdDC
%endmacro
%macro SSE41_ChromaGetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 16
- SSE41_ChromaGetX38x4SatdDC %1
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 16
+ SSE41_ChromaGetX38x4SatdDC %1
%endmacro
%macro SSE41_HSum8W 3
- pmaddwd %1, %2
- movhlps %3, %1
- paddd %1, %3
- pshuflw %3, %1,0Eh
- paddd %1, %3
+ pmaddwd %1, %2
+ movhlps %3, %1
+ paddd %1, %3
+ pshuflw %3, %1,0Eh
+ paddd %1, %3
%endmacro
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
- %assign push_num 0
- LOAD_7_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
%ifndef X86_32
- push r12
- mov r12, r2
+ push r12
+ mov r12, r2
%endif
- pxor xmm4, xmm4
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub r0, r1
- movdqu xmm0, [r0]
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [r6], xmm0 ;V
- movdqa [r6+16], xmm1
- add r0, r1
- pinsrb xmm0, byte[r0-1], 0
- pinsrb xmm0, byte[r0+r1-1], 1
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 2
- pinsrb xmm0, byte[r0+r1-1], 3
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 4
- pinsrb xmm0, byte[r0+r1-1], 5
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 6
- pinsrb xmm0, byte[r0+r1-1], 7
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 8
- pinsrb xmm0, byte[r0+r1-1], 9
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 10
- pinsrb xmm0, byte[r0+r1-1], 11
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 12
- pinsrb xmm0, byte[r0+r1-1], 13
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 14
- pinsrb xmm0, byte[r0+r1-1], 15
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [r6+32], xmm0 ;H
- movdqa [r6+48], xmm1
- movd r0d, xmm4 ;dc
- add r0d, 16 ;(sum+16)
- shr r0d, 5 ;((sum+16)>>5)
- shl r0d, 4 ;
- movd mm4, r0d ; mm4 copy DC
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
+ pxor xmm4, xmm4
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub r0, r1
+ movdqu xmm0, [r0]
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [r6], xmm0 ;V
+ movdqa [r6+16], xmm1
+ add r0, r1
+ pinsrb xmm0, byte[r0-1], 0
+ pinsrb xmm0, byte[r0+r1-1], 1
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 2
+ pinsrb xmm0, byte[r0+r1-1], 3
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 4
+ pinsrb xmm0, byte[r0+r1-1], 5
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 6
+ pinsrb xmm0, byte[r0+r1-1], 7
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 8
+ pinsrb xmm0, byte[r0+r1-1], 9
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 10
+ pinsrb xmm0, byte[r0+r1-1], 11
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 12
+ pinsrb xmm0, byte[r0+r1-1], 13
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 14
+ pinsrb xmm0, byte[r0+r1-1], 15
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [r6+32], xmm0 ;H
+ movdqa [r6+48], xmm1
+ movd r0d, xmm4 ;dc
+ add r0d, 16 ;(sum+16)
+ shr r0d, 5 ;((sum+16)>>5)
+ shl r0d, 4 ;
+ movd mm4, r0d ; mm4 copy DC
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
%ifdef UNIX64
- push r4
+ push r4
%endif
- mov r0, 0
- mov r4, 0
+ mov r0, 0
+ mov r4, 0
.loop16x16_get_satd:
.loopStart1:
- SSE41_I16x16GetX38x4Satd r0, r4
- inc r0
- cmp r0, 4
- jl .loopStart1
- cmp r4, 16
- je .loop16x16_get_satd_end
+ SSE41_I16x16GetX38x4Satd r0, r4
+ inc r0
+ cmp r0, 4
+ jl .loopStart1
+ cmp r4, 16
+ je .loop16x16_get_satd_end
%ifdef X86_32
- mov r2, arg3
+ mov r2, arg3
%else
- mov r2, r12
+ mov r2, r12
%endif
- add r2, 8
- mov r0, 0
- add r4, 16
- jmp .loop16x16_get_satd
+ add r2, 8
+ mov r0, 0
+ add r4, 16
+ jmp .loop16x16_get_satd
.loop16x16_get_satd_end:
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
%ifdef UNIX64
- pop r4
+ pop r4
%endif
- ; comparing order: DC H V
- movd r3d, xmm6 ;DC
- movd r1d, xmm5 ;H
- movd r0d, xmm4 ;V
+ ; comparing order: DC H V
+ movd r3d, xmm6 ;DC
+ movd r1d, xmm5 ;H
+ movd r0d, xmm4 ;V
%ifndef X86_32
- pop r12
+ pop r12
%endif
- shl r5d, 1
- add r1d, r5d
- add r3d, r5d
- mov r4, arg5
- cmp r3d, r1d
- jge near not_dc_16x16
- cmp r3d, r0d
- jge near not_dc_h_16x16
+ shl r5d, 1
+ add r1d, r5d
+ add r3d, r5d
+ mov r4, arg5
+ cmp r3d, r1d
+ jge near not_dc_16x16
+ cmp r3d, r0d
+ jge near not_dc_h_16x16
- ; for DC mode
- mov dword[r4], 2;I16_PRED_DC
- mov retrd, r3d
- jmp near return_satd_intra_16x16_x3
+ ; for DC mode
+ mov dword[r4], 2;I16_PRED_DC
+ mov retrd, r3d
+ jmp near return_satd_intra_16x16_x3
not_dc_16x16:
- ; for H mode
- cmp r1d, r0d
- jge near not_dc_h_16x16
- mov dword[r4], 1;I16_PRED_H
- mov retrd, r1d
- jmp near return_satd_intra_16x16_x3
+ ; for H mode
+ cmp r1d, r0d
+ jge near not_dc_h_16x16
+ mov dword[r4], 1;I16_PRED_H
+ mov retrd, r1d
+ jmp near return_satd_intra_16x16_x3
not_dc_h_16x16:
- ; for V mode
- mov dword[r4], 0;I16_PRED_V
- mov retrd, r0d
+ ; for V mode
+ mov dword[r4], 0;I16_PRED_V
+ mov retrd, r0d
return_satd_intra_16x16_x3:
- WELSEMMS
- POP_XMM
- LOAD_7_PARA_POP
+ WELSEMMS
+ POP_XMM
+ LOAD_7_PARA_POP
ret
%macro SSE41_ChromaGetX38x8Satd 0
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub r0, r1
- movq xmm0, [r0]
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
- movdqa [r6], xmm0 ;V
- add r0, r1
- pinsrb xmm0, byte[r0-1], 0
- pinsrb xmm0, byte[r0+r1-1], 1
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 2
- pinsrb xmm0, byte[r0+r1-1], 3
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 4
- pinsrb xmm0, byte[r0+r1-1], 5
- lea r0, [r0+2*r1]
- pinsrb xmm0, byte[r0-1], 6
- pinsrb xmm0, byte[r0+r1-1], 7
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
- movdqa [r6+16], xmm0 ;H
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub r0, r1
+ movq xmm0, [r0]
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+ movdqa [r6], xmm0 ;V
+ add r0, r1
+ pinsrb xmm0, byte[r0-1], 0
+ pinsrb xmm0, byte[r0+r1-1], 1
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 2
+ pinsrb xmm0, byte[r0+r1-1], 3
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 4
+ pinsrb xmm0, byte[r0+r1-1], 5
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 6
+ pinsrb xmm0, byte[r0+r1-1], 7
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+ movdqa [r6+16], xmm0 ;H
;(sum+2)>>2
- movdqa xmm6, [PDQ2]
- movdqa xmm5, xmm4
- punpckhqdq xmm5, xmm1
- paddd xmm5, xmm6
- psrld xmm5, 2
+ movdqa xmm6, [PDQ2]
+ movdqa xmm5, xmm4
+ punpckhqdq xmm5, xmm1
+ paddd xmm5, xmm6
+ psrld xmm5, 2
;(sum1+sum2+4)>>3
- paddd xmm6, xmm6
- paddd xmm4, xmm1
- paddd xmm4, xmm6
- psrld xmm4, 3
+ paddd xmm6, xmm6
+ paddd xmm4, xmm1
+ paddd xmm4, xmm6
+ psrld xmm4, 3
;satd *16
- pslld xmm5, 4
- pslld xmm4, 4
+ pslld xmm5, 4
+ pslld xmm4, 4
;temp satd
- movdqa xmm6, xmm4
- punpcklqdq xmm4, xmm5
- psllq xmm4, 32
- psrlq xmm4, 32
- movdqa [r6+32], xmm4
- punpckhqdq xmm5, xmm6
- psllq xmm5, 32
- psrlq xmm5, 32
- movdqa [r6+48], xmm5
+ movdqa xmm6, xmm4
+ punpcklqdq xmm4, xmm5
+ psllq xmm4, 32
+ psrlq xmm4, 32
+ movdqa [r6+32], xmm4
+ punpckhqdq xmm5, xmm6
+ psllq xmm5, 32
+ psrlq xmm5, 32
+ movdqa [r6+48], xmm5
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov r0, 0
- SSE41_ChromaGetX38x4Satd r0, 0
- inc r0
- SSE41_ChromaGetX38x4Satd r0, 0
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov r0, 0
+ SSE41_ChromaGetX38x4Satd r0, 0
+ inc r0
+ SSE41_ChromaGetX38x4Satd r0, 0
%endmacro
%macro SSEReg2MMX 3
- movdq2q %2, %1
- movhlps %1, %1
- movdq2q %3, %1
+ movdq2q %2, %1
+ movhlps %1, %1
+ movdq2q %3, %1
%endmacro
%macro MMXReg2SSE 4
- movq2dq %1, %3
- movq2dq %2, %4
- punpcklqdq %1, %2
+ movq2dq %1, %3
+ movq2dq %2, %4
+ punpcklqdq %1, %2
%endmacro
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
- %assign push_num 0
- LOAD_7_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
loop_chroma_satdx3:
- SSE41_ChromaGetX38x8Satd
- SSEReg2MMX xmm4, mm0,mm1
- SSEReg2MMX xmm5, mm2,mm3
- SSEReg2MMX xmm6, mm5,mm6
- mov r0, arg8
- mov r2, arg9
+ SSE41_ChromaGetX38x8Satd
+ SSEReg2MMX xmm4, mm0,mm1
+ SSEReg2MMX xmm5, mm2,mm3
+ SSEReg2MMX xmm6, mm5,mm6
+ mov r0, arg8
+ mov r2, arg9
- SSE41_ChromaGetX38x8Satd
+ SSE41_ChromaGetX38x8Satd
- MMXReg2SSE xmm0, xmm3, mm0, mm1
- MMXReg2SSE xmm1, xmm3, mm2, mm3
- MMXReg2SSE xmm2, xmm3, mm5, mm6
+ MMXReg2SSE xmm0, xmm3, mm0, mm1
+ MMXReg2SSE xmm1, xmm3, mm2, mm3
+ MMXReg2SSE xmm2, xmm3, mm5, mm6
- paddw xmm4, xmm0
- paddw xmm5, xmm1
- paddw xmm6, xmm2
+ paddw xmm4, xmm0
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
- ; comparing order: DC H V
- movd r3d, xmm6 ;DC
- movd r1d, xmm5 ;H
- movd r0d, xmm4 ;V
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
+ ; comparing order: DC H V
+ movd r3d, xmm6 ;DC
+ movd r1d, xmm5 ;H
+ movd r0d, xmm4 ;V
- shl r5d, 1
- add r1d, r5d
- add r0d, r5d
- cmp r3d, r1d
- jge near not_dc_8x8
- cmp r3d, r0d
- jge near not_dc_h_8x8
+ shl r5d, 1
+ add r1d, r5d
+ add r0d, r5d
+ cmp r3d, r1d
+ jge near not_dc_8x8
+ cmp r3d, r0d
+ jge near not_dc_h_8x8
- ; for DC mode
- mov dword[r4], 0;I8_PRED_DC
- mov retrd, r3d
- jmp near return_satd_intra_8x8_x3
+ ; for DC mode
+ mov dword[r4], 0;I8_PRED_DC
+ mov retrd, r3d
+ jmp near return_satd_intra_8x8_x3
not_dc_8x8:
- ; for H mode
- cmp r1d, r0d
- jge near not_dc_h_8x8
- mov dword[r4], 1;I8_PRED_H
- mov retrd, r1d
- jmp near return_satd_intra_8x8_x3
+ ; for H mode
+ cmp r1d, r0d
+ jge near not_dc_h_8x8
+ mov dword[r4], 1;I8_PRED_H
+ mov retrd, r1d
+ jmp near return_satd_intra_8x8_x3
not_dc_h_8x8:
- ; for V mode
- mov dword[r4], 2;I8_PRED_V
- mov retrd, r0d
+ ; for V mode
+ mov dword[r4], 2;I8_PRED_V
+ mov retrd, r0d
return_satd_intra_8x8_x3:
- WELSEMMS
- POP_XMM
- LOAD_7_PARA_POP
+ WELSEMMS
+ POP_XMM
+ LOAD_7_PARA_POP
ret
@@ -1040,22 +1040,22 @@
;
;***********************************************************************
%macro SSSE3_Get16BSadHVDC 2
- movd xmm6,%1
- pshufb xmm6,xmm1
- movdqa %1, xmm6
- movdqa xmm0,%2
- psadbw xmm0,xmm7
- paddw xmm4,xmm0
- movdqa xmm0,%2
- psadbw xmm0,xmm5
- paddw xmm2,xmm0
- psadbw xmm6,%2
- paddw xmm3,xmm6
+ movd xmm6,%1
+ pshufb xmm6,xmm1
+ movdqa %1, xmm6
+ movdqa xmm0,%2
+ psadbw xmm0,xmm7
+ paddw xmm4,xmm0
+ movdqa xmm0,%2
+ psadbw xmm0,xmm5
+ paddw xmm2,xmm0
+ psadbw xmm6,%2
+ paddw xmm3,xmm6
%endmacro
%macro WelsAddDCValue 4
- movzx %2, byte %1
- mov %3, %2
- add %4, %2
+ movzx %2, byte %1
+ mov %3, %2
+ add %4, %2
%endmacro
;***********************************************************************
@@ -1064,138 +1064,138 @@
;
;***********************************************************************
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
- %assign push_num 0
- LOAD_7_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
+ %assign push_num 0
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
- push r5
- push r4
- push r3
+ push r5
+ push r4
+ push r3
- sub r0, r1
- movdqa xmm5,[r0]
- pxor xmm0,xmm0
- psadbw xmm0,xmm5
- movhlps xmm1,xmm0
- paddw xmm0,xmm1
- movd r5d, xmm0
+ sub r0, r1
+ movdqa xmm5,[r0]
+ pxor xmm0,xmm0
+ psadbw xmm0,xmm5
+ movhlps xmm1,xmm0
+ paddw xmm0,xmm1
+ movd r5d, xmm0
- add r0,r1
- lea r3,[r1+2*r1] ;ebx r3
- WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d
- WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
- WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
- WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
- lea r0, [r0+4*r1]
- add r6, 64
- WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
- WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
- WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
- WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
- lea r0, [r0+4*r1]
- add r6, 64
- WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
- WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
- WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
- WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
- lea r0, [r0+4*r1]
- add r6, 64
- WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
- WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
- WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
- WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
- sub r6, 192
- add r5d,10h
- shr r5d,5
- movd xmm7,r5d
- pxor xmm1,xmm1
- pshufb xmm7,xmm1
- pxor xmm4,xmm4
- pxor xmm3,xmm3
- pxor xmm2,xmm2
- ;sad begin
- pop r3
- lea r4, [r3+2*r3] ;esi r4
- SSSE3_Get16BSadHVDC [r6], [r2]
- SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
- SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
- SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
- add r6, 64
- lea r2, [r2+4*r3]
- SSSE3_Get16BSadHVDC [r6], [r2]
- SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
- SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
- SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
- add r6, 64
- lea r2, [r2+4*r3]
- SSSE3_Get16BSadHVDC [r6], [r2]
- SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
- SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
- SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
- add r6, 64
- lea r2, [r2+4*r3]
- SSSE3_Get16BSadHVDC [r6], [r2]
- SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
- SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
- SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+ add r0,r1
+ lea r3,[r1+2*r1] ;ebx r3
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ lea r0, [r0+4*r1]
+ add r6, 64
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ lea r0, [r0+4*r1]
+ add r6, 64
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ lea r0, [r0+4*r1]
+ add r6, 64
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ sub r6, 192
+ add r5d,10h
+ shr r5d,5
+ movd xmm7,r5d
+ pxor xmm1,xmm1
+ pshufb xmm7,xmm1
+ pxor xmm4,xmm4
+ pxor xmm3,xmm3
+ pxor xmm2,xmm2
+ ;sad begin
+ pop r3
+ lea r4, [r3+2*r3] ;esi r4
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+ add r6, 64
+ lea r2, [r2+4*r3]
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+ add r6, 64
+ lea r2, [r2+4*r3]
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+ add r6, 64
+ lea r2, [r2+4*r3]
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
- pop r4
- pop r5
- pslldq xmm3,4
- por xmm3,xmm2
- movhlps xmm1,xmm3
- paddw xmm3,xmm1
- movhlps xmm0,xmm4
- paddw xmm4,xmm0
- ; comparing order: DC H V
- movd r1d, xmm4 ;DC ;ebx r1d
- movd r0d, xmm3 ;V ;ecx r0d
- psrldq xmm3, 4
- movd r2d, xmm3 ;H ;esi r2d
+ pop r4
+ pop r5
+ pslldq xmm3,4
+ por xmm3,xmm2
+ movhlps xmm1,xmm3
+ paddw xmm3,xmm1
+ movhlps xmm0,xmm4
+ paddw xmm4,xmm0
+ ; comparing order: DC H V
+ movd r1d, xmm4 ;DC ;ebx r1d
+ movd r0d, xmm3 ;V ;ecx r0d
+ psrldq xmm3, 4
+ movd r2d, xmm3 ;H ;esi r2d
- ;mov eax, [esp+36] ;lamda ;eax r5
- shl r5d, 1
- add r2d, r5d
- add r1d, r5d
- ;mov edx, [esp+32] ;edx r4
- cmp r1d, r2d
- jge near not_dc_16x16_sad
- cmp r1d, r0d
- jge near not_dc_h_16x16_sad
- ; for DC mode
- mov dword[r4], 2;I16_PRED_DC
- mov retrd, r1d
- sub r6, 192
+ ;mov eax, [esp+36] ;lamda ;eax r5
+ shl r5d, 1
+ add r2d, r5d
+ add r1d, r5d
+ ;mov edx, [esp+32] ;edx r4
+ cmp r1d, r2d
+ jge near not_dc_16x16_sad
+ cmp r1d, r0d
+ jge near not_dc_h_16x16_sad
+ ; for DC mode
+ mov dword[r4], 2;I16_PRED_DC
+ mov retrd, r1d
+ sub r6, 192
%assign x 0
%rep 16
- movdqa [r6+16*x], xmm7
+ movdqa [r6+16*x], xmm7
%assign x x+1
%endrep
- jmp near return_sad_intra_16x16_x3
+ jmp near return_sad_intra_16x16_x3
not_dc_16x16_sad:
- ; for H mode
- cmp r2d, r0d
- jge near not_dc_h_16x16_sad
- mov dword[r4], 1;I16_PRED_H
- mov retrd, r2d
- jmp near return_sad_intra_16x16_x3
+ ; for H mode
+ cmp r2d, r0d
+ jge near not_dc_h_16x16_sad
+ mov dword[r4], 1;I16_PRED_H
+ mov retrd, r2d
+ jmp near return_sad_intra_16x16_x3
not_dc_h_16x16_sad:
- ; for V mode
- mov dword[r4], 0;I16_PRED_V
- mov retrd, r0d
- sub r6, 192
+ ; for V mode
+ mov dword[r4], 0;I16_PRED_V
+ mov retrd, r0d
+ sub r6, 192
%assign x 0
%rep 16
- movdqa [r6+16*x], xmm5
+ movdqa [r6+16*x], xmm5
%assign x x+1
%endrep
return_sad_intra_16x16_x3:
- POP_XMM
- LOAD_7_PARA_POP
- ret
+ POP_XMM
+ LOAD_7_PARA_POP
+ ret
;***********************************************************************
;
@@ -1210,63 +1210,63 @@
;SSE4.1
%macro SSE41_GetSatd8x4 0
- movq xmm0, [r0]
- punpcklqdq xmm0, xmm0
- pmaddubsw xmm0, xmm7
- movq xmm1, [r0+r1]
- punpcklqdq xmm1, xmm1
- pmaddubsw xmm1, xmm7
- movq xmm2, [r2]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [r2+r3]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- psubsw xmm0, xmm2
- psubsw xmm1, xmm3
- movq xmm2, [r0+2*r1]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [r0+r4]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- movq xmm4, [r2+2*r3]
- punpcklqdq xmm4, xmm4
- pmaddubsw xmm4, xmm7
- movq xmm5, [r2+r5]
- punpcklqdq xmm5, xmm5
- pmaddubsw xmm5, xmm7
- psubsw xmm2, xmm4
- psubsw xmm3, xmm5
- SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
- pabsw xmm0, xmm0
- pabsw xmm2, xmm2
- pabsw xmm1, xmm1
- pabsw xmm3, xmm3
- movdqa xmm4, xmm3
- pblendw xmm3, xmm1, 0xAA
- pslld xmm1, 16
- psrld xmm4, 16
- por xmm1, xmm4
- pmaxuw xmm1, xmm3
- paddw xmm6, xmm1
- movdqa xmm4, xmm0
- pblendw xmm0, xmm2, 0xAA
- pslld xmm2, 16
- psrld xmm4, 16
- por xmm2, xmm4
- pmaxuw xmm0, xmm2
- paddw xmm6, xmm0
+ movq xmm0, [r0]
+ punpcklqdq xmm0, xmm0
+ pmaddubsw xmm0, xmm7
+ movq xmm1, [r0+r1]
+ punpcklqdq xmm1, xmm1
+ pmaddubsw xmm1, xmm7
+ movq xmm2, [r2]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r2+r3]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ psubsw xmm0, xmm2
+ psubsw xmm1, xmm3
+ movq xmm2, [r0+2*r1]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r0+r4]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ movq xmm4, [r2+2*r3]
+ punpcklqdq xmm4, xmm4
+ pmaddubsw xmm4, xmm7
+ movq xmm5, [r2+r5]
+ punpcklqdq xmm5, xmm5
+ pmaddubsw xmm5, xmm7
+ psubsw xmm2, xmm4
+ psubsw xmm3, xmm5
+ SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
+ pabsw xmm0, xmm0
+ pabsw xmm2, xmm2
+ pabsw xmm1, xmm1
+ pabsw xmm3, xmm3
+ movdqa xmm4, xmm3
+ pblendw xmm3, xmm1, 0xAA
+ pslld xmm1, 16
+ psrld xmm4, 16
+ por xmm1, xmm4
+ pmaxuw xmm1, xmm3
+ paddw xmm6, xmm1
+ movdqa xmm4, xmm0
+ pblendw xmm0, xmm2, 0xAA
+ pslld xmm2, 16
+ psrld xmm4, 16
+ por xmm2, xmm4
+ pmaxuw xmm0, xmm2
+ paddw xmm6, xmm0
%endmacro
%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
- MMX_DW_1_2REG %3, %4
- pmaddwd %2, %3
- movhlps %4, %2
- paddd %2, %4
- pshuflw %4, %2,0Eh
- paddd %2, %4
- movd %1, %2
+ MMX_DW_1_2REG %3, %4
+ pmaddwd %2, %3
+ movhlps %4, %2
+ paddd %2, %4
+ pshuflw %4, %2,0Eh
+ paddd %2, %4
+ movd %1, %2
%endmacro
;***********************************************************************
;
@@ -1274,53 +1274,53 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse41
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqa xmm4,[HSwapSumSubDB1]
- movd xmm2,[r2]
- movd xmm5,[r2+r3]
- shufps xmm2,xmm5,0
- movd xmm3,[r2+r3*2]
- lea r2, [r3*2+r2]
- movd xmm5,[r2+r3]
- shufps xmm3,xmm5,0
- movd xmm0,[r0]
- movd xmm5,[r0+r1]
- shufps xmm0,xmm5,0
- movd xmm1,[r0+r1*2]
- lea r0, [r1*2+r0]
- movd xmm5,[r0+r1]
- shufps xmm1,xmm5,0
- pmaddubsw xmm0,xmm4
- pmaddubsw xmm1,xmm4
- pmaddubsw xmm2,xmm4
- pmaddubsw xmm3,xmm4
- psubw xmm0,xmm2
- psubw xmm1,xmm3
- movdqa xmm2,xmm0
- paddw xmm0,xmm1
- psubw xmm1,xmm2
- movdqa xmm2,xmm0
- punpcklqdq xmm0,xmm1
- punpckhqdq xmm2,xmm1
- movdqa xmm1,xmm0
- paddw xmm0,xmm2
- psubw xmm2,xmm1
- movdqa xmm1,xmm0
- pblendw xmm0,xmm2,0AAh
- pslld xmm2,16
- psrld xmm1,16
- por xmm2,xmm1
- pabsw xmm0,xmm0
- pabsw xmm2,xmm2
- pmaxsw xmm0,xmm2
- SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm4,[HSwapSumSubDB1]
+ movd xmm2,[r2]
+ movd xmm5,[r2+r3]
+ shufps xmm2,xmm5,0
+ movd xmm3,[r2+r3*2]
+ lea r2, [r3*2+r2]
+ movd xmm5,[r2+r3]
+ shufps xmm3,xmm5,0
+ movd xmm0,[r0]
+ movd xmm5,[r0+r1]
+ shufps xmm0,xmm5,0
+ movd xmm1,[r0+r1*2]
+ lea r0, [r1*2+r0]
+ movd xmm5,[r0+r1]
+ shufps xmm1,xmm5,0
+ pmaddubsw xmm0,xmm4
+ pmaddubsw xmm1,xmm4
+ pmaddubsw xmm2,xmm4
+ pmaddubsw xmm3,xmm4
+ psubw xmm0,xmm2
+ psubw xmm1,xmm3
+ movdqa xmm2,xmm0
+ paddw xmm0,xmm1
+ psubw xmm1,xmm2
+ movdqa xmm2,xmm0
+ punpcklqdq xmm0,xmm1
+ punpckhqdq xmm2,xmm1
+ movdqa xmm1,xmm0
+ paddw xmm0,xmm2
+ psubw xmm2,xmm1
+ movdqa xmm1,xmm0
+ pblendw xmm0,xmm2,0AAh
+ pslld xmm2,16
+ psrld xmm1,16
+ por xmm2,xmm1
+ pabsw xmm0,xmm0
+ pabsw xmm2,xmm2
+ pmaxsw xmm0,xmm2
+ SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
;***********************************************************************
;
@@ -1329,30 +1329,30 @@
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse41
%ifdef X86_32
- push r4
- push r5
+ push r4
+ push r5
%endif
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
%ifdef X86_32
- pop r5
- pop r4
+ pop r5
+ pop r4
%endif
- ret
+ ret
;***********************************************************************
;
@@ -1361,36 +1361,36 @@
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse41
%ifdef X86_32
- push r4
- push r5
- push r6
+ push r4
+ push r5
+ push r6
%endif
- %assign push_num 3
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- mov r6, 0
+ %assign push_num 3
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
loop_get_satd_8x16:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_8x16
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_8x16
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
%ifdef X86_32
- pop r6
- pop r5
- pop r4
+ pop r6
+ pop r5
+ pop r4
%endif
- ret
+ ret
;***********************************************************************
;
@@ -1399,42 +1399,42 @@
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse41
%ifdef X86_32
- push r4
- push r5
+ push r4
+ push r5
%endif
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- push r0
- push r2
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ push r0
+ push r2
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
- pop r2
- pop r0
- add r0, 8
- add r2, 8
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
+ pop r2
+ pop r0
+ add r0, 8
+ add r2, 8
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
%ifdef X86_32
- pop r5
- pop r4
+ pop r5
+ pop r4
%endif
- ret
+ ret
;***********************************************************************
;
@@ -1444,53 +1444,53 @@
WELS_EXTERN WelsSampleSatd16x16_sse41
%ifdef X86_32
- push r4
- push r5
- push r6
+ push r4
+ push r5
+ push r6
%endif
- %assign push_num 3
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
+ %assign push_num 3
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
- push r0
- push r2
+ push r0
+ push r2
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- mov r6, 0
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
loop_get_satd_16x16_left:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_16x16_left
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_left
- pop r2
- pop r0
- add r0, 8
- add r2, 8
- mov r6, 0
+ pop r2
+ pop r0
+ add r0, 8
+ add r2, 8
+ mov r6, 0
loop_get_satd_16x16_right:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_16x16_right
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_right
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
%ifdef X86_32
- pop r6
- pop r5
- pop r4
+ pop r6
+ pop r5
+ pop r4
%endif
- ret
+ ret
;***********************************************************************
;
@@ -1505,55 +1505,55 @@
;***********************************************************************
%macro SSE2_GetSad2x16 0
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqu xmm1, [r2]
- MOVDQ xmm2, [r0];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqu xmm1, [r2]
+ MOVDQ xmm2, [r0];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
%endmacro
%macro SSE2_GetSad4x16 0
- movdqu xmm0, [r2]
- MOVDQ xmm2, [r0]
- psadbw xmm0, xmm2
- paddw xmm7, xmm0
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [r2+2*r3]
- MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [r2+r5]
- MOVDQ xmm2, [r0+r4]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ paddw xmm7, xmm0
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+2*r3]
+ MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+r5]
+ MOVDQ xmm2, [r0+r4]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
%endmacro
%macro SSE2_GetSad8x4 0
- movq xmm0, [r0]
- movq xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movhps xmm0, [r0]
- movhps xmm1, [r0+r1]
+ movq xmm0, [r0]
+ movq xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movhps xmm0, [r0]
+ movhps xmm1, [r0+r1]
- movq xmm2, [r2]
- movq xmm3, [r2+r3]
- lea r2, [r2+2*r3]
- movhps xmm2, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm3
- paddw xmm6, xmm0
- paddw xmm6, xmm1
+ movq xmm2, [r2]
+ movq xmm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movhps xmm2, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm2
+ psadbw xmm1, xmm3
+ paddw xmm6, xmm0
+ paddw xmm6, xmm1
%endmacro
;***********************************************************************
@@ -1565,39 +1565,39 @@
;***********************************************************************
WELS_EXTERN WelsSampleSad16x16_sse2
%ifdef X86_32
- push r4
- push r5
+ push r4
+ push r5
%endif
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- lea r4, [3*r1]
- lea r5, [3*r3]
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ lea r4, [3*r1]
+ lea r5, [3*r3]
- pxor xmm7, xmm7
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd retrd, xmm0
- POP_XMM
- LOAD_4_PARA_POP
+ pxor xmm7, xmm7
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ movhlps xmm0, xmm7
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
%ifdef X86_32
- pop r5
- pop r4
+ pop r5
+ pop r4
%endif
- ret
+ ret
;***********************************************************************
;
@@ -1607,55 +1607,55 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x8_sse2
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqu xmm0, [r2]
- MOVDQ xmm2, [r0]
- psadbw xmm0, xmm2
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
- SSE2_GetSad2x16
- SSE2_GetSad2x16
- SSE2_GetSad2x16
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
- movhlps xmm1, xmm0
- paddw xmm0, xmm1
- movd retrd, xmm0
- LOAD_4_PARA_POP
- ret
+ movhlps xmm1, xmm0
+ paddw xmm0, xmm1
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+ ret
WELS_EXTERN WelsSampleSad8x16_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 7
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd retrd, xmm0
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
@@ -1664,22 +1664,22 @@
%endmacro
WELS_EXTERN WelsSampleSad8x8_sse21
- %assign push_num 0
- mov r2, arg3
- push r2
- CACHE_SPLIT_CHECK r2, 8, 64
- jle near .pixel_sad_8x8_nsplit
- pop r2
+ %assign push_num 0
+ mov r2, arg3
+ push r2
+ CACHE_SPLIT_CHECK r2, 8, 64
+ jle near .pixel_sad_8x8_nsplit
+ pop r2
%ifdef X86_32
- push r3
- push r4
- push r5
+ push r3
+ push r4
+ push r5
%endif
- %assign push_num 3
- PUSH_XMM 8
- mov r0, arg1
- mov r1, arg2
- SIGN_EXTENSION r1, r1d
+ %assign push_num 3
+ PUSH_XMM 8
+ mov r0, arg1
+ mov r1, arg2
+ SIGN_EXTENSION r1, r1d
pxor xmm7, xmm7
;ecx r2, edx r4, edi r5
@@ -1694,109 +1694,109 @@
shl r4, 3
movd xmm5, r5d
movd xmm6, r4d
- mov r5, 8
- add r5, r2
+ mov r5, 8
+ add r5, r2
mov r3, arg4
- SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r3, r3d
movq xmm0, [r0]
- movhps xmm0, [r0+r1]
+ movhps xmm0, [r0+r1]
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
movq xmm0, [r0]
- movhps xmm0, [r0+r1]
+ movhps xmm0, [r0+r1]
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
movq xmm0, [r0]
- movhps xmm0, [r0+r1]
+ movhps xmm0, [r0+r1]
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
movq xmm0, [r0]
- movhps xmm0, [r0+r1]
+ movhps xmm0, [r0+r1]
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd retrd, xmm0
- POP_XMM
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+ POP_XMM
%ifdef X86_32
- pop r5
- pop r4
- pop r3
+ pop r5
+ pop r4
+ pop r3
%endif
- jmp .return
+ jmp .return
.pixel_sad_8x8_nsplit:
- pop r2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 7
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm6, xmm6
- SSE2_GetSad8x4
+ pop r2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm6, xmm6
+ SSE2_GetSad8x4
lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
+ lea r2, [r2+2*r3]
SSE2_GetSad8x4
movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd retrd, xmm0
- POP_XMM
- LOAD_4_PARA_POP
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
.return:
- ret
+ ret
;***********************************************************************
@@ -1814,624 +1814,624 @@
%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
- psadbw %1, %4
- paddw xmm5, %1
- psadbw %4, %3
- paddw xmm4, %4
- movdqu %4, [%5-1]
- psadbw %4, %2
- paddw xmm6, %4
- movdqu %4, [%5+1]
- psadbw %4, %2
- paddw xmm7, %4
+ psadbw %1, %4
+ paddw xmm5, %1
+ psadbw %4, %3
+ paddw xmm4, %4
+ movdqu %4, [%5-1]
+ psadbw %4, %2
+ paddw xmm6, %4
+ movdqu %4, [%5+1]
+ psadbw %4, %2
+ paddw xmm7, %4
%endmacro
WELS_EXTERN WelsSampleSadFour16x16_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [r0]
- sub r2, r3
- movdqu xmm3, [r2]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
- movdqu xmm2, [r2+r3-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
- movdqu xmm3, [r2+r3+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- psadbw xmm2, xmm3
- paddw xmm5, xmm2
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm2, xmm3
+ paddw xmm5, xmm2
- movdqu xmm2, [r2-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
+ movdqu xmm2, [r2-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
- movdqu xmm3, [r2+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movdqu xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movdqu xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
WELS_EXTERN WelsSampleSadFour16x8_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [r0]
- sub r2, r3
- movdqu xmm3, [r2]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
- movdqu xmm2, [r2+r3-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
- movdqu xmm3, [r2+r3+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movdqu xmm0, [r2-1]
- psadbw xmm0, xmm1
- paddw xmm6, xmm0
+ movdqu xmm0, [r2-1]
+ psadbw xmm0, xmm1
+ paddw xmm6, xmm0
- movdqu xmm3, [r2+1]
- psadbw xmm3, xmm1
- paddw xmm7, xmm3
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm1
+ paddw xmm7, xmm3
- movdqu xmm3, [r2+r3]
- psadbw xmm1, xmm3
- paddw xmm5, xmm1
+ movdqu xmm3, [r2+r3]
+ psadbw xmm1, xmm3
+ paddw xmm5, xmm1
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
WELS_EXTERN WelsSampleSadFour8x16_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- sub r2, r3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
WELS_EXTERN WelsSampleSadFour8x8_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- sub r2, r3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
WELS_EXTERN WelsSampleSadFour4x4_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movd xmm0, [r0]
- movd xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movd xmm2, [r0]
- movd xmm3, [r0+r1]
- punpckldq xmm0, xmm1
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
- sub r2, r3
- movd xmm1, [r2]
- movd xmm2, [r2+r3]
- punpckldq xmm1, xmm2
- movd xmm2, [r2+r3-1]
- movd xmm3, [r2+r3+1]
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movd xmm0, [r0]
+ movd xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movd xmm2, [r0]
+ movd xmm3, [r0+r1]
+ punpckldq xmm0, xmm1
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ sub r2, r3
+ movd xmm1, [r2]
+ movd xmm2, [r2+r3]
+ punpckldq xmm1, xmm2
+ movd xmm2, [r2+r3-1]
+ movd xmm3, [r2+r3+1]
- lea r2, [r2+2*r3]
+ lea r2, [r2+2*r3]
- movd xmm4, [r2]
- movd xmm5, [r2-1]
- punpckldq xmm2, xmm5
- movd xmm5, [r2+1]
- punpckldq xmm3, xmm5
+ movd xmm4, [r2]
+ movd xmm5, [r2-1]
+ punpckldq xmm2, xmm5
+ movd xmm5, [r2+1]
+ punpckldq xmm3, xmm5
- movd xmm5, [r2+r3]
- punpckldq xmm4, xmm5
+ movd xmm5, [r2+r3]
+ punpckldq xmm4, xmm5
- punpcklqdq xmm1, xmm4 ;-L
+ punpcklqdq xmm1, xmm4 ;-L
- movd xmm5, [r2+r3-1]
- movd xmm6, [r2+r3+1]
+ movd xmm5, [r2+r3-1]
+ movd xmm6, [r2+r3+1]
- lea r2, [r2+2*r3]
- movd xmm7, [r2-1]
- punpckldq xmm5, xmm7
- punpcklqdq xmm2, xmm5 ;-1
- movd xmm7, [r2+1]
- punpckldq xmm6, xmm7
- punpcklqdq xmm3, xmm6 ;+1
- movd xmm6, [r2]
- movd xmm7, [r2+r3]
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6 ;+L
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
+ lea r2, [r2+2*r3]
+ movd xmm7, [r2-1]
+ punpckldq xmm5, xmm7
+ punpcklqdq xmm2, xmm5 ;-1
+ movd xmm7, [r2+1]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm3, xmm6 ;+1
+ movd xmm6, [r2]
+ movd xmm7, [r2+r3]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6 ;+L
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
- movhlps xmm0, xmm1
- paddw xmm1, xmm0
- movhlps xmm0, xmm2
- paddw xmm2, xmm0
- movhlps xmm0, xmm3
- paddw xmm3, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- punpckldq xmm1, xmm4
- punpckldq xmm2, xmm3
- punpcklqdq xmm1, xmm2
- movdqa [r4],xmm1
- POP_XMM
- LOAD_5_PARA_POP
- ret
+ movhlps xmm0, xmm1
+ paddw xmm1, xmm0
+ movhlps xmm0, xmm2
+ paddw xmm2, xmm0
+ movhlps xmm0, xmm3
+ paddw xmm3, xmm0
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ punpckldq xmm1, xmm4
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm1, xmm2
+ movdqa [r4],xmm1
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
;***********************************************************************
;
@@ -2444,33 +2444,33 @@
;***********************************************************************
WELS_EXTERN WelsSampleSad4x4_mmx
%assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movd mm0, [r0]
- movd mm1, [r0+r1]
- punpckldq mm0, mm1
+ LOAD_4_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movd mm0, [r0]
+ movd mm1, [r0+r1]
+ punpckldq mm0, mm1
- movd mm3, [r2]
- movd mm4, [r2+r3]
- punpckldq mm3, mm4
- psadbw mm0, mm3
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm0, mm3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
- movd mm1, [r0]
- movd mm2, [r0+r1]
- punpckldq mm1, mm2
+ movd mm1, [r0]
+ movd mm2, [r0+r1]
+ punpckldq mm1, mm2
- movd mm3, [r2]
- movd mm4, [r2+r3]
- punpckldq mm3, mm4
- psadbw mm1, mm3
- paddw mm0, mm1
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm1, mm3
+ paddw mm0, mm1
movd retrd, mm0
- WELSEMMS
+ WELSEMMS
LOAD_4_PARA_POP
ret
--- a/codec/common/x86/vaa.asm
+++ b/codec/common/x86/vaa.asm
@@ -29,16 +29,16 @@
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
-;* vaa.asm
+;* vaa.asm
;*
-;* Abstract
+;* Abstract
;* sse2 for pVaa routines
;*
;* History
-;* 04/14/2010 Created
-;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
-;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
-;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;* 04/14/2010 Created
+;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
+;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
+;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
;*
;*************************************************************************/
%include "asm_inc.asm"
@@ -49,87 +49,87 @@
;***********************************************************************
; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
- ; @sum_8x2 begin
- pshufd %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 0B1h ; 10110001 B
- paddw %1, %2
- ; end of @sum_8x2
-%endmacro ; END of SUM_WORD_8x2_SSE2
+%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
+ ; @sum_8x2 begin
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 0B1h ; 10110001 B
+ paddw %1, %2
+ ; end of @sum_8x2
+%endmacro ; END of SUM_WORD_8x2_SSE2
%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [r0 ] ; line 0
- movdqa %2, [r0+r1] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [r0+r2] ; line 2
- movdqa %4, [r0+r3] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- pshufd %3, %1, 0B1h
- pshufd %4, %2, 0B1h
- paddw %1, %3
- paddw %2, %4
- movdqa %3, %1
- movdqa %4, %2
- pshuflw %5, %1, 0B1h
- pshufhw %6, %3, 0B1h
- paddw %1, %5
- paddw %3, %6
- pshuflw %5, %2, 0B1h
- pshufhw %6, %4, 0B1h
- paddw %2, %5
- paddw %4, %6
- punpcklwd %1, %2
- punpckhwd %3, %4
- punpcklwd %1, %3
- psraw %1, $04
+ movdqa %1, [r0 ] ; line 0
+ movdqa %2, [r0+r1] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [r0+r2] ; line 2
+ movdqa %4, [r0+r3] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ pshufd %3, %1, 0B1h
+ pshufd %4, %2, 0B1h
+ paddw %1, %3
+ paddw %2, %4
+ movdqa %3, %1
+ movdqa %4, %2
+ pshuflw %5, %1, 0B1h
+ pshufhw %6, %3, 0B1h
+ paddw %1, %5
+ paddw %3, %6
+ pshuflw %5, %2, 0B1h
+ pshufhw %6, %4, 0B1h
+ paddw %2, %5
+ paddw %4, %6
+ punpcklwd %1, %2
+ punpckhwd %3, %4
+ punpcklwd %1, %3
+ psraw %1, $04
%endmacro
%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [r0 ] ; line 0
- movdqa %2, [r0+r1] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [r0+r2] ; line 2
- movdqa %4, [r0+r3] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
- phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
- psraw %1, $04
+ movdqa %1, [r0 ] ; line 0
+ movdqa %2, [r0+r1] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [r0+r2] ; line 2
+ movdqa %4, [r0+r3] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+ phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+ psraw %1, $04
%endmacro
@@ -143,7 +143,7 @@
; , 6/7/2010
;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
+; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
WELS_EXTERN AnalysisVaaInfoIntra_sse2
@@ -174,71 +174,71 @@
mov r4,r2
sal r4,$01 ;r4 = 4*iLineSize
- pxor xmm7, xmm7
+ pxor xmm7, xmm7
- ; loops
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7], xmm0
+ ; loops
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7], xmm0
- lea r0, [r0+r4]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7+8], xmm0
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+8], xmm0
- lea r0, [r0+r4]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7+16], xmm0
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+16], xmm0
- lea r0, [r0+r4]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7+24], xmm0
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+24], xmm0
- movdqa xmm0, [r7] ; block 0~7
- movdqa xmm1, [r7+16] ; block 8~15
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3
+ movdqa xmm0, [r7] ; block 0~7
+ movdqa xmm1, [r7+16] ; block 8~15
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
- movd r2d, xmm0
- and r2, 0ffffh ; effective low work truncated
- mov r3, r2
- imul r2, r3
- sar r2, $04
- movd retrd, xmm1
- sub retrd, r2d
+ movd r2d, xmm0
+ and r2, 0ffffh ; effective low work truncated
+ mov r3, r2
+ imul r2, r3
+ sar r2, $04
+ movd retrd, xmm1
+ sub retrd, r2d
- add r7,32
- add r7,r5
+ add r7,32
+ add r7,r5
%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
+ pop r6
+ pop r5
+ pop r4
+ pop r3
%endif
- POP_XMM
+ POP_XMM
- ret
+ ret
;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
+; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
@@ -269,47 +269,47 @@
mov r4,r2
sal r4,$01 ;r4 = 4*iLineSize
- pxor xmm7, xmm7
+ pxor xmm7, xmm7
- ; loops
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ ; loops
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7],xmm0
- lea r0,[r0+r4]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+8],xmm1
- lea r0,[r0+r4]
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+16],xmm0
- lea r0,[r0+r4]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+24],xmm1
- movdqa xmm0,[r7]
- movdqa xmm1,[r7+16]
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
+ movdqa xmm0,[r7]
+ movdqa xmm1,[r7+16]
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
movd r2d, xmm0
@@ -318,94 +318,94 @@
imul r2, r3
sar r2, $04
movd retrd, xmm1
- sub retrd, r2d
+ sub retrd, r2d
- add r7,32
- add r7,r5
+ add r7,32
+ add r7,r5
%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
+ pop r6
+ pop r5
+ pop r4
+ pop r3
%endif
- POP_XMM
+ POP_XMM
- ret
+ ret
;***********************************************************************
-; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
+; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
;***********************************************************************
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
- %assign push_num 0
- LOAD_1_PARA
- movdqa xmm0,[r0]
- pshufd xmm1, xmm0, 01Bh
- paddd xmm1, xmm0
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
- psrad xmm1, 02h ; iAverageSad
- movdqa xmm2, xmm1
- psrad xmm2, 06h
- movdqa xmm3, xmm0 ; iSadBlock
- psrad xmm3, 06h
- psubd xmm3, xmm2
- pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
- pshufd xmm4, xmm3, 01Bh
- paddd xmm4, xmm3
- pshufd xmm3, xmm4, 0B1h
- paddd xmm3, xmm4
- movd r0d, xmm3
- cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0,[r0]
+ pshufd xmm1, xmm0, 01Bh
+ paddd xmm1, xmm0
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+ psrad xmm1, 02h ; iAverageSad
+ movdqa xmm2, xmm1
+ psrad xmm2, 06h
+ movdqa xmm3, xmm0 ; iSadBlock
+ psrad xmm3, 06h
+ psubd xmm3, xmm2
+ pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
+ pshufd xmm4, xmm3, 01Bh
+ paddd xmm4, xmm3
+ pshufd xmm3, xmm4, 0B1h
+ paddd xmm3, xmm4
+ movd r0d, xmm3
+ cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
- jb near .threshold_exit
- pshufd xmm0, xmm0, 01Bh
- pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
- movmskps retrd, xmm0
- ret
+ jb near .threshold_exit
+ pshufd xmm0, xmm0, 01Bh
+ pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
+ movmskps retrd, xmm0
+ ret
.threshold_exit:
- mov retrd, 15
- ret
+ mov retrd, 15
+ ret
;***********************************************************************
-; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
+; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
;***********************************************************************
WELS_EXTERN MdInterAnalysisVaaInfo_sse2
- %assign push_num 0
- LOAD_1_PARA
- movdqa xmm0, [r0]
- pshufd xmm1, xmm0, 01Bh
- paddd xmm1, xmm0
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
- psrad xmm1, 02h ; iAverageSad
- movdqa xmm2, xmm1
- psrad xmm2, 06h
- movdqa xmm3, xmm0 ; iSadBlock
- psrad xmm3, 06h
- psubd xmm3, xmm2
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ pshufd xmm1, xmm0, 01Bh
+ paddd xmm1, xmm0
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+ psrad xmm1, 02h ; iAverageSad
+ movdqa xmm2, xmm1
+ psrad xmm2, 06h
+ movdqa xmm3, xmm0 ; iSadBlock
+ psrad xmm3, 06h
+ psubd xmm3, xmm2
- ; to replace pmulld functionality as below
- movdqa xmm2, xmm3
- pmuludq xmm2, xmm3
- pshufd xmm4, xmm3, 0B1h
- pmuludq xmm4, xmm4
- movdqa xmm5, xmm2
- punpckldq xmm5, xmm4
- punpckhdq xmm2, xmm4
- punpcklqdq xmm5, xmm2
+ ; to replace pmulld functionality as below
+ movdqa xmm2, xmm3
+ pmuludq xmm2, xmm3
+ pshufd xmm4, xmm3, 0B1h
+ pmuludq xmm4, xmm4
+ movdqa xmm5, xmm2
+ punpckldq xmm5, xmm4
+ punpckhdq xmm2, xmm4
+ punpcklqdq xmm5, xmm2
- pshufd xmm4, xmm5, 01Bh
- paddd xmm4, xmm5
- pshufd xmm5, xmm4, 0B1h
- paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 01Bh
+ paddd xmm4, xmm5
+ pshufd xmm5, xmm4, 0B1h
+ paddd xmm5, xmm4
- movd r0d, xmm5
- cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
- jb near .threshold_exit
- pshufd xmm0, xmm0, 01Bh
- pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
- movmskps retrd, xmm0
- ret
+ movd r0d, xmm5
+ cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
+ jb near .threshold_exit
+ pshufd xmm0, xmm0, 01Bh
+ pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
+ movmskps retrd, xmm0
+ ret
.threshold_exit:
- mov retrd, 15
- ret
+ mov retrd, 15
+ ret
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -35,68 +35,68 @@
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
-.macro ROW_TRANSFORM_1_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
- vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 $8, $1, #1
- vshr.s16 $9, $3, #1
- vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
+.macro ROW_TRANSFORM_1_STEP
+// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+ vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
+ vshr.s16 $8, $1, #1
+ vshr.s16 $9, $3, #1
+ vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
+ vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
-.macro TRANSFORM_4BYTES // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+.macro TRANSFORM_4BYTES // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
-.macro COL_TRANSFORM_1_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 $6, $1, #1
- vshr.s32 $7, $3, #1
- vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+.macro COL_TRANSFORM_1_STEP
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vshr.s32 $6, $1, #1
+ vshr.s32 $7, $3, #1
+ vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
#else
-.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
- vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 \arg8, \arg1, #1
- vshr.s16 \arg9, \arg3, #1
- vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
+.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+ vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
+ vshr.s16 \arg8, \arg1, #1
+ vshr.s16 \arg9, \arg3, #1
+ vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
+ vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
-.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
-.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 \arg6, \arg1, #1
- vshr.s32 \arg7, \arg3, #1
- vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vshr.s32 \arg6, \arg1, #1
+ vshr.s32 \arg7, \arg3, #1
+ vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
#endif
// r0 int16_t* block,
@@ -103,61 +103,61 @@
// r1 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
- vld1.64 {d0-d2}, [r1]
+ vld1.64 {d0-d2}, [r1]
- vceq.s8 q0, q0, #0
- vceq.s8 d2, d2, #0
- vmvn q0, q0
- vmvn d2, d2
- vabs.s8 q0, q0
- vabs.s8 d2, d2
+ vceq.s8 q0, q0, #0
+ vceq.s8 d2, d2, #0
+ vmvn q0, q0
+ vmvn d2, d2
+ vabs.s8 q0, q0
+ vabs.s8 d2, d2
- vst1.64 {d0-d2}, [r1]
+ vst1.64 {d0-d2}, [r1]
WELS_ASM_FUNC_END
-// uint8_t *pred, const int32_t stride, int16_t *rs
+// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
- vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
+ vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
- ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
+ ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
- TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
+ TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
- // transform element 32bits
- vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+ // transform element 32bits
+ vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+ vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+ vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+ vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
- COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11
+ COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11
- TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
+ TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
- //after clip_table[MAX_NEG_CROP] into [0, 255]
- mov r2, r0
- vld1.32 {d20[0]},[r0],r1
- vld1.32 {d20[1]},[r0],r1
- vld1.32 {d22[0]},[r0],r1
- vld1.32 {d22[1]},[r0]
+ //after clip_table[MAX_NEG_CROP] into [0, 255]
+ mov r2, r0
+ vld1.32 {d20[0]},[r0],r1
+ vld1.32 {d20[1]},[r0],r1
+ vld1.32 {d22[0]},[r0],r1
+ vld1.32 {d22[1]},[r0]
- vrshrn.s32 d16, q0, #6
- vrshrn.s32 d17, q1, #6
- vrshrn.s32 d18, q2, #6
- vrshrn.s32 d19, q3, #6
+ vrshrn.s32 d16, q0, #6
+ vrshrn.s32 d17, q1, #6
+ vrshrn.s32 d18, q2, #6
+ vrshrn.s32 d19, q3, #6
- vmovl.u8 q0,d20
- vmovl.u8 q1,d22
- vadd.s16 q0,q8
- vadd.s16 q1,q9
+ vmovl.u8 q0,d20
+ vmovl.u8 q1,d22
+ vadd.s16 q0,q8
+ vadd.s16 q1,q9
- vqmovun.s16 d20,q0
- vqmovun.s16 d22,q1
+ vqmovun.s16 d20,q0
+ vqmovun.s16 d22,q1
- vst1.32 {d20[0]},[r2],r1
- vst1.32 {d20[1]},[r2],r1
- vst1.32 {d22[0]},[r2],r1
- vst1.32 {d22[1]},[r2]
+ vst1.32 {d20[0]},[r2],r1
+ vst1.32 {d20[1]},[r2],r1
+ vst1.32 {d22[0]},[r2],r1
+ vst1.32 {d22[1]},[r2]
WELS_ASM_FUNC_END
#endif
--- a/codec/decoder/core/arm/intra_pred_neon.S
+++ b/codec/decoder/core/arm/intra_pred_neon.S
@@ -38,45 +38,45 @@
#ifdef __APPLE__
//Global macro
.macro GET_8BYTE_DATA
- vld1.8 {$0[0]}, [$1], $2
- vld1.8 {$0[1]}, [$1], $2
- vld1.8 {$0[2]}, [$1], $2
- vld1.8 {$0[3]}, [$1], $2
- vld1.8 {$0[4]}, [$1], $2
- vld1.8 {$0[5]}, [$1], $2
- vld1.8 {$0[6]}, [$1], $2
- vld1.8 {$0[7]}, [$1], $2
+ vld1.8 {$0[0]}, [$1], $2
+ vld1.8 {$0[1]}, [$1], $2
+ vld1.8 {$0[2]}, [$1], $2
+ vld1.8 {$0[3]}, [$1], $2
+ vld1.8 {$0[4]}, [$1], $2
+ vld1.8 {$0[5]}, [$1], $2
+ vld1.8 {$0[6]}, [$1], $2
+ vld1.8 {$0[7]}, [$1], $2
.endmacro
#else
//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
- vld1.8 {\arg0[0]}, [\arg1], \arg2
- vld1.8 {\arg0[1]}, [\arg1], \arg2
- vld1.8 {\arg0[2]}, [\arg1], \arg2
- vld1.8 {\arg0[3]}, [\arg1], \arg2
- vld1.8 {\arg0[4]}, [\arg1], \arg2
- vld1.8 {\arg0[5]}, [\arg1], \arg2
- vld1.8 {\arg0[6]}, [\arg1], \arg2
- vld1.8 {\arg0[7]}, [\arg1], \arg2
+ vld1.8 {\arg0[0]}, [\arg1], \arg2
+ vld1.8 {\arg0[1]}, [\arg1], \arg2
+ vld1.8 {\arg0[2]}, [\arg1], \arg2
+ vld1.8 {\arg0[3]}, [\arg1], \arg2
+ vld1.8 {\arg0[4]}, [\arg1], \arg2
+ vld1.8 {\arg0[5]}, [\arg1], \arg2
+ vld1.8 {\arg0[6]}, [\arg1], \arg2
+ vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
- //Get the top line data to 'q0'
- sub r2, r0, r1
- vldm r2, {d0, d1}
+ //Get the top line data to 'q0'
+ sub r2, r0, r1
+ vldm r2, {d0, d1}
- mov r2, r0
- mov r3, #4
- //Set the top line to the each line of MB(16*16)
+ mov r2, r0
+ mov r3, #4
+ //Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v:
- vst1.8 {d0,d1}, [r2], r1
- vst1.8 {d0,d1}, [r2], r1
- vst1.8 {d0,d1}, [r2], r1
- vst1.8 {d0,d1}, [r2], r1
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_v
+ vst1.8 {d0,d1}, [r2], r1
+ vst1.8 {d0,d1}, [r2], r1
+ vst1.8 {d0,d1}, [r2], r1
+ vst1.8 {d0,d1}, [r2], r1
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END
@@ -83,59 +83,59 @@
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
- sub r2, r0, #1
- mov r3, #4
+ sub r2, r0, #1
+ mov r3, #4
loop_0_get_i16x16_luma_pred_h:
- //Get one byte data from left side
- vld1.8 {d0[],d1[]}, [r2], r1
- vld1.8 {d2[],d3[]}, [r2], r1
- vld1.8 {d4[],d5[]}, [r2], r1
- vld1.8 {d6[],d7[]}, [r2], r1
+ //Get one byte data from left side
+ vld1.8 {d0[],d1[]}, [r2], r1
+ vld1.8 {d2[],d3[]}, [r2], r1
+ vld1.8 {d4[],d5[]}, [r2], r1
+ vld1.8 {d6[],d7[]}, [r2], r1
- //Set the line of MB using the left side byte data
- vst1.8 {d0,d1}, [r0], r1
- vst1.8 {d2,d3}, [r0], r1
- vst1.8 {d4,d5}, [r0], r1
- vst1.8 {d6,d7}, [r0], r1
+ //Set the line of MB using the left side byte data
+ vst1.8 {d0,d1}, [r0], r1
+ vst1.8 {d2,d3}, [r0], r1
+ vst1.8 {d4,d5}, [r0], r1
+ vst1.8 {d6,d7}, [r0], r1
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_h
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
- //stmdb sp!, { r2-r5, lr}
- //Get the left vertical line data
- sub r2, r0, #1
- GET_8BYTE_DATA d0, r2, r1
- GET_8BYTE_DATA d1, r2, r1
+ //stmdb sp!, { r2-r5, lr}
+ //Get the left vertical line data
+ sub r2, r0, #1
+ GET_8BYTE_DATA d0, r2, r1
+ GET_8BYTE_DATA d1, r2, r1
- //Get the top horizontal line data
- sub r2, r0, r1
- vldm r2, {d2, d3}
+ //Get the top horizontal line data
+ sub r2, r0, r1
+ vldm r2, {d2, d3}
- //Calculate the sum of top horizontal line data and vertical line data
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vadd.u16 q0, q0, q1
- vadd.u16 d0, d0, d1
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
+ //Calculate the sum of top horizontal line data and vertical line data
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
- //Calculate the mean value
- vrshr.u16 d0, d0, #5
- vdup.8 q0, d0[0]
+ //Calculate the mean value
+ vrshr.u16 d0, d0, #5
+ vdup.8 q0, d0[0]
- //Set the mean value to the all of member of MB
- mov r2, #4
+ //Set the mean value to the all of member of MB
+ mov r2, #4
loop_0_get_i16x16_luma_pred_dc_both:
- vst1.8 {d0,d1}, [r0], r1
- vst1.8 {d0,d1}, [r0], r1
- vst1.8 {d0,d1}, [r0], r1
- vst1.8 {d0,d1}, [r0], r1
- subs r2, #1
- bne loop_0_get_i16x16_luma_pred_dc_both
+ vst1.8 {d0,d1}, [r0], r1
+ vst1.8 {d0,d1}, [r0], r1
+ vst1.8 {d0,d1}, [r0], r1
+ vst1.8 {d0,d1}, [r0], r1
+ subs r2, #1
+ bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END
@@ -149,106 +149,106 @@
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
- //stmdb sp!, { r2-r5, lr}
+ //stmdb sp!, { r2-r5, lr}
- //Load the table {(8,7,6,5,4,3,2,1) * 5}
- adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
- vldr d0, [r2]
+ //Load the table {(8,7,6,5,4,3,2,1) * 5}
+ adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
+ vldr d0, [r2]
- //Pack the top[-1] ~ top[6] to d1
- sub r2, r0, r1
- sub r3, r2, #1
- vld1.8 d1, [r3]
+ //Pack the top[-1] ~ top[6] to d1
+ sub r2, r0, r1
+ sub r3, r2, #1
+ vld1.8 d1, [r3]
- //Pack the top[8] ~ top[15] to d2
- add r3, #9
- vld1.8 d2, [r3]
+ //Pack the top[8] ~ top[15] to d2
+ add r3, #9
+ vld1.8 d2, [r3]
- //Save the top[15] to d6 for next step
- vdup.u8 d6, d2[7]
+ //Save the top[15] to d6 for next step
+ vdup.u8 d6, d2[7]
- //Get and pack left[-1] ~ left[6] to d4
- sub r3, r2, #1
- GET_8BYTE_DATA d4, r3, r1
+ //Get and pack left[-1] ~ left[6] to d4
+ sub r3, r2, #1
+ GET_8BYTE_DATA d4, r3, r1
- //Get and pack left[8] ~ left[15] to d3
- add r3, r1
- GET_8BYTE_DATA d3, r3, r1
+ //Get and pack left[8] ~ left[15] to d3
+ add r3, r1
+ GET_8BYTE_DATA d3, r3, r1
- //Save the left[15] to d7 for next step
- vdup.u8 d7, d3[7]
+ //Save the left[15] to d7 for next step
+ vdup.u8 d7, d3[7]
- //revert the sequence of d2,d3
- vrev64.8 q1, q1
+ //revert the sequence of d2,d3
+ vrev64.8 q1, q1
- vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
- vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
+ vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
+ vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
- vmovl.u8 q0, d0
- vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
- vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
+ vmovl.u8 q0, d0
+ vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
+ vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
- //Calculate the sum of items of q1, q2
- vpadd.s16 d0, d2, d3
- vpadd.s16 d1, d4, d5
- vpaddl.s16 q0, q0
- vpaddl.s32 q0, q0
+ //Calculate the sum of items of q1, q2
+ vpadd.s16 d0, d2, d3
+ vpadd.s16 d1, d4, d5
+ vpaddl.s16 q0, q0
+ vpaddl.s32 q0, q0
- //Get the value of 'b', 'c' and extend to q1, q2.
- vrshr.s64 q0, #6
- vdup.s16 q1, d0[0]
- vdup.s16 q2, d1[0]
+ //Get the value of 'b', 'c' and extend to q1, q2.
+ vrshr.s64 q0, #6
+ vdup.s16 q1, d0[0]
+ vdup.s16 q2, d1[0]
- //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
- adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
- vld1.32 {d0}, [r2]
+ //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
+ adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
+ vld1.32 {d0}, [r2]
- //Get the value of 'a' and save to q3
- vaddl.u8 q3, d6, d7
- vshl.u16 q3, #4
+ //Get the value of 'a' and save to q3
+ vaddl.u8 q3, d6, d7
+ vshl.u16 q3, #4
- //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
- vmovl.s8 q0, d0
- vmla.s16 q3, q0, q1
- vmla.s16 q3, q2, d0[0]
+ //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
+ vmovl.s8 q0, d0
+ vmla.s16 q3, q0, q1
+ vmla.s16 q3, q2, d0[0]
- //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
- vshl.s16 q8, q1, #3
- vadd.s16 q8, q3
+ //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
+ vshl.s16 q8, q1, #3
+ vadd.s16 q8, q3
- //right shift 5 bits and rounding
- vqrshrun.s16 d0, q3, #5
- vqrshrun.s16 d1, q8, #5
+ //right shift 5 bits and rounding
+ vqrshrun.s16 d0, q3, #5
+ vqrshrun.s16 d1, q8, #5
- //Set the line of MB
- vst1.u32 {d0,d1}, [r0], r1
+ //Set the line of MB
+ vst1.u32 {d0,d1}, [r0], r1
- //Do the same processing for setting other lines
- mov r2, #15
+ //Do the same processing for setting other lines
+ mov r2, #15
loop_0_get_i16x16_luma_pred_plane:
- vadd.s16 q3, q2
- vadd.s16 q8, q2
- vqrshrun.s16 d0, q3, #5
- vqrshrun.s16 d1, q8, #5
- vst1.u32 {d0,d1}, [r0], r1
- subs r2, #1
- bne loop_0_get_i16x16_luma_pred_plane
+ vadd.s16 q3, q2
+ vadd.s16 q8, q2
+ vqrshrun.s16 d0, q3, #5
+ vqrshrun.s16 d1, q8, #5
+ vst1.u32 {d0,d1}, [r0], r1
+ subs r2, #1
+ bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r2, r0, r1
- ldr r2, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r2, r0, r1
+ ldr r2, [r2]
- //Set the luma MB using top line
- str r2, [r0], r1
- str r2, [r0], r1
- str r2, [r0], r1
- str r2, [r0]
+ //Set the luma MB using top line
+ str r2, [r0], r1
+ str r2, [r0], r1
+ str r2, [r0], r1
+ str r2, [r0]
WELS_ASM_FUNC_END
@@ -255,97 +255,97 @@
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the left column (4 bytes)
- sub r2, r0, #1
- vld1.8 {d0[]}, [r2], r1
- vld1.8 {d1[]}, [r2], r1
- vld1.8 {d2[]}, [r2], r1
- vld1.8 {d3[]}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the left column (4 bytes)
+ sub r2, r0, #1
+ vld1.8 {d0[]}, [r2], r1
+ vld1.8 {d1[]}, [r2], r1
+ vld1.8 {d2[]}, [r2], r1
+ vld1.8 {d3[]}, [r2]
- //Set the luma MB using the left side byte
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d1[0]}, [r0], r1
- vst1.32 {d2[0]}, [r0], r1
- vst1.32 {d3[0]}, [r0]
+ //Set the luma MB using the left side byte
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+ vst1.32 {d2[0]}, [r0], r1
+ vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row data(8 bytes)
- sub r2, r0, r1
- vld1.32 {d0}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row data(8 bytes)
+ sub r2, r0, r1
+ vld1.32 {d0}, [r2]
- //For "t7 + (t7<<1)"
- vdup.8 d1, d0[7]
+ //For "t7 + (t7<<1)"
+ vdup.8 d1, d0[7]
- //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
- vext.8 d1, d0, d1, #1
- vaddl.u8 q1, d1, d0
+ //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q1, d1, d0
- //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
- vext.8 q2, q1, q1, #14
- vadd.u16 q0, q1, q2
+ //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
+ vext.8 q2, q1, q1, #14
+ vadd.u16 q0, q1, q2
- //right shift 2 bits and rounding
- vqrshrn.u16 d0, q0, #2
+ //right shift 2 bits and rounding
+ vqrshrn.u16 d0, q0, #2
- //Save "ddl0, ddl1, ddl2, ddl3"
- vext.8 d1, d0, d0, #1
- vst1.32 d1[0], [r0], r1
+ //Save "ddl0, ddl1, ddl2, ddl3"
+ vext.8 d1, d0, d0, #1
+ vst1.32 d1[0], [r0], r1
- //Save "ddl1, ddl2, ddl3, ddl4"
- vext.8 d1, d0, d0, #2
- vst1.32 d1[0], [r0], r1
+ //Save "ddl1, ddl2, ddl3, ddl4"
+ vext.8 d1, d0, d0, #2
+ vst1.32 d1[0], [r0], r1
- //Save "ddl2, ddl3, ddl4, ddl5"
- vext.8 d1, d0, d0, #3
- vst1.32 d1[0], [r0], r1
+ //Save "ddl2, ddl3, ddl4, ddl5"
+ vext.8 d1, d0, d0, #3
+ vst1.32 d1[0], [r0], r1
- //Save "ddl3, ddl4, ddl5, ddl6"
- vst1.32 d0[1], [r0]
+ //Save "ddl3, ddl4, ddl5, ddl6"
+ vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r2, r0, r1
- vld1.32 {d0[1]}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r2, r0, r1
+ vld1.32 {d0[1]}, [r2]
- //Load the left column (5 bytes)
- sub r2, #1
- vld1.8 {d0[3]}, [r2], r1
- vld1.8 {d0[2]}, [r2], r1
- vld1.8 {d0[1]}, [r2], r1
- vld1.8 {d0[0]}, [r2], r1
- vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
+ //Load the left column (5 bytes)
+ sub r2, #1
+ vld1.8 {d0[3]}, [r2], r1
+ vld1.8 {d0[2]}, [r2], r1
+ vld1.8 {d0[1]}, [r2], r1
+ vld1.8 {d0[0]}, [r2], r1
+ vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
- vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
- //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
+ vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
+ //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
- //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
- vaddl.u8 q2, d2, d0
+ //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
+ vaddl.u8 q2, d2, d0
- //q1:{TL0+LT0,LT0+T01,...L12+L23}
- vext.8 q3, q3, q2, #14
- vadd.u16 q1, q2, q3
+ //q1:{TL0+LT0,LT0+T01,...L12+L23}
+ vext.8 q3, q3, q2, #14
+ vadd.u16 q1, q2, q3
- //right shift 2 bits and rounding
- vqrshrn.u16 d0, q1, #2
+ //right shift 2 bits and rounding
+ vqrshrn.u16 d0, q1, #2
- //Adjust the data sequence for setting luma MB of 'pred'
- vst1.32 d0[1], [r0], r1
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0], r1
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0], r1
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0]
+ //Adjust the data sequence for setting luma MB of 'pred'
+ vst1.32 d0[1], [r0], r1
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0], r1
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0], r1
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
@@ -352,31 +352,31 @@
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (8 bytes)
- sub r2, r0, r1
- vld1.32 {d0}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (8 bytes)
+ sub r2, r0, r1
+ vld1.32 {d0}, [r2]
- vext.8 d1, d0, d0, #1
- vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
+ vext.8 d1, d0, d0, #1
+ vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
- vext.8 q2, q1, q1, #2
- vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
+ vext.8 q2, q1, q1, #2
+ vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
- //calculate the "vl0,vl1,vl2,vl3,vl4"
- vqrshrn.u16 d0, q1, #1
+ //calculate the "vl0,vl1,vl2,vl3,vl4"
+ vqrshrn.u16 d0, q1, #1
- //calculate the "vl5,vl6,vl7,vl8,vl9"
- vqrshrn.u16 d1, q2, #2
+ //calculate the "vl5,vl6,vl7,vl8,vl9"
+ vqrshrn.u16 d1, q2, #2
- //Adjust the data sequence for setting the luma MB
- vst1.32 d0[0], [r0], r1
- vst1.32 d1[0], [r0], r1
- vext.8 d0, d0, d0, #1
- vext.8 d1, d1, d1, #1
- vst1.32 d0[0], [r0], r1
- vst1.32 d1[0], [r0]
+ //Adjust the data sequence for setting the luma MB
+ vst1.32 d0[0], [r0], r1
+ vst1.32 d1[0], [r0], r1
+ vext.8 d0, d0, d0, #1
+ vext.8 d1, d1, d1, #1
+ vst1.32 d0[0], [r0], r1
+ vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END
@@ -383,152 +383,152 @@
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r2, r0, r1
- vld1.32 {d0[1]}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r2, r0, r1
+ vld1.32 {d0[1]}, [r2]
- //Load the left column (4 bytes)
- sub r2, #1
- vld1.8 {d0[3]}, [r2], r1
- vld1.8 {d0[2]}, [r2], r1
- vld1.8 {d0[1]}, [r2], r1
- vld1.8 {d0[0]}, [r2]
+ //Load the left column (4 bytes)
+ sub r2, #1
+ vld1.8 {d0[3]}, [r2], r1
+ vld1.8 {d0[2]}, [r2], r1
+ vld1.8 {d0[1]}, [r2], r1
+ vld1.8 {d0[0]}, [r2]
- vext.8 d1, d0, d0, #7
- vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
+ vext.8 d1, d0, d0, #7
+ vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
- vext.u8 q2, q1, q1, #14
- vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
+ vext.u8 q2, q1, q1, #14
+ vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
- //Calculate the vr0 ~ vr9
- vqrshrn.u16 d1, q2, #2
- vqrshrn.u16 d0, q1, #1
+ //Calculate the vr0 ~ vr9
+ vqrshrn.u16 d1, q2, #2
+ vqrshrn.u16 d0, q1, #1
- //Adjust the data sequence for setting the luma MB
- vst1.32 d0[1], [r0], r1
- vst1.32 d1[1], [r0], r1
- add r2, r0, r1
- vst1.8 d1[3], [r0]!
- vst1.16 d0[2], [r0]!
- vst1.8 d0[6], [r0]!
- vst1.8 d1[2], [r2]!
- vst1.16 d1[2], [r2]!
- vst1.8 d1[6], [r2]
+ //Adjust the data sequence for setting the luma MB
+ vst1.32 d0[1], [r0], r1
+ vst1.32 d1[1], [r0], r1
+ add r2, r0, r1
+ vst1.8 d1[3], [r0]!
+ vst1.16 d0[2], [r0]!
+ vst1.8 d0[6], [r0]!
+ vst1.8 d1[2], [r2]!
+ vst1.16 d1[2], [r2]!
+ vst1.8 d1[6], [r2]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the left column data
- sub r2, r0, #1
- mov r3, #3
- mul r3, r1
- add r3, r2
- vld1.8 {d0[]}, [r3]
- vld1.8 {d0[4]}, [r2], r1
- vld1.8 {d0[5]}, [r2], r1
- vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
+ //stmdb sp!, { r2-r5, lr}
+ //Load the left column data
+ sub r2, r0, #1
+ mov r3, #3
+ mul r3, r1
+ add r3, r2
+ vld1.8 {d0[]}, [r3]
+ vld1.8 {d0[4]}, [r2], r1
+ vld1.8 {d0[5]}, [r2], r1
+ vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
- vext.8 d1, d0, d0, #1
- vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+ vext.8 d1, d0, d0, #1
+ vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
- vext.u8 d2, d5, d4, #2
- vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
+ vext.u8 d2, d5, d4, #2
+ vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
- //Calculate the hu0 ~ hu5
- vqrshrn.u16 d2, q2, #1
- vqrshrn.u16 d1, q1, #2
+ //Calculate the hu0 ~ hu5
+ vqrshrn.u16 d2, q2, #1
+ vqrshrn.u16 d1, q1, #2
- //Adjust the data sequence for setting the luma MB
- vzip.8 d2, d1
- vst1.32 d1[0], [r0], r1
- vext.8 d2, d1, d1, #2
- vst1.32 d2[0], [r0], r1
- vst1.32 d1[1], [r0], r1
- vst1.32 d0[0], [r0]
+ //Adjust the data sequence for setting the luma MB
+ vzip.8 d2, d1
+ vst1.32 d1[0], [r0], r1
+ vext.8 d2, d1, d1, #2
+ vst1.32 d2[0], [r0], r1
+ vst1.32 d1[1], [r0], r1
+ vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the data
- sub r2, r0, r1
- sub r2, #1
- vld1.32 {d0[1]}, [r2], r1
- vld1.8 {d0[3]}, [r2], r1
- vld1.8 {d0[2]}, [r2], r1
- vld1.8 {d0[1]}, [r2], r1
- vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
+ //stmdb sp!, { r2-r5, lr}
+ //Load the data
+ sub r2, r0, r1
+ sub r2, #1
+ vld1.32 {d0[1]}, [r2], r1
+ vld1.8 {d0[3]}, [r2], r1
+ vld1.8 {d0[2]}, [r2], r1
+ vld1.8 {d0[1]}, [r2], r1
+ vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
- vext.8 d1, d0, d0, #7
- vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
+ vext.8 d1, d0, d0, #7
+ vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
- vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
- vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
+ vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
+ vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
- //Calculate the hd0~hd9
- vqrshrn.u16 d1, q3, #2
- vqrshrn.u16 d0, q2, #1
+ //Calculate the hd0~hd9
+ vqrshrn.u16 d1, q3, #2
+ vqrshrn.u16 d0, q2, #1
- //Adjust the data sequence for setting the luma MB
- vmov d3, d1
- vtrn.8 d0, d1
- vext.u8 d2, d1, d1, #6
- vst2.16 {d2[3], d3[3]}, [r0], r1
- vst2.16 {d0[2], d1[2]}, [r0], r1
- vmov d3, d0
- vst2.16 {d2[2], d3[2]}, [r0], r1
- vst2.16 {d0[1], d1[1]}, [r0]
+ //Adjust the data sequence for setting the luma MB
+ vmov d3, d1
+ vtrn.8 d0, d1
+ vext.u8 d2, d1, d1, #6
+ vst2.16 {d2[3], d3[3]}, [r0], r1
+ vst2.16 {d0[2], d1[2]}, [r0], r1
+ vmov d3, d0
+ vst2.16 {d2[2], d3[2]}, [r0], r1
+ vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
- //stmdb sp!, { r2-r5, lr}
- //Get the top row (8 byte)
- sub r2, r0, r1
- vldr d0, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Get the top row (8 byte)
+ sub r2, r0, r1
+ vldr d0, [r2]
- //Set the chroma MB using top row data
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0], r1
- vst1.8 {d0}, [r0]
+ //Set the chroma MB using top row data
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
- //stmdb sp!, { r2-r5, lr}
- ////Get the left column (8 byte)
- sub r2, r0, #1
- vld1.8 {d0[]}, [r2], r1
- vld1.8 {d1[]}, [r2], r1
- vld1.8 {d2[]}, [r2], r1
- vld1.8 {d3[]}, [r2], r1
- vld1.8 {d4[]}, [r2], r1
- vld1.8 {d5[]}, [r2], r1
- vld1.8 {d6[]}, [r2], r1
- vld1.8 {d7[]}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ ////Get the left column (8 byte)
+ sub r2, r0, #1
+ vld1.8 {d0[]}, [r2], r1
+ vld1.8 {d1[]}, [r2], r1
+ vld1.8 {d2[]}, [r2], r1
+ vld1.8 {d3[]}, [r2], r1
+ vld1.8 {d4[]}, [r2], r1
+ vld1.8 {d5[]}, [r2], r1
+ vld1.8 {d6[]}, [r2], r1
+ vld1.8 {d7[]}, [r2]
- //Set the chroma MB using left column data
- vst1.8 {d0}, [r0], r1
- vst1.8 {d1}, [r0], r1
- vst1.8 {d2}, [r0], r1
- vst1.8 {d3}, [r0], r1
- vst1.8 {d4}, [r0], r1
- vst1.8 {d5}, [r0], r1
- vst1.8 {d6}, [r0], r1
- vst1.8 {d7}, [r0]
+ //Set the chroma MB using left column data
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d1}, [r0], r1
+ vst1.8 {d2}, [r0], r1
+ vst1.8 {d3}, [r0], r1
+ vst1.8 {d4}, [r0], r1
+ vst1.8 {d5}, [r0], r1
+ vst1.8 {d6}, [r0], r1
+ vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END
@@ -576,73 +576,73 @@
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row data
- sub r2, r0, #1
- sub r2, r1
- vld1.32 {d1[0]}, [r2]
- add r2, #5
- vld1.32 {d0[0]}, [r2]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row data
+ sub r2, r0, #1
+ sub r2, r1
+ vld1.32 {d1[0]}, [r2]
+ add r2, #5
+ vld1.32 {d0[0]}, [r2]
- //Load the left column data
- sub r2, #5
- vld1.8 {d1[4]}, [r2], r1
- vld1.8 {d1[5]}, [r2], r1
- vld1.8 {d1[6]}, [r2], r1
- vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
- add r2, r1
- vld1.8 {d0[4]}, [r2], r1
- vld1.8 {d0[5]}, [r2], r1
- vld1.8 {d0[6]}, [r2], r1
- vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
+ //Load the left column data
+ sub r2, #5
+ vld1.8 {d1[4]}, [r2], r1
+ vld1.8 {d1[5]}, [r2], r1
+ vld1.8 {d1[6]}, [r2], r1
+ vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
+ add r2, r1
+ vld1.8 {d0[4]}, [r2], r1
+ vld1.8 {d0[5]}, [r2], r1
+ vld1.8 {d0[6]}, [r2], r1
+ vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
- //Save T7 to d3 for next step
- vdup.u8 d3, d0[3]
- //Save L7 to d4 for next step
- vdup.u8 d4, d0[7]
+ //Save T7 to d3 for next step
+ vdup.u8 d3, d0[3]
+ //Save L7 to d4 for next step
+ vdup.u8 d4, d0[7]
- //Calculate the value of 'a' and save to q2
- vaddl.u8 q2, d3, d4
- vshl.u16 q2, #4
+ //Calculate the value of 'a' and save to q2
+ vaddl.u8 q2, d3, d4
+ vshl.u16 q2, #4
- //Load the table {{1,2,3,4,1,2,3,4}*17}
- adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
- vld1.32 {d2}, [r2]
+ //Load the table {{1,2,3,4,1,2,3,4}*17}
+ adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
+ vld1.32 {d2}, [r2]
- //Calculate the 'b','c', and save to q0
- vrev32.8 d1, d1
- vsubl.u8 q0, d0, d1
- vmovl.u8 q1, d2
- vmul.s16 q0, q1
- vpaddl.s16 q0, q0
- vpaddl.s32 q0, q0
- vrshr.s64 q0, #5
+ //Calculate the 'b','c', and save to q0
+ vrev32.8 d1, d1
+ vsubl.u8 q0, d0, d1
+ vmovl.u8 q1, d2
+ vmul.s16 q0, q1
+ vpaddl.s16 q0, q0
+ vpaddl.s32 q0, q0
+ vrshr.s64 q0, #5
- //Load the table {-3,-2,-1,0,1,2,3,4} to q3
- adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
- vld1.32 {d6, d7}, [r2]
+ //Load the table {-3,-2,-1,0,1,2,3,4} to q3
+ adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
+ vld1.32 {d6, d7}, [r2]
- //Duplicate the 'b','c' to q0, q1 for SIMD instruction
- vdup.s16 q1, d1[0]
- vdup.s16 q0, d0[0]
+ //Duplicate the 'b','c' to q0, q1 for SIMD instruction
+ vdup.s16 q1, d1[0]
+ vdup.s16 q0, d0[0]
- //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
- vmla.s16 q2, q0, q3
- vmla.s16 q2, q1, d6[0]
- vqrshrun.s16 d0, q2, #5
+ //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
+ vmla.s16 q2, q0, q3
+ vmla.s16 q2, q1, d6[0]
+ vqrshrun.s16 d0, q2, #5
- //Set a line of chroma MB
- vst1.u32 {d0}, [r0], r1
+ //Set a line of chroma MB
+ vst1.u32 {d0}, [r0], r1
- //Do the same processing for each line.
- mov r2, #7
+ //Do the same processing for each line.
+ mov r2, #7
loop_0_get_i_chroma_pred_plane:
- vadd.s16 q2, q1
- vqrshrun.s16 d0, q2, #5
- vst1.u32 {d0}, [r0], r1
- subs r2, #1
- bne loop_0_get_i_chroma_pred_plane
+ vadd.s16 q2, q1
+ vqrshrun.s16 d0, q2, #5
+ vst1.u32 {d0}, [r0], r1
+ subs r2, #1
+ bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END
--- a/codec/decoder/core/x86/dct.asm
+++ b/codec/decoder/core/x86/dct.asm
@@ -54,7 +54,7 @@
%endmacro
%macro MMX_SumSub 3
- movq %3, %2
+ movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
@@ -62,8 +62,8 @@
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
- MMX_SumSub %1, %4, %6
- MMX_SumSub %3, %5, %6
+ MMX_SumSub %1, %4, %6
+ MMX_SumSub %3, %5, %6
%endmacro
@@ -96,13 +96,13 @@
movq mm2, [r2+16]
movq mm3, [r2+24]
- MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
- MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
+ MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
+ MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
- MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
+ MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
- WELS_Zero mm7
- WELS_DW32 mm6
+ WELS_Zero mm7
+ WELS_DW32 mm6
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
@@ -111,5 +111,5 @@
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
- emms
+ emms
ret
--- a/codec/decoder/core/x86/intra_pred.asm
+++ b/codec/decoder/core/x86/intra_pred.asm
@@ -36,10 +36,10 @@
;*
;* History
;* 18/09/2009 Created
-;* 19/11/2010 Added
-;* WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
-;* WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
-;* and WelsDecoderIChromaPredDcNA_mmx
+;* 19/11/2010 Added
+;* WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
+;* WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
+;* and WelsDecoderIChromaPredDcNA_mmx
;*
;*
;*************************************************************************/
@@ -50,11 +50,6 @@
;*******************************************************************************
SECTION .rodata align=16
-%if 1
- %define WELSEMMS emms
-%else
- %define WELSEMMS
-%endif
align 16
sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
@@ -70,7 +65,7 @@
sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
align 16
-mmx_01bytes: times 16 db 1
+mmx_01bytes: times 16 db 1
align 16
mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
@@ -86,86 +81,86 @@
;xmm0, xmm1, xmm2, eax, ecx
;lower 64 bits of xmm0 save the result
%macro SSE2_PRED_H_4X4_TWO_LINE 5
- movd %1, [%4-1]
- movdqa %3, %1
- punpcklbw %1, %3
- movdqa %3, %1
- punpcklbw %1, %3
+ movd %1, [%4-1]
+ movdqa %3, %1
+ punpcklbw %1, %3
+ movdqa %3, %1
+ punpcklbw %1, %3
- ;add %4, %5
- movd %2, [%4+%5-1]
- movdqa %3, %2
- punpcklbw %2, %3
- movdqa %3, %2
- punpcklbw %2, %3
- punpckldq %1, %2
+ ;add %4, %5
+ movd %2, [%4+%5-1]
+ movdqa %3, %2
+ punpcklbw %2, %3
+ movdqa %3, %2
+ punpcklbw %2, %3
+ punpckldq %1, %2
%endmacro
-%macro LOAD_COLUMN 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpcklwd %1, %3
- lea %5, [%5+2*%6]
- movd %4, [%5]
- movd %2, [%5+%6]
- punpcklbw %4, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- lea %5, [%5+2*%6]
- punpcklbw %3, %2
- punpcklwd %4, %3
- punpckhdq %1, %4
+%macro LOAD_COLUMN 6
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpcklwd %1, %3
+ lea %5, [%5+2*%6]
+ movd %4, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %4, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ lea %5, [%5+2*%6]
+ punpcklbw %3, %2
+ punpcklwd %4, %3
+ punpckhdq %1, %4
%endmacro
-%macro SUMW_HORIZON 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+%macro SUMW_HORIZON 3
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
%endmacro
-%macro COPY_16_TIMES 2
- movdqa %2, [%1-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
+%macro COPY_16_TIMES 2
+ movdqa %2, [%1-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
%endmacro
-%macro COPY_16_TIMESS 3
- movdqa %2, [%1+%3-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
+%macro COPY_16_TIMESS 3
+ movdqa %2, [%1+%3-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
%endmacro
-%macro LOAD_COLUMN_C 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1,%2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpckhwd %1, %3
- lea %5, [%5+2*%6]
+%macro LOAD_COLUMN_C 6
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1,%2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpckhwd %1, %3
+ lea %5, [%5+2*%6]
%endmacro
%macro LOAD_2_LEFT_AND_ADD 0
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01]
- add r2, r3
- movzx r3, byte [r0+r1-0x01]
- add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01]
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01]
+ add r2, r3
%endmacro
;*******************************************************************************
@@ -178,131 +173,131 @@
;*******************************************************************************
; void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
;
-; pPred must align to 16
+; pPred must align to 16
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
- movzx r2, byte [r0-1]
- movd xmm0, r2d
- pmuludq xmm0, [mmx_01bytes]
+ movzx r2, byte [r0-1]
+ movd xmm0, r2d
+ pmuludq xmm0, [mmx_01bytes]
- movzx r2, byte [r0+r1-1]
- movd xmm1, r2d
- pmuludq xmm1, [mmx_01bytes]
+ movzx r2, byte [r0+r1-1]
+ movd xmm1, r2d
+ pmuludq xmm1, [mmx_01bytes]
- lea r0, [r0+r1]
- movzx r2, byte [r0+r1-1]
- movd xmm2, r2d
- pmuludq xmm2, [mmx_01bytes]
+ lea r0, [r0+r1]
+ movzx r2, byte [r0+r1-1]
+ movd xmm2, r2d
+ pmuludq xmm2, [mmx_01bytes]
- movzx r2, byte [r0+2*r1-1]
- movd xmm3, r2d
- pmuludq xmm3, [mmx_01bytes]
+ movzx r2, byte [r0+2*r1-1]
+ movd xmm3, r2d
+ pmuludq xmm3, [mmx_01bytes]
- sub r0, r1
- movd [r0], xmm0
- movd [r0+r1], xmm1
- lea r0, [r0+2*r1]
- movd [r0], xmm2
- movd [r0+r1], xmm3
+ sub r0, r1
+ movd [r0], xmm0
+ movd [r0+r1], xmm1
+ lea r0, [r0+2*r1]
+ movd [r0], xmm2
+ movd [r0+r1], xmm3
- ret
+ ret
;*******************************************************************************
; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r4, r0 ; save r0 in r4
- sub r0, 1
- sub r0, r1
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0 ; save r0 in r4
+ sub r0, 1
+ sub r0, r1
- ;for H
- pxor xmm7, xmm7
- movq xmm0, [r0]
- movdqa xmm5, [sse2_plane_dec]
- punpcklbw xmm0, xmm7
- pmullw xmm0, xmm5
- movq xmm1, [r0 + 9]
- movdqa xmm6, [sse2_plane_inc]
- punpcklbw xmm1, xmm7
- pmullw xmm1, xmm6
- psubw xmm1, xmm0
+ ;for H
+ pxor xmm7, xmm7
+ movq xmm0, [r0]
+ movdqa xmm5, [sse2_plane_dec]
+ punpcklbw xmm0, xmm7
+ pmullw xmm0, xmm5
+ movq xmm1, [r0 + 9]
+ movdqa xmm6, [sse2_plane_inc]
+ punpcklbw xmm1, xmm7
+ pmullw xmm1, xmm6
+ psubw xmm1, xmm0
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r2d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
- movsx r2, r2w
- imul r2, 5
- add r2, 32
- sar r2, 6 ; b = (5 * H + 32) >> 6;
- SSE2_Copy8Times xmm1, r2d ; xmm1 = b,b,b,b,b,b,b,b
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r2d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+ movsx r2, r2w
+ imul r2, 5
+ add r2, 32
+ sar r2, 6 ; b = (5 * H + 32) >> 6;
+ SSE2_Copy8Times xmm1, r2d ; xmm1 = b,b,b,b,b,b,b,b
- movzx r3, BYTE [r0+16]
- sub r0, 3
- LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r0, r1
+ movzx r3, BYTE [r0+16]
+ sub r0, 3
+ LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r0, r1
- add r0, 3
- movzx r2, BYTE [r0+8*r1]
- add r3, r2
- shl r3, 4 ; a = (left[15*kiStride] + top[15]) << 4;
+ add r0, 3
+ movzx r2, BYTE [r0+8*r1]
+ add r3, r2
+ shl r3, 4 ; a = (left[15*kiStride] + top[15]) << 4;
- sub r0, 3
- add r0, r1
- LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r0, r1
- pxor xmm4, xmm4
- punpckhbw xmm0, xmm4
- pmullw xmm0, xmm5
- punpckhbw xmm7, xmm4
- pmullw xmm7, xmm6
- psubw xmm7, xmm0
+ sub r0, 3
+ add r0, r1
+ LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r0, r1
+ pxor xmm4, xmm4
+ punpckhbw xmm0, xmm4
+ pmullw xmm0, xmm5
+ punpckhbw xmm7, xmm4
+ pmullw xmm7, xmm6
+ psubw xmm7, xmm0
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r2d, xmm7 ; V
- movsx r2, r2w
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r2d, xmm7 ; V
+ movsx r2, r2w
- imul r2, 5
- add r2, 32
- sar r2, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, r2d ; xmm4 = c,c,c,c,c,c,c,c
+ imul r2, 5
+ add r2, 32
+ sar r2, 6 ; c = (5 * V + 32) >> 6;
+ SSE2_Copy8Times xmm4, r2d ; xmm4 = c,c,c,c,c,c,c,c
- mov r0, r4
- add r3, 16
- imul r2, -7
- add r3, r2 ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+ mov r0, r4
+ add r3, 16
+ imul r2, -7
+ add r3, r2 ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- xor r2, r2
- movdqa xmm5, [sse2_plane_inc_minus]
+ xor r2, r2
+ movdqa xmm5, [sse2_plane_inc_minus]
get_i16x16_luma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- movdqa xmm3, xmm1
- pmullw xmm3, xmm6
- paddw xmm3, xmm0
- psraw xmm3, 5
- packuswb xmm2, xmm3
- movdqa [r0], xmm2
- paddw xmm0, xmm4
- add r0, r1
- inc r2
- cmp r2, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ movdqa xmm3, xmm1
+ pmullw xmm3, xmm6
+ paddw xmm3, xmm0
+ psraw xmm3, 5
+ packuswb xmm2, xmm3
+ movdqa [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, r1
+ inc r2
+ cmp r2, 16
+ jnz get_i16x16_luma_pred_plane_sse2_1
- POP_XMM
- pop r4
- pop r3
- ret
+ POP_XMM
+ pop r4
+ pop r3
+ ret
@@ -311,31 +306,31 @@
;*******************************************************************************
%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
- lea %1, [%1+%2*2]
+ lea %1, [%1+%2*2]
- COPY_16_TIMES %1, xmm0
- movdqa [%1], xmm0
- COPY_16_TIMESS %1, xmm0, %2
- movdqa [%1+%2], xmm0
+ COPY_16_TIMES %1, xmm0
+ movdqa [%1], xmm0
+ COPY_16_TIMESS %1, xmm0, %2
+ movdqa [%1+%2], xmm0
%endmacro
WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
- COPY_16_TIMES r0, xmm0
- movdqa [r0], xmm0
- COPY_16_TIMESS r0, xmm0, r1
- movdqa [r0+r1], xmm0
+ COPY_16_TIMES r0, xmm0
+ movdqa [r0], xmm0
+ COPY_16_TIMESS r0, xmm0, r1
+ movdqa [r0+r1], xmm0
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
ret
@@ -343,9 +338,9 @@
; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
sub r0, r1
movdqa xmm0, [r0]
@@ -381,252 +376,252 @@
; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r4, r0
- sub r0, 1
- sub r0, r1
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
+ sub r0, 1
+ sub r0, r1
- pxor mm7, mm7
- movq mm0, [r0]
- movq mm5, [sse2_plane_dec_c]
- punpcklbw mm0, mm7
- pmullw mm0, mm5
- movq mm1, [r0 + 5]
- movq mm6, [sse2_plane_inc_c]
- punpcklbw mm1, mm7
- pmullw mm1, mm6
- psubw mm1, mm0
+ pxor mm7, mm7
+ movq mm0, [r0]
+ movq mm5, [sse2_plane_dec_c]
+ punpcklbw mm0, mm7
+ pmullw mm0, mm5
+ movq mm1, [r0 + 5]
+ movq mm6, [sse2_plane_inc_c]
+ punpcklbw mm1, mm7
+ pmullw mm1, mm6
+ psubw mm1, mm0
- movq2dq xmm1, mm1
- pxor xmm2, xmm2
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r2d, xmm1
- movsx r2, r2w
- imul r2, 17
- add r2, 16
- sar r2, 5 ; b = (17 * H + 16) >> 5;
- SSE2_Copy8Times xmm1, r2d ; mm1 = b,b,b,b,b,b,b,b
+ movq2dq xmm1, mm1
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r2d, xmm1
+ movsx r2, r2w
+ imul r2, 17
+ add r2, 16
+ sar r2, 5 ; b = (17 * H + 16) >> 5;
+ SSE2_Copy8Times xmm1, r2d ; mm1 = b,b,b,b,b,b,b,b
- movzx r3, BYTE [r0+8]
- sub r0, 3
- LOAD_COLUMN_C mm0, mm2, mm3, mm4, r0, r1
+ movzx r3, BYTE [r0+8]
+ sub r0, 3
+ LOAD_COLUMN_C mm0, mm2, mm3, mm4, r0, r1
- add r0, 3
- movzx r2, BYTE [r0+4*r1]
- add r3, r2
- shl r3, 4 ; a = (left[7*kiStride] + top[7]) << 4;
+ add r0, 3
+ movzx r2, BYTE [r0+4*r1]
+ add r3, r2
+ shl r3, 4 ; a = (left[7*kiStride] + top[7]) << 4;
- sub r0, 3
- add r0, r1
- LOAD_COLUMN_C mm7, mm2, mm3, mm4, r0, r1
- pxor mm4, mm4
- punpckhbw mm0, mm4
- pmullw mm0, mm5
- punpckhbw mm7, mm4
- pmullw mm7, mm6
- psubw mm7, mm0
+ sub r0, 3
+ add r0, r1
+ LOAD_COLUMN_C mm7, mm2, mm3, mm4, r0, r1
+ pxor mm4, mm4
+ punpckhbw mm0, mm4
+ pmullw mm0, mm5
+ punpckhbw mm7, mm4
+ pmullw mm7, mm6
+ psubw mm7, mm0
- movq2dq xmm7, mm7
- pxor xmm2, xmm2
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r2d, xmm7 ; V
- movsx r2, r2w
+ movq2dq xmm7, mm7
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r2d, xmm7 ; V
+ movsx r2, r2w
- imul r2, 17
- add r2, 16
- sar r2, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c
+ imul r2, 17
+ add r2, 16
+ sar r2, 5 ; c = (17 * V + 16) >> 5;
+ SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c
- mov r0, r4
- add r3, 16
- imul r2, -3
- add r3, r2 ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+ mov r0, r4
+ add r3, 16
+ imul r2, -3
+ add r3, r2 ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- xor r2, r2
- movdqa xmm5, [sse2_plane_mul_b_c]
+ xor r2, r2
+ movdqa xmm5, [sse2_plane_mul_b_c]
get_i_chroma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r0], xmm2
- paddw xmm0, xmm4
- add r0, r1
- inc r2
- cmp r2, 8
- jnz get_i_chroma_pred_plane_sse2_1
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, r1
+ inc r2
+ cmp r2, 8
+ jnz get_i_chroma_pred_plane_sse2_1
- POP_XMM
- pop r4
- pop r3
- WELSEMMS
- ret
+ POP_XMM
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
;*******************************************************************************
-; 0 |1 |2 |3 |4 |
-; 6 |7 |8 |9 |10|
-; 11|12|13|14|15|
-; 16|17|18|19|20|
-; 21|22|23|24|25|
-; 7 is the start pixel of current 4x4 block
-; pPred[7] = ([6]+[0]*2+[1]+2)/4
+; 0 |1 |2 |3 |4 |
+; 6 |7 |8 |9 |10|
+; 11|12|13|14|15|
+; 16|17|18|19|20|
+; 21|22|23|24|25|
+; 7 is the start pixel of current 4x4 block
+; pPred[7] = ([6]+[0]*2+[1]+2)/4
;
; void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
;
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
- movq mm1,[r2+r1-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
- movq mm2,[r2-8] ;get value of 6 mm2[8] = 6
- sub r2, r1 ;mov eax to above line of current block(postion of 1)
- punpckhbw mm2,[r2-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
- movd mm3,[r2] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
- punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
- psllq mm3,18h ;mm3[5]=[1]
- psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
- movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- lea r2,[r2+r1*2-8h] ;set eax point to 12
- movq mm4,[r2+r1] ;get value of 16, mm4[8]=[16]
- psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[16]
- por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
- movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
- movq mm4,[r2+r1*2] ;mm4[8]=[21]
- psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[21]
- por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
- movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
- pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
- pxor mm1,mm4 ;find odd value in the lowest bit of each byte
- pand mm1,[mmx_01bytes] ;set the odd bit
- psubusb mm3,mm1 ;decrease 1 from odd bytes
- pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
+ movq mm1,[r2+r1-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+ movq mm2,[r2-8] ;get value of 6 mm2[8] = 6
+ sub r2, r1 ;mov eax to above line of current block(postion of 1)
+ punpckhbw mm2,[r2-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+ movd mm3,[r2] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+ punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+ psllq mm3,18h ;mm3[5]=[1]
+ psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+ movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ lea r2,[r2+r1*2-8h] ;set eax point to 12
+ movq mm4,[r2+r1] ;get value of 16, mm4[8]=[16]
+ psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[16]
+ por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+ movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+ movq mm4,[r2+r1*2] ;mm4[8]=[21]
+ psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[21]
+ por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+ movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+ pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
+ pxor mm1,mm4 ;find odd value in the lowest bit of each byte
+ pand mm1,[mmx_01bytes] ;set the odd bit
+ psubusb mm3,mm1 ;decrease 1 from odd bytes
+ pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
- lea r0,[r0+r1]
- movd [r0+2*r1],mm2
- sub r0,r1
- psrlq mm2,8
- movd [r0+2*r1],mm2
- psrlq mm2,8
- movd [r0+r1],mm2
- psrlq mm2,8
- movd [r0],mm2
- WELSEMMS
- ret
+ lea r0,[r0+r1]
+ movd [r0+2*r1],mm2
+ sub r0,r1
+ psrlq mm2,8
+ movd [r0+2*r1],mm2
+ psrlq mm2,8
+ movd [r0+r1],mm2
+ psrlq mm2,8
+ movd [r0],mm2
+ WELSEMMS
+ ret
;*******************************************************************************
-; void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
; copy 8 pixel of 8 line from left
;*******************************************************************************
%macro MMX_PRED_H_8X8_ONE_LINE 4
- movq %1, [%3-8]
- psrlq %1, 38h
+ movq %1, [%3-8]
+ psrlq %1, 38h
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
%endmacro
%macro MMX_PRED_H_8X8_ONE_LINEE 4
- movq %1, [%3+r1-8]
- psrlq %1, 38h
+ movq %1, [%3+r1-8]
+ psrlq %1, 38h
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
%endmacro
WELS_EXTERN WelsDecoderIChromaPredH_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
- movq mm0, [r2-8]
- psrlq mm0, 38h
+ movq mm0, [r2-8]
+ psrlq mm0, 38h
- pmullw mm0, [mmx_01bytes]
- pshufw mm0, mm0, 0
- movq [r0], mm0
+ pmullw mm0, [mmx_01bytes]
+ pshufw mm0, mm0, 0
+ movq [r0], mm0
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
- lea r2, [r2+r1*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
- lea r0, [r0+2*r1]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
- lea r2, [r2+r1*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
- lea r0, [r0+2*r1]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
- lea r2, [r2+r1*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
- lea r0, [r0+2*r1]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
- WELSEMMS
- ret
+ WELSEMMS
+ ret
;*******************************************************************************
-; void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
; copy 8 pixels from top 8 pixels
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredV_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
- sub r0, r1
- movq mm0, [r0]
+ sub r0, r1
+ movq mm0, [r0]
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
- lea r0, [r0+2*r1]
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
- lea r0, [r0+2*r1]
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
- lea r0, [r0+2*r1]
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
- WELSEMMS
- ret
+ WELSEMMS
+ ret
;*******************************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |a |b |
-; |g |h |e |f |
-; |i |j |g |h |
+; |a |b |c |d |
+; |e |f |a |b |
+; |g |h |e |f |
+; |i |j |g |h |
; a = (1 + lt + l0)>>1
; e = (1 + l0 + l1)>>1
@@ -645,73 +640,73 @@
; void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movd mm0, [r2-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
- psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movd mm0, [r2-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+ psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
- movd mm1, [r2+2*r1-4]
- punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
- lea r2, [r2+2*r1]
- movd mm2, [r2+2*r1-4]
- punpcklbw mm2, [r2+r1-4] ; mm2[7] = l2, mm2[6] = l3
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
- psrlq mm2, 20h
- pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+ movd mm1, [r2+2*r1-4]
+ punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r2, [r2+2*r1]
+ movd mm2, [r2+2*r1-4]
+ punpcklbw mm2, [r2+r1-4] ; mm2[7] = l2, mm2[6] = l3
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+ psrlq mm2, 20h
+ pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
- movq mm1, mm0
- psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
- movq mm2, mm0
- psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
- movq mm3, mm2
- movq mm4, mm1
- pavgb mm1, mm0
+ movq mm1, mm0
+ psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+ movq mm2, mm0
+ psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+ movq mm3, mm2
+ movq mm4, mm1
+ pavgb mm1, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm4 ; decrease 1 from odd bytes
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm4 ; decrease 1 from odd bytes
- pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
+ pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
- movq mm4, mm0
- pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
- punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
+ movq mm4, mm0
+ pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
+ punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
- psrlq mm2, 20h
- psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
- movq mm4, mm3
- psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
- pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
- psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
+ psrlq mm2, 20h
+ psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
+ movq mm4, mm3
+ psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
+ pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
- movd [r0], mm2
- lea r0, [r0+r1]
- movd [r0+2*r1], mm3
- sub r0, r1
- psrlq mm3, 10h
- movd [r0+2*r1], mm3
- psrlq mm3, 10h
- movd [r0+r1], mm3
- WELSEMMS
- ret
+ movd [r0], mm2
+ lea r0, [r0+r1]
+ movd [r0+2*r1], mm3
+ sub r0, r1
+ psrlq mm3, 10h
+ movd [r0+2*r1], mm3
+ psrlq mm3, 10h
+ movd [r0+r1], mm3
+ WELSEMMS
+ ret
;*******************************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
; destination:
-; |a |b |c |d |
-; |c |d |e |f |
-; |e |f |g |g |
-; |g |g |g |g |
+; |a |b |c |d |
+; |c |d |e |f |
+; |e |f |g |g |
+; |g |g |g |g |
; a = (1 + l0 + l1)>>1
; c = (1 + l1 + l2)>>1
@@ -727,74 +722,74 @@
; void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
- movd mm0, [r2-4] ; mm0[3] = l0
- punpcklbw mm0, [r2+r1-4] ; mm0[7] = l1, mm0[6] = l0
- lea r2, [r2+2*r1]
- movd mm2, [r2-4] ; mm2[3] = l2
- movd mm4, [r2+r1-4] ; mm4[3] = l3
- punpcklbw mm2, mm4
- punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+ movd mm0, [r2-4] ; mm0[3] = l0
+ punpcklbw mm0, [r2+r1-4] ; mm0[7] = l1, mm0[6] = l0
+ lea r2, [r2+2*r1]
+ movd mm2, [r2-4] ; mm2[3] = l2
+ movd mm4, [r2+r1-4] ; mm4[3] = l3
+ punpcklbw mm2, mm4
+ punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
- psrlq mm4, 18h
- psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
- psrlq mm0, 8h
- pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+ psrlq mm4, 18h
+ psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
+ psrlq mm0, 8h
+ pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
- movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
- pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+ movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+ pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
- movq mm5, mm2
- pavgb mm2, mm0
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+ movq mm5, mm2
+ pavgb mm2, mm0
- pxor mm5, mm0 ; find odd value in the lowest bit of each byte
- pand mm5, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm5 ; decrease 1 from odd bytes
+ pxor mm5, mm0 ; find odd value in the lowest bit of each byte
+ pand mm5, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm5 ; decrease 1 from odd bytes
- pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
+ pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
- psrlq mm2, 8h
- pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
+ psrlq mm2, 8h
+ pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
- punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
- punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
- punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
+ punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
+ punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
+ punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
- psrlq mm4, 20h
- lea r0, [r0+r1]
- movd [r0+2*r1], mm4
+ psrlq mm4, 20h
+ lea r0, [r0+r1]
+ movd [r0+2*r1], mm4
- sub r0, r1
- movd [r0], mm1
- psrlq mm1, 10h
- movd [r0+r1], mm1
- psrlq mm1, 10h
- movd [r0+2*r1], mm1
- WELSEMMS
- ret
+ sub r0, r1
+ movd [r0], mm1
+ psrlq mm1, 10h
+ movd [r0+r1], mm1
+ psrlq mm1, 10h
+ movd [r0+2*r1], mm1
+ WELSEMMS
+ ret
;*******************************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; l3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; l3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |i |a |b |c |
-; |j |e |f |g |
+; |a |b |c |d |
+; |e |f |g |h |
+; |i |a |b |c |
+; |j |e |f |g |
; a = (1 + lt + t0)>>1
; b = (1 + t0 + t1)>>1
@@ -812,77 +807,77 @@
; void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movq mm0, [r2-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
- psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movq mm0, [r2-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+ psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
- movd mm1, [r2+2*r1-4]
- punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
- lea r2, [r2+2*r1]
- movq mm2, [r2+r1-8] ; mm2[7] = l2
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
- psrlq mm2, 28h
- pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+ movd mm1, [r2+2*r1-4]
+ punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r2, [r2+2*r1]
+ movq mm2, [r2+r1-8] ; mm2[7] = l2
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+ psrlq mm2, 28h
+ pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
- movq mm3, mm2
- pavgb mm2, mm0
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+ movq mm3, mm2
+ pavgb mm2, mm0
- pxor mm3, mm0 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm3 ; decrease 1 from odd bytes
+ pxor mm3, mm0 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm3 ; decrease 1 from odd bytes
- movq mm3, mm0
- psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
- movq mm2, mm3
+ movq mm3, mm0
+ psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
+ movq mm2, mm3
- psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
- movd [r0], mm1
+ psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
+ movd [r0], mm1
- psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
- movd [r0+r1], mm2
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
+ movd [r0+r1], mm2
- movq mm4, mm3
- psllq mm4, 20h
- psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
+ movq mm4, mm3
+ psllq mm4, 20h
+ psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
- movq mm5, mm3
- psllq mm5, 28h
- psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
+ movq mm5, mm3
+ psllq mm5, 28h
+ psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
- psllq mm1, 8h
- pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
- movd [r0+2*r1], mm4
+ psllq mm1, 8h
+ pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
+ movd [r0+2*r1], mm4
- psllq mm2, 8h
- pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
- lea r0, [r0+2*r1]
- movd [r0+r1], mm5
- WELSEMMS
- ret
+ psllq mm2, 8h
+ pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm5
+ WELSEMMS
+ ret
;*******************************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
; destination:
-; |a |b |c |d |
-; |b |c |d |e |
-; |c |d |e |f |
-; |d |e |f |g |
+; |a |b |c |d |
+; |b |c |d |e |
+; |c |d |e |f |
+; |d |e |f |g |
; a = (2 + t0 + t2 + (t1<<1))>>2
; b = (2 + t1 + t3 + (t2<<1))>>2
@@ -898,56 +893,56 @@
; void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
- movq mm3, mm0
- psrlq mm3, 38h
- psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
+ movq mm3, mm0
+ psrlq mm3, 38h
+ psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
- psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
- psrlq mm2, 8h
- pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+ psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+ psrlq mm2, 8h
+ pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
- movq mm3, mm1
- pavgb mm1, mm2
- pxor mm3, mm2 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm3 ; decrease 1 from odd bytes
+ movq mm3, mm1
+ pavgb mm1, mm2
+ pxor mm3, mm2 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm3 ; decrease 1 from odd bytes
- pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
+ pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
- psrlq mm0, 8h
- movd [r0], mm0
- psrlq mm0, 8h
- movd [r0+r1], mm0
- psrlq mm0, 8h
- movd [r0+2*r1], mm0
- psrlq mm0, 8h
- lea r0, [r0+2*r1]
- movd [r0+r1], mm0
- WELSEMMS
- ret
+ psrlq mm0, 8h
+ movd [r0], mm0
+ psrlq mm0, 8h
+ movd [r0+r1], mm0
+ psrlq mm0, 8h
+ movd [r0+2*r1], mm0
+ psrlq mm0, 8h
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm0
+ WELSEMMS
+ ret
;*******************************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |b |c |d |i |
-; |f |g |h |j |
+; |a |b |c |d |
+; |e |f |g |h |
+; |b |c |d |i |
+; |f |g |h |j |
; a = (1 + t0 + t1)>>1
; b = (1 + t1 + t2)>>1
@@ -966,40 +961,40 @@
; void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
- sub r2, r1
- movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
+ sub r2, r1
+ movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
- psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
- psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+ psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+ psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
- movq mm3, mm1
- pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
+ movq mm3, mm1
+ pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
- movq mm4, mm2
- pavgb mm2, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm4 ; decrease 1 from odd bytes
+ movq mm4, mm2
+ pavgb mm2, mm0
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm4 ; decrease 1 from odd bytes
- pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
+ pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
- movd [r0], mm3
- psrlq mm3, 8h
- movd [r0+2*r1], mm3
+ movd [r0], mm3
+ psrlq mm3, 8h
+ movd [r0+2*r1], mm3
- movd [r0+r1], mm2
- psrlq mm2, 8h
- lea r0, [r0+2*r1]
- movd [r0+r1], mm2
- WELSEMMS
- ret
+ movd [r0+r1], mm2
+ psrlq mm2, 8h
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm2
+ WELSEMMS
+ ret
;*******************************************************************************
;
@@ -1006,93 +1001,93 @@
; void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r4, r0
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
- sub r0, r1
- movq mm0, [r0]
+ sub r0, r1
+ movq mm0, [r0]
- movzx r2, byte [r0+r1-0x01] ; l1
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l2
- add r2, r3
- movzx r3, byte [r0+r1-0x01] ; l3
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l4
- add r2, r3
- movd mm1, r2d ; mm1 = l1+l2+l3+l4
+ movzx r2, byte [r0+r1-0x01] ; l1
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l2
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01] ; l3
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l4
+ add r2, r3
+ movd mm1, r2d ; mm1 = l1+l2+l3+l4
- movzx r2, byte [r0+r1-0x01] ; l5
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l6
- add r2, r3
- movzx r3, byte [r0+r1-0x01] ; l7
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l8
- add r2, r3
- movd mm2, r2d ; mm2 = l5+l6+l7+l8
+ movzx r2, byte [r0+r1-0x01] ; l5
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l6
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01] ; l7
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l8
+ add r2, r3
+ movd mm2, r2d ; mm2 = l5+l6+l7+l8
- movq mm3, mm0
- psrlq mm0, 0x20
- psllq mm3, 0x20
- psrlq mm3, 0x20
- pxor mm4, mm4
- psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+ movq mm3, mm0
+ psrlq mm0, 0x20
+ psllq mm3, 0x20
+ psrlq mm3, 0x20
+ pxor mm4, mm4
+ psadbw mm0, mm4
+ psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
- paddq mm3, mm1
- movq mm1, mm2
- paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+ paddq mm3, mm1
+ movq mm1, mm2
+ paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
- movq mm4, [mmx_0x02]
+ movq mm4, [mmx_0x02]
- paddq mm0, mm4
- psrlq mm0, 0x02
+ paddq mm0, mm4
+ psrlq mm0, 0x02
- paddq mm2, mm4
- psrlq mm2, 0x02
+ paddq mm2, mm4
+ psrlq mm2, 0x02
- paddq mm3, mm4
- paddq mm3, mm4
- psrlq mm3, 0x03
+ paddq mm3, mm4
+ paddq mm3, mm4
+ psrlq mm3, 0x03
- paddq mm1, mm4
- paddq mm1, mm4
- psrlq mm1, 0x03
+ paddq mm1, mm4
+ paddq mm1, mm4
+ psrlq mm1, 0x03
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
- psllq mm0, 0x20
- pxor mm0, mm3 ; mm0 = m_up
+ pmuludq mm0, [mmx_01bytes]
+ pmuludq mm3, [mmx_01bytes]
+ psllq mm0, 0x20
+ pxor mm0, mm3 ; mm0 = m_up
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
- psllq mm1, 0x20
- pxor mm1, mm2 ; mm2 = m_down
+ pmuludq mm2, [mmx_01bytes]
+ pmuludq mm1, [mmx_01bytes]
+ psllq mm1, 0x20
+ pxor mm1, mm2 ; mm2 = m_down
- movq [r4], mm0
- movq [r4+r1], mm0
- movq [r4+2*r1], mm0
- lea r4, [r4+2*r1]
- movq [r4+r1], mm0
+ movq [r4], mm0
+ movq [r4+r1], mm0
+ movq [r4+2*r1], mm0
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm0
- movq [r4+2*r1], mm1
- lea r4, [r4+2*r1]
- movq [r4+r1], mm1
- movq [r4+2*r1], mm1
- lea r4, [r4+2*r1]
- movq [r4+r1], mm1
+ movq [r4+2*r1], mm1
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm1
+ movq [r4+2*r1], mm1
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm1
- pop r4
- pop r3
- WELSEMMS
- ret
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
@@ -1101,75 +1096,75 @@
; void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r4, r0
- sub r0, r1
- movdqa xmm0, [r0] ; read one row
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- movdqa xmm1, xmm0
- psrldq xmm1, 0x08
- pslldq xmm0, 0x08
- psrldq xmm0, 0x08
- paddw xmm0, xmm1
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
+ sub r0, r1
+ movdqa xmm0, [r0] ; read one row
+ pxor xmm1, xmm1
+ psadbw xmm0, xmm1
+ movdqa xmm1, xmm0
+ psrldq xmm1, 0x08
+ pslldq xmm0, 0x08
+ psrldq xmm0, 0x08
+ paddw xmm0, xmm1
- movzx r2, byte [r0+r1-0x01]
- movzx r3, byte [r0+2*r1-0x01]
- add r2, r3
- lea r0, [r0+r1]
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- add r2, 0x10
- movd xmm1, r2d
- paddw xmm0, xmm1
- psrld xmm0, 0x05
- pmuludq xmm0, [mmx_01bytes]
- pshufd xmm0, xmm0, 0
+ movzx r2, byte [r0+r1-0x01]
+ movzx r3, byte [r0+2*r1-0x01]
+ add r2, r3
+ lea r0, [r0+r1]
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ add r2, 0x10
+ movd xmm1, r2d
+ paddw xmm0, xmm1
+ psrld xmm0, 0x05
+ pmuludq xmm0, [mmx_01bytes]
+ pshufd xmm0, xmm0, 0
- movdqa [r4], xmm0
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4], xmm0
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [r4+r1], xmm0
+ movdqa [r4+r1], xmm0
- pop r4
- pop r3
+ pop r4
+ pop r3
- ret
+ ret
;*******************************************************************************
; for intra prediction as follows, 11/19/2010
@@ -1176,239 +1171,239 @@
;*******************************************************************************
;*******************************************************************************
-; void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movdqa xmm0, [r2] ; pPred-kiStride, top line
- pxor xmm7, xmm7
- psadbw xmm0, xmm7
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddw xmm0, xmm1
- xor r2, r2
- movd r2d, xmm0
- ;movdqa xmm1, xmm0
- ;punpcklbw xmm0, xmm7
- ;punpckhbw xmm1, xmm7
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movdqa xmm0, [r2] ; pPred-kiStride, top line
+ pxor xmm7, xmm7
+ psadbw xmm0, xmm7
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddw xmm0, xmm1
+ xor r2, r2
+ movd r2d, xmm0
+ ;movdqa xmm1, xmm0
+ ;punpcklbw xmm0, xmm7
+ ;punpckhbw xmm1, xmm7
- ;paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
- ;pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4
- ;paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
- ;pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
- ;paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
- ;pshuflw xmm1, xmm0, 0b1h ; 10110001
- ;paddw xmm0, xmm1 ; sum in word unit (x8)
- ;xor r3, r3
- ;movd r3d, xmm0
- ;and edx, 0ffffh
+ ;paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
+ ;pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4
+ ;paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
+ ;pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
+ ;paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
+ ;pshuflw xmm1, xmm0, 0b1h ; 10110001
+ ;paddw xmm0, xmm1 ; sum in word unit (x8)
+ ;xor r3, r3
+ ;movd r3d, xmm0
+ ;and edx, 0ffffh
- add r2, 8
- sar r2, 4
- SSE2_Copy16Times xmm1, r2d
- ;mov dh, dl
- ;mov r2, edx
- ;shl r2, 010h
- ;or edx, r2
- ;movd xmm1, edx
- ;pshufd xmm0, xmm1, 00h
- ;movdqa xmm1, xmm0
- movdqa xmm0, xmm1
- lea r2, [2*r1+r1] ; 3*kiStride
+ add r2, 8
+ sar r2, 4
+ SSE2_Copy16Times xmm1, r2d
+ ;mov dh, dl
+ ;mov r2, edx
+ ;shl r2, 010h
+ ;or edx, r2
+ ;movd xmm1, edx
+ ;pshufd xmm0, xmm1, 00h
+ ;movdqa xmm1, xmm0
+ movdqa xmm0, xmm1
+ lea r2, [2*r1+r1] ; 3*kiStride
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- POP_XMM
- ret
+ POP_XMM
+ ret
;*******************************************************************************
-; void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- lea r2, [2*r1+r1] ; 3*kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ lea r2, [2*r1+r1] ; 3*kiStride
- movdqa xmm0, [sse2_dc_0x80]
- movdqa xmm1, xmm0
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
+ movdqa xmm0, [sse2_dc_0x80]
+ movdqa xmm1, xmm0
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- ret
+ ret
;*******************************************************************************
-; void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r4, r0
- ; for left
- dec r0
- xor r2, r2
- xor r3, r3
- movzx r2, byte [r0]
- movzx r3, byte [r0+r1]
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0]
- add r2, r3
- movzx r3, byte [r0+r1]
- add r2, r3
- add r2, 02h
- sar r2, 02h
- ;SSE2_Copy16Times mm0, r2d
- mov r3, r2
- sal r3, 8
- or r2, r3
- movd mm1, r2d
- pshufw mm0, mm1, 00h
- ;mov bh, bl
- ;movd mm1, ebx
- ;pshufw mm0, mm1, 00h ; up64
- movq mm1, mm0
- xor r2, r2
- lea r0, [r0+2*r1]
- movzx r2, byte [r0]
- movzx r3, byte [r0+r1]
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0]
- add r2, r3
- movzx r3, byte [r0+r1]
- add r2, r3
- add r2, 02h
- sar r2, 02h
- mov r3, r2
- sal r3, 8
- or r2, r3
- movd mm3, r2d
- pshufw mm2, mm3, 00h
- ;mov bh, bl
- ;movd mm3, ebx
- ;pshufw mm2, mm3, 00h ; down64
- ;SSE2_Copy16Times mm2, r2d
- movq mm3, mm2
- lea r2, [2*r1+r1]
- movq [r4], mm0
- movq [r4+r1], mm1
- movq [r4+2*r1], mm0
- movq [r4+r2], mm1
- lea r4, [r4+4*r1]
- movq [r4], mm2
- movq [r4+r1], mm3
- movq [r4+2*r1], mm2
- movq [r4+r2], mm3
- pop r4
- pop r3
- emms
- ret
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
+ ; for left
+ dec r0
+ xor r2, r2
+ xor r3, r3
+ movzx r2, byte [r0]
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0]
+ add r2, r3
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ add r2, 02h
+ sar r2, 02h
+ ;SSE2_Copy16Times mm0, r2d
+ mov r3, r2
+ sal r3, 8
+ or r2, r3
+ movd mm1, r2d
+ pshufw mm0, mm1, 00h
+ ;mov bh, bl
+ ;movd mm1, ebx
+ ;pshufw mm0, mm1, 00h ; up64
+ movq mm1, mm0
+ xor r2, r2
+ lea r0, [r0+2*r1]
+ movzx r2, byte [r0]
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0]
+ add r2, r3
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ add r2, 02h
+ sar r2, 02h
+ mov r3, r2
+ sal r3, 8
+ or r2, r3
+ movd mm3, r2d
+ pshufw mm2, mm3, 00h
+ ;mov bh, bl
+ ;movd mm3, ebx
+ ;pshufw mm2, mm3, 00h ; down64
+ ;SSE2_Copy16Times mm2, r2d
+ movq mm3, mm2
+ lea r2, [2*r1+r1]
+ movq [r4], mm0
+ movq [r4+r1], mm1
+ movq [r4+2*r1], mm0
+ movq [r4+r2], mm1
+ lea r4, [r4+4*r1]
+ movq [r4], mm2
+ movq [r4+r1], mm3
+ movq [r4+2*r1], mm2
+ movq [r4+r2], mm3
+ pop r4
+ pop r3
+ emms
+ ret
;*******************************************************************************
-; void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movq xmm0, [r2] ; top: 8x1 pixels
- pxor xmm7, xmm7
- punpcklbw xmm0, xmm7 ; ext 8x2 words
- pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
- paddw xmm0, xmm1 ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
- movdqa xmm1, xmm0
- pshuflw xmm2, xmm0, 0B1h ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
- pshufhw xmm3, xmm1, 0B1h ; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
- paddw xmm0, xmm2 ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
- paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
- punpckhqdq xmm1, xmm7
- punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
- movdqa xmm6, [sse2_wd_0x02]
- paddw xmm0, xmm6
- psraw xmm0, 02h
- packuswb xmm0, xmm7
- lea r2, [2*r1+r1]
- movq [r0], xmm0
- movq [r0+r1], xmm0
- movq [r0+2*r1], xmm0
- movq [r0+r2], xmm0
- lea r0, [r0+4*r1]
- movq [r0], xmm0
- movq [r0+r1], xmm0
- movq [r0+2*r1], xmm0
- movq [r0+r2], xmm0
- POP_XMM
- ret
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movq xmm0, [r2] ; top: 8x1 pixels
+ pxor xmm7, xmm7
+ punpcklbw xmm0, xmm7 ; ext 8x2 words
+ pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
+ paddw xmm0, xmm1 ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
+ movdqa xmm1, xmm0
+ pshuflw xmm2, xmm0, 0B1h ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
+ pshufhw xmm3, xmm1, 0B1h ; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
+ paddw xmm0, xmm2 ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
+ paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
+ punpckhqdq xmm1, xmm7
+ punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
+ movdqa xmm6, [sse2_wd_0x02]
+ paddw xmm0, xmm6
+ psraw xmm0, 02h
+ packuswb xmm0, xmm7
+ lea r2, [2*r1+r1]
+ movq [r0], xmm0
+ movq [r0+r1], xmm0
+ movq [r0+2*r1], xmm0
+ movq [r0+r2], xmm0
+ lea r0, [r0+4*r1]
+ movq [r0], xmm0
+ movq [r0+r1], xmm0
+ movq [r0+2*r1], xmm0
+ movq [r0+r2], xmm0
+ POP_XMM
+ ret
;*******************************************************************************
-; void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
+; void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- lea r2, [2*r1+r1]
- movq mm0, [sse2_dc_0x80]
- movq mm1, mm0
- movq [r0], mm0
- movq [r0+r1], mm1
- movq [r0+2*r1], mm0
- movq [r0+r2], mm1
- lea r0, [r0+4*r1]
- movq [r0], mm0
- movq [r0+r1], mm1
- movq [r0+2*r1], mm0
- movq [r0+r2], mm1
- emms
- ret
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ lea r2, [2*r1+r1]
+ movq mm0, [sse2_dc_0x80]
+ movq mm1, mm0
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ movq [r0+2*r1], mm0
+ movq [r0+r2], mm1
+ lea r0, [r0+4*r1]
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ movq [r0+2*r1], mm0
+ movq [r0+r2], mm1
+ emms
+ ret
--- a/codec/encoder/core/arm/intra_pred_neon.S
+++ b/codec/encoder/core/arm/intra_pred_neon.S
@@ -38,107 +38,107 @@
#ifdef __APPLE__
//Global macro
.macro GET_8BYTE_DATA
- vld1.8 {$0[0]}, [$1], $2
- vld1.8 {$0[1]}, [$1], $2
- vld1.8 {$0[2]}, [$1], $2
- vld1.8 {$0[3]}, [$1], $2
- vld1.8 {$0[4]}, [$1], $2
- vld1.8 {$0[5]}, [$1], $2
- vld1.8 {$0[6]}, [$1], $2
- vld1.8 {$0[7]}, [$1], $2
+ vld1.8 {$0[0]}, [$1], $2
+ vld1.8 {$0[1]}, [$1], $2
+ vld1.8 {$0[2]}, [$1], $2
+ vld1.8 {$0[3]}, [$1], $2
+ vld1.8 {$0[4]}, [$1], $2
+ vld1.8 {$0[5]}, [$1], $2
+ vld1.8 {$0[6]}, [$1], $2
+ vld1.8 {$0[7]}, [$1], $2
.endm
#else
//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
- vld1.8 {\arg0[0]}, [\arg1], \arg2
- vld1.8 {\arg0[1]}, [\arg1], \arg2
- vld1.8 {\arg0[2]}, [\arg1], \arg2
- vld1.8 {\arg0[3]}, [\arg1], \arg2
- vld1.8 {\arg0[4]}, [\arg1], \arg2
- vld1.8 {\arg0[5]}, [\arg1], \arg2
- vld1.8 {\arg0[6]}, [\arg1], \arg2
- vld1.8 {\arg0[7]}, [\arg1], \arg2
+ vld1.8 {\arg0[0]}, [\arg1], \arg2
+ vld1.8 {\arg0[1]}, [\arg1], \arg2
+ vld1.8 {\arg0[2]}, [\arg1], \arg2
+ vld1.8 {\arg0[3]}, [\arg1], \arg2
+ vld1.8 {\arg0[4]}, [\arg1], \arg2
+ vld1.8 {\arg0[5]}, [\arg1], \arg2
+ vld1.8 {\arg0[6]}, [\arg1], \arg2
+ vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
- //Get the top line data to 'q0'
- sub r3, r1, r2
- vldm r3, {d0, d1}
+ //Get the top line data to 'q0'
+ sub r3, r1, r2
+ vldm r3, {d0, d1}
- //mov r2, #16
- mov r3, #4
- //Set the top line to the each line of MB(16*16)
+ //mov r2, #16
+ mov r3, #4
+ //Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v:
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_v
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
//stmdb sp!, {r4, lr}
- sub r1, r1, #1
- mov r3, #4
+ sub r1, r1, #1
+ mov r3, #4
loop_0_get_i16x16_luma_pred_h:
- //Get one byte data from left side
- vld1.8 {d0[],d1[]}, [r1], r2
- vld1.8 {d2[],d3[]}, [r1], r2
- vld1.8 {d4[],d5[]}, [r1], r2
- vld1.8 {d6[],d7[]}, [r1], r2
+ //Get one byte data from left side
+ vld1.8 {d0[],d1[]}, [r1], r2
+ vld1.8 {d2[],d3[]}, [r1], r2
+ vld1.8 {d4[],d5[]}, [r1], r2
+ vld1.8 {d6[],d7[]}, [r1], r2
- //Set the line of MB using the left side byte data
- vst1.8 {d0,d1}, [r0]!
- //add r0, #16
- vst1.8 {d2,d3}, [r0]!
- //add r0, #16
- vst1.8 {d4,d5}, [r0]!
- //add r0, #16
- vst1.8 {d6,d7}, [r0]!
- //add r0, #16
+ //Set the line of MB using the left side byte data
+ vst1.8 {d0,d1}, [r0]!
+ //add r0, #16
+ vst1.8 {d2,d3}, [r0]!
+ //add r0, #16
+ vst1.8 {d4,d5}, [r0]!
+ //add r0, #16
+ vst1.8 {d6,d7}, [r0]!
+ //add r0, #16
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_h
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
- //stmdb sp!, { r2-r5, lr}
- //Get the left vertical line data
- sub r3, r1, #1
- GET_8BYTE_DATA d0, r3, r2
- GET_8BYTE_DATA d1, r3, r2
+ //stmdb sp!, { r2-r5, lr}
+ //Get the left vertical line data
+ sub r3, r1, #1
+ GET_8BYTE_DATA d0, r3, r2
+ GET_8BYTE_DATA d1, r3, r2
- //Get the top horizontal line data
- sub r3, r1, r2
- vldm r3, {d2, d3}
+ //Get the top horizontal line data
+ sub r3, r1, r2
+ vldm r3, {d2, d3}
- //Calculate the sum of top horizontal line data and vertical line data
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vadd.u16 q0, q0, q1
- vadd.u16 d0, d0, d1
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
+ //Calculate the sum of top horizontal line data and vertical line data
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
- //Calculate the mean value
- vrshr.u16 d0, d0, #5
- vdup.8 q0, d0[0]
+ //Calculate the mean value
+ vrshr.u16 d0, d0, #5
+ vdup.8 q0, d0[0]
- //Set the mean value to the all of member of MB
- mov r3, #4
+ //Set the mean value to the all of member of MB
+ mov r3, #4
loop_0_get_i16x16_luma_pred_dc_both:
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_dc_both
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END
@@ -151,383 +151,383 @@
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
- //stmdb sp!, { r4, lr}
+ //stmdb sp!, { r4, lr}
- //Load the table {(8,7,6,5,4,3,2,1) * 5}
- adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
- vldr d0, [r3]
+ //Load the table {(8,7,6,5,4,3,2,1) * 5}
+ adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
+ vldr d0, [r3]
- //Pack the top[-1] ~ top[6] to d1
- sub r3, r1, r2
- sub r1, r3, #1
- vld1.8 d1, [r1]
+ //Pack the top[-1] ~ top[6] to d1
+ sub r3, r1, r2
+ sub r1, r3, #1
+ vld1.8 d1, [r1]
- //Pack the top[8] ~ top[15] to d2
- add r1, #9
- vld1.8 d2, [r1]
+ //Pack the top[8] ~ top[15] to d2
+ add r1, #9
+ vld1.8 d2, [r1]
- //Save the top[15] to d6 for next step
- vdup.u8 d6, d2[7]
+ //Save the top[15] to d6 for next step
+ vdup.u8 d6, d2[7]
- //Get and pack left[-1] ~ left[6] to d4
- sub r1, r3, #1
- GET_8BYTE_DATA d4, r1, r2
+ //Get and pack left[-1] ~ left[6] to d4
+ sub r1, r3, #1
+ GET_8BYTE_DATA d4, r1, r2
- //Get and pack left[8] ~ left[15] to d3
- add r1, r2
- GET_8BYTE_DATA d3, r1, r2
+ //Get and pack left[8] ~ left[15] to d3
+ add r1, r2
+ GET_8BYTE_DATA d3, r1, r2
- //Save the left[15] to d7 for next step
- vdup.u8 d7, d3[7]
+ //Save the left[15] to d7 for next step
+ vdup.u8 d7, d3[7]
- //revert the sequence of d2,d3
- vrev64.8 q1, q1
+ //revert the sequence of d2,d3
+ vrev64.8 q1, q1
- vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
- vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
+ vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
+ vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
- vmovl.u8 q0, d0
- vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
- vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
+ vmovl.u8 q0, d0
+ vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
+ vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
- //Calculate the sum of items of q1, q2
- vpadd.s16 d0, d2, d3
- vpadd.s16 d1, d4, d5
- vpaddl.s16 q0, q0
- vpaddl.s32 q0, q0
+ //Calculate the sum of items of q1, q2
+ vpadd.s16 d0, d2, d3
+ vpadd.s16 d1, d4, d5
+ vpaddl.s16 q0, q0
+ vpaddl.s32 q0, q0
- //Get the value of 'b', 'c' and extend to q1, q2.
- vrshr.s64 q0, #6
- vdup.s16 q1, d0[0]
- vdup.s16 q2, d1[0]
+ //Get the value of 'b', 'c' and extend to q1, q2.
+ vrshr.s64 q0, #6
+ vdup.s16 q1, d0[0]
+ vdup.s16 q2, d1[0]
- //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
- adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
- vld1.32 {d0}, [r3]
+ //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
+ adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
+ vld1.32 {d0}, [r3]
- //Get the value of 'a' and save to q3
- vaddl.u8 q3, d6, d7
- vshl.u16 q3, #4
+ //Get the value of 'a' and save to q3
+ vaddl.u8 q3, d6, d7
+ vshl.u16 q3, #4
- //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
- vmovl.s8 q0, d0
- vmla.s16 q3, q0, q1
- vmla.s16 q3, q2, d0[0]
+ //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
+ vmovl.s8 q0, d0
+ vmla.s16 q3, q0, q1
+ vmla.s16 q3, q2, d0[0]
- //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
- vshl.s16 q8, q1, #3
- vadd.s16 q8, q3
+ //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
+ vshl.s16 q8, q1, #3
+ vadd.s16 q8, q3
- //right shift 5 bits and rounding
- vqrshrun.s16 d0, q3, #5
- vqrshrun.s16 d1, q8, #5
+ //right shift 5 bits and rounding
+ vqrshrun.s16 d0, q3, #5
+ vqrshrun.s16 d1, q8, #5
- //Set the line of MB
- vst1.u32 {d0,d1}, [r0]!
+ //Set the line of MB
+ vst1.u32 {d0,d1}, [r0]!
- //Do the same processing for setting other lines
- mov r3, #15
+ //Do the same processing for setting other lines
+ mov r3, #15
loop_0_get_i16x16_luma_pred_plane:
- vadd.s16 q3, q2
- vadd.s16 q8, q2
- vqrshrun.s16 d0, q3, #5
- vqrshrun.s16 d1, q8, #5
- vst1.u32 {d0,d1}, [r0]!
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_plane
+ vadd.s16 q3, q2
+ vadd.s16 q8, q2
+ vqrshrun.s16 d0, q3, #5
+ vqrshrun.s16 d1, q8, #5
+ vst1.u32 {d0,d1}, [r0]!
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r3, r1, r2
- ldr r3, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r3, r1, r2
+ ldr r3, [r3]
- //Set the luma MB using top line
- str r3, [r0], #4
- str r3, [r0], #4
- str r3, [r0], #4
- str r3, [r0]
+ //Set the luma MB using top line
+ str r3, [r0], #4
+ str r3, [r0], #4
+ str r3, [r0], #4
+ str r3, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the left column (4 bytes)
- sub r3, r1, #1
- vld1.8 {d0[]}, [r3], r2
- vld1.8 {d1[]}, [r3], r2
- vld1.8 {d2[]}, [r3], r2
- vld1.8 {d3[]}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the left column (4 bytes)
+ sub r3, r1, #1
+ vld1.8 {d0[]}, [r3], r2
+ vld1.8 {d1[]}, [r3], r2
+ vld1.8 {d2[]}, [r3], r2
+ vld1.8 {d3[]}, [r3]
- //Set the luma MB using the left side byte
- vst1.32 {d0[0]}, [r0]!
- vst1.32 {d1[0]}, [r0]!
- vst1.32 {d2[0]}, [r0]!
- vst1.32 {d3[0]}, [r0]
+ //Set the luma MB using the left side byte
+ vst1.32 {d0[0]}, [r0]!
+ vst1.32 {d1[0]}, [r0]!
+ vst1.32 {d2[0]}, [r0]!
+ vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row data(8 bytes)
- sub r3, r1, r2
- vld1.32 {d0}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row data(8 bytes)
+ sub r3, r1, r2
+ vld1.32 {d0}, [r3]
- //For "t7 + (t7<<1)"
- vdup.8 d1, d0[7]
+ //For "t7 + (t7<<1)"
+ vdup.8 d1, d0[7]
- //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
- vext.8 d1, d0, d1, #1
- vaddl.u8 q1, d1, d0
+ //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q1, d1, d0
- //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
- vext.8 q2, q1, q1, #14
- vadd.u16 q0, q1, q2
+ //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
+ vext.8 q2, q1, q1, #14
+ vadd.u16 q0, q1, q2
- //right shift 2 bits and rounding
- vqrshrn.u16 d0, q0, #2
+ //right shift 2 bits and rounding
+ vqrshrn.u16 d0, q0, #2
- //Save "ddl0, ddl1, ddl2, ddl3"
- vext.8 d1, d0, d0, #1
- vst1.32 d1[0], [r0]!
+ //Save "ddl0, ddl1, ddl2, ddl3"
+ vext.8 d1, d0, d0, #1
+ vst1.32 d1[0], [r0]!
- //Save "ddl1, ddl2, ddl3, ddl4"
- vext.8 d1, d0, d0, #2
- vst1.32 d1[0], [r0]!
+ //Save "ddl1, ddl2, ddl3, ddl4"
+ vext.8 d1, d0, d0, #2
+ vst1.32 d1[0], [r0]!
- //Save "ddl2, ddl3, ddl4, ddl5"
- vext.8 d1, d0, d0, #3
- vst1.32 d1[0], [r0]!
+ //Save "ddl2, ddl3, ddl4, ddl5"
+ vext.8 d1, d0, d0, #3
+ vst1.32 d1[0], [r0]!
- //Save "ddl3, ddl4, ddl5, ddl6"
- vst1.32 d0[1], [r0]
+ //Save "ddl3, ddl4, ddl5, ddl6"
+ vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r3, r1, r2
- vld1.32 {d0[1]}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r3, r1, r2
+ vld1.32 {d0[1]}, [r3]
- //Load the left column (5 bytes)
- sub r3, #1
- vld1.8 {d0[3]}, [r3], r2
- vld1.8 {d0[2]}, [r3], r2
- vld1.8 {d0[1]}, [r3], r2
- vld1.8 {d0[0]}, [r3], r2
- vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
+ //Load the left column (5 bytes)
+ sub r3, #1
+ vld1.8 {d0[3]}, [r3], r2
+ vld1.8 {d0[2]}, [r3], r2
+ vld1.8 {d0[1]}, [r3], r2
+ vld1.8 {d0[0]}, [r3], r2
+ vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
- vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
- //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
+ vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
+ //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
- //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
- vaddl.u8 q2, d2, d0
+ //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
+ vaddl.u8 q2, d2, d0
- //q1:{TL0+LT0,LT0+T01,...L12+L23}
- vext.8 q3, q3, q2, #14
- vadd.u16 q1, q2, q3
+ //q1:{TL0+LT0,LT0+T01,...L12+L23}
+ vext.8 q3, q3, q2, #14
+ vadd.u16 q1, q2, q3
- //right shift 2 bits and rounding
- vqrshrn.u16 d0, q1, #2
+ //right shift 2 bits and rounding
+ vqrshrn.u16 d0, q1, #2
- //Adjust the data sequence for setting luma MB of 'pred'
- vst1.32 d0[1], [r0]!
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0]!
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0]!
- vext.8 d0, d0, d0, #7
- vst1.32 d0[1], [r0]
+ //Adjust the data sequence for setting luma MB of 'pred'
+ vst1.32 d0[1], [r0]!
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0]!
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0]!
+ vext.8 d0, d0, d0, #7
+ vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (8 bytes)
- sub r3, r1, r2
- vld1.32 {d0}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (8 bytes)
+ sub r3, r1, r2
+ vld1.32 {d0}, [r3]
- vext.8 d1, d0, d0, #1
- vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
+ vext.8 d1, d0, d0, #1
+ vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
- vext.8 q2, q1, q1, #2
- vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
+ vext.8 q2, q1, q1, #2
+ vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
- //calculate the "vl0,vl1,vl2,vl3,vl4"
- vqrshrn.u16 d0, q1, #1
+ //calculate the "vl0,vl1,vl2,vl3,vl4"
+ vqrshrn.u16 d0, q1, #1
- //calculate the "vl5,vl6,vl7,vl8,vl9"
- vqrshrn.u16 d1, q2, #2
+ //calculate the "vl5,vl6,vl7,vl8,vl9"
+ vqrshrn.u16 d1, q2, #2
- //Adjust the data sequence for setting the luma MB
- vst1.32 d0[0], [r0]!
- vst1.32 d1[0], [r0]!
- vext.8 d0, d0, d0, #1
- vext.8 d1, d1, d1, #1
- vst1.32 d0[0], [r0]!
- vst1.32 d1[0], [r0]
+ //Adjust the data sequence for setting the luma MB
+ vst1.32 d0[0], [r0]!
+ vst1.32 d1[0], [r0]!
+ vext.8 d0, d0, d0, #1
+ vext.8 d1, d1, d1, #1
+ vst1.32 d0[0], [r0]!
+ vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row (4 bytes)
- sub r3, r1, r2
- vld1.32 {d0[1]}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row (4 bytes)
+ sub r3, r1, r2
+ vld1.32 {d0[1]}, [r3]
- //Load the left column (4 bytes)
- sub r3, #1
- vld1.8 {d0[3]}, [r3], r2
- vld1.8 {d0[2]}, [r3], r2
- vld1.8 {d0[1]}, [r3], r2
- vld1.8 {d0[0]}, [r3]
+ //Load the left column (4 bytes)
+ sub r3, #1
+ vld1.8 {d0[3]}, [r3], r2
+ vld1.8 {d0[2]}, [r3], r2
+ vld1.8 {d0[1]}, [r3], r2
+ vld1.8 {d0[0]}, [r3]
- vext.8 d1, d0, d0, #7
- vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
+ vext.8 d1, d0, d0, #7
+ vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
- vext.u8 q2, q1, q1, #14
- vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
+ vext.u8 q2, q1, q1, #14
+ vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
- //Calculate the vr0 ~ vr9
- vqrshrn.u16 d1, q2, #2
- vqrshrn.u16 d0, q1, #1
+ //Calculate the vr0 ~ vr9
+ vqrshrn.u16 d1, q2, #2
+ vqrshrn.u16 d0, q1, #1
- //Adjust the data sequence for setting the luma MB
- vst1.32 d0[1], [r0]!
- vst1.32 d1[1], [r0]!
- //add r2, r0, r1
- vst1.8 d1[3], [r0]!
- vst1.16 d0[2], [r0]!
- vst1.8 d0[6], [r0]!
- vst1.8 d1[2], [r0]!
- vst1.16 d1[2], [r0]!
- vst1.8 d1[6], [r0]
+ //Adjust the data sequence for setting the luma MB
+ vst1.32 d0[1], [r0]!
+ vst1.32 d1[1], [r0]!
+ //add r2, r0, r1
+ vst1.8 d1[3], [r0]!
+ vst1.16 d0[2], [r0]!
+ vst1.8 d0[6], [r0]!
+ vst1.8 d1[2], [r0]!
+ vst1.16 d1[2], [r0]!
+ vst1.8 d1[6], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon
- //stmdb sp!, { r4, lr}
- //Load the left column data
- sub r3, r1, #1
- mov r1, #3
- mul r1, r2
- add r1, r3
- vld1.8 {d0[]}, [r1]
- vld1.8 {d0[4]}, [r3], r2
- vld1.8 {d0[5]}, [r3], r2
- vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
+ //stmdb sp!, { r4, lr}
+ //Load the left column data
+ sub r3, r1, #1
+ mov r1, #3
+ mul r1, r2
+ add r1, r3
+ vld1.8 {d0[]}, [r1]
+ vld1.8 {d0[4]}, [r3], r2
+ vld1.8 {d0[5]}, [r3], r2
+ vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
- vext.8 d1, d0, d0, #1
- vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+ vext.8 d1, d0, d0, #1
+ vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
- vext.u8 d2, d5, d4, #2
- vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
+ vext.u8 d2, d5, d4, #2
+ vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
- //Calculate the hu0 ~ hu5
- vqrshrn.u16 d2, q2, #1
- vqrshrn.u16 d1, q1, #2
+ //Calculate the hu0 ~ hu5
+ vqrshrn.u16 d2, q2, #1
+ vqrshrn.u16 d1, q1, #2
- //Adjust the data sequence for setting the luma MB
- vzip.8 d2, d1
- vst1.32 d1[0], [r0]!
- vext.8 d2, d1, d1, #2
- vst1.32 d2[0], [r0]!
- vst1.32 d1[1], [r0]!
- vst1.32 d0[0], [r0]
+ //Adjust the data sequence for setting the luma MB
+ vzip.8 d2, d1
+ vst1.32 d1[0], [r0]!
+ vext.8 d2, d1, d1, #2
+ vst1.32 d2[0], [r0]!
+ vst1.32 d1[1], [r0]!
+ vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the data
- sub r3, r1, r2
- sub r3, #1
- vld1.32 {d0[1]}, [r3], r2
- vld1.8 {d0[3]}, [r3], r2
- vld1.8 {d0[2]}, [r3], r2
- vld1.8 {d0[1]}, [r3], r2
- vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
+ //stmdb sp!, { r2-r5, lr}
+ //Load the data
+ sub r3, r1, r2
+ sub r3, #1
+ vld1.32 {d0[1]}, [r3], r2
+ vld1.8 {d0[3]}, [r3], r2
+ vld1.8 {d0[2]}, [r3], r2
+ vld1.8 {d0[1]}, [r3], r2
+ vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
- vext.8 d1, d0, d0, #7
- vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
+ vext.8 d1, d0, d0, #7
+ vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
- vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
- vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
+ vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
+ vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
- //Calculate the hd0~hd9
- vqrshrn.u16 d1, q3, #2
- vqrshrn.u16 d0, q2, #1
+ //Calculate the hd0~hd9
+ vqrshrn.u16 d1, q3, #2
+ vqrshrn.u16 d0, q2, #1
- //Adjust the data sequence for setting the luma MB
- vmov d3, d1
- vtrn.8 d0, d1
- vext.u8 d2, d1, d1, #6
- vst2.16 {d2[3], d3[3]}, [r0]!
- vst2.16 {d0[2], d1[2]}, [r0]!
- vmov d3, d0
- vst2.16 {d2[2], d3[2]}, [r0]!
- vst2.16 {d0[1], d1[1]}, [r0]
+ //Adjust the data sequence for setting the luma MB
+ vmov d3, d1
+ vtrn.8 d0, d1
+ vext.u8 d2, d1, d1, #6
+ vst2.16 {d2[3], d3[3]}, [r0]!
+ vst2.16 {d0[2], d1[2]}, [r0]!
+ vmov d3, d0
+ vst2.16 {d2[2], d3[2]}, [r0]!
+ vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon
- //stmdb sp!, { r2-r5, lr}
- //Get the top row (8 byte)
- sub r3, r1, r2
- vldr d0, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Get the top row (8 byte)
+ sub r3, r1, r2
+ vldr d0, [r3]
- //Set the chroma MB using top row data
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]
+ //Set the chroma MB using top row data
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
- //stmdb sp!, { r2-r5, lr}
- ////Get the left column (8 byte)
- sub r3, r1, #1
- vld1.8 {d0[]}, [r3], r2
- vld1.8 {d1[]}, [r3], r2
- vld1.8 {d2[]}, [r3], r2
- vld1.8 {d3[]}, [r3], r2
- vld1.8 {d4[]}, [r3], r2
- vld1.8 {d5[]}, [r3], r2
- vld1.8 {d6[]}, [r3], r2
- vld1.8 {d7[]}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ ////Get the left column (8 byte)
+ sub r3, r1, #1
+ vld1.8 {d0[]}, [r3], r2
+ vld1.8 {d1[]}, [r3], r2
+ vld1.8 {d2[]}, [r3], r2
+ vld1.8 {d3[]}, [r3], r2
+ vld1.8 {d4[]}, [r3], r2
+ vld1.8 {d5[]}, [r3], r2
+ vld1.8 {d6[]}, [r3], r2
+ vld1.8 {d7[]}, [r3]
- //Set the chroma MB using left column data
- vst1.8 {d0}, [r0]!
- vst1.8 {d1}, [r0]!
- vst1.8 {d2}, [r0]!
- vst1.8 {d3}, [r0]!
- vst1.8 {d4}, [r0]!
- vst1.8 {d5}, [r0]!
- vst1.8 {d6}, [r0]!
- vst1.8 {d7}, [r0]
+ //Set the chroma MB using left column data
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d1}, [r0]!
+ vst1.8 {d2}, [r0]!
+ vst1.8 {d3}, [r0]!
+ vst1.8 {d4}, [r0]!
+ vst1.8 {d5}, [r0]!
+ vst1.8 {d6}, [r0]!
+ vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END
@@ -575,73 +575,73 @@
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon
- //stmdb sp!, { r2-r5, lr}
- //Load the top row data
- sub r3, r1, #1
- sub r3, r2
- vld1.32 {d1[0]}, [r3]
- add r3, #5
- vld1.32 {d0[0]}, [r3]
+ //stmdb sp!, { r2-r5, lr}
+ //Load the top row data
+ sub r3, r1, #1
+ sub r3, r2
+ vld1.32 {d1[0]}, [r3]
+ add r3, #5
+ vld1.32 {d0[0]}, [r3]
- //Load the left column data
- sub r3, #5
- vld1.8 {d1[4]}, [r3], r2
- vld1.8 {d1[5]}, [r3], r2
- vld1.8 {d1[6]}, [r3], r2
- vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
- add r3, r2
- vld1.8 {d0[4]}, [r3], r2
- vld1.8 {d0[5]}, [r3], r2
- vld1.8 {d0[6]}, [r3], r2
- vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
+ //Load the left column data
+ sub r3, #5
+ vld1.8 {d1[4]}, [r3], r2
+ vld1.8 {d1[5]}, [r3], r2
+ vld1.8 {d1[6]}, [r3], r2
+ vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
+ add r3, r2
+ vld1.8 {d0[4]}, [r3], r2
+ vld1.8 {d0[5]}, [r3], r2
+ vld1.8 {d0[6]}, [r3], r2
+ vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
- //Save T7 to d3 for next step
- vdup.u8 d3, d0[3]
- //Save L7 to d4 for next step
- vdup.u8 d4, d0[7]
+ //Save T7 to d3 for next step
+ vdup.u8 d3, d0[3]
+ //Save L7 to d4 for next step
+ vdup.u8 d4, d0[7]
- //Calculate the value of 'a' and save to q2
- vaddl.u8 q2, d3, d4
- vshl.u16 q2, #4
+ //Calculate the value of 'a' and save to q2
+ vaddl.u8 q2, d3, d4
+ vshl.u16 q2, #4
- //Load the table {{1,2,3,4,1,2,3,4}*17}
- adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
- vld1.32 {d2}, [r3]
+ //Load the table {{1,2,3,4,1,2,3,4}*17}
+ adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
+ vld1.32 {d2}, [r3]
- //Calculate the 'b','c', and save to q0
- vrev32.8 d1, d1
- vsubl.u8 q0, d0, d1
- vmovl.u8 q1, d2
- vmul.s16 q0, q1
- vpaddl.s16 q0, q0
- vpaddl.s32 q0, q0
- vrshr.s64 q0, #5
+ //Calculate the 'b','c', and save to q0
+ vrev32.8 d1, d1
+ vsubl.u8 q0, d0, d1
+ vmovl.u8 q1, d2
+ vmul.s16 q0, q1
+ vpaddl.s16 q0, q0
+ vpaddl.s32 q0, q0
+ vrshr.s64 q0, #5
- //Load the table {-3,-2,-1,0,1,2,3,4} to q3
- adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
- vld1.32 {d6, d7}, [r3]
+ //Load the table {-3,-2,-1,0,1,2,3,4} to q3
+ adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
+ vld1.32 {d6, d7}, [r3]
- //Duplicate the 'b','c' to q0, q1 for SIMD instruction
- vdup.s16 q1, d1[0]
- vdup.s16 q0, d0[0]
+ //Duplicate the 'b','c' to q0, q1 for SIMD instruction
+ vdup.s16 q1, d1[0]
+ vdup.s16 q0, d0[0]
- //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
- vmla.s16 q2, q0, q3
- vmla.s16 q2, q1, d6[0]
- vqrshrun.s16 d0, q2, #5
+ //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
+ vmla.s16 q2, q0, q3
+ vmla.s16 q2, q1, d6[0]
+ vqrshrun.s16 d0, q2, #5
- //Set a line of chroma MB
- vst1.u32 {d0}, [r0]!
+ //Set a line of chroma MB
+ vst1.u32 {d0}, [r0]!
- //Do the same processing for each line.
- mov r3, #7
+ //Do the same processing for each line.
+ mov r3, #7
loop_0_get_i_chroma_pred_plane:
- vadd.s16 q2, q1
- vqrshrun.s16 d0, q2, #5
- vst1.u32 {d0}, [r0]!
- subs r3, #1
- bne loop_0_get_i_chroma_pred_plane
+ vadd.s16 q2, q1
+ vqrshrun.s16 d0, q2, #5
+ vst1.u32 {d0}, [r0]!
+ subs r3, #1
+ bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END
--- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
+++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
@@ -38,59 +38,59 @@
#ifdef __APPLE__
//The data sequence will be used
.macro GET_8BYTE_DATA_L0
- vld1.8 {$0[0]}, [$1], $2
- vld1.8 {$0[1]}, [$1], $2
- vld1.8 {$0[2]}, [$1], $2
- vld1.8 {$0[3]}, [$1], $2
- vld1.8 {$0[4]}, [$1], $2
- vld1.8 {$0[5]}, [$1], $2
- vld1.8 {$0[6]}, [$1], $2
- vld1.8 {$0[7]}, [$1], $2
+ vld1.8 {$0[0]}, [$1], $2
+ vld1.8 {$0[1]}, [$1], $2
+ vld1.8 {$0[2]}, [$1], $2
+ vld1.8 {$0[3]}, [$1], $2
+ vld1.8 {$0[4]}, [$1], $2
+ vld1.8 {$0[5]}, [$1], $2
+ vld1.8 {$0[6]}, [$1], $2
+ vld1.8 {$0[7]}, [$1], $2
.endm
.macro HDM_TRANSFORM_4X4_L0
- //Do the vertical transform
- vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
- vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
- vswp d1, d2
- vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
- vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
+ //Do the vertical transform
+ vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
+ vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
+ vswp d1, d2
+ vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
+ vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
- //Do the horizontal transform
- vtrn.32 q2, q1
- vadd.s16 q0, q2, q1
- vsub.s16 q1, q2, q1
+ //Do the horizontal transform
+ vtrn.32 q2, q1
+ vadd.s16 q0, q2, q1
+ vsub.s16 q1, q2, q1
- vtrn.16 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
+ vtrn.16 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
- vmov.s16 d0, d4
- vmov.s16 d1, d2
+ vmov.s16 d0, d4
+ vmov.s16 d1, d2
- vabs.s16 d3, d3
+ vabs.s16 d3, d3
- //16x16_v
- vtrn.32 d0, d1 //{0,1,3,2}
- vaba.s16 $5, d0, $2 //16x16_v
- vaba.s16 $5, d1, $8
- vaba.s16 $5, d5, $8
- vadd.u16 $5, d3
+ //16x16_v
+ vtrn.32 d0, d1 //{0,1,3,2}
+ vaba.s16 $5, d0, $2 //16x16_v
+ vaba.s16 $5, d1, $8
+ vaba.s16 $5, d5, $8
+ vadd.u16 $5, d3
- //16x16_h
- vtrn.16 d4, d5 //{0,4,12,8}
- vaba.s16 $6, d4, $3 //16x16_h
- vabs.s16 d2, d2
- vabs.s16 d5, d5
- vadd.u16 d2, d3
- vadd.u16 d2, d5
- vadd.u16 $6, d2
+ //16x16_h
+ vtrn.16 d4, d5 //{0,4,12,8}
+ vaba.s16 $6, d4, $3 //16x16_h
+ vabs.s16 d2, d2
+ vabs.s16 d5, d5
+ vadd.u16 d2, d3
+ vadd.u16 d2, d5
+ vadd.u16 $6, d2
- //16x16_dc_both
- vaba.s16 $7, d4, $4 //16x16_dc_both
- vadd.u16 $7, d2
+ //16x16_dc_both
+ vaba.s16 $7, d4, $4 //16x16_dc_both
+ vadd.u16 $7, d2
.endm
@@ -97,58 +97,58 @@
#else
//The data sequence will be used
.macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
- vld1.8 {\arg0[0]}, [\arg1], \arg2
- vld1.8 {\arg0[1]}, [\arg1], \arg2
- vld1.8 {\arg0[2]}, [\arg1], \arg2
- vld1.8 {\arg0[3]}, [\arg1], \arg2
- vld1.8 {\arg0[4]}, [\arg1], \arg2
- vld1.8 {\arg0[5]}, [\arg1], \arg2
- vld1.8 {\arg0[6]}, [\arg1], \arg2
- vld1.8 {\arg0[7]}, [\arg1], \arg2
+ vld1.8 {\arg0[0]}, [\arg1], \arg2
+ vld1.8 {\arg0[1]}, [\arg1], \arg2
+ vld1.8 {\arg0[2]}, [\arg1], \arg2
+ vld1.8 {\arg0[3]}, [\arg1], \arg2
+ vld1.8 {\arg0[4]}, [\arg1], \arg2
+ vld1.8 {\arg0[5]}, [\arg1], \arg2
+ vld1.8 {\arg0[6]}, [\arg1], \arg2
+ vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2,arg3, arg4, arg5, arg6, arg7, arg8
- //Do the vertical transform
- vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13}
- vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15}
- vswp d1, d2
- vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
- vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
+ //Do the vertical transform
+ vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13}
+ vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15}
+ vswp d1, d2
+ vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
+ vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
- //Do the horizontal transform
- vtrn.32 q2, q1
- vadd.s16 q0, q2, q1
- vsub.s16 q1, q2, q1
+ //Do the horizontal transform
+ vtrn.32 q2, q1
+ vadd.s16 q0, q2, q1
+ vsub.s16 q1, q2, q1
- vtrn.16 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
+ vtrn.16 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
- vmov.s16 d0, d4
- vmov.s16 d1, d2
+ vmov.s16 d0, d4
+ vmov.s16 d1, d2
- vabs.s16 d3, d3
+ vabs.s16 d3, d3
- //16x16_v
- vtrn.32 d0, d1 //{0,1,3,2}
- vaba.s16 \arg5, d0, \arg2 //16x16_v
- vaba.s16 \arg5, d1, \arg8
- vaba.s16 \arg5, d5, \arg8
- vadd.u16 \arg5, d3
+ //16x16_v
+ vtrn.32 d0, d1 //{0,1,3,2}
+ vaba.s16 \arg5, d0, \arg2 //16x16_v
+ vaba.s16 \arg5, d1, \arg8
+ vaba.s16 \arg5, d5, \arg8
+ vadd.u16 \arg5, d3
- //16x16_h
- vtrn.16 d4, d5 //{0,4,12,8}
- vaba.s16 \arg6, d4, \arg3 //16x16_h
- vabs.s16 d2, d2
- vabs.s16 d5, d5
- vadd.u16 d2, d3
- vadd.u16 d2, d5
- vadd.u16 \arg6, d2
+ //16x16_h
+ vtrn.16 d4, d5 //{0,4,12,8}
+ vaba.s16 \arg6, d4, \arg3 //16x16_h
+ vabs.s16 d2, d2
+ vabs.s16 d5, d5
+ vadd.u16 d2, d3
+ vadd.u16 d2, d5
+ vadd.u16 \arg6, d2
- //16x16_dc_both
- vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
- vadd.u16 \arg7, d2
+ //16x16_dc_both
+ vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
+ vadd.u16 \arg7, d2
.endm
#endif
@@ -156,63 +156,63 @@
stmdb sp!, {r4-r7, lr}
vpush {q4-q7}
- //Get the top line data to 'q15'(16 bytes)
- sub r7, r0, r1
+ //Get the top line data to 'q15'(16 bytes)
+ sub r7, r0, r1
vld1.8 {q15}, [r7]
- //Get the left colume data to 'q14' (16 bytes)
- sub r7, r0, #1
- GET_8BYTE_DATA_L0 d28, r7, r1
- GET_8BYTE_DATA_L0 d29, r7, r1
+ //Get the left colume data to 'q14' (16 bytes)
+ sub r7, r0, #1
+ GET_8BYTE_DATA_L0 d28, r7, r1
+ GET_8BYTE_DATA_L0 d29, r7, r1
- //Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
- //Calculate the 16x16_dc_both mode SATD
- vaddl.u8 q0, d30, d31
- vaddl.u8 q1, d28, d29
- vadd.u16 q0, q1
- vadd.u16 d0, d1
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
+ //Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
+ //Calculate the 16x16_dc_both mode SATD
+ vaddl.u8 q0, d30, d31
+ vaddl.u8 q1, d28, d29
+ vadd.u16 q0, q1
+ vadd.u16 d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
- //Calculate the mean value
- vrshr.u16 d0, #5
- vshl.u16 d27, d0, #4
+ //Calculate the mean value
+ vrshr.u16 d0, #5
+ vshl.u16 d27, d0, #4
- //Calculate the 16x16_v mode SATD and save to "q11, 12"
- vshll.u8 q0, d30, #2
- vshll.u8 q1, d31, #2
- vtrn.32 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
- vtrn.16 q2, q1
- vadd.s16 q12, q2, q1
- vsub.s16 q11, q2, q1
- vtrn.32 q12, q11 //{0,1,3,2, 4,5,7,6} q12
- //{8,9,11,10, 12,13,15,14} q11
+ //Calculate the 16x16_v mode SATD and save to "q11, 12"
+ vshll.u8 q0, d30, #2
+ vshll.u8 q1, d31, #2
+ vtrn.32 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
+ vtrn.16 q2, q1
+ vadd.s16 q12, q2, q1
+ vsub.s16 q11, q2, q1
+ vtrn.32 q12, q11 //{0,1,3,2, 4,5,7,6} q12
+ //{8,9,11,10, 12,13,15,14} q11
//Calculate the 16x16_h mode SATD and save to "q9, q10"
- vshll.u8 q0, d28, #2
- vshll.u8 q1, d29, #2
- vtrn.32 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
- vtrn.16 q2, q1
- vadd.s16 q10, q2, q1
- vsub.s16 q9, q2, q1
- vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10
- //{8,9,11,10, 12,13,15,14} q9
+ vshll.u8 q0, d28, #2
+ vshll.u8 q1, d29, #2
+ vtrn.32 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
+ vtrn.16 q2, q1
+ vadd.s16 q10, q2, q1
+ vsub.s16 q9, q2, q1
+ vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10
+ //{8,9,11,10, 12,13,15,14} q9
- vmov.i32 d17, #0//Save the SATD of DC_BOTH
- vmov.i32 d16, #0//Save the SATD of H
- vmov.i32 d15, #0//Save the SATD of V
- vmov.i32 d14, #0//For zero D register
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {q3}, [r2], r3
- vld1.32 {q4}, [r2], r3
- vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
- vtrn.32 q3, q4
- vtrn.32 q5, q6
+ vmov.i32 d17, #0//Save the SATD of DC_BOTH
+ vmov.i32 d16, #0//Save the SATD of H
+ vmov.i32 d15, #0//Save the SATD of V
+ vmov.i32 d14, #0//For zero D register
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {q3}, [r2], r3
+ vld1.32 {q4}, [r2], r3
+ vld1.32 {q5}, [r2], r3
+ vld1.32 {q6}, [r2], r3
+ vtrn.32 q3, q4
+ vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14
@@ -219,13 +219,13 @@
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {q3}, [r2], r3
- vld1.32 {q4}, [r2], r3
- vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
- vtrn.32 q3, q4
- vtrn.32 q5, q6
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {q3}, [r2], r3
+ vld1.32 {q4}, [r2], r3
+ vld1.32 {q5}, [r2], r3
+ vld1.32 {q6}, [r2], r3
+ vtrn.32 q3, q4
+ vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14
@@ -232,13 +232,13 @@
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {q3}, [r2], r3
- vld1.32 {q4}, [r2], r3
- vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
- vtrn.32 q3, q4
- vtrn.32 q5, q6
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {q3}, [r2], r3
+ vld1.32 {q4}, [r2], r3
+ vld1.32 {q5}, [r2], r3
+ vld1.32 {q6}, [r2], r3
+ vtrn.32 q3, q4
+ vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14
@@ -245,13 +245,13 @@
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {q3}, [r2], r3
- vld1.32 {q4}, [r2], r3
- vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
- vtrn.32 q3, q4
- vtrn.32 q5, q6
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {q3}, [r2], r3
+ vld1.32 {q4}, [r2], r3
+ vld1.32 {q5}, [r2], r3
+ vld1.32 {q6}, [r2], r3
+ vtrn.32 q3, q4
+ vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14
@@ -258,29 +258,29 @@
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
- //Get the data from stack
- ldr r5, [sp, #84] //the addr of Best_mode
- ldr r6, [sp, #88] //the value of i_lambda
+ //Get the data from stack
+ ldr r5, [sp, #84] //the addr of Best_mode
+ ldr r6, [sp, #88] //the value of i_lambda
- //vadd.u16 d24, d25
- vrshr.u16 d15, #1
- vpaddl.u16 d15, d15
- vpaddl.u32 d15, d15
- vmov.u32 r0, d15[0]
+ //vadd.u16 d24, d25
+ vrshr.u16 d15, #1
+ vpaddl.u16 d15, d15
+ vpaddl.u32 d15, d15
+ vmov.u32 r0, d15[0]
- //vadd.u16 d22, d23
- vrshr.u16 d16, #1
- vpaddl.u16 d16, d16
- vpaddl.u32 d16, d16
- vmov.u32 r1, d16[0]
- add r1, r1, r6, lsl #1
+ //vadd.u16 d22, d23
+ vrshr.u16 d16, #1
+ vpaddl.u16 d16, d16
+ vpaddl.u32 d16, d16
+ vmov.u32 r1, d16[0]
+ add r1, r1, r6, lsl #1
- //vadd.u16 d20, d21
- vrshr.u16 d17, #1
- vpaddl.u16 d17, d17
- vpaddl.u32 d17, d17
- vmov.u32 r2, d17[0]
- add r2, r2, r6, lsl #1
+ //vadd.u16 d20, d21
+ vrshr.u16 d17, #1
+ vpaddl.u16 d17, d17
+ vpaddl.u32 d17, d17
+ vmov.u32 r2, d17[0]
+ add r2, r2, r6, lsl #1
mov r4, #0
cmp r1, r0
@@ -300,77 +300,77 @@
WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Sad_neon
stmdb sp!, {r4-r7, lr}
- //Get the top line data to 'q15'(16 bytes)
- sub r4, r0, r1
+ //Get the top line data to 'q15'(16 bytes)
+ sub r4, r0, r1
vld1.8 {q15}, [r4]
- //Get the left colume data to 'q14' (16 bytes)
- sub r4, r0, #1
- GET_8BYTE_DATA_L0 d28, r4, r1
- GET_8BYTE_DATA_L0 d29, r4, r1
+ //Get the left colume data to 'q14' (16 bytes)
+ sub r4, r0, #1
+ GET_8BYTE_DATA_L0 d28, r4, r1
+ GET_8BYTE_DATA_L0 d29, r4, r1
- //Calculate the mean value and save to 'q13' (8 bytes)
- //Calculate the 16x16_dc_both mode SATD
- vaddl.u8 q0, d30, d31
- vaddl.u8 q1, d28, d29
- vadd.u16 q0, q1
- vadd.u16 d0, d1
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
+ //Calculate the mean value and save to 'q13' (8 bytes)
+ //Calculate the 16x16_dc_both mode SATD
+ vaddl.u8 q0, d30, d31
+ vaddl.u8 q1, d28, d29
+ vadd.u16 q0, q1
+ vadd.u16 d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
- //Calculate the mean value
- vrshr.u16 d0, d0, #5
- vdup.8 q13, d0[0]
+ //Calculate the mean value
+ vrshr.u16 d0, d0, #5
+ vdup.8 q13, d0[0]
- sub r4, r0, #1
+ sub r4, r0, #1
- vmov.i32 q12, #0//Save the SATD of DC_BOTH
- vmov.i32 q11, #0//Save the SATD of H
- vmov.i32 q10, #0//Save the SATD of V
+ vmov.i32 q12, #0//Save the SATD of DC_BOTH
+ vmov.i32 q11, #0//Save the SATD of H
+ vmov.i32 q10, #0//Save the SATD of V
- mov lr, #16
+ mov lr, #16
sad_intra_16x16_x3_opt_loop0:
//Get the left colume data to 'd0' (16 bytes)
- vld1.8 {d0[]}, [r4], r1
+ vld1.8 {d0[]}, [r4], r1
- //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
- vld1.8 {q1}, [r2], r3
+ //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+ vld1.8 {q1}, [r2], r3
- subs lr, #1
- //Do the SAD for top colume
- vabal.u8 q12, d30, d2
- vabal.u8 q12, d31, d3
+ subs lr, #1
+ //Do the SAD for top colume
+ vabal.u8 q12, d30, d2
+ vabal.u8 q12, d31, d3
- //Do the SAD for left colume
- vabal.u8 q11, d0, d2
- vabal.u8 q11, d0, d3
+ //Do the SAD for left colume
+ vabal.u8 q11, d0, d2
+ vabal.u8 q11, d0, d3
- //Do the SAD for mean value
- vabal.u8 q10, d26, d2
- vabal.u8 q10, d26, d3
+ //Do the SAD for mean value
+ vabal.u8 q10, d26, d2
+ vabal.u8 q10, d26, d3
- bne sad_intra_16x16_x3_opt_loop0
+ bne sad_intra_16x16_x3_opt_loop0
- //Get the data from stack
- ldr r5, [sp, #20] //the addr of Best_mode
- ldr r6, [sp, #24] //the value of i_lambda
+ //Get the data from stack
+ ldr r5, [sp, #20] //the addr of Best_mode
+ ldr r6, [sp, #24] //the value of i_lambda
- vadd.u16 d24, d25
- vpaddl.u16 d24, d24
- vpaddl.u32 d24, d24
- vmov.u32 r0, d24[0]
+ vadd.u16 d24, d25
+ vpaddl.u16 d24, d24
+ vpaddl.u32 d24, d24
+ vmov.u32 r0, d24[0]
- vadd.u16 d22, d23
- vpaddl.u16 d22, d22
- vpaddl.u32 d22, d22
- vmov.u32 r1, d22[0]
- add r1, r1, r6, lsl #1
+ vadd.u16 d22, d23
+ vpaddl.u16 d22, d22
+ vpaddl.u32 d22, d22
+ vmov.u32 r1, d22[0]
+ add r1, r1, r6, lsl #1
- vadd.u16 d20, d21
- vpaddl.u16 d20, d20
- vpaddl.u32 d20, d20
- vmov.u32 r2, d20[0]
- add r2, r2, r6, lsl #1
+ vadd.u16 d20, d21
+ vpaddl.u16 d20, d20
+ vpaddl.u32 d20, d20
+ vmov.u32 r2, d20[0]
+ add r2, r2, r6, lsl #1
mov r4, #0
cmp r1, r0
@@ -382,7 +382,7 @@
str r4, [r5]
- ldmia sp!, {r4-r7, lr}
+ ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
@@ -389,24 +389,24 @@
WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon
stmdb sp!, {r4-r7, lr}
- //Get the data from stack
- ldr r4, [sp, #32] //p_dec_cr
- ldr r5, [sp, #36] //p_enc_cr
+ //Get the data from stack
+ ldr r4, [sp, #32] //p_dec_cr
+ ldr r5, [sp, #36] //p_enc_cr
- //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
- sub r6, r0, #1
- GET_8BYTE_DATA_L0 d28, r6, r1
- sub r6, r4, #1
- GET_8BYTE_DATA_L0 d30, r6, r1
+ //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
+ sub r6, r0, #1
+ GET_8BYTE_DATA_L0 d28, r6, r1
+ sub r6, r4, #1
+ GET_8BYTE_DATA_L0 d30, r6, r1
- //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
- sub r6, r0, r1
+ //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
+ sub r6, r0, r1
vld1.8 {d29}, [r6]
- sub r6, r4, r1
+ sub r6, r4, r1
vld1.8 {d31}, [r6]
- //Calculate the sum of left column and top row
- vmov.i32 q0, q14
+ //Calculate the sum of left column and top row
+ vmov.i32 q0, q14
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
@@ -416,13 +416,13 @@
//duplicate the 'mx' to a vector line
vdup.8 d27, d2[0]
vdup.8 d26, d1[4]
- vtrn.32 d27, d26
+ vtrn.32 d27, d26
vdup.8 d26, d0[4]
vdup.8 d25, d2[4]
vtrn.32 d26, d25 //Save to "d27, d26"
- vmov.i32 q0, q15
+ vmov.i32 q0, q15
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
@@ -432,94 +432,94 @@
//duplicate the 'mx' to a vector line
vdup.8 d25, d2[0]
vdup.8 d24, d1[4]
- vtrn.32 d25, d24
+ vtrn.32 d25, d24
vdup.8 d24, d0[4]
vdup.8 d23, d2[4]
- vtrn.32 d24, d23 //Save to "d25, d24"
+ vtrn.32 d24, d23 //Save to "d25, d24"
- vmov.i32 q11, #0//Save the SATD of DC_BOTH
- vmov.i32 q10, #0//Save the SATD of H
- vmov.i32 q9 , #0//Save the SATD of V
- sub r6, r0, #1
- sub r7, r4, #1
- mov lr, #4
+ vmov.i32 q11, #0//Save the SATD of DC_BOTH
+ vmov.i32 q10, #0//Save the SATD of H
+ vmov.i32 q9 , #0//Save the SATD of V
+ sub r6, r0, #1
+ sub r7, r4, #1
+ mov lr, #4
sad_intra_8x8_x3_opt_loop0:
- //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
- vld1.8 {d0}, [r2], r3
- vld1.8 {d1}, [r5], r3
+ //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r5], r3
//Get the left colume data to 'd0' (16 bytes)
- vld1.8 {d2[]}, [r6], r1
- vld1.8 {d3[]}, [r7], r1
+ vld1.8 {d2[]}, [r6], r1
+ vld1.8 {d3[]}, [r7], r1
- subs lr, #1
+ subs lr, #1
- //Do the SAD for top colume
- vabal.u8 q11, d29, d0
- vabal.u8 q11, d31, d1
+ //Do the SAD for top colume
+ vabal.u8 q11, d29, d0
+ vabal.u8 q11, d31, d1
- //Do the SAD for left colume
- vabal.u8 q10, d2, d0
- vabal.u8 q10, d3, d1
+ //Do the SAD for left colume
+ vabal.u8 q10, d2, d0
+ vabal.u8 q10, d3, d1
- //Do the SAD for mean value
- vabal.u8 q9, d27, d0
- vabal.u8 q9, d25, d1
+ //Do the SAD for mean value
+ vabal.u8 q9, d27, d0
+ vabal.u8 q9, d25, d1
- bne sad_intra_8x8_x3_opt_loop0
+ bne sad_intra_8x8_x3_opt_loop0
- mov lr, #4
+ mov lr, #4
sad_intra_8x8_x3_opt_loop1:
- //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
- vld1.8 {d0}, [r2], r3
- vld1.8 {d1}, [r5], r3
+ //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r5], r3
//Get the left colume data to 'd0' (16 bytes)
- vld1.8 {d2[]}, [r6], r1
- vld1.8 {d3[]}, [r7], r1
+ vld1.8 {d2[]}, [r6], r1
+ vld1.8 {d3[]}, [r7], r1
- subs lr, #1
+ subs lr, #1
- //Do the SAD for top colume
- vabal.u8 q11, d29, d0
- vabal.u8 q11, d31, d1
+ //Do the SAD for top colume
+ vabal.u8 q11, d29, d0
+ vabal.u8 q11, d31, d1
- //Do the SAD for left colume
- vabal.u8 q10, d2, d0
- vabal.u8 q10, d3, d1
+ //Do the SAD for left colume
+ vabal.u8 q10, d2, d0
+ vabal.u8 q10, d3, d1
- //Do the SAD for mean value
- vabal.u8 q9, d26, d0
- vabal.u8 q9, d24, d1
+ //Do the SAD for mean value
+ vabal.u8 q9, d26, d0
+ vabal.u8 q9, d24, d1
- bne sad_intra_8x8_x3_opt_loop1
- //Get the data from stack
- ldr r5, [sp, #20] //the addr of Best_mode
- ldr r6, [sp, #24] //the value of i_lambda
+ bne sad_intra_8x8_x3_opt_loop1
+ //Get the data from stack
+ ldr r5, [sp, #20] //the addr of Best_mode
+ ldr r6, [sp, #24] //the value of i_lambda
- vadd.u16 d22, d23
- vpaddl.u16 d22, d22
- vpaddl.u32 d22, d22
- vmov.u32 r0, d22[0]
- add r0, r0, r6, lsl #1
+ vadd.u16 d22, d23
+ vpaddl.u16 d22, d22
+ vpaddl.u32 d22, d22
+ vmov.u32 r0, d22[0]
+ add r0, r0, r6, lsl #1
- vadd.u16 d20, d21
- vpaddl.u16 d20, d20
- vpaddl.u32 d20, d20
- vmov.u32 r1, d20[0]
- add r1, r1, r6, lsl #1
+ vadd.u16 d20, d21
+ vpaddl.u16 d20, d20
+ vpaddl.u32 d20, d20
+ vmov.u32 r1, d20[0]
+ add r1, r1, r6, lsl #1
- vadd.u16 d18, d19
- vpaddl.u16 d18, d18
- vpaddl.u32 d18, d18
- vmov.u32 r2, d18[0]
+ vadd.u16 d18, d19
+ vpaddl.u16 d18, d18
+ vpaddl.u32 d18, d18
+ vmov.u32 r2, d18[0]
mov r4, #2
cmp r1, r0
@@ -531,7 +531,7 @@
str r4, [r5]
- ldmia sp!, {r4-r7, lr}
+ ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
@@ -539,47 +539,47 @@
stmdb sp!, {r4-r7, lr}
vpush {q4-q7}
- //Get the data from stack
- ldr r4, [sp, #96] //p_dec_cr
- ldr r5, [sp, #100] //p_enc_cr
+ //Get the data from stack
+ ldr r4, [sp, #96] //p_dec_cr
+ ldr r5, [sp, #100] //p_enc_cr
- //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
- sub r6, r0, r1
+ //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
+ sub r6, r0, r1
vld1.8 {d29}, [r6]
- sub r6, r4, r1
+ sub r6, r4, r1
vld1.8 {d31}, [r6]
- //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
- sub r6, r0, #1
- GET_8BYTE_DATA_L0 d28, r6, r1
- sub r6, r4, #1
- GET_8BYTE_DATA_L0 d30, r6, r1
+ //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
+ sub r6, r0, #1
+ GET_8BYTE_DATA_L0 d28, r6, r1
+ sub r6, r4, #1
+ GET_8BYTE_DATA_L0 d30, r6, r1
- //Calculate the 16x16_v mode SATD and save to "q12, 13"
- vshll.u8 q0, d29, #2
- vshll.u8 q1, d31, #2
- vtrn.32 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
- vtrn.16 q2, q1
- vadd.s16 q13, q2, q1
- vsub.s16 q12, q2, q1
- vtrn.32 q13, q12 //{0,1,3,2, 4,5,7,6} q13
- //{8,9,11,10, 12,13,15,14} q12
+ //Calculate the 16x16_v mode SATD and save to "q12, 13"
+ vshll.u8 q0, d29, #2
+ vshll.u8 q1, d31, #2
+ vtrn.32 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
+ vtrn.16 q2, q1
+ vadd.s16 q13, q2, q1
+ vsub.s16 q12, q2, q1
+ vtrn.32 q13, q12 //{0,1,3,2, 4,5,7,6} q13
+ //{8,9,11,10, 12,13,15,14} q12
//Calculate the 16x16_h mode SATD and save to "q10, q11"
- vshll.u8 q0, d28, #2
- vshll.u8 q1, d30, #2
- vtrn.32 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
- vtrn.16 q2, q1
- vadd.s16 q11, q2, q1
- vsub.s16 q10, q2, q1
- vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11
- //{8,9,11,10, 12,13,15,14} q10
+ vshll.u8 q0, d28, #2
+ vshll.u8 q1, d30, #2
+ vtrn.32 q0, q1
+ vadd.s16 q2, q0, q1
+ vsub.s16 q1, q0, q1
+ vtrn.16 q2, q1
+ vadd.s16 q11, q2, q1
+ vsub.s16 q10, q2, q1
+ vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11
+ //{8,9,11,10, 12,13,15,14} q10
- //Calculate the sum of left column and top row
- //vmov.i32 q0, q14
+ //Calculate the sum of left column and top row
+ //vmov.i32 q0, q14
vpaddl.u8 q0, q14
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1
@@ -588,77 +588,77 @@
vpaddl.u16 q2, q2
vadd.u32 d3, d4, d5
- vtrn.32 q0, q2
- vrshr.u32 q1, #3
- vrshr.u32 q2, #2
- vshll.u32 q9, d4, #4 // {2cb, 2cr} q9
- vshll.u32 q8, d5, #4 // {1cb, 1cr} q8
- vshll.u32 q7, d2, #4 // {0cb, 3cb} q7
- vshll.u32 q6, d3, #4 // {0cr, 3cr} q6
+ vtrn.32 q0, q2
+ vrshr.u32 q1, #3
+ vrshr.u32 q2, #2
+ vshll.u32 q9, d4, #4 // {2cb, 2cr} q9
+ vshll.u32 q8, d5, #4 // {1cb, 1cr} q8
+ vshll.u32 q7, d2, #4 // {0cb, 3cb} q7
+ vshll.u32 q6, d3, #4 // {0cr, 3cr} q6
vmov.i32 d28, #0//Save the SATD of DC_BOTH
- vmov.i32 d10, #0//Save the SATD of H
- vmov.i32 d11, #0//Save the SATD of V
- vmov.i32 d30, #0//For zero D register
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {d6}, [r2], r3
- vld1.32 {d7}, [r2], r3
- vld1.32 {d8}, [r2], r3
- vld1.32 {d9}, [r2], r3
- vtrn.32 d6, d7
- vtrn.32 d8, d9
+ vmov.i32 d10, #0//Save the SATD of H
+ vmov.i32 d11, #0//Save the SATD of V
+ vmov.i32 d30, #0//For zero D register
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {d6}, [r2], r3
+ vld1.32 {d7}, [r2], r3
+ vld1.32 {d8}, [r2], r3
+ vld1.32 {d9}, [r2], r3
+ vtrn.32 d6, d7
+ vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30
- vld1.32 {d6}, [r5], r3
- vld1.32 {d7}, [r5], r3
- vld1.32 {d8}, [r5], r3
- vld1.32 {d9}, [r5], r3
- vtrn.32 d6, d7
- vtrn.32 d8, d9
+ vld1.32 {d6}, [r5], r3
+ vld1.32 {d7}, [r5], r3
+ vld1.32 {d8}, [r5], r3
+ vld1.32 {d9}, [r5], r3
+ vtrn.32 d6, d7
+ vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
- vld1.32 {d6}, [r2], r3
- vld1.32 {d7}, [r2], r3
- vld1.32 {d8}, [r2], r3
- vld1.32 {d9}, [r2], r3
- vtrn.32 d6, d7
- vtrn.32 d8, d9
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ vld1.32 {d6}, [r2], r3
+ vld1.32 {d7}, [r2], r3
+ vld1.32 {d8}, [r2], r3
+ vld1.32 {d9}, [r2], r3
+ vtrn.32 d6, d7
+ vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30
- vld1.32 {d6}, [r5], r3
- vld1.32 {d7}, [r5], r3
- vld1.32 {d8}, [r5], r3
- vld1.32 {d9}, [r5], r3
- vtrn.32 d6, d7
- vtrn.32 d8, d9
+ vld1.32 {d6}, [r5], r3
+ vld1.32 {d7}, [r5], r3
+ vld1.32 {d8}, [r5], r3
+ vld1.32 {d9}, [r5], r3
+ vtrn.32 d6, d7
+ vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
- //Get the data from stack
- ldr r5, [sp, #84] //the addr of Best_mode
- ldr r6, [sp, #88] //the value of i_lambda
+ //Get the data from stack
+ ldr r5, [sp, #84] //the addr of Best_mode
+ ldr r6, [sp, #88] //the value of i_lambda
- vrshr.u16 d11, #1
- vpaddl.u16 d11, d11
- vpaddl.u32 d11, d11
- vmov.u32 lr, d11[0]
- add lr, lr, r6, lsl #1
+ vrshr.u16 d11, #1
+ vpaddl.u16 d11, d11
+ vpaddl.u32 d11, d11
+ vmov.u32 lr, d11[0]
+ add lr, lr, r6, lsl #1
- vrshr.u16 d10, #1
- vpaddl.u16 d10, d10
- vpaddl.u32 d10, d10
- vmov.u32 r3, d10[0]
- add r3, r3, r6, lsl #1
+ vrshr.u16 d10, #1
+ vpaddl.u16 d10, d10
+ vpaddl.u32 d10, d10
+ vmov.u32 r3, d10[0]
+ add r3, r3, r6, lsl #1
- vrshr.u16 d28, #1
- vpaddl.u16 d28, d28
- vpaddl.u32 d28, d28
- vmov.u32 r2, d28[0]
+ vrshr.u16 d28, #1
+ vpaddl.u16 d28, d28
+ vpaddl.u32 d28, d28
+ vmov.u32 r2, d28[0]
mov r6, #2
cmp r3, lr
@@ -671,8 +671,8 @@
str r6, [r5]
mov r0, lr
- vpop {q4-q7}
- ldmia sp!, {r4-r7, lr}
+ vpop {q4-q7}
+ ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
@@ -680,118 +680,118 @@
stmdb sp!, {r4-r7, lr}
//Get the top line data to 'd31[0~3]'(4 bytes)
- sub r7, r0, r1
+ sub r7, r0, r1
vld1.32 {d31[0]}, [r7]
- //Get the left colume data to 'd31[4~7]' (4 bytes)
- sub r7, r0, #1
+ //Get the left colume data to 'd31[4~7]' (4 bytes)
+ sub r7, r0, #1
vld1.8 {d31[4]}, [r7], r1
vld1.8 {d31[5]}, [r7], r1
vld1.8 {d31[6]}, [r7], r1
vld1.8 {d31[7]}, [r7], r1
- //Calculate the mean value and save to 'd30' (2 bytes)
- vpaddl.u8 d0, d31
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
- //Calculate the mean value
- vrshr.u16 d0, #3
- vshl.u16 d30, d0, #4
+ //Calculate the mean value and save to 'd30' (2 bytes)
+ vpaddl.u8 d0, d31
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ //Calculate the mean value
+ vrshr.u16 d0, #3
+ vshl.u16 d30, d0, #4
- //Calculate the 16x16_v mode SATD and save to "d29"
+ //Calculate the 16x16_v mode SATD and save to "d29"
//Calculate the 16x16_h mode SATD and save to "d28"
- vshll.u8 q0, d31, #2
- vtrn.32 d0, d1
- vadd.s16 d2, d0, d1
- vsub.s16 d1, d0, d1
- vtrn.16 d2, d1
- vadd.s16 d29, d2, d1
- vsub.s16 d28, d2, d1
- vtrn.32 d29, d28 //{0,1,3,2 top} d29
- //{0,1,3,2 left} d28
+ vshll.u8 q0, d31, #2
+ vtrn.32 d0, d1
+ vadd.s16 d2, d0, d1
+ vsub.s16 d1, d0, d1
+ vtrn.16 d2, d1
+ vadd.s16 d29, d2, d1
+ vsub.s16 d28, d2, d1
+ vtrn.32 d29, d28 //{0,1,3,2 top} d29
+ //{0,1,3,2 left} d28
vmov.i32 d27, #0//Save the SATD of DC_BOTH
- vmov.i32 d26, #0//Save the SATD of H
- vmov.i32 d25, #0//Save the SATD of V
- vmov.i32 d24, #0//For zero D register
+ vmov.i32 d26, #0//Save the SATD of H
+ vmov.i32 d25, #0//Save the SATD of V
+ vmov.i32 d24, #0//For zero D register
- //Load the p_enc data and save to "d22,d23"--- 4X4 bytes
- vld1.32 {d23[0]}, [r2], r3
- vld1.32 {d23[1]}, [r2], r3
- vld1.32 {d22[0]}, [r2], r3
- vld1.32 {d22[1]}, [r2], r3
+ //Load the p_enc data and save to "d22,d23"--- 4X4 bytes
+ vld1.32 {d23[0]}, [r2], r3
+ vld1.32 {d23[1]}, [r2], r3
+ vld1.32 {d22[0]}, [r2], r3
+ vld1.32 {d22[1]}, [r2], r3
HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24
- //Get the data from stack
- ldr r5, [sp, #28] //the value of lambda2
- ldr r6, [sp, #32] //the value of lambda1
- ldr r7, [sp, #36] //the value of lambda0
+ //Get the data from stack
+ ldr r5, [sp, #28] //the value of lambda2
+ ldr r6, [sp, #32] //the value of lambda1
+ ldr r7, [sp, #36] //the value of lambda0
- vrshr.u16 d25, #1
- vpaddl.u16 d25, d25
- vpaddl.u32 d25, d25
- vmov.u32 r0, d25[0]
- add r0, r7
+ vrshr.u16 d25, #1
+ vpaddl.u16 d25, d25
+ vpaddl.u32 d25, d25
+ vmov.u32 r0, d25[0]
+ add r0, r7
- vrshr.u16 d26, #1
- vpaddl.u16 d26, d26
- vpaddl.u32 d26, d26
- vmov.u32 r1, d26[0]
- add r1, r6
+ vrshr.u16 d26, #1
+ vpaddl.u16 d26, d26
+ vpaddl.u32 d26, d26
+ vmov.u32 r1, d26[0]
+ add r1, r6
- vrshr.u16 d27, #1
- vpaddl.u16 d27, d27
- vpaddl.u32 d27, d27
- vmov.u32 r2, d27[0]
- add r2, r5
+ vrshr.u16 d27, #1
+ vpaddl.u16 d27, d27
+ vpaddl.u32 d27, d27
+ vmov.u32 r2, d27[0]
+ add r2, r5
- ldr r5, [sp, #20] //p_dst
- ldr r6, [sp, #24] //the addr of Best_mode
+ ldr r5, [sp, #20] //p_dst
+ ldr r6, [sp, #24] //the addr of Best_mode
- mov r4, r0
- cmp r1, r4
- movcc r4, r1
- cmp r2, r4
- movcc r4, r2
+ mov r4, r0
+ cmp r1, r4
+ movcc r4, r1
+ cmp r2, r4
+ movcc r4, r2
- //The compare sequence affect the resule
- cmp r4, r2
- bne satd_intra_4x4_x3_opt_jump0
- mov r0, #2
- str r0, [r6]
- vshr.u32 d0, d30, #4 // {2cb, 2cr} q9
- vdup.8 q1, d0[0]
- vst1.8 {q1}, [r5]
- //...
- bl satd_intra_4x4_x3_opt_end
+ //The compare sequence affect the resule
+ cmp r4, r2
+ bne satd_intra_4x4_x3_opt_jump0
+ mov r0, #2
+ str r0, [r6]
+ vshr.u32 d0, d30, #4 // {2cb, 2cr} q9
+ vdup.8 q1, d0[0]
+ vst1.8 {q1}, [r5]
+ //...
+ bl satd_intra_4x4_x3_opt_end
satd_intra_4x4_x3_opt_jump0:
- cmp r4, r1
- bne satd_intra_4x4_x3_opt_jump1
- mov r0, #1
- str r0, [r6]
- vdup.8 d0, d31[4]
- vdup.8 d1, d31[5]
- vdup.8 d2, d31[6]
- vdup.8 d3, d31[7]
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
+ cmp r4, r1
+ bne satd_intra_4x4_x3_opt_jump1
+ mov r0, #1
+ str r0, [r6]
+ vdup.8 d0, d31[4]
+ vdup.8 d1, d31[5]
+ vdup.8 d2, d31[6]
+ vdup.8 d3, d31[7]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
- bl satd_intra_4x4_x3_opt_end
+ bl satd_intra_4x4_x3_opt_end
satd_intra_4x4_x3_opt_jump1:
- mov r0, #0
- str r0, [r6]
- vst1.32 {d31[0]}, [r5]!
- vst1.32 {d31[0]}, [r5]!
- vst1.32 {d31[0]}, [r5]!
- vst1.32 {d31[0]}, [r5]!
+ mov r0, #0
+ str r0, [r6]
+ vst1.32 {d31[0]}, [r5]!
+ vst1.32 {d31[0]}, [r5]!
+ vst1.32 {d31[0]}, [r5]!
+ vst1.32 {d31[0]}, [r5]!
satd_intra_4x4_x3_opt_end:
- mov r0, r4
+ mov r0, r4
- ldmia sp!, {r4-r7, lr}
+ ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
#endif
--- a/codec/encoder/core/arm/pixel_neon.S
+++ b/codec/encoder/core/arm/pixel_neon.S
@@ -66,10 +66,10 @@
vsub.s16 q3, q12, q13
vadd.s16 q8, q10, q11
- vsub.s16 q9, q10, q11
+ vsub.s16 q9, q10, q11
vadd.s16 q10, q14, q15
- vsub.s16 q11, q14, q15
+ vsub.s16 q11, q14, q15
vadd.s16 q12, q0, q2
vsub.s16 q14, q0, q2
@@ -372,28 +372,28 @@
WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
stmdb sp!, {r4-r5, lr}
- //Loading a horizontal line data (4 bytes)
- //line 0
- ldr r4, [r0], r1
- ldr r5, [r2], r3
- usad8 lr, r4, r5
+ //Loading a horizontal line data (4 bytes)
+ //line 0
+ ldr r4, [r0], r1
+ ldr r5, [r2], r3
+ usad8 lr, r4, r5
//line 1
- ldr r4, [r0], r1
- ldr r5, [r2], r3
- usada8 lr, r4, r5, lr
+ ldr r4, [r0], r1
+ ldr r5, [r2], r3
+ usada8 lr, r4, r5, lr
//line 2
- ldr r4, [r0], r1
- ldr r5, [r2], r3
- usada8 lr, r4, r5, lr
+ ldr r4, [r0], r1
+ ldr r5, [r2], r3
+ usada8 lr, r4, r5, lr
- //line 3
- ldr r4, [r0]
- ldr r5, [r2]
- usada8 r0, r4, r5, lr
+ //line 3
+ ldr r4, [r0]
+ ldr r5, [r2]
+ usada8 r0, r4, r5, lr
- ldmia sp!, {r4-r5, lr}
+ ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
@@ -401,76 +401,76 @@
stmdb sp!, {r4-r5, lr}
- //Generate the pix2 start addr
- sub r4, r2, #1
- add r5, r2, #1
- sub r2, r3
+ //Generate the pix2 start addr
+ sub r4, r2, #1
+ add r5, r2, #1
+ sub r2, r3
//Loading a horizontal line data (16 bytes)
- vld1.8 {q0}, [r0], r1 //save pix1
+ vld1.8 {q0}, [r0], r1 //save pix1
- vld1.8 {q1}, [r2], r3 //save pix2 - stride
- vld1.8 {q10}, [r2], r3 //save pix2
- vld1.8 {q2}, [r2], r3 //save pix2 + stride
+ vld1.8 {q1}, [r2], r3 //save pix2 - stride
+ vld1.8 {q10}, [r2], r3 //save pix2
+ vld1.8 {q2}, [r2], r3 //save pix2 + stride
- vld1.8 {q3}, [r4], r3 //save pix2 - 1
- vld1.8 {q8}, [r5], r3 //save pix2 + 1
+ vld1.8 {q3}, [r4], r3 //save pix2 - 1
+ vld1.8 {q8}, [r5], r3 //save pix2 + 1
- //Do the SAD for 16 bytes
- vabdl.u8 q15, d0, d2
- vabal.u8 q15, d1, d3
+ //Do the SAD for 16 bytes
+ vabdl.u8 q15, d0, d2
+ vabal.u8 q15, d1, d3
- vabdl.u8 q13, d0, d4
- vabal.u8 q13, d1, d5
+ vabdl.u8 q13, d0, d4
+ vabal.u8 q13, d1, d5
- vabdl.u8 q11, d0, d6
- vabal.u8 q11, d1, d7
+ vabdl.u8 q11, d0, d6
+ vabal.u8 q11, d1, d7
- vabdl.u8 q9, d0, d16
- vabal.u8 q9, d1, d17
+ vabdl.u8 q9, d0, d16
+ vabal.u8 q9, d1, d17
- mov lr, #15
+ mov lr, #15
pixel_sad_4_16x16_loop_0:
//Loading a horizontal line data (16 bytes)
- vld1.8 {q0}, [r0], r1 //save pix1
- vmov.8 q1, q10 //save pix2 - stride
- vmov.8 q10, q2
- vabal.u8 q15, d0, d2
- vld1.8 {q2}, [r2], r3 //save pix2 + stride
- vabal.u8 q15, d1, d3
- vld1.8 {q3}, [r4], r3 //save pix2 - 1
- vabal.u8 q13, d0, d4
- vld1.8 {q8}, [r5], r3 //save pix2 + 1
+ vld1.8 {q0}, [r0], r1 //save pix1
+ vmov.8 q1, q10 //save pix2 - stride
+ vmov.8 q10, q2
+ vabal.u8 q15, d0, d2
+ vld1.8 {q2}, [r2], r3 //save pix2 + stride
+ vabal.u8 q15, d1, d3
+ vld1.8 {q3}, [r4], r3 //save pix2 - 1
+ vabal.u8 q13, d0, d4
+ vld1.8 {q8}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
- subs lr, #1
+ subs lr, #1
- vabal.u8 q11, d0, d6
- vabal.u8 q11, d1, d7
+ vabal.u8 q11, d0, d6
+ vabal.u8 q11, d1, d7
- vabal.u8 q9, d0, d16
- vabal.u8 q9, d1, d17
+ vabal.u8 q9, d0, d16
+ vabal.u8 q9, d1, d17
- bne pixel_sad_4_16x16_loop_0
+ bne pixel_sad_4_16x16_loop_0
//Save SAD to 'r0'
- ldr r0, [sp, #12]
+ ldr r0, [sp, #12]
- vadd.u16 d0, d30, d31
- vadd.u16 d1, d26, d27
- vadd.u16 d2, d22, d23
- vadd.u16 d3, d18, d19
+ vadd.u16 d0, d30, d31
+ vadd.u16 d1, d26, d27
+ vadd.u16 d2, d22, d23
+ vadd.u16 d3, d18, d19
- vpaddl.u16 q0, q0
- vpaddl.u16 q1, q1
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
- vpaddl.u32 q0, q0
- vpaddl.u32 q1, q1
+ vpaddl.u32 q0, q0
+ vpaddl.u32 q1, q1
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
- ldmia sp!, {r4-r5, lr}
+ ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
@@ -477,75 +477,75 @@
WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
stmdb sp!, {r4-r5, lr}
- //Generate the pix2 start addr
- sub r4, r2, #1
- add r5, r2, #1
- sub r2, r3
+ //Generate the pix2 start addr
+ sub r4, r2, #1
+ add r5, r2, #1
+ sub r2, r3
//Loading a horizontal line data (16 bytes)
- vld1.8 {q0}, [r0], r1 //save pix1
+ vld1.8 {q0}, [r0], r1 //save pix1
- vld1.8 {q1}, [r2], r3 //save pix2 - stride
- vld1.8 {q10}, [r2], r3 //save pix2
- vld1.8 {q2}, [r2], r3 //save pix2 + stride
+ vld1.8 {q1}, [r2], r3 //save pix2 - stride
+ vld1.8 {q10}, [r2], r3 //save pix2
+ vld1.8 {q2}, [r2], r3 //save pix2 + stride
- vld1.8 {q3}, [r4], r3 //save pix2 - 1
- vld1.8 {q8}, [r5], r3 //save pix2 + 1
+ vld1.8 {q3}, [r4], r3 //save pix2 - 1
+ vld1.8 {q8}, [r5], r3 //save pix2 + 1
- //Do the SAD for 16 bytes
- vabdl.u8 q15, d0, d2
- vabal.u8 q15, d1, d3
+ //Do the SAD for 16 bytes
+ vabdl.u8 q15, d0, d2
+ vabal.u8 q15, d1, d3
- vabdl.u8 q13, d0, d4
- vabal.u8 q13, d1, d5
+ vabdl.u8 q13, d0, d4
+ vabal.u8 q13, d1, d5
- vabdl.u8 q11, d0, d6
- vabal.u8 q11, d1, d7
+ vabdl.u8 q11, d0, d6
+ vabal.u8 q11, d1, d7
- vabdl.u8 q9, d0, d16
- vabal.u8 q9, d1, d17
+ vabdl.u8 q9, d0, d16
+ vabal.u8 q9, d1, d17
- mov lr, #7
+ mov lr, #7
pixel_sad_4_16x8_loop_0:
//Loading a horizontal line data (16 bytes)
- vld1.8 {q0}, [r0], r1 //save pix1
- vmov.8 q1, q10 //save pix2 - stride
- vmov.8 q10, q2
- vabal.u8 q15, d0, d2
- vld1.8 {q2}, [r2], r3 //save pix2 + stride
- vabal.u8 q15, d1, d3
- vld1.8 {q3}, [r4], r3 //save pix2 - 1
- vabal.u8 q13, d0, d4
- vld1.8 {q8}, [r5], r3 //save pix2 + 1
+ vld1.8 {q0}, [r0], r1 //save pix1
+ vmov.8 q1, q10 //save pix2 - stride
+ vmov.8 q10, q2
+ vabal.u8 q15, d0, d2
+ vld1.8 {q2}, [r2], r3 //save pix2 + stride
+ vabal.u8 q15, d1, d3
+ vld1.8 {q3}, [r4], r3 //save pix2 - 1
+ vabal.u8 q13, d0, d4
+ vld1.8 {q8}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
- subs lr, #1
+ subs lr, #1
- vabal.u8 q11, d0, d6
- vabal.u8 q11, d1, d7
+ vabal.u8 q11, d0, d6
+ vabal.u8 q11, d1, d7
- vabal.u8 q9, d0, d16
- vabal.u8 q9, d1, d17
+ vabal.u8 q9, d0, d16
+ vabal.u8 q9, d1, d17
- bne pixel_sad_4_16x8_loop_0
+ bne pixel_sad_4_16x8_loop_0
//Save SAD to 'r0'
- ldr r0, [sp, #12]
+ ldr r0, [sp, #12]
- vadd.u16 d0, d30, d31
- vadd.u16 d1, d26, d27
- vadd.u16 d2, d22, d23
- vadd.u16 d3, d18, d19
+ vadd.u16 d0, d30, d31
+ vadd.u16 d1, d26, d27
+ vadd.u16 d2, d22, d23
+ vadd.u16 d3, d18, d19
- vpaddl.u16 q0, q0
- vpaddl.u16 q1, q1
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
- vpaddl.u32 q0, q0
- vpaddl.u32 q1, q1
+ vpaddl.u32 q0, q0
+ vpaddl.u32 q1, q1
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
- ldmia sp!, {r4-r5, lr}
+ ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
@@ -552,189 +552,189 @@
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
stmdb sp!, {r4-r5, lr}
- //Generate the pix2 start addr
- sub r4, r2, #1
- add r5, r2, #1
- sub r2, r3
+ //Generate the pix2 start addr
+ sub r4, r2, #1
+ add r5, r2, #1
+ sub r2, r3
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1 //save pix1
+ vld1.8 {d0}, [r0], r1 //save pix1
- vld1.8 {d1}, [r2], r3 //save pix2 - stride
- vld1.8 {d6}, [r2], r3 //save pix2
- vld1.8 {d2}, [r2], r3 //save pix2 + stride
+ vld1.8 {d1}, [r2], r3 //save pix2 - stride
+ vld1.8 {d6}, [r2], r3 //save pix2
+ vld1.8 {d2}, [r2], r3 //save pix2 + stride
- vld1.8 {d3}, [r4], r3 //save pix2 - 1
- vld1.8 {d4}, [r5], r3 //save pix2 + 1
+ vld1.8 {d3}, [r4], r3 //save pix2 - 1
+ vld1.8 {d4}, [r5], r3 //save pix2 + 1
- //Do the SAD for 8 bytes
- vabdl.u8 q15, d0, d1
- vabdl.u8 q14, d0, d2
- vabdl.u8 q13, d0, d3
- vabdl.u8 q12, d0, d4
+ //Do the SAD for 8 bytes
+ vabdl.u8 q15, d0, d1
+ vabdl.u8 q14, d0, d2
+ vabdl.u8 q13, d0, d3
+ vabdl.u8 q12, d0, d4
- mov lr, #15
+ mov lr, #15
pixel_sad_4_8x16_loop_0:
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1 //save pix1
- vmov.8 d1, d6 //save pix2 - stride
- vmov.8 d6, d2
- vld1.8 {d2}, [r2], r3 //save pix2 + stride
- vld1.8 {d3}, [r4], r3 //save pix2 - 1
- vabal.u8 q15, d0, d1
+ vld1.8 {d0}, [r0], r1 //save pix1
+ vmov.8 d1, d6 //save pix2 - stride
+ vmov.8 d6, d2
+ vld1.8 {d2}, [r2], r3 //save pix2 + stride
+ vld1.8 {d3}, [r4], r3 //save pix2 - 1
+ vabal.u8 q15, d0, d1
- vld1.8 {d4}, [r5], r3 //save pix2 + 1
- //Do the SAD for 8 bytes
- vabal.u8 q14, d0, d2
- vabal.u8 q13, d0, d3
- vabal.u8 q12, d0, d4
+ vld1.8 {d4}, [r5], r3 //save pix2 + 1
+ //Do the SAD for 8 bytes
+ vabal.u8 q14, d0, d2
+ vabal.u8 q13, d0, d3
+ vabal.u8 q12, d0, d4
subs lr, #1
- bne pixel_sad_4_8x16_loop_0
+ bne pixel_sad_4_8x16_loop_0
//Save SAD to 'r0'
- ldr r0, [sp, #12]
+ ldr r0, [sp, #12]
- vadd.u16 d0, d30, d31
- vadd.u16 d1, d28, d29
- vadd.u16 d2, d26, d27
- vadd.u16 d3, d24, d25
+ vadd.u16 d0, d30, d31
+ vadd.u16 d1, d28, d29
+ vadd.u16 d2, d26, d27
+ vadd.u16 d3, d24, d25
- vpaddl.u16 q0, q0
- vpaddl.u16 q1, q1
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
- vpaddl.u32 q0, q0
- vpaddl.u32 q1, q1
+ vpaddl.u32 q0, q0
+ vpaddl.u32 q1, q1
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
- ldmia sp!, {r4-r5, lr}
+ ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
- stmdb sp!, {r4-r5, lr}
+ stmdb sp!, {r4-r5, lr}
- //Generate the pix2 start addr
- sub r4, r2, #1
- add r5, r2, #1
- sub r2, r3
+ //Generate the pix2 start addr
+ sub r4, r2, #1
+ add r5, r2, #1
+ sub r2, r3
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1 //save pix1
+ vld1.8 {d0}, [r0], r1 //save pix1
- vld1.8 {d1}, [r2], r3 //save pix2 - stride
- vld1.8 {d6}, [r2], r3 //save pix2
- vld1.8 {d2}, [r2], r3 //save pix2 + stride
+ vld1.8 {d1}, [r2], r3 //save pix2 - stride
+ vld1.8 {d6}, [r2], r3 //save pix2
+ vld1.8 {d2}, [r2], r3 //save pix2 + stride
- vld1.8 {d3}, [r4], r3 //save pix2 - 1
- vld1.8 {d4}, [r5], r3 //save pix2 + 1
+ vld1.8 {d3}, [r4], r3 //save pix2 - 1
+ vld1.8 {d4}, [r5], r3 //save pix2 + 1
- //Do the SAD for 8 bytes
- vabdl.u8 q15, d0, d1
- vabdl.u8 q14, d0, d2
- vabdl.u8 q13, d0, d3
- vabdl.u8 q12, d0, d4
+ //Do the SAD for 8 bytes
+ vabdl.u8 q15, d0, d1
+ vabdl.u8 q14, d0, d2
+ vabdl.u8 q13, d0, d3
+ vabdl.u8 q12, d0, d4
- mov lr, #7
+ mov lr, #7
pixel_sad_4_8x8_loop_0:
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1 //save pix1
- vmov.8 d1, d6 //save pix2 - stride
- vmov.8 d6, d2
- vld1.8 {d2}, [r2], r3 //save pix2 + stride
- vld1.8 {d3}, [r4], r3 //save pix2 - 1
- vabal.u8 q15, d0, d1
+ vld1.8 {d0}, [r0], r1 //save pix1
+ vmov.8 d1, d6 //save pix2 - stride
+ vmov.8 d6, d2
+ vld1.8 {d2}, [r2], r3 //save pix2 + stride
+ vld1.8 {d3}, [r4], r3 //save pix2 - 1
+ vabal.u8 q15, d0, d1
- vld1.8 {d4}, [r5], r3 //save pix2 + 1
- //Do the SAD for 8 bytes
- vabal.u8 q14, d0, d2
- vabal.u8 q13, d0, d3
- vabal.u8 q12, d0, d4
+ vld1.8 {d4}, [r5], r3 //save pix2 + 1
+ //Do the SAD for 8 bytes
+ vabal.u8 q14, d0, d2
+ vabal.u8 q13, d0, d3
+ vabal.u8 q12, d0, d4
subs lr, #1
- bne pixel_sad_4_8x8_loop_0
+ bne pixel_sad_4_8x8_loop_0
//Save SAD to 'r0'
- ldr r0, [sp, #12]
+ ldr r0, [sp, #12]
- vadd.u16 d0, d30, d31
- vadd.u16 d1, d28, d29
- vadd.u16 d2, d26, d27
- vadd.u16 d3, d24, d25
+ vadd.u16 d0, d30, d31
+ vadd.u16 d1, d28, d29
+ vadd.u16 d2, d26, d27
+ vadd.u16 d3, d24, d25
- vpaddl.u16 q0, q0
- vpaddl.u16 q1, q1
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
- vpaddl.u32 q0, q0
- vpaddl.u32 q1, q1
+ vpaddl.u32 q0, q0
+ vpaddl.u32 q1, q1
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
- ldmia sp!, {r4-r5, lr}
+ ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
- vld1.32 {d0[0]}, [r0], r1
- vld1.32 {d0[1]}, [r0], r1
- vld1.32 {d1[0]}, [r0], r1
- vld1.32 {d1[1]}, [r0]
+ vld1.32 {d0[0]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d1[0]}, [r0], r1
+ vld1.32 {d1[1]}, [r0]
- sub r0, r2, r3
- vld1.32 {d2[0]}, [r0], r3
- vld1.32 {d2[1]}, [r0], r3
- vld1.32 {d3[0]}, [r0], r3
- vld1.32 {d3[1]}, [r0], r3
- vld1.32 {d4[0]}, [r0], r3
- vld1.32 {d4[1]}, [r0]
+ sub r0, r2, r3
+ vld1.32 {d2[0]}, [r0], r3
+ vld1.32 {d2[1]}, [r0], r3
+ vld1.32 {d3[0]}, [r0], r3
+ vld1.32 {d3[1]}, [r0], r3
+ vld1.32 {d4[0]}, [r0], r3
+ vld1.32 {d4[1]}, [r0]
- sub r0, r2, #1
- vld1.32 {d5[0]}, [r0], r3
- vld1.32 {d5[1]}, [r0], r3
- vld1.32 {d6[0]}, [r0], r3
- vld1.32 {d6[1]}, [r0]
+ sub r0, r2, #1
+ vld1.32 {d5[0]}, [r0], r3
+ vld1.32 {d5[1]}, [r0], r3
+ vld1.32 {d6[0]}, [r0], r3
+ vld1.32 {d6[1]}, [r0]
- add r0, r2, #1
- vld1.32 {d7[0]}, [r0], r3
- vld1.32 {d7[1]}, [r0], r3
- vld1.32 {d8[0]}, [r0], r3
- vld1.32 {d8[1]}, [r0]
+ add r0, r2, #1
+ vld1.32 {d7[0]}, [r0], r3
+ vld1.32 {d7[1]}, [r0], r3
+ vld1.32 {d8[0]}, [r0], r3
+ vld1.32 {d8[1]}, [r0]
- vabdl.u8 q15, d0, d2
- vabdl.u8 q14, d1, d3
+ vabdl.u8 q15, d0, d2
+ vabdl.u8 q14, d1, d3
- vabdl.u8 q13, d0, d3
- vabdl.u8 q12, d1, d4
+ vabdl.u8 q13, d0, d3
+ vabdl.u8 q12, d1, d4
- vabdl.u8 q11, d0, d5
- vabdl.u8 q10, d1, d6
+ vabdl.u8 q11, d0, d5
+ vabdl.u8 q10, d1, d6
- vabdl.u8 q9, d0, d7
- vabdl.u8 q8, d1, d8
+ vabdl.u8 q9, d0, d7
+ vabdl.u8 q8, d1, d8
- //Save SAD to 'r4'
- ldr r0, [sp]
- vadd.u16 q0, q14, q15
- vadd.u16 q1, q12, q13
- vadd.u16 q2, q10, q11
- vadd.u16 q3, q8 , q9
+ //Save SAD to 'r4'
+ ldr r0, [sp]
+ vadd.u16 q0, q14, q15
+ vadd.u16 q1, q12, q13
+ vadd.u16 q2, q10, q11
+ vadd.u16 q3, q8 , q9
- vadd.u16 d0, d1
- vadd.u16 d1, d2, d3
- vadd.u16 d2, d4, d5
- vadd.u16 d3, d6, d7
+ vadd.u16 d0, d1
+ vadd.u16 d1, d2, d3
+ vadd.u16 d2, d4, d5
+ vadd.u16 d3, d6, d7
- vpaddl.u16 q0, q0
- vpaddl.u16 q1, q1
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
- vpaddl.u32 q0, q0
- vpaddl.u32 q1, q1
+ vpaddl.u32 q0, q0
+ vpaddl.u32 q1, q1
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
WELS_ASM_FUNC_END
@@ -834,16 +834,16 @@
WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
//Load the pix1 data --- 16 bytes
- vld1.32 {d0[0]}, [r0], r1
- vld1.32 {d0[1]}, [r0], r1
- vld1.32 {d1[0]}, [r0], r1
- vld1.32 {d1[1]}, [r0]
+ vld1.32 {d0[0]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d1[0]}, [r0], r1
+ vld1.32 {d1[1]}, [r0]
//Load the pix2 data --- 16 bytes
- vld1.32 {d2[0]}, [r2], r3
- vld1.32 {d2[1]}, [r2], r3
- vld1.32 {d3[0]}, [r2], r3
- vld1.32 {d3[1]}, [r2]
+ vld1.32 {d2[0]}, [r2], r3
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d3[0]}, [r2], r3
+ vld1.32 {d3[1]}, [r2]
//Get the difference
vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
@@ -874,7 +874,7 @@
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
- vmov.u32 r0, d0[0]
+ vmov.u32 r0, d0[0]
WELS_ASM_FUNC_END
--- a/codec/encoder/core/arm/reconstruct_neon.S
+++ b/codec/encoder/core/arm/reconstruct_neon.S
@@ -35,592 +35,592 @@
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
-.macro LOAD_4x4_DATA_FOR_DCT
-// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride
- vld2.16 {$0[0],$1[0]}, [$4], $5
- vld2.16 {$2[0],$3[0]}, [$6], $7
- vld2.16 {$0[1],$1[1]}, [$4], $5
- vld2.16 {$2[1],$3[1]}, [$6], $7
+.macro LOAD_4x4_DATA_FOR_DCT
+// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride
+ vld2.16 {$0[0],$1[0]}, [$4], $5
+ vld2.16 {$2[0],$3[0]}, [$6], $7
+ vld2.16 {$0[1],$1[1]}, [$4], $5
+ vld2.16 {$2[1],$3[1]}, [$6], $7
- vld2.16 {$0[2],$1[2]}, [$4], $5
- vld2.16 {$2[2],$3[2]}, [$6], $7
- vld2.16 {$0[3],$1[3]}, [$4], $5
- vld2.16 {$2[3],$3[3]}, [$6], $7
-// }
+ vld2.16 {$0[2],$1[2]}, [$4], $5
+ vld2.16 {$2[2],$3[2]}, [$6], $7
+ vld2.16 {$0[3],$1[3]}, [$4], $5
+ vld2.16 {$2[3],$3[3]}, [$6], $7
+// }
.endm
-.macro LOAD_8x8_DATA_FOR_DCT
-// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
- vld1.64 {$0}, [$8], r2
- vld1.64 {$4}, [$9], r4
- vld1.64 {$1}, [$8], r2
- vld1.64 {$5}, [$9], r4
+.macro LOAD_8x8_DATA_FOR_DCT
+// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+ vld1.64 {$0}, [$8], r2
+ vld1.64 {$4}, [$9], r4
+ vld1.64 {$1}, [$8], r2
+ vld1.64 {$5}, [$9], r4
- vld1.64 {$2}, [$8], r2
- vld1.64 {$6}, [$9], r4
- vld1.64 {$3}, [$8], r2
- vld1.64 {$7}, [$9], r4
-// }
+ vld1.64 {$2}, [$8], r2
+ vld1.64 {$6}, [$9], r4
+ vld1.64 {$3}, [$8], r2
+ vld1.64 {$7}, [$9], r4
+// }
.endm
-.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
-// { // input: src_d[0]~[3], working: [4]~[7]
- vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3];
- vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3];
- vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2];
- vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2];
+.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
+// { // input: src_d[0]~[3], working: [4]~[7]
+ vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3];
+ vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3];
+ vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2];
+ vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2];
- vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1];
- vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1];
- vshl.s16 $1, $7, #1
- vshl.s16 $3, $6, #1
- vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2];
- vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1);
-// }
+ vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1];
+ vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1];
+ vshl.s16 $1, $7, #1
+ vshl.s16 $3, $6, #1
+ vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2];
+ vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1);
+// }
.endm
-.macro MATRIX_TRANSFORM_EACH_16BITS
-// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
- vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-// }
+.macro MATRIX_TRANSFORM_EACH_16BITS
+// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+ vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+ vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+ vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+ vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+// }
.endm
-.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
- veor.s16 $6, $6 // init 0 , and keep 0;
- vaba.s16 $1, $0, $6 // f + abs(coef - 0)
- vmull.s16 $7, $2, $4
- vmull.s16 $8, $3, $5
- vshr.s32 $7, #16
- vshr.s32 $8, #16
- vmovn.s32 $2, $7
- vmovn.s32 $3, $8
+.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+ veor.s16 $6, $6 // init 0 , and keep 0;
+ vaba.s16 $1, $0, $6 // f + abs(coef - 0)
+ vmull.s16 $7, $2, $4
+ vmull.s16 $8, $3, $5
+ vshr.s32 $7, #16
+ vshr.s32 $8, #16
+ vmovn.s32 $2, $7
+ vmovn.s32 $3, $8
- vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $6, #1
- vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
+ vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 $6, #1
+ vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
+// }
.endm
-.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
- veor.s16 $6, $6 // init 0 , and keep 0;
- vaba.s16 $1, $0, $6 // f + abs(coef - 0)
- vmull.s16 $7, $2, $4
- vmull.s16 $8, $3, $5
- vshr.s32 $7, #16
- vshr.s32 $8, #16
- vmovn.s32 $2, $7
- vmovn.s32 $3, $8
+.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+ veor.s16 $6, $6 // init 0 , and keep 0;
+ vaba.s16 $1, $0, $6 // f + abs(coef - 0)
+ vmull.s16 $7, $2, $4
+ vmull.s16 $8, $3, $5
+ vshr.s32 $7, #16
+ vshr.s32 $8, #16
+ vmovn.s32 $2, $7
+ vmovn.s32 $3, $8
- vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $6, #1
- vmax.s16 $9, $2, $3
- vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
+ vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 $6, #1
+ vmax.s16 $9, $2, $3
+ vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
+// }
.endm
-.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), mf , working_d (all 0), working_q
- vaba.s16 $1, $0, $3 // f + abs(coef - 0)
- vmull.s16 $4, $1, $2 // *= mf
- vshr.s32 $4, #16
- vmovn.s32 $1, $4 // >> 16
+.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
+// { // input: coef, ff (dst), mf , working_d (all 0), working_q
+ vaba.s16 $1, $0, $3 // f + abs(coef - 0)
+ vmull.s16 $4, $1, $2 // *= mf
+ vshr.s32 $4, #16
+ vmovn.s32 $1, $4 // >> 16
- vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $3, #1
- vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111
+ vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 $3, #1
+ vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x
+// }
.endm
-.macro DC_ZERO_COUNT_IN_DUALWORD
-// { // input: coef, dst_d, working_d (all 0x01)
- vceq.s16 $1, $0, #0
- vand.s16 $1, $2
- vpadd.s16 $1, $1, $1
- vpadd.s16 $1, $1, $1
-// }
+.macro DC_ZERO_COUNT_IN_DUALWORD
+// { // input: coef, dst_d, working_d (all 0x01)
+ vceq.s16 $1, $0, #0
+ vand.s16 $1, $2
+ vpadd.s16 $1, $1, $1
+ vpadd.s16 $1, $1, $1
+// }
.endm
-.macro SELECT_MAX_IN_ABS_COEF
-// { // input: coef_0, coef_1, max_q (identy to follow two)
- vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4
- vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3]
- vpmax.s16 $3, $3, $4 // max 1st in $3[0][1]
-// }
+.macro SELECT_MAX_IN_ABS_COEF
+// { // input: coef_0, coef_1, max_q (identy to follow two)
+ vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4
+ vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3]
+ vpmax.s16 $3, $3, $4 // max 1st in $3[0][1]
+// }
.endm
-.macro ZERO_COUNT_IN_2_QUARWORD
-// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
- vceq.s16 $0, #0
- vceq.s16 $1, #0
- vand.s16 $0, $2
- vand.s16 $1, $2
+.macro ZERO_COUNT_IN_2_QUARWORD
+// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
+ vceq.s16 $0, #0
+ vceq.s16 $1, #0
+ vand.s16 $0, $2
+ vand.s16 $1, $2
- vpadd.s16 $3, $3, $5
- vpadd.s16 $4, $4, $6
- vpadd.s16 $3, $3, $4 // 8-->4
- vpadd.s16 $3, $3, $3
- vpadd.s16 $3, $3, $3
-// }
+ vpadd.s16 $3, $3, $5
+ vpadd.s16 $4, $4, $6
+ vpadd.s16 $3, $3, $4 // 8-->4
+ vpadd.s16 $3, $3, $3
+ vpadd.s16 $3, $3, $3
+// }
.endm
-.macro HDM_QUANT_2x2_TOTAL_16BITS
-// { // input: src_d[0]~[3], working_d, dst_d
- vshr.s64 $1, $0, #32
- vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
- vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
- vtrn.s16 $2, $1
- vtrn.s32 $2, $1
-// }
+.macro HDM_QUANT_2x2_TOTAL_16BITS
+// { // input: src_d[0]~[3], working_d, dst_d
+ vshr.s64 $1, $0, #32
+ vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+ vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+ vtrn.s16 $2, $1
+ vtrn.s32 $2, $1
+// }
.endm
-.macro IHDM_4x4_TOTAL_16BITS
-// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
- vshr.s64 $1, $0, #32
- vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
- vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
- vtrn.s16 $2, $1
- vrev32.16 $1, $1
- vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+.macro IHDM_4x4_TOTAL_16BITS
+// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+ vshr.s64 $1, $0, #32
+ vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+ vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+ vtrn.s16 $2, $1
+ vrev32.16 $1, $1
+ vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
- vrev64.16 $1, $2
- vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
- vsub.s16 $1, $2, $1
- vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
- vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-// }
+ vrev64.16 $1, $2
+ vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+ vsub.s16 $1, $2, $1
+ vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+ vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+// }
.endm
-.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
-// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
- vmovl.u8 $4,$0
- vmovl.u8 $5,$1
- vadd.s16 $4,$2
- vadd.s16 $5,$3
- vqmovun.s16 $0,$4
- vqmovun.s16 $1,$5
-// }
+.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
+// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+ vmovl.u8 $4,$0
+ vmovl.u8 $5,$1
+ vadd.s16 $4,$2
+ vadd.s16 $5,$3
+ vqmovun.s16 $0,$4
+ vqmovun.s16 $1,$5
+// }
.endm
-.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
-// { // input: src_d[0]~[3], output: e_d[0]~[3];
- vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2];
- vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2];
- vshr.s16 $6, $1, #1
- vshr.s16 $7, $3, #1
- vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3];
- vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1);
-// }
+.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
+// { // input: src_d[0]~[3], output: e_d[0]~[3];
+ vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2];
+ vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2];
+ vshr.s16 $6, $1, #1
+ vshr.s16 $7, $3, #1
+ vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3];
+ vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
-.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
-.macro ROW_TRANSFORM_0_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3];
- vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
- vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3];
- vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3];
-// }
+.macro ROW_TRANSFORM_0_STEP
+// { // input: src_d[0]~[3], output: e_q[0]~[3];
+ vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
+ vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3];
+ vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3];
+// }
.endm
-.macro ROW_TRANSFORM_1_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
- vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 $8, $1, #1
- vshr.s16 $9, $3, #1
- vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
+.macro ROW_TRANSFORM_1_STEP
+// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+ vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
+ vshr.s16 $8, $1, #1
+ vshr.s16 $9, $3, #1
+ vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
+ vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
-.macro TRANSFORM_4BYTES // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+.macro TRANSFORM_4BYTES // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
-.macro COL_TRANSFORM_0_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
- vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+.macro COL_TRANSFORM_0_STEP
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
-.macro COL_TRANSFORM_1_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 $6, $1, #1
- vshr.s32 $7, $3, #1
- vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+.macro COL_TRANSFORM_1_STEP
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vshr.s32 $6, $1, #1
+ vshr.s32 $7, $3, #1
+ vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
#else
-.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
- vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
- vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7
- vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5
- vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7
+.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
+ vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
+ vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7
+ vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5
+ vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7
- vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5
- vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7
- vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5
- vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7
-// }
+ vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5
+ vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7
+ vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5
+ vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7
+// }
.endm
-.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
- vld1.64 {\arg0}, [\arg8], r2
- vld1.64 {\arg4}, [\arg9], r4
- vld1.64 {\arg1}, [\arg8], r2
- vld1.64 {\arg5}, [\arg9], r4
+.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+ vld1.64 {\arg0}, [\arg8], r2
+ vld1.64 {\arg4}, [\arg9], r4
+ vld1.64 {\arg1}, [\arg8], r2
+ vld1.64 {\arg5}, [\arg9], r4
- vld1.64 {\arg2}, [\arg8], r2
- vld1.64 {\arg6}, [\arg9], r4
- vld1.64 {\arg3}, [\arg8], r2
- vld1.64 {\arg7}, [\arg9], r4
-// }
+ vld1.64 {\arg2}, [\arg8], r2
+ vld1.64 {\arg6}, [\arg9], r4
+ vld1.64 {\arg3}, [\arg8], r2
+ vld1.64 {\arg7}, [\arg9], r4
+// }
.endm
-.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_d[0]~[3], working: [4]~[7]
- vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3];
- vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3];
- vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2];
- vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2];
+.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: src_d[0]~[3], working: [4]~[7]
+ vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3];
+ vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3];
+ vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2];
+ vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2];
- vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1];
- vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1];
- vshl.s16 \arg1, \arg7, #1
- vshl.s16 \arg3, \arg6, #1
- vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2];
- vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1);
-// }
+ vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1];
+ vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1];
+ vshl.s16 \arg1, \arg7, #1
+ vshl.s16 \arg3, \arg6, #1
+ vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2];
+ vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1);
+// }
.endm
-.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
-// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
- vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-// }
+.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
+// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+ vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+ vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+ vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+ vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+// }
.endm
-.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
- veor.s16 \arg6, \arg6 // init 0 , and keep 0;
- vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
- vmull.s16 \arg7, \arg2, \arg4
- vmull.s16 \arg8, \arg3, \arg5
- vshr.s32 \arg7, #16
- vshr.s32 \arg8, #16
- vmovn.s32 \arg2, \arg7
- vmovn.s32 \arg3, \arg8
+.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+ veor.s16 \arg6, \arg6 // init 0 , and keep 0;
+ vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
+ vmull.s16 \arg7, \arg2, \arg4
+ vmull.s16 \arg8, \arg3, \arg5
+ vshr.s32 \arg7, #16
+ vshr.s32 \arg8, #16
+ vmovn.s32 \arg2, \arg7
+ vmovn.s32 \arg3, \arg8
- vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
- vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 \arg6, #1
- vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
+ vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 \arg6, #1
+ vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
+// }
.endm
-.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
- veor.s16 \arg6, \arg6 // init 0 , and keep 0;
- vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
- vmull.s16 \arg7, \arg2, \arg4
- vmull.s16 \arg8, \arg3, \arg5
- vshr.s32 \arg7, #16
- vshr.s32 \arg8, #16
- vmovn.s32 \arg2, \arg7
- vmovn.s32 \arg3, \arg8
+.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+ veor.s16 \arg6, \arg6 // init 0 , and keep 0;
+ vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
+ vmull.s16 \arg7, \arg2, \arg4
+ vmull.s16 \arg8, \arg3, \arg5
+ vshr.s32 \arg7, #16
+ vshr.s32 \arg8, #16
+ vmovn.s32 \arg2, \arg7
+ vmovn.s32 \arg3, \arg8
- vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
- vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 \arg6, #1
- vmax.s16 \arg9, \arg2, \arg3
- vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
+ vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 \arg6, #1
+ vmax.s16 \arg9, \arg2, \arg3
+ vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
+// }
.endm
-.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
-// { // input: coef, ff (dst), mf , working_d (all 0), working_q
- vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0)
- vmull.s16 \arg4, \arg1, \arg2 // *= mf
- vshr.s32 \arg4, #16
- vmovn.s32 \arg1, \arg4 // >> 16
+.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
+// { // input: coef, ff (dst), mf , working_d (all 0), working_q
+ vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0)
+ vmull.s16 \arg4, \arg1, \arg2 // *= mf
+ vshr.s32 \arg4, #16
+ vmovn.s32 \arg1, \arg4 // >> 16
- vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111
- vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 \arg3, #1
- vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x
-// }
+ vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111
+ vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 \arg3, #1
+ vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x
+// }
.endm
-.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
-// { // input: coef, dst_d, working_d (all 0x01)
- vceq.s16 \arg1, \arg0, #0
- vand.s16 \arg1, \arg2
- vpadd.s16 \arg1, \arg1, \arg1
- vpadd.s16 \arg1, \arg1, \arg1
-// }
+.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
+// { // input: coef, dst_d, working_d (all 0x01)
+ vceq.s16 \arg1, \arg0, #0
+ vand.s16 \arg1, \arg2
+ vpadd.s16 \arg1, \arg1, \arg1
+ vpadd.s16 \arg1, \arg1, \arg1
+// }
.endm
-.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
-// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
- vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4
- vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
- vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1]
-// }
+.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
+// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
+ vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4
+ vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
+ vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1]
+// }
.endm
-.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
-// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
- vceq.s16 \arg0, #0
- vceq.s16 \arg1, #0
- vand.s16 \arg0, \arg2
- vand.s16 \arg1, \arg2
+.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
+// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
+ vceq.s16 \arg0, #0
+ vceq.s16 \arg1, #0
+ vand.s16 \arg0, \arg2
+ vand.s16 \arg1, \arg2
- vpadd.s16 \arg3, \arg3, \arg5
- vpadd.s16 \arg4, \arg4, \arg6
- vpadd.s16 \arg3, \arg3, \arg4 // 8-->4
- vpadd.s16 \arg3, \arg3, \arg3
- vpadd.s16 \arg3, \arg3, \arg3
-// }
+ vpadd.s16 \arg3, \arg3, \arg5
+ vpadd.s16 \arg4, \arg4, \arg6
+ vpadd.s16 \arg3, \arg3, \arg4 // 8-->4
+ vpadd.s16 \arg3, \arg3, \arg3
+ vpadd.s16 \arg3, \arg3, \arg3
+// }
.endm
-.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
-// { // input: src_d[0]~[3], working_d, dst_d
- vshr.s64 \arg1, \arg0, #32
- vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
- vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
- vtrn.s16 \arg2, \arg1
- vtrn.s32 \arg2, \arg1
-// }
+.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
+// { // input: src_d[0]~[3], working_d, dst_d
+ vshr.s64 \arg1, \arg0, #32
+ vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+ vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+ vtrn.s16 \arg2, \arg1
+ vtrn.s32 \arg2, \arg1
+// }
.endm
-.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
-// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
- vshr.s64 \arg1, \arg0, #32
- vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
- vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
- vtrn.s16 \arg2, \arg1
- vrev32.16 \arg1, \arg1
- vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
+// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+ vshr.s64 \arg1, \arg0, #32
+ vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+ vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+ vtrn.s16 \arg2, \arg1
+ vrev32.16 \arg1, \arg1
+ vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
- vrev64.16 \arg1, \arg2
- vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
- vsub.s16 \arg1, \arg2, \arg1
- vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
- vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-// }
+ vrev64.16 \arg1, \arg2
+ vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+ vsub.s16 \arg1, \arg2, \arg1
+ vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+ vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+// }
.endm
-.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
- vmovl.u8 \arg4,\arg0
- vmovl.u8 \arg5,\arg1
- vadd.s16 \arg4,\arg2
- vadd.s16 \arg5,\arg3
- vqmovun.s16 \arg0,\arg4
- vqmovun.s16 \arg1,\arg5
-// }
+.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
+// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+ vmovl.u8 \arg4,\arg0
+ vmovl.u8 \arg5,\arg1
+ vadd.s16 \arg4,\arg2
+ vadd.s16 \arg5,\arg3
+ vqmovun.s16 \arg0,\arg4
+ vqmovun.s16 \arg1,\arg5
+// }
.endm
-.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_d[0]~[3], output: e_d[0]~[3];
- vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2];
- vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2];
- vshr.s16 \arg6, \arg1, #1
- vshr.s16 \arg7, \arg3, #1
- vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3];
- vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1);
-// }
+.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: src_d[0]~[3], output: e_d[0]~[3];
+ vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2];
+ vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2];
+ vshr.s16 \arg6, \arg1, #1
+ vshr.s16 \arg7, \arg3, #1
+ vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3];
+ vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
-.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
-.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_d[0]~[3], output: e_q[0]~[3];
- vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
- vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3];
- vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3];
-// }
+.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: src_d[0]~[3], output: e_q[0]~[3];
+ vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
+ vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3];
+ vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3];
+// }
.endm
-.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
- vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 \arg8, \arg1, #1
- vshr.s16 \arg9, \arg3, #1
- vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
+.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
+ vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
+ vshr.s16 \arg8, \arg1, #1
+ vshr.s16 \arg9, \arg3, #1
+ vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
+ vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
+// }
.endm
-.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
+.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
.endm
-.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
- vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
-.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 \arg6, \arg1, #1
- vshr.s32 \arg7, \arg3, #1
- vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
+.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vshr.s32 \arg6, \arg1, #1
+ vshr.s32 \arg7, \arg3, #1
+ vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsDctT4_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4
+ LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4
- vsubl.u8 q0, d4, d6
- vsubl.u8 q1, d5, d7
- vtrn.s32 q0, q1
- vswp d1, d2
+ vsubl.u8 q0, d4, d6
+ vsubl.u8 q1, d5, d7
+ vtrn.s32 q0, q1
+ vswp d1, d2
- // horizontal transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ // horizontal transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
- // vertical transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ // vertical transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
- vst1.s16 {q0, q1}, [r0]!
+ vst1.s16 {q0, q1}, [r0]!
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
+ LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
- vsubl.u8 q0, d16, d20
- vsubl.u8 q1, d17, d21
- vsubl.u8 q2, d18, d22
- vsubl.u8 q3, d19, d23
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ vsubl.u8 q0, d16, d20
+ vsubl.u8 q1, d17, d21
+ vsubl.u8 q2, d18, d22
+ vsubl.u8 q3, d19, d23
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- // horizontal transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ // horizontal transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- // vertical transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ // vertical transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- vswp d1, d2
- vswp d5, d6
- vswp q1, q2
- vst1.s16 {q0, q1}, [r0]!
- vst1.s16 {q2, q3}, [r0]!
+ vswp d1, d2
+ vswp d5, d6
+ vswp q1, q2
+ vst1.s16 {q0, q1}, [r0]!
+ vst1.s16 {q2, q3}, [r0]!
- ////////////////
- LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
+ ////////////////
+ LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
- vsubl.u8 q0, d16, d20
- vsubl.u8 q1, d17, d21
- vsubl.u8 q2, d18, d22
- vsubl.u8 q3, d19, d23
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ vsubl.u8 q0, d16, d20
+ vsubl.u8 q1, d17, d21
+ vsubl.u8 q2, d18, d22
+ vsubl.u8 q3, d19, d23
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- // horizontal transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ // horizontal transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- // vertical transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ // vertical transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- vswp d1, d2
- vswp d5, d6
- vswp q1, q2
- vst1.s16 {q0, q1}, [r0]!
- vst1.s16 {q2, q3}, [r0]!
+ vswp d1, d2
+ vswp d5, d6
+ vswp q1, q2
+ vst1.s16 {q0, q1}, [r0]!
+ vst1.s16 {q2, q3}, [r0]!
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon
- vld1.s16 {q2}, [r1]
- vld1.s16 {q0, q1}, [r0]
- vld1.s16 {q3}, [r2]
+ vld1.s16 {q2}, [r1]
+ vld1.s16 {q0, q1}, [r0]
+ vld1.s16 {q3}, [r2]
- vmov q8, q2
+ vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11
- vst1.s16 {q2}, [r0]!
+ NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11
+ vst1.s16 {q2}, [r0]!
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r0]!
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r0]!
WELS_ASM_FUNC_END
@@ -627,266 +627,266 @@
WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon
- vld1.s16 {q0, q1}, [r0]
- vdup.s16 q2, r1 // even ff range [0, 768]
- vdup.s16 q3, r2
+ vld1.s16 {q0, q1}, [r0]
+ vdup.s16 q2, r1 // even ff range [0, 768]
+ vdup.s16 q3, r2
- vmov q8, q2
+ vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11
- vst1.s16 {q2}, [r0]!
+ NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11
+ vst1.s16 {q2}, [r0]!
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r0]!
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r0]!
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon
- vld1.s16 {q2}, [r1]
- vld1.s16 {q3}, [r2]
- mov r1, r0
+ vld1.s16 {q2}, [r1]
+ vld1.s16 {q3}, [r2]
+ mov r1, r0
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
- vst1.s16 {q8}, [r1]!
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
+ vst1.s16 {q8}, [r1]!
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon
- vld1.s16 {q2}, [r1]
- vld1.s16 {q3}, [r2]
- mov r1, r0
+ vld1.s16 {q2}, [r1]
+ vld1.s16 {q3}, [r2]
+ mov r1, r0
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
- vst1.s16 {q8}, [r1]!
- vmov q12, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
- vst1.s16 {q12}, [r1]! // then 1st 16 elem in d26 & d28
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
+ vst1.s16 {q8}, [r1]!
+ vmov q12, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
+ vst1.s16 {q12}, [r1]! // then 1st 16 elem in d26 & d28
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
- vst1.s16 {q8}, [r1]!
- vmov q12, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
- vst1.s16 {q12}, [r1]! // then 2nd 16 elem in d27 & d29
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
+ vst1.s16 {q8}, [r1]!
+ vmov q12, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
+ vst1.s16 {q12}, [r1]! // then 2nd 16 elem in d27 & d29
- SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1
- vst1.s32 {d0[0]}, [r3]!
+ SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1
+ vst1.s32 {d0[0]}, [r3]!
- ///////////
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
- vst1.s16 {q8}, [r1]!
- vmov q12, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
- vst1.s16 {q12}, [r1]! // then 3rd 16 elem in d26 & d28
+ ///////////
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
+ vst1.s16 {q8}, [r1]!
+ vmov q12, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
+ vst1.s16 {q12}, [r1]! // then 3rd 16 elem in d26 & d28
- vld1.s16 {q0, q1}, [r0]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
- vst1.s16 {q8}, [r1]!
- vmov q12, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
- vst1.s16 {q12}, [r1]! // then 4th 16 elem in d27 & d29
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
+ vst1.s16 {q8}, [r1]!
+ vmov q12, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
+ vst1.s16 {q12}, [r1]! // then 4th 16 elem in d27 & d29
- SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1
- vst1.s32 {d0[0]}, [r3]!
+ SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1
+ vst1.s32 {d0[0]}, [r3]!
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon
- push {r2,r3}
- mov r2, #64 // 2*16*sizeof(int16_t)
- add r3, r1, #32
+ push {r2,r3}
+ mov r2, #64 // 2*16*sizeof(int16_t)
+ add r3, r1, #32
- vld1.s16 {d0}, [r1], r2
- vld1.s16 {d1}, [r3], r2
- vld1.s16 {d4}, [r1], r2
- vld1.s16 {d5}, [r3], r2
- vld1.s16 {d2}, [r1], r2
- vld1.s16 {d3}, [r3], r2
- vld1.s16 {d6}, [r1], r2
- vld1.s16 {d7}, [r3], r2
- vtrn.16 q0, q2 // d0[0 4], d1[1 5]
- vtrn.16 q1, q3 // d2[2 6], d3[3 7]
+ vld1.s16 {d0}, [r1], r2
+ vld1.s16 {d1}, [r3], r2
+ vld1.s16 {d4}, [r1], r2
+ vld1.s16 {d5}, [r3], r2
+ vld1.s16 {d2}, [r1], r2
+ vld1.s16 {d3}, [r3], r2
+ vld1.s16 {d6}, [r1], r2
+ vld1.s16 {d7}, [r3], r2
+ vtrn.16 q0, q2 // d0[0 4], d1[1 5]
+ vtrn.16 q1, q3 // d2[2 6], d3[3 7]
- vld1.s16 {d16}, [r1], r2
- vld1.s16 {d17}, [r3], r2
- vld1.s16 {d20}, [r1], r2
- vld1.s16 {d21}, [r3], r2
- vld1.s16 {d18}, [r1], r2
- vld1.s16 {d19}, [r3], r2
- vld1.s16 {d22}, [r1], r2
- vld1.s16 {d23}, [r3], r2
- vtrn.16 q8, q10 //d16[08 12],d17[09 13]
- vtrn.16 q9, q11 //d18[10 14],d19[11 15]
+ vld1.s16 {d16}, [r1], r2
+ vld1.s16 {d17}, [r3], r2
+ vld1.s16 {d20}, [r1], r2
+ vld1.s16 {d21}, [r3], r2
+ vld1.s16 {d18}, [r1], r2
+ vld1.s16 {d19}, [r3], r2
+ vld1.s16 {d22}, [r1], r2
+ vld1.s16 {d23}, [r3], r2
+ vtrn.16 q8, q10 //d16[08 12],d17[09 13]
+ vtrn.16 q9, q11 //d18[10 14],d19[11 15]
- vtrn.32 q0, q8 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16]
- vtrn.32 q1, q9 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80]
+ vtrn.32 q0, q8 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16]
+ vtrn.32 q1, q9 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80]
- ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q8, q11, q10, q9
+ ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q8, q11, q10, q9
- TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9
+ TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9
- // transform element 32bits
- vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+ // transform element 32bits
+ vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+ vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+ vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+ vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
- COL_TRANSFORM_0_STEP q0, q1, q3, q2, q8, q11, q10, q9
+ COL_TRANSFORM_0_STEP q0, q1, q3, q2, q8, q11, q10, q9
- TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9
+ TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9
- vrshrn.s32 d16, q0, #1
- vrshrn.s32 d17, q1, #1
- vrshrn.s32 d18, q2, #1
- vrshrn.s32 d19, q3, #1
- vst1.16 {q8, q9}, [r0] //store
+ vrshrn.s32 d16, q0, #1
+ vrshrn.s32 d17, q1, #1
+ vrshrn.s32 d18, q2, #1
+ vrshrn.s32 d19, q3, #1
+ vst1.16 {q8, q9}, [r0] //store
- pop {r2,r3}
+ pop {r2,r3}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon
- vdup.s16 d1, r1 //ff
- vdup.s16 d2, r2 //mf
- veor d3, d3
+ vdup.s16 d1, r1 //ff
+ vdup.s16 d2, r2 //mf
+ veor d3, d3
- mov r1, #32
- mov r2, r0
+ mov r1, #32
+ mov r2, r0
- vld1.s16 {d0[0]}, [r0], r1 //rs[00]
- vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0
- vld1.s16 {d0[1]}, [r0], r1 //rs[16]
- vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0
- vld1.s16 {d0[2]}, [r0], r1 //rs[32]
- vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0
- vld1.s16 {d0[3]}, [r0], r1 //rs[48]
- vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0
+ vld1.s16 {d0[0]}, [r0], r1 //rs[00]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0
+ vld1.s16 {d0[1]}, [r0], r1 //rs[16]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0
+ vld1.s16 {d0[2]}, [r0], r1 //rs[32]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0
+ vld1.s16 {d0[3]}, [r0], r1 //rs[48]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0
- HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5
+ HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5
- HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0
+ HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0
- QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2
+ QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2
- vst1.s16 d1, [r3] // store to dct
- ldr r2, [sp, #0]
- vst1.s16 d1, [r2] // store to block
+ vst1.s16 d1, [r3] // store to dct
+ ldr r2, [sp, #0]
+ vst1.s16 d1, [r2] // store to block
- mov r1, #1
- vdup.s16 d3, r1
- DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3
+ mov r1, #1
+ vdup.s16 d3, r1
+ DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3
- vmov r0, r1, d0
- and r0, #0x07 // range [0~4]
- rsb r0, #4
+ vmov r0, r1, d0
+ and r0, #0x07 // range [0~4]
+ rsb r0, #4
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon
- vdup.s16 d3, r1
- mov r1, #32
- vld1.s16 {d0[0]}, [r0], r1 //rs[00]
- vld1.s16 {d0[1]}, [r0], r1 //rs[16]
- vld1.s16 {d0[2]}, [r0], r1 //rs[32]
- vld1.s16 {d0[3]}, [r0], r1 //rs[48]
+ vdup.s16 d3, r1
+ mov r1, #32
+ vld1.s16 {d0[0]}, [r0], r1 //rs[00]
+ vld1.s16 {d0[1]}, [r0], r1 //rs[16]
+ vld1.s16 {d0[2]}, [r0], r1 //rs[32]
+ vld1.s16 {d0[3]}, [r0], r1 //rs[48]
- HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2
+ HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2
- HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0
+ HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0
- vabs.s16 d1, d0
- vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold;
- vmov r0, r1, d1
- orr r0, r1
+ vabs.s16 d1, d0
+ vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold;
+ vmov r0, r1, d1
+ orr r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon
- push {r1}
- vld1.s16 {q0, q1}, [r0]
- vmov.s16 q8, #1
+ push {r1}
+ vld1.s16 {q0, q1}, [r0]
+ vmov.s16 q8, #1
- ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3
- vmov r0, r1, d0
- and r0, #0x1F // range [0~16]
- rsb r0, #16
- pop {r1}
+ ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3
+ vmov r0, r1, d0
+ and r0, #0x1F // range [0~16]
+ rsb r0, #16
+ pop {r1}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon
- vld1.s16 {q0, q1}, [r0]
- vld1.u16 {q2}, [r1]
+ vld1.s16 {q0, q1}, [r0]
+ vld1.u16 {q2}, [r1]
- vmul.s16 q8, q0, q2
- vmul.s16 q9, q1, q2
+ vmul.s16 q8, q0, q2
+ vmul.s16 q9, q1, q2
- vst1.s16 {q8, q9}, [r0]
+ vst1.s16 {q8, q9}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon
- vld1.u16 {q12}, [r1]
- mov r1, r0
- vld1.s16 {q0, q1}, [r0]!
- vld1.s16 {q2, q3}, [r0]!
- vmul.s16 q0, q0, q12
- vld1.s16 {q8, q9}, [r0]!
- vmul.s16 q1, q1, q12
- vld1.s16 {q10, q11}, [r0]!
+ vld1.u16 {q12}, [r1]
+ mov r1, r0
+ vld1.s16 {q0, q1}, [r0]!
+ vld1.s16 {q2, q3}, [r0]!
+ vmul.s16 q0, q0, q12
+ vld1.s16 {q8, q9}, [r0]!
+ vmul.s16 q1, q1, q12
+ vld1.s16 {q10, q11}, [r0]!
- vst1.s16 {q0, q1}, [r1]!
+ vst1.s16 {q0, q1}, [r1]!
- vmul.s16 q2, q2, q12
- vmul.s16 q3, q3, q12
- vmul.s16 q8, q8, q12
- vst1.s16 {q2, q3}, [r1]!
+ vmul.s16 q2, q2, q12
+ vmul.s16 q3, q3, q12
+ vmul.s16 q8, q8, q12
+ vst1.s16 {q2, q3}, [r1]!
- vmul.s16 q9, q9, q12
- vmul.s16 q10, q10, q12
- vmul.s16 q11, q11, q12
- vst1.s16 {q8, q9}, [r1]!
- vst1.s16 {q10, q11}, [r1]!
+ vmul.s16 q9, q9, q12
+ vmul.s16 q10, q10, q12
+ vmul.s16 q11, q11, q12
+ vst1.s16 {q8, q9}, [r1]!
+ vst1.s16 {q10, q11}, [r1]!
WELS_ASM_FUNC_END
@@ -893,258 +893,258 @@
WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon
- vld1.s16 {q0, q1}, [r0]
- vdup.s16 q8, r1
+ vld1.s16 {q0, q1}, [r0]
+ vdup.s16 q8, r1
- IHDM_4x4_TOTAL_16BITS q0, q2, q3
- IHDM_4x4_TOTAL_16BITS q1, q2, q3
+ IHDM_4x4_TOTAL_16BITS q0, q2, q3
+ IHDM_4x4_TOTAL_16BITS q1, q2, q3
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
- IHDM_4x4_TOTAL_16BITS q0, q2, q3
- vmul.s16 q0, q8
+ IHDM_4x4_TOTAL_16BITS q0, q2, q3
+ vmul.s16 q0, q8
- IHDM_4x4_TOTAL_16BITS q1, q2, q3
- vmul.s16 q1, q8
+ IHDM_4x4_TOTAL_16BITS q1, q2, q3
+ vmul.s16 q1, q8
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
- vst1.s16 {q0, q1}, [r0]
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+ vst1.s16 {q0, q1}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon
- vld1.u32 {d16[0]}, [r2], r3
- push {r4}
- ldr r4, [sp, #4]
- vld1.u32 {d16[1]}, [r2], r3
+ vld1.u32 {d16[0]}, [r2], r3
+ push {r4}
+ ldr r4, [sp, #4]
+ vld1.u32 {d16[1]}, [r2], r3
- vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles!
- vld1.u32 {d17[0]}, [r2], r3
- vld1.u32 {d17[1]}, [r2], r3 // q7 is pred
+ vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles!
+ vld1.u32 {d17[0]}, [r2], r3
+ vld1.u32 {d17[1]}, [r2], r3 // q7 is pred
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- vrshr.s16 d0, d0, #6
- vrshr.s16 d1, d1, #6
- vrshr.s16 d2, d2, #6
- vrshr.s16 d3, d3, #6
+ TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ vrshr.s16 d0, d0, #6
+ vrshr.s16 d1, d1, #6
+ vrshr.s16 d2, d2, #6
+ vrshr.s16 d3, d3, #6
- //after rounding 6, clip into [0, 255]
- vmovl.u8 q2,d16
- vadd.s16 q0,q2
- vqmovun.s16 d16,q0
- vst1.32 {d16[0]},[r0],r1
- vst1.32 {d16[1]},[r0],r1
+ //after rounding 6, clip into [0, 255]
+ vmovl.u8 q2,d16
+ vadd.s16 q0,q2
+ vqmovun.s16 d16,q0
+ vst1.32 {d16[0]},[r0],r1
+ vst1.32 {d16[1]},[r0],r1
- vmovl.u8 q2,d17
- vadd.s16 q1,q2
- vqmovun.s16 d17,q1
- vst1.32 {d17[0]},[r0],r1
- vst1.32 {d17[1]},[r0]
+ vmovl.u8 q2,d17
+ vadd.s16 q1,q2
+ vqmovun.s16 d17,q1
+ vst1.32 {d17[0]},[r0],r1
+ vst1.32 {d17[1]},[r0]
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon
- vld1.u64 {d24}, [r2], r3
- push {r4}
- ldr r4, [sp, #4]
- vld1.u64 {d25}, [r2], r3
+ vld1.u64 {d24}, [r2], r3
+ push {r4}
+ ldr r4, [sp, #4]
+ vld1.u64 {d25}, [r2], r3
- vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
- vld1.u64 {d26}, [r2], r3
- vld1.u64 {d27}, [r2], r3
- vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
- vswp d1, d4
- vswp d3, d6
- vswp q1, q2 // q0~q3
+ vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
+ vld1.u64 {d26}, [r2], r3
+ vld1.u64 {d27}, [r2], r3
+ vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
+ vswp d1, d4
+ vswp d3, d6
+ vswp q1, q2 // q0~q3
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- vrshr.s16 q0, q0, #6
- vrshr.s16 q1, q1, #6
- vrshr.s16 q2, q2, #6
- vrshr.s16 q3, q3, #6
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ vrshr.s16 q0, q0, #6
+ vrshr.s16 q1, q1, #6
+ vrshr.s16 q2, q2, #6
+ vrshr.s16 q3, q3, #6
- //after rounding 6, clip into [0, 255]
- vmovl.u8 q8,d24
- vadd.s16 q0,q8
- vqmovun.s16 d24,q0
- vst1.u8 {d24},[r0],r1
+ //after rounding 6, clip into [0, 255]
+ vmovl.u8 q8,d24
+ vadd.s16 q0,q8
+ vqmovun.s16 d24,q0
+ vst1.u8 {d24},[r0],r1
- vmovl.u8 q8,d25
- vadd.s16 q1,q8
- vqmovun.s16 d25,q1
- vst1.u8 {d25},[r0],r1
+ vmovl.u8 q8,d25
+ vadd.s16 q1,q8
+ vqmovun.s16 d25,q1
+ vst1.u8 {d25},[r0],r1
- vmovl.u8 q8,d26
- vadd.s16 q2,q8
- vqmovun.s16 d26,q2
- vst1.u8 {d26},[r0],r1
+ vmovl.u8 q8,d26
+ vadd.s16 q2,q8
+ vqmovun.s16 d26,q2
+ vst1.u8 {d26},[r0],r1
- vmovl.u8 q8,d27
- vadd.s16 q3,q8
- vqmovun.s16 d27,q3
- vst1.u8 {d27},[r0],r1
+ vmovl.u8 q8,d27
+ vadd.s16 q3,q8
+ vqmovun.s16 d27,q3
+ vst1.u8 {d27},[r0],r1
- vld1.u64 {d24}, [r2], r3
- vld1.u64 {d25}, [r2], r3
+ vld1.u64 {d24}, [r2], r3
+ vld1.u64 {d25}, [r2], r3
- vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
- vld1.u64 {d26}, [r2], r3
- vld1.u64 {d27}, [r2], r3
- vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
- vswp d1, d4
- vswp d3, d6
- vswp q1, q2 // q0~q3
+ vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
+ vld1.u64 {d26}, [r2], r3
+ vld1.u64 {d27}, [r2], r3
+ vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
+ vswp d1, d4
+ vswp d3, d6
+ vswp q1, q2 // q0~q3
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
- vrshr.s16 q0, q0, #6
- vrshr.s16 q1, q1, #6
- vrshr.s16 q2, q2, #6
- vrshr.s16 q3, q3, #6
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
+ vrshr.s16 q0, q0, #6
+ vrshr.s16 q1, q1, #6
+ vrshr.s16 q2, q2, #6
+ vrshr.s16 q3, q3, #6
- //after rounding 6, clip into [0, 255]
- vmovl.u8 q8,d24
- vadd.s16 q0,q8
- vqmovun.s16 d24,q0
- vst1.u8 {d24},[r0],r1
+ //after rounding 6, clip into [0, 255]
+ vmovl.u8 q8,d24
+ vadd.s16 q0,q8
+ vqmovun.s16 d24,q0
+ vst1.u8 {d24},[r0],r1
- vmovl.u8 q8,d25
- vadd.s16 q1,q8
- vqmovun.s16 d25,q1
- vst1.u8 {d25},[r0],r1
+ vmovl.u8 q8,d25
+ vadd.s16 q1,q8
+ vqmovun.s16 d25,q1
+ vst1.u8 {d25},[r0],r1
- vmovl.u8 q8,d26
- vadd.s16 q2,q8
- vqmovun.s16 d26,q2
- vst1.u8 {d26},[r0],r1
+ vmovl.u8 q8,d26
+ vadd.s16 q2,q8
+ vqmovun.s16 d26,q2
+ vst1.u8 {d26},[r0],r1
- vmovl.u8 q8,d27
- vadd.s16 q3,q8
- vqmovun.s16 d27,q3
- vst1.u8 {d27},[r0],r1
+ vmovl.u8 q8,d27
+ vadd.s16 q3,q8
+ vqmovun.s16 d27,q3
+ vst1.u8 {d27},[r0],r1
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon
- push {r4}
- ldr r4, [sp, #4]
+ push {r4}
+ ldr r4, [sp, #4]
- vld1.s16 {q8,q9}, [r4]
- vrshr.s16 q8, q8, #6
- vrshr.s16 q9, q9, #6
+ vld1.s16 {q8,q9}, [r4]
+ vrshr.s16 q8, q8, #6
+ vrshr.s16 q9, q9, #6
- vdup.s16 d20, d16[0]
- vdup.s16 d21, d16[1]
- vdup.s16 d22, d16[2]
- vdup.s16 d23, d16[3]
+ vdup.s16 d20, d16[0]
+ vdup.s16 d21, d16[1]
+ vdup.s16 d22, d16[2]
+ vdup.s16 d23, d16[3]
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vdup.s16 d20, d17[0]
- vdup.s16 d21, d17[1]
- vdup.s16 d22, d17[2]
- vdup.s16 d23, d17[3]
+ vdup.s16 d20, d17[0]
+ vdup.s16 d21, d17[1]
+ vdup.s16 d22, d17[2]
+ vdup.s16 d23, d17[3]
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vdup.s16 d20, d18[0]
- vdup.s16 d21, d18[1]
- vdup.s16 d22, d18[2]
- vdup.s16 d23, d18[3]
+ vdup.s16 d20, d18[0]
+ vdup.s16 d21, d18[1]
+ vdup.s16 d22, d18[2]
+ vdup.s16 d23, d18[3]
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vdup.s16 d20, d19[0]
- vdup.s16 d21, d19[1]
- vdup.s16 d22, d19[2]
- vdup.s16 d23, d19[3]
+ vdup.s16 d20, d19[0]
+ vdup.s16 d21, d19[1]
+ vdup.s16 d22, d19[2]
+ vdup.s16 d23, d19[3]
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
- pop {r4}
+ pop {r4}
WELS_ASM_FUNC_END
#endif
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@@ -55,262 +55,262 @@
align 16
byte_1pos_table:
- db 0,0,0,0,0,0,0,0, ;0
- db 0,0,0,0,0,0,0,1, ;1
- db 1,0,0,0,0,0,0,1, ;2
- db 1,0,0,0,0,0,0,2, ;3
- db 2,0,0,0,0,0,0,1, ;4
- db 2,0,0,0,0,0,0,2, ;5
- db 2,1,0,0,0,0,0,2, ;6
- db 2,1,0,0,0,0,0,3, ;7
- db 3,0,0,0,0,0,0,1, ;8
- db 3,0,0,0,0,0,0,2, ;9
- db 3,1,0,0,0,0,0,2, ;10
- db 3,1,0,0,0,0,0,3, ;11
- db 3,2,0,0,0,0,0,2, ;12
- db 3,2,0,0,0,0,0,3, ;13
- db 3,2,1,0,0,0,0,3, ;14
- db 3,2,1,0,0,0,0,4, ;15
- db 4,0,0,0,0,0,0,1, ;16
- db 4,0,0,0,0,0,0,2, ;17
- db 4,1,0,0,0,0,0,2, ;18
- db 4,1,0,0,0,0,0,3, ;19
- db 4,2,0,0,0,0,0,2, ;20
- db 4,2,0,0,0,0,0,3, ;21
- db 4,2,1,0,0,0,0,3, ;22
- db 4,2,1,0,0,0,0,4, ;23
- db 4,3,0,0,0,0,0,2, ;24
- db 4,3,0,0,0,0,0,3, ;25
- db 4,3,1,0,0,0,0,3, ;26
- db 4,3,1,0,0,0,0,4, ;27
- db 4,3,2,0,0,0,0,3, ;28
- db 4,3,2,0,0,0,0,4, ;29
- db 4,3,2,1,0,0,0,4, ;30
- db 4,3,2,1,0,0,0,5, ;31
- db 5,0,0,0,0,0,0,1, ;32
- db 5,0,0,0,0,0,0,2, ;33
- db 5,1,0,0,0,0,0,2, ;34
- db 5,1,0,0,0,0,0,3, ;35
- db 5,2,0,0,0,0,0,2, ;36
- db 5,2,0,0,0,0,0,3, ;37
- db 5,2,1,0,0,0,0,3, ;38
- db 5,2,1,0,0,0,0,4, ;39
- db 5,3,0,0,0,0,0,2, ;40
- db 5,3,0,0,0,0,0,3, ;41
- db 5,3,1,0,0,0,0,3, ;42
- db 5,3,1,0,0,0,0,4, ;43
- db 5,3,2,0,0,0,0,3, ;44
- db 5,3,2,0,0,0,0,4, ;45
- db 5,3,2,1,0,0,0,4, ;46
- db 5,3,2,1,0,0,0,5, ;47
- db 5,4,0,0,0,0,0,2, ;48
- db 5,4,0,0,0,0,0,3, ;49
- db 5,4,1,0,0,0,0,3, ;50
- db 5,4,1,0,0,0,0,4, ;51
- db 5,4,2,0,0,0,0,3, ;52
- db 5,4,2,0,0,0,0,4, ;53
- db 5,4,2,1,0,0,0,4, ;54
- db 5,4,2,1,0,0,0,5, ;55
- db 5,4,3,0,0,0,0,3, ;56
- db 5,4,3,0,0,0,0,4, ;57
- db 5,4,3,1,0,0,0,4, ;58
- db 5,4,3,1,0,0,0,5, ;59
- db 5,4,3,2,0,0,0,4, ;60
- db 5,4,3,2,0,0,0,5, ;61
- db 5,4,3,2,1,0,0,5, ;62
- db 5,4,3,2,1,0,0,6, ;63
- db 6,0,0,0,0,0,0,1, ;64
- db 6,0,0,0,0,0,0,2, ;65
- db 6,1,0,0,0,0,0,2, ;66
- db 6,1,0,0,0,0,0,3, ;67
- db 6,2,0,0,0,0,0,2, ;68
- db 6,2,0,0,0,0,0,3, ;69
- db 6,2,1,0,0,0,0,3, ;70
- db 6,2,1,0,0,0,0,4, ;71
- db 6,3,0,0,0,0,0,2, ;72
- db 6,3,0,0,0,0,0,3, ;73
- db 6,3,1,0,0,0,0,3, ;74
- db 6,3,1,0,0,0,0,4, ;75
- db 6,3,2,0,0,0,0,3, ;76
- db 6,3,2,0,0,0,0,4, ;77
- db 6,3,2,1,0,0,0,4, ;78
- db 6,3,2,1,0,0,0,5, ;79
- db 6,4,0,0,0,0,0,2, ;80
- db 6,4,0,0,0,0,0,3, ;81
- db 6,4,1,0,0,0,0,3, ;82
- db 6,4,1,0,0,0,0,4, ;83
- db 6,4,2,0,0,0,0,3, ;84
- db 6,4,2,0,0,0,0,4, ;85
- db 6,4,2,1,0,0,0,4, ;86
- db 6,4,2,1,0,0,0,5, ;87
- db 6,4,3,0,0,0,0,3, ;88
- db 6,4,3,0,0,0,0,4, ;89
- db 6,4,3,1,0,0,0,4, ;90
- db 6,4,3,1,0,0,0,5, ;91
- db 6,4,3,2,0,0,0,4, ;92
- db 6,4,3,2,0,0,0,5, ;93
- db 6,4,3,2,1,0,0,5, ;94
- db 6,4,3,2,1,0,0,6, ;95
- db 6,5,0,0,0,0,0,2, ;96
- db 6,5,0,0,0,0,0,3, ;97
- db 6,5,1,0,0,0,0,3, ;98
- db 6,5,1,0,0,0,0,4, ;99
- db 6,5,2,0,0,0,0,3, ;100
- db 6,5,2,0,0,0,0,4, ;101
- db 6,5,2,1,0,0,0,4, ;102
- db 6,5,2,1,0,0,0,5, ;103
- db 6,5,3,0,0,0,0,3, ;104
- db 6,5,3,0,0,0,0,4, ;105
- db 6,5,3,1,0,0,0,4, ;106
- db 6,5,3,1,0,0,0,5, ;107
- db 6,5,3,2,0,0,0,4, ;108
- db 6,5,3,2,0,0,0,5, ;109
- db 6,5,3,2,1,0,0,5, ;110
- db 6,5,3,2,1,0,0,6, ;111
- db 6,5,4,0,0,0,0,3, ;112
- db 6,5,4,0,0,0,0,4, ;113
- db 6,5,4,1,0,0,0,4, ;114
- db 6,5,4,1,0,0,0,5, ;115
- db 6,5,4,2,0,0,0,4, ;116
- db 6,5,4,2,0,0,0,5, ;117
- db 6,5,4,2,1,0,0,5, ;118
- db 6,5,4,2,1,0,0,6, ;119
- db 6,5,4,3,0,0,0,4, ;120
- db 6,5,4,3,0,0,0,5, ;121
- db 6,5,4,3,1,0,0,5, ;122
- db 6,5,4,3,1,0,0,6, ;123
- db 6,5,4,3,2,0,0,5, ;124
- db 6,5,4,3,2,0,0,6, ;125
- db 6,5,4,3,2,1,0,6, ;126
- db 6,5,4,3,2,1,0,7, ;127
- db 7,0,0,0,0,0,0,1, ;128
- db 7,0,0,0,0,0,0,2, ;129
- db 7,1,0,0,0,0,0,2, ;130
- db 7,1,0,0,0,0,0,3, ;131
- db 7,2,0,0,0,0,0,2, ;132
- db 7,2,0,0,0,0,0,3, ;133
- db 7,2,1,0,0,0,0,3, ;134
- db 7,2,1,0,0,0,0,4, ;135
- db 7,3,0,0,0,0,0,2, ;136
- db 7,3,0,0,0,0,0,3, ;137
- db 7,3,1,0,0,0,0,3, ;138
- db 7,3,1,0,0,0,0,4, ;139
- db 7,3,2,0,0,0,0,3, ;140
- db 7,3,2,0,0,0,0,4, ;141
- db 7,3,2,1,0,0,0,4, ;142
- db 7,3,2,1,0,0,0,5, ;143
- db 7,4,0,0,0,0,0,2, ;144
- db 7,4,0,0,0,0,0,3, ;145
- db 7,4,1,0,0,0,0,3, ;146
- db 7,4,1,0,0,0,0,4, ;147
- db 7,4,2,0,0,0,0,3, ;148
- db 7,4,2,0,0,0,0,4, ;149
- db 7,4,2,1,0,0,0,4, ;150
- db 7,4,2,1,0,0,0,5, ;151
- db 7,4,3,0,0,0,0,3, ;152
- db 7,4,3,0,0,0,0,4, ;153
- db 7,4,3,1,0,0,0,4, ;154
- db 7,4,3,1,0,0,0,5, ;155
- db 7,4,3,2,0,0,0,4, ;156
- db 7,4,3,2,0,0,0,5, ;157
- db 7,4,3,2,1,0,0,5, ;158
- db 7,4,3,2,1,0,0,6, ;159
- db 7,5,0,0,0,0,0,2, ;160
- db 7,5,0,0,0,0,0,3, ;161
- db 7,5,1,0,0,0,0,3, ;162
- db 7,5,1,0,0,0,0,4, ;163
- db 7,5,2,0,0,0,0,3, ;164
- db 7,5,2,0,0,0,0,4, ;165
- db 7,5,2,1,0,0,0,4, ;166
- db 7,5,2,1,0,0,0,5, ;167
- db 7,5,3,0,0,0,0,3, ;168
- db 7,5,3,0,0,0,0,4, ;169
- db 7,5,3,1,0,0,0,4, ;170
- db 7,5,3,1,0,0,0,5, ;171
- db 7,5,3,2,0,0,0,4, ;172
- db 7,5,3,2,0,0,0,5, ;173
- db 7,5,3,2,1,0,0,5, ;174
- db 7,5,3,2,1,0,0,6, ;175
- db 7,5,4,0,0,0,0,3, ;176
- db 7,5,4,0,0,0,0,4, ;177
- db 7,5,4,1,0,0,0,4, ;178
- db 7,5,4,1,0,0,0,5, ;179
- db 7,5,4,2,0,0,0,4, ;180
- db 7,5,4,2,0,0,0,5, ;181
- db 7,5,4,2,1,0,0,5, ;182
- db 7,5,4,2,1,0,0,6, ;183
- db 7,5,4,3,0,0,0,4, ;184
- db 7,5,4,3,0,0,0,5, ;185
- db 7,5,4,3,1,0,0,5, ;186
- db 7,5,4,3,1,0,0,6, ;187
- db 7,5,4,3,2,0,0,5, ;188
- db 7,5,4,3,2,0,0,6, ;189
- db 7,5,4,3,2,1,0,6, ;190
- db 7,5,4,3,2,1,0,7, ;191
- db 7,6,0,0,0,0,0,2, ;192
- db 7,6,0,0,0,0,0,3, ;193
- db 7,6,1,0,0,0,0,3, ;194
- db 7,6,1,0,0,0,0,4, ;195
- db 7,6,2,0,0,0,0,3, ;196
- db 7,6,2,0,0,0,0,4, ;197
- db 7,6,2,1,0,0,0,4, ;198
- db 7,6,2,1,0,0,0,5, ;199
- db 7,6,3,0,0,0,0,3, ;200
- db 7,6,3,0,0,0,0,4, ;201
- db 7,6,3,1,0,0,0,4, ;202
- db 7,6,3,1,0,0,0,5, ;203
- db 7,6,3,2,0,0,0,4, ;204
- db 7,6,3,2,0,0,0,5, ;205
- db 7,6,3,2,1,0,0,5, ;206
- db 7,6,3,2,1,0,0,6, ;207
- db 7,6,4,0,0,0,0,3, ;208
- db 7,6,4,0,0,0,0,4, ;209
- db 7,6,4,1,0,0,0,4, ;210
- db 7,6,4,1,0,0,0,5, ;211
- db 7,6,4,2,0,0,0,4, ;212
- db 7,6,4,2,0,0,0,5, ;213
- db 7,6,4,2,1,0,0,5, ;214
- db 7,6,4,2,1,0,0,6, ;215
- db 7,6,4,3,0,0,0,4, ;216
- db 7,6,4,3,0,0,0,5, ;217
- db 7,6,4,3,1,0,0,5, ;218
- db 7,6,4,3,1,0,0,6, ;219
- db 7,6,4,3,2,0,0,5, ;220
- db 7,6,4,3,2,0,0,6, ;221
- db 7,6,4,3,2,1,0,6, ;222
- db 7,6,4,3,2,1,0,7, ;223
- db 7,6,5,0,0,0,0,3, ;224
- db 7,6,5,0,0,0,0,4, ;225
- db 7,6,5,1,0,0,0,4, ;226
- db 7,6,5,1,0,0,0,5, ;227
- db 7,6,5,2,0,0,0,4, ;228
- db 7,6,5,2,0,0,0,5, ;229
- db 7,6,5,2,1,0,0,5, ;230
- db 7,6,5,2,1,0,0,6, ;231
- db 7,6,5,3,0,0,0,4, ;232
- db 7,6,5,3,0,0,0,5, ;233
- db 7,6,5,3,1,0,0,5, ;234
- db 7,6,5,3,1,0,0,6, ;235
- db 7,6,5,3,2,0,0,5, ;236
- db 7,6,5,3,2,0,0,6, ;237
- db 7,6,5,3,2,1,0,6, ;238
- db 7,6,5,3,2,1,0,7, ;239
- db 7,6,5,4,0,0,0,4, ;240
- db 7,6,5,4,0,0,0,5, ;241
- db 7,6,5,4,1,0,0,5, ;242
- db 7,6,5,4,1,0,0,6, ;243
- db 7,6,5,4,2,0,0,5, ;244
- db 7,6,5,4,2,0,0,6, ;245
- db 7,6,5,4,2,1,0,6, ;246
- db 7,6,5,4,2,1,0,7, ;247
- db 7,6,5,4,3,0,0,5, ;248
- db 7,6,5,4,3,0,0,6, ;249
- db 7,6,5,4,3,1,0,6, ;250
- db 7,6,5,4,3,1,0,7, ;251
- db 7,6,5,4,3,2,0,6, ;252
- db 7,6,5,4,3,2,0,7, ;253
- db 7,6,5,4,3,2,1,7, ;254
- db 7,6,5,4,3,2,1,8, ;255
+ db 0,0,0,0,0,0,0,0, ;0
+ db 0,0,0,0,0,0,0,1, ;1
+ db 1,0,0,0,0,0,0,1, ;2
+ db 1,0,0,0,0,0,0,2, ;3
+ db 2,0,0,0,0,0,0,1, ;4
+ db 2,0,0,0,0,0,0,2, ;5
+ db 2,1,0,0,0,0,0,2, ;6
+ db 2,1,0,0,0,0,0,3, ;7
+ db 3,0,0,0,0,0,0,1, ;8
+ db 3,0,0,0,0,0,0,2, ;9
+ db 3,1,0,0,0,0,0,2, ;10
+ db 3,1,0,0,0,0,0,3, ;11
+ db 3,2,0,0,0,0,0,2, ;12
+ db 3,2,0,0,0,0,0,3, ;13
+ db 3,2,1,0,0,0,0,3, ;14
+ db 3,2,1,0,0,0,0,4, ;15
+ db 4,0,0,0,0,0,0,1, ;16
+ db 4,0,0,0,0,0,0,2, ;17
+ db 4,1,0,0,0,0,0,2, ;18
+ db 4,1,0,0,0,0,0,3, ;19
+ db 4,2,0,0,0,0,0,2, ;20
+ db 4,2,0,0,0,0,0,3, ;21
+ db 4,2,1,0,0,0,0,3, ;22
+ db 4,2,1,0,0,0,0,4, ;23
+ db 4,3,0,0,0,0,0,2, ;24
+ db 4,3,0,0,0,0,0,3, ;25
+ db 4,3,1,0,0,0,0,3, ;26
+ db 4,3,1,0,0,0,0,4, ;27
+ db 4,3,2,0,0,0,0,3, ;28
+ db 4,3,2,0,0,0,0,4, ;29
+ db 4,3,2,1,0,0,0,4, ;30
+ db 4,3,2,1,0,0,0,5, ;31
+ db 5,0,0,0,0,0,0,1, ;32
+ db 5,0,0,0,0,0,0,2, ;33
+ db 5,1,0,0,0,0,0,2, ;34
+ db 5,1,0,0,0,0,0,3, ;35
+ db 5,2,0,0,0,0,0,2, ;36
+ db 5,2,0,0,0,0,0,3, ;37
+ db 5,2,1,0,0,0,0,3, ;38
+ db 5,2,1,0,0,0,0,4, ;39
+ db 5,3,0,0,0,0,0,2, ;40
+ db 5,3,0,0,0,0,0,3, ;41
+ db 5,3,1,0,0,0,0,3, ;42
+ db 5,3,1,0,0,0,0,4, ;43
+ db 5,3,2,0,0,0,0,3, ;44
+ db 5,3,2,0,0,0,0,4, ;45
+ db 5,3,2,1,0,0,0,4, ;46
+ db 5,3,2,1,0,0,0,5, ;47
+ db 5,4,0,0,0,0,0,2, ;48
+ db 5,4,0,0,0,0,0,3, ;49
+ db 5,4,1,0,0,0,0,3, ;50
+ db 5,4,1,0,0,0,0,4, ;51
+ db 5,4,2,0,0,0,0,3, ;52
+ db 5,4,2,0,0,0,0,4, ;53
+ db 5,4,2,1,0,0,0,4, ;54
+ db 5,4,2,1,0,0,0,5, ;55
+ db 5,4,3,0,0,0,0,3, ;56
+ db 5,4,3,0,0,0,0,4, ;57
+ db 5,4,3,1,0,0,0,4, ;58
+ db 5,4,3,1,0,0,0,5, ;59
+ db 5,4,3,2,0,0,0,4, ;60
+ db 5,4,3,2,0,0,0,5, ;61
+ db 5,4,3,2,1,0,0,5, ;62
+ db 5,4,3,2,1,0,0,6, ;63
+ db 6,0,0,0,0,0,0,1, ;64
+ db 6,0,0,0,0,0,0,2, ;65
+ db 6,1,0,0,0,0,0,2, ;66
+ db 6,1,0,0,0,0,0,3, ;67
+ db 6,2,0,0,0,0,0,2, ;68
+ db 6,2,0,0,0,0,0,3, ;69
+ db 6,2,1,0,0,0,0,3, ;70
+ db 6,2,1,0,0,0,0,4, ;71
+ db 6,3,0,0,0,0,0,2, ;72
+ db 6,3,0,0,0,0,0,3, ;73
+ db 6,3,1,0,0,0,0,3, ;74
+ db 6,3,1,0,0,0,0,4, ;75
+ db 6,3,2,0,0,0,0,3, ;76
+ db 6,3,2,0,0,0,0,4, ;77
+ db 6,3,2,1,0,0,0,4, ;78
+ db 6,3,2,1,0,0,0,5, ;79
+ db 6,4,0,0,0,0,0,2, ;80
+ db 6,4,0,0,0,0,0,3, ;81
+ db 6,4,1,0,0,0,0,3, ;82
+ db 6,4,1,0,0,0,0,4, ;83
+ db 6,4,2,0,0,0,0,3, ;84
+ db 6,4,2,0,0,0,0,4, ;85
+ db 6,4,2,1,0,0,0,4, ;86
+ db 6,4,2,1,0,0,0,5, ;87
+ db 6,4,3,0,0,0,0,3, ;88
+ db 6,4,3,0,0,0,0,4, ;89
+ db 6,4,3,1,0,0,0,4, ;90
+ db 6,4,3,1,0,0,0,5, ;91
+ db 6,4,3,2,0,0,0,4, ;92
+ db 6,4,3,2,0,0,0,5, ;93
+ db 6,4,3,2,1,0,0,5, ;94
+ db 6,4,3,2,1,0,0,6, ;95
+ db 6,5,0,0,0,0,0,2, ;96
+ db 6,5,0,0,0,0,0,3, ;97
+ db 6,5,1,0,0,0,0,3, ;98
+ db 6,5,1,0,0,0,0,4, ;99
+ db 6,5,2,0,0,0,0,3, ;100
+ db 6,5,2,0,0,0,0,4, ;101
+ db 6,5,2,1,0,0,0,4, ;102
+ db 6,5,2,1,0,0,0,5, ;103
+ db 6,5,3,0,0,0,0,3, ;104
+ db 6,5,3,0,0,0,0,4, ;105
+ db 6,5,3,1,0,0,0,4, ;106
+ db 6,5,3,1,0,0,0,5, ;107
+ db 6,5,3,2,0,0,0,4, ;108
+ db 6,5,3,2,0,0,0,5, ;109
+ db 6,5,3,2,1,0,0,5, ;110
+ db 6,5,3,2,1,0,0,6, ;111
+ db 6,5,4,0,0,0,0,3, ;112
+ db 6,5,4,0,0,0,0,4, ;113
+ db 6,5,4,1,0,0,0,4, ;114
+ db 6,5,4,1,0,0,0,5, ;115
+ db 6,5,4,2,0,0,0,4, ;116
+ db 6,5,4,2,0,0,0,5, ;117
+ db 6,5,4,2,1,0,0,5, ;118
+ db 6,5,4,2,1,0,0,6, ;119
+ db 6,5,4,3,0,0,0,4, ;120
+ db 6,5,4,3,0,0,0,5, ;121
+ db 6,5,4,3,1,0,0,5, ;122
+ db 6,5,4,3,1,0,0,6, ;123
+ db 6,5,4,3,2,0,0,5, ;124
+ db 6,5,4,3,2,0,0,6, ;125
+ db 6,5,4,3,2,1,0,6, ;126
+ db 6,5,4,3,2,1,0,7, ;127
+ db 7,0,0,0,0,0,0,1, ;128
+ db 7,0,0,0,0,0,0,2, ;129
+ db 7,1,0,0,0,0,0,2, ;130
+ db 7,1,0,0,0,0,0,3, ;131
+ db 7,2,0,0,0,0,0,2, ;132
+ db 7,2,0,0,0,0,0,3, ;133
+ db 7,2,1,0,0,0,0,3, ;134
+ db 7,2,1,0,0,0,0,4, ;135
+ db 7,3,0,0,0,0,0,2, ;136
+ db 7,3,0,0,0,0,0,3, ;137
+ db 7,3,1,0,0,0,0,3, ;138
+ db 7,3,1,0,0,0,0,4, ;139
+ db 7,3,2,0,0,0,0,3, ;140
+ db 7,3,2,0,0,0,0,4, ;141
+ db 7,3,2,1,0,0,0,4, ;142
+ db 7,3,2,1,0,0,0,5, ;143
+ db 7,4,0,0,0,0,0,2, ;144
+ db 7,4,0,0,0,0,0,3, ;145
+ db 7,4,1,0,0,0,0,3, ;146
+ db 7,4,1,0,0,0,0,4, ;147
+ db 7,4,2,0,0,0,0,3, ;148
+ db 7,4,2,0,0,0,0,4, ;149
+ db 7,4,2,1,0,0,0,4, ;150
+ db 7,4,2,1,0,0,0,5, ;151
+ db 7,4,3,0,0,0,0,3, ;152
+ db 7,4,3,0,0,0,0,4, ;153
+ db 7,4,3,1,0,0,0,4, ;154
+ db 7,4,3,1,0,0,0,5, ;155
+ db 7,4,3,2,0,0,0,4, ;156
+ db 7,4,3,2,0,0,0,5, ;157
+ db 7,4,3,2,1,0,0,5, ;158
+ db 7,4,3,2,1,0,0,6, ;159
+ db 7,5,0,0,0,0,0,2, ;160
+ db 7,5,0,0,0,0,0,3, ;161
+ db 7,5,1,0,0,0,0,3, ;162
+ db 7,5,1,0,0,0,0,4, ;163
+ db 7,5,2,0,0,0,0,3, ;164
+ db 7,5,2,0,0,0,0,4, ;165
+ db 7,5,2,1,0,0,0,4, ;166
+ db 7,5,2,1,0,0,0,5, ;167
+ db 7,5,3,0,0,0,0,3, ;168
+ db 7,5,3,0,0,0,0,4, ;169
+ db 7,5,3,1,0,0,0,4, ;170
+ db 7,5,3,1,0,0,0,5, ;171
+ db 7,5,3,2,0,0,0,4, ;172
+ db 7,5,3,2,0,0,0,5, ;173
+ db 7,5,3,2,1,0,0,5, ;174
+ db 7,5,3,2,1,0,0,6, ;175
+ db 7,5,4,0,0,0,0,3, ;176
+ db 7,5,4,0,0,0,0,4, ;177
+ db 7,5,4,1,0,0,0,4, ;178
+ db 7,5,4,1,0,0,0,5, ;179
+ db 7,5,4,2,0,0,0,4, ;180
+ db 7,5,4,2,0,0,0,5, ;181
+ db 7,5,4,2,1,0,0,5, ;182
+ db 7,5,4,2,1,0,0,6, ;183
+ db 7,5,4,3,0,0,0,4, ;184
+ db 7,5,4,3,0,0,0,5, ;185
+ db 7,5,4,3,1,0,0,5, ;186
+ db 7,5,4,3,1,0,0,6, ;187
+ db 7,5,4,3,2,0,0,5, ;188
+ db 7,5,4,3,2,0,0,6, ;189
+ db 7,5,4,3,2,1,0,6, ;190
+ db 7,5,4,3,2,1,0,7, ;191
+ db 7,6,0,0,0,0,0,2, ;192
+ db 7,6,0,0,0,0,0,3, ;193
+ db 7,6,1,0,0,0,0,3, ;194
+ db 7,6,1,0,0,0,0,4, ;195
+ db 7,6,2,0,0,0,0,3, ;196
+ db 7,6,2,0,0,0,0,4, ;197
+ db 7,6,2,1,0,0,0,4, ;198
+ db 7,6,2,1,0,0,0,5, ;199
+ db 7,6,3,0,0,0,0,3, ;200
+ db 7,6,3,0,0,0,0,4, ;201
+ db 7,6,3,1,0,0,0,4, ;202
+ db 7,6,3,1,0,0,0,5, ;203
+ db 7,6,3,2,0,0,0,4, ;204
+ db 7,6,3,2,0,0,0,5, ;205
+ db 7,6,3,2,1,0,0,5, ;206
+ db 7,6,3,2,1,0,0,6, ;207
+ db 7,6,4,0,0,0,0,3, ;208
+ db 7,6,4,0,0,0,0,4, ;209
+ db 7,6,4,1,0,0,0,4, ;210
+ db 7,6,4,1,0,0,0,5, ;211
+ db 7,6,4,2,0,0,0,4, ;212
+ db 7,6,4,2,0,0,0,5, ;213
+ db 7,6,4,2,1,0,0,5, ;214
+ db 7,6,4,2,1,0,0,6, ;215
+ db 7,6,4,3,0,0,0,4, ;216
+ db 7,6,4,3,0,0,0,5, ;217
+ db 7,6,4,3,1,0,0,5, ;218
+ db 7,6,4,3,1,0,0,6, ;219
+ db 7,6,4,3,2,0,0,5, ;220
+ db 7,6,4,3,2,0,0,6, ;221
+ db 7,6,4,3,2,1,0,6, ;222
+ db 7,6,4,3,2,1,0,7, ;223
+ db 7,6,5,0,0,0,0,3, ;224
+ db 7,6,5,0,0,0,0,4, ;225
+ db 7,6,5,1,0,0,0,4, ;226
+ db 7,6,5,1,0,0,0,5, ;227
+ db 7,6,5,2,0,0,0,4, ;228
+ db 7,6,5,2,0,0,0,5, ;229
+ db 7,6,5,2,1,0,0,5, ;230
+ db 7,6,5,2,1,0,0,6, ;231
+ db 7,6,5,3,0,0,0,4, ;232
+ db 7,6,5,3,0,0,0,5, ;233
+ db 7,6,5,3,1,0,0,5, ;234
+ db 7,6,5,3,1,0,0,6, ;235
+ db 7,6,5,3,2,0,0,5, ;236
+ db 7,6,5,3,2,0,0,6, ;237
+ db 7,6,5,3,2,1,0,6, ;238
+ db 7,6,5,3,2,1,0,7, ;239
+ db 7,6,5,4,0,0,0,4, ;240
+ db 7,6,5,4,0,0,0,5, ;241
+ db 7,6,5,4,1,0,0,5, ;242
+ db 7,6,5,4,1,0,0,6, ;243
+ db 7,6,5,4,2,0,0,5, ;244
+ db 7,6,5,4,2,0,0,6, ;245
+ db 7,6,5,4,2,1,0,6, ;246
+ db 7,6,5,4,2,1,0,7, ;247
+ db 7,6,5,4,3,0,0,5, ;248
+ db 7,6,5,4,3,0,0,6, ;249
+ db 7,6,5,4,3,1,0,6, ;250
+ db 7,6,5,4,3,1,0,7, ;251
+ db 7,6,5,4,3,2,0,6, ;252
+ db 7,6,5,4,3,2,0,7, ;253
+ db 7,6,5,4,3,2,1,7, ;254
+ db 7,6,5,4,3,2,1,8, ;255
;***********************************************************************
; Code
@@ -323,43 +323,43 @@
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
WELS_EXTERN CavlcParamCal_sse2
- push ebx
- push edi
- push esi
+ push ebx
+ push edi
+ push esi
- mov eax, [esp+16] ;coffLevel
- mov edi, [esp+24] ;Level
- mov ebx, [esp+32] ;endIdx
- cmp ebx, 3
- jne .Level16
- pxor xmm1, xmm1
- movq xmm0, [eax] ; removed QWORD
- jmp .Cal_begin
+ mov eax, [esp+16] ;coffLevel
+ mov edi, [esp+24] ;Level
+ mov ebx, [esp+32] ;endIdx
+ cmp ebx, 3
+ jne .Level16
+ pxor xmm1, xmm1
+ movq xmm0, [eax] ; removed QWORD
+ jmp .Cal_begin
.Level16:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax+16]
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax+16]
.Cal_begin:
- movdqa xmm2, xmm0
- packsswb xmm0, xmm1
- movdqa xmm4, xmm0
- pxor xmm3, xmm3
- pcmpgtb xmm0, xmm3
- pcmpgtb xmm3, xmm4
- por xmm0, xmm3
- pmovmskb edx, xmm0
- cmp edx, 0
- je near .return
- movdqa xmm6, [sse2_b_1]
- pcmpeqw xmm7, xmm7 ;generate -1
- mov ebx, 0xff
- ;pinsrw xmm6, ebx, 3
+ movdqa xmm2, xmm0
+ packsswb xmm0, xmm1
+ movdqa xmm4, xmm0
+ pxor xmm3, xmm3
+ pcmpgtb xmm0, xmm3
+ pcmpgtb xmm3, xmm4
+ por xmm0, xmm3
+ pmovmskb edx, xmm0
+ cmp edx, 0
+ je near .return
+ movdqa xmm6, [sse2_b_1]
+ pcmpeqw xmm7, xmm7 ;generate -1
+ mov ebx, 0xff
+ ;pinsrw xmm6, ebx, 3
mov bl, dh
- lea ebx, [byte_1pos_table+8*ebx]
- movq xmm0, [ebx]
- pextrw ecx, xmm0, 3
- shr ecx, 8
+ lea ebx, [byte_1pos_table+8*ebx]
+ movq xmm0, [ebx]
+ pextrw ecx, xmm0, 3
+ shr ecx, 8
mov dh, cl
.loopHighFind0:
@@ -367,19 +367,19 @@
je .loopHighFind0End
;mov esi, [ebx]
;and esi, 0xff
- movzx esi, byte [ebx]
+ movzx esi, byte [ebx]
add esi, 8
mov esi, [eax+2*esi]
mov [edi], si
add edi, 2
;add ebx, 1
- inc ebx
+ inc ebx
dec ecx
- jmp .loopHighFind0
+ jmp .loopHighFind0
.loopHighFind0End:
mov cl, dh
cmp cl, 8
- pand xmm0, xmm6
+ pand xmm0, xmm6
jne .LowByteFind0
sub edi, 2
mov esi, [eax+16]
@@ -387,8 +387,8 @@
add edi, 2
.LowByteFind0:
and edx, 0xff
- lea ebx, [byte_1pos_table+8*edx]
- movq xmm1, [ebx]
+ lea ebx, [byte_1pos_table+8*edx]
+ movq xmm1, [ebx]
pextrw esi, xmm1, 3
or esi, 0xff
or ecx, 0xff00
@@ -398,16 +398,16 @@
.loopLowFind0:
cmp esi, 0
je .loopLowFind0End
- ;mov edx, [ebx]
- ;and edx, 0xff
- movzx edx, byte [ebx]
- mov edx, [eax+2*edx]
- mov [edi], dx
- add edi, 2
- ;add ebx, 1
- inc ebx
+ ;mov edx, [ebx]
+ ;and edx, 0xff
+ movzx edx, byte [ebx]
+ mov edx, [eax+2*edx]
+ mov [edi], dx
+ add edi, 2
+ ;add ebx, 1
+ inc ebx
dec esi
- jmp .loopLowFind0
+ jmp .loopLowFind0
.loopLowFind0End:
cmp ch, 8
jne .getLevelEnd
@@ -415,12 +415,12 @@
mov edx, [eax]
mov [edi], dx
.getLevelEnd:
- mov edx, [esp+28] ;total_coeffs
+ mov edx, [esp+28] ;total_coeffs
;mov ebx, ecx
;and ebx, 0xff
- movzx ebx, byte cl
+ movzx ebx, byte cl
add cl, ch
- mov [edx], cl
+ mov [edx], cl
;getRun
movq xmm5, [sse2_b8]
paddb xmm0, xmm5
@@ -430,7 +430,7 @@
sub eax, ebx
shl eax, 3
shl ebx, 3
- pinsrw xmm2, ebx, 0
+ pinsrw xmm2, ebx, 0
pinsrw xmm3, eax, 0
psllq xmm0, xmm3
psrlq xmm0, xmm3
@@ -441,19 +441,19 @@
por xmm0, xmm1
pextrw eax, xmm0, 0
- and eax, 0xff
+ and eax, 0xff
inc eax
sub al, cl
- movdqa xmm1, xmm0
- paddb xmm1, xmm7
- psrldq xmm0, 1
- psubb xmm1, xmm0
+ movdqa xmm1, xmm0
+ paddb xmm1, xmm7
+ psrldq xmm0, 1
+ psubb xmm1, xmm0
mov ecx, [esp+20] ;run
- movdqa [ecx], xmm1
+ movdqa [ecx], xmm1
;getRunEnd
.return:
- pop esi
- pop edi
- pop ebx
- ret
+ pop esi
+ pop edi
+ pop ebx
+ ret
%endif
--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -50,17 +50,17 @@
align 16
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
- dw 10, 13, 10, 13, 13, 16, 13, 16,
+ dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 11, 14, 11, 14, 14, 18, 14, 18,
- dw 11, 14, 11, 14, 14, 18, 14, 18,
- dw 13, 16, 13, 16, 16, 20, 16, 20,
- dw 13, 16, 13, 16, 16, 20, 16, 20,
+ dw 11, 14, 11, 14, 14, 18, 14, 18,
+ dw 13, 16, 13, 16, 16, 20, 16, 20,
+ dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 14, 18, 14, 18, 18, 23, 18, 23,
- dw 14, 18, 14, 18, 18, 23, 18, 23,
- dw 16, 20, 16, 20, 20, 25, 20, 25,
- dw 16, 20, 16, 20, 20, 25, 20, 25,
+ dw 14, 18, 14, 18, 18, 23, 18, 23,
+ dw 16, 20, 16, 20, 20, 25, 20, 25,
+ dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 18, 23, 18, 23, 23, 29, 23, 29,
- dw 18, 23, 18, 23, 23, 29, 23, 29
+ dw 18, 23, 18, 23, 23, 29, 23, 29
;***********************************************************************
@@ -68,27 +68,27 @@
;***********************************************************************
%macro MMX_LoadDiff4P 5
- movd %1, [%3]
- movd %2, [%4]
- punpcklbw %1, %5
- punpcklbw %2, %5
- psubw %1, %2
+ movd %1, [%3]
+ movd %2, [%4]
+ punpcklbw %1, %5
+ punpcklbw %2, %5
+ psubw %1, %2
%endmacro
%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
- MMX_LoadDiff4P %1, %9, %5, %7, %10
- MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
- lea %5, [%5+2*%6]
- lea %7, [%7+2*%8]
- MMX_LoadDiff4P %3, %9, %5, %7, %10
- MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
+ MMX_LoadDiff4P %1, %9, %5, %7, %10
+ MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
+ lea %5, [%5+2*%6]
+ lea %7, [%7+2*%8]
+ MMX_LoadDiff4P %3, %9, %5, %7, %10
+ MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
%endmacro
%macro MMX_SumSubMul2 3
- movq %3, %1
- psllw %1, $01
- paddw %1, %2
- psllw %2, $01
+ movq %3, %1
+ psllw %1, $01
+ paddw %1, %2
+ psllw %2, $01
psubw %3, %2
%endmacro
@@ -101,15 +101,15 @@
%endmacro
%macro MMX_SumSub 3
- movq %3, %2
+ movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
%macro MMX_DCT 6
- MMX_SumSub %4, %1, %6
- MMX_SumSub %3, %2, %6
- MMX_SumSub %3, %4, %6
+ MMX_SumSub %4, %1, %6
+ MMX_SumSub %3, %2, %6
+ MMX_SumSub %3, %4, %6
MMX_SumSubMul2 %1, %2, %5
%endmacro
@@ -116,8 +116,8 @@
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
- MMX_SumSub %1, %4, %6
- MMX_SumSub %3, %5, %6
+ MMX_SumSub %1, %4, %6
+ MMX_SumSub %3, %5, %6
%endmacro
%macro MMX_StoreDiff4P 6
@@ -142,11 +142,11 @@
MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
- MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
- MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
+ MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
+ MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
- MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
- MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
+ MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
+ MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
movq [r0+ 0], mm2
movq [r0+ 8], mm1
@@ -170,22 +170,22 @@
movq mm2, [r4+16]
movq mm3, [r4+24]
- MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
- MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
- MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
- MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
+ MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
+ MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
+ MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
+ MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
- WELS_Zero mm7
- WELS_DW32 mm6
+ WELS_Zero mm7
+ WELS_DW32 mm6
- MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
- MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
+ MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
+ MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
- MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
- MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
+ MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
+ MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
- WELSEMMS
+ WELSEMMS
LOAD_5_PARA_POP
ret
@@ -194,21 +194,21 @@
; SSE2 functions
;***********************************************************************
%macro SSE2_Store4x8p 6
- SSE2_XSawp qdq, %2, %3, %6
- SSE2_XSawp qdq, %4, %5, %3
- MOVDQ [%1+0x00], %2
- MOVDQ [%1+0x10], %4
- MOVDQ [%1+0x20], %6
- MOVDQ [%1+0x30], %3
+ SSE2_XSawp qdq, %2, %3, %6
+ SSE2_XSawp qdq, %4, %5, %3
+ MOVDQ [%1+0x00], %2
+ MOVDQ [%1+0x10], %4
+ MOVDQ [%1+0x20], %6
+ MOVDQ [%1+0x30], %3
%endmacro
%macro SSE2_Load4x8p 6
- MOVDQ %2, [%1+0x00]
- MOVDQ %4, [%1+0x10]
- MOVDQ %6, [%1+0x20]
- MOVDQ %3, [%1+0x30]
- SSE2_XSawp qdq, %4, %3, %5
- SSE2_XSawp qdq, %2, %6, %3
+ MOVDQ %2, [%1+0x00]
+ MOVDQ %4, [%1+0x10]
+ MOVDQ %6, [%1+0x20]
+ MOVDQ %3, [%1+0x30]
+ SSE2_XSawp qdq, %4, %3, %5
+ SSE2_XSawp qdq, %2, %6, %3
%endmacro
%macro SSE2_SumSubMul2 3
@@ -231,57 +231,57 @@
%macro SSE2_StoreDiff8p 6
paddw %1, %3
psraw %1, $06
- movq %2, %6
+ movq %2, %6
punpcklbw %2, %4
paddsw %2, %1
packuswb %2, %2
- movq %5, %2
+ movq %5, %2
%endmacro
%macro SSE2_StoreDiff8p 5
- movq %2, %5
+ movq %2, %5
punpcklbw %2, %3
paddsw %2, %1
packuswb %2, %2
- movq %4, %2
+ movq %4, %2
%endmacro
-%macro SSE2_Load8DC 6
- movdqa %1, %6 ; %1 = dc0 dc1
- paddw %1, %5
- psraw %1, $06 ; (dc + 32) >> 6
+%macro SSE2_Load8DC 6
+ movdqa %1, %6 ; %1 = dc0 dc1
+ paddw %1, %5
+ psraw %1, $06 ; (dc + 32) >> 6
- movdqa %2, %1
- psrldq %2, 4
- punpcklwd %2, %2
- punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklwd %2, %2
+ punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
- movdqa %3, %1
- psrldq %3, 8
- punpcklwd %3, %3
- punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
+ movdqa %3, %1
+ psrldq %3, 8
+ punpcklwd %3, %3
+ punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
- movdqa %4, %1
- psrldq %4, 12
- punpcklwd %4, %4
- punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
+ movdqa %4, %1
+ psrldq %4, 12
+ punpcklwd %4, %4
+ punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
- punpcklwd %1, %1
- punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+ punpcklwd %1, %1
+ punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
%endmacro
%macro SSE2_DCT 6
- SSE2_SumSub %6, %3, %5
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %2, %5
- SSE2_SumSubMul2 %6, %1, %4
+ SSE2_SumSub %6, %3, %5
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %2, %5
+ SSE2_SumSubMul2 %6, %1, %4
%endmacro
%macro SSE2_IDCT 7
SSE2_SumSub %7, %2, %6
SSE2_SumSubDiv2 %1, %3, %5, %4
- SSE2_SumSub %2, %1, %5
- SSE2_SumSub %7, %4, %5
+ SSE2_SumSub %2, %1, %5
+ SSE2_SumSub %7, %4, %5
%endmacro
;***********************************************************************
@@ -294,42 +294,42 @@
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
pxor xmm7, xmm7
- ;Load 4x8
- SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
+ ;Load 4x8
+ SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
- SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
+ SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
- SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
- SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
- SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
- SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
+ SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+ SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
+ SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+ SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
- SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+ SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
- ;Load 4x8
- SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
+ ;Load 4x8
+ SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
- SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
- SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
- SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
- SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
+ SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+ SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
+ SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+ SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
- lea r0, [r0+64]
- SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+ lea r0, [r0+64]
+ SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
- POP_XMM
- LOAD_5_PARA_POP
+ POP_XMM
+ LOAD_5_PARA_POP
ret
@@ -337,59 +337,59 @@
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
;***********************************************************************
WELS_EXTERN WelsIDctFourT4Rec_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- ;Load 4x8
- SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ ;Load 4x8
+ SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
- SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
- SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
- SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
- SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+ SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
+ SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+ SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
+ SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
- WELS_Zero xmm7
- WELS_DW32 xmm6
+ WELS_Zero xmm7
+ WELS_DW32 xmm6
- SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
- SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
- SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
+ SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
+ SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
+ SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
- add r4, 64
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
+ add r4, 64
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
- SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
- SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+ SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
+ SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
- SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+ SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
- WELS_Zero xmm7
- WELS_DW32 xmm6
+ WELS_Zero xmm7
+ WELS_DW32 xmm6
- SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
- SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
- SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
- POP_XMM
- LOAD_5_PARA_POP
- ; pop esi
- ; pop ebx
+ SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
+ SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
+ SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
+ POP_XMM
+ LOAD_5_PARA_POP
+ ; pop esi
+ ; pop ebx
ret
%macro SSE2_StoreDiff4x8p 8
- SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
- SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
- SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
- SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
+ SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
+ SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
+ SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
+ SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
%endmacro
;***********************************************************************
@@ -396,76 +396,76 @@
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
;***********************************************************************
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm7, xmm7
- WELS_DW32 xmm6
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm7, xmm7
+ WELS_DW32 xmm6
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- POP_XMM
- LOAD_5_PARA_POP
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ POP_XMM
+ LOAD_5_PARA_POP
ret
%macro SSE2_SumSubD 3
- movdqa %3, %2
+ movdqa %3, %2
paddd %2, %1
psubd %1, %3
%endmacro
%macro SSE2_SumSubDiv2D 4
- paddd %1, %2
- paddd %1, %3
- psrad %1, 1
- movdqa %4, %1
- psubd %4, %2
+ paddd %1, %2
+ paddd %1, %3
+ psrad %1, 1
+ movdqa %4, %1
+ psubd %4, %2
%endmacro
-%macro SSE2_Load4Col 5
- movsx r2, WORD[%5]
- movd %1, r2d
- movsx r2, WORD[%5 + 0x20]
- movd %2, r2d
- punpckldq %1, %2
- movsx r2, WORD[%5 + 0x80]
- movd %3, r2d
- movsx r2, WORD[%5 + 0xa0]
- movd %4, r2d
- punpckldq %3, %4
- punpcklqdq %1, %3
+%macro SSE2_Load4Col 5
+ movsx r2, WORD[%5]
+ movd %1, r2d
+ movsx r2, WORD[%5 + 0x20]
+ movd %2, r2d
+ punpckldq %1, %2
+ movsx r2, WORD[%5 + 0x80]
+ movd %3, r2d
+ movsx r2, WORD[%5 + 0xa0]
+ movd %4, r2d
+ punpckldq %3, %4
+ punpcklqdq %1, %3
%endmacro
;***********************************************************************
@@ -472,33 +472,33 @@
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;***********************************************************************
WELS_EXTERN WelsHadamardT4Dc_sse2
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
- SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
- SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
- SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
+ SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
+ SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
+ SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
- SSE2_SumSubD xmm1, xmm2, xmm7
- SSE2_SumSubD xmm3, xmm4, xmm7
- SSE2_SumSubD xmm2, xmm4, xmm7
- SSE2_SumSubD xmm1, xmm3, xmm7
+ SSE2_SumSubD xmm1, xmm2, xmm7
+ SSE2_SumSubD xmm3, xmm4, xmm7
+ SSE2_SumSubD xmm2, xmm4, xmm7
+ SSE2_SumSubD xmm1, xmm3, xmm7
- SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
+ SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
- SSE2_SumSubD xmm4, xmm3, xmm7
- SSE2_SumSubD xmm5, xmm1, xmm7
+ SSE2_SumSubD xmm4, xmm3, xmm7
+ SSE2_SumSubD xmm5, xmm1, xmm7
- WELS_DD1 xmm6
- SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
- SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
- SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
+ WELS_DD1 xmm6
+ SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
+ SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
+ SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
- packssdw xmm3, xmm4
- packssdw xmm2, xmm1
- movdqa [r0+ 0], xmm3
- movdqa [r0+16], xmm2
+ packssdw xmm3, xmm4
+ packssdw xmm2, xmm1
+ movdqa [r0+ 0], xmm3
+ movdqa [r0+16], xmm2
- POP_XMM
- ret
+ POP_XMM
+ ret
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -61,7 +61,7 @@
sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
align 16
-mmx_01bytes: times 16 db 1
+mmx_01bytes: times 16 db 1
align 16
mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
@@ -73,106 +73,106 @@
;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
;%1 will keep the last result
%macro SSE_DB_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubb %1, %2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubb %1, %2
%endmacro
;xmm0, xmm1, xmm2, eax, ecx
;lower 64 bits of xmm0 save the result
%macro SSE2_PRED_H_4X4_TWO_LINE 5
- movd %1, [%4-1]
- movdqa %3, %1
- punpcklbw %1, %3
- movdqa %3, %1
- punpcklbw %1, %3
+ movd %1, [%4-1]
+ movdqa %3, %1
+ punpcklbw %1, %3
+ movdqa %3, %1
+ punpcklbw %1, %3
- ;add %4, %5
- movd %2, [%4+%5-1]
- movdqa %3, %2
- punpcklbw %2, %3
- movdqa %3, %2
- punpcklbw %2, %3
- punpckldq %1, %2
+ ;add %4, %5
+ movd %2, [%4+%5-1]
+ movdqa %3, %2
+ punpcklbw %2, %3
+ movdqa %3, %2
+ punpcklbw %2, %3
+ punpckldq %1, %2
%endmacro
-%macro SUMW_HORIZON1 2
- movdqa %2, %1
- psrldq %2, 8
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 4
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 2
- paddusw %1, %2
+%macro SUMW_HORIZON1 2
+ movdqa %2, %1
+ psrldq %2, 8
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ paddusw %1, %2
%endmacro
-%macro LOAD_COLUMN 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpcklwd %1, %3
- lea %5, [%5+2*%6]
- movd %4, [%5]
- movd %2, [%5+%6]
- punpcklbw %4, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- lea %5, [%5+2*%6]
- punpcklbw %3, %2
- punpcklwd %4, %3
- punpckhdq %1, %4
+%macro LOAD_COLUMN 6
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpcklwd %1, %3
+ lea %5, [%5+2*%6]
+ movd %4, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %4, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ lea %5, [%5+2*%6]
+ punpcklbw %3, %2
+ punpcklwd %4, %3
+ punpckhdq %1, %4
%endmacro
-%macro SUMW_HORIZON 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+%macro SUMW_HORIZON 3
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
%endmacro
-%macro COPY_16_TIMES 2
- movdqa %2, [%1-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
+%macro COPY_16_TIMES 2
+ movdqa %2, [%1-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
%endmacro
-%macro COPY_16_TIMESS 3
- movdqa %2, [%1+%3-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
+%macro COPY_16_TIMESS 3
+ movdqa %2, [%1+%3-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
%endmacro
-%macro LOAD_COLUMN_C 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1,%2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpckhwd %1, %3
- lea %5, [%5+2*%6]
+%macro LOAD_COLUMN_C 6
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1,%2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpckhwd %1, %3
+ lea %5, [%5+2*%6]
%endmacro
%macro LOAD_2_LEFT_AND_ADD 0
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01]
- add r3, r4
- movzx r4, byte [r1+r2-0x01]
- add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01]
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01]
+ add r3, r4
%endmacro
;***********************************************************************
@@ -184,127 +184,127 @@
;***********************************************************************
; void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;
-; pred must align to 16
+; pred must align to 16
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredH_sse2
- push r3
- %assign push_num 1
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movzx r3, byte [r1-1]
- movd xmm0, r3d
- pmuludq xmm0, [mmx_01bytes]
+ push r3
+ %assign push_num 1
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movzx r3, byte [r1-1]
+ movd xmm0, r3d
+ pmuludq xmm0, [mmx_01bytes]
- movzx r3, byte [r1+r2-1]
- movd xmm1, r3d
- pmuludq xmm1, [mmx_01bytes]
+ movzx r3, byte [r1+r2-1]
+ movd xmm1, r3d
+ pmuludq xmm1, [mmx_01bytes]
- unpcklps xmm0, xmm1
+ unpcklps xmm0, xmm1
- lea r1, [r1+r2*2]
- movzx r3, byte [r1-1]
- movd xmm2, r3d
- pmuludq xmm2, [mmx_01bytes]
+ lea r1, [r1+r2*2]
+ movzx r3, byte [r1-1]
+ movd xmm2, r3d
+ pmuludq xmm2, [mmx_01bytes]
- movzx r3, byte [r1+r2-1]
- movd xmm3, r3d
- pmuludq xmm3, [mmx_01bytes]
+ movzx r3, byte [r1+r2-1]
+ movd xmm3, r3d
+ pmuludq xmm3, [mmx_01bytes]
- unpcklps xmm2, xmm3
- unpcklpd xmm0, xmm2
+ unpcklps xmm2, xmm3
+ unpcklpd xmm0, xmm2
- movdqa [r0], xmm0
- pop r3
- ret
+ movdqa [r0], xmm0
+ pop r3
+ ret
;***********************************************************************
; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r2, r2d
- sub r1, 1
- sub r1, r2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ sub r1, 1
+ sub r1, r2
- ;for H
- pxor xmm7, xmm7
- movq xmm0, [r1]
- movdqa xmm5, [sse2_plane_dec]
- punpcklbw xmm0, xmm7
- pmullw xmm0, xmm5
- movq xmm1, [r1 + 9]
- movdqa xmm6, [sse2_plane_inc]
- punpcklbw xmm1, xmm7
- pmullw xmm1, xmm6
- psubw xmm1, xmm0
+ ;for H
+ pxor xmm7, xmm7
+ movq xmm0, [r1]
+ movdqa xmm5, [sse2_plane_dec]
+ punpcklbw xmm0, xmm7
+ pmullw xmm0, xmm5
+ movq xmm1, [r1 + 9]
+ movdqa xmm6, [sse2_plane_inc]
+ punpcklbw xmm1, xmm7
+ pmullw xmm1, xmm6
+ psubw xmm1, xmm0
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
- movsx r3, r3w
- imul r3, 5
- add r3, 32
- sar r3, 6 ; b = (5 * H + 32) >> 6;
- SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+ movsx r3, r3w
+ imul r3, 5
+ add r3, 32
+ sar r3, 6 ; b = (5 * H + 32) >> 6;
+ SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b
- movzx r4, BYTE [r1+16]
- sub r1, 3
- LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2
+ movzx r4, BYTE [r1+16]
+ sub r1, 3
+ LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2
- add r1, 3
- movzx r3, BYTE [r1+8*r2]
- add r4, r3
- shl r4, 4 ; a = (left[15*stride] + top[15]) << 4;
+ add r1, 3
+ movzx r3, BYTE [r1+8*r2]
+ add r4, r3
+ shl r4, 4 ; a = (left[15*stride] + top[15]) << 4;
- sub r1, 3
- add r1, r2
- LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2
- pxor xmm4, xmm4
- punpckhbw xmm0, xmm4
- pmullw xmm0, xmm5
- punpckhbw xmm7, xmm4
- pmullw xmm7, xmm6
- psubw xmm7, xmm0
+ sub r1, 3
+ add r1, r2
+ LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2
+ pxor xmm4, xmm4
+ punpckhbw xmm0, xmm4
+ pmullw xmm0, xmm5
+ punpckhbw xmm7, xmm4
+ pmullw xmm7, xmm6
+ psubw xmm7, xmm0
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r3d, xmm7 ; V
- movsx r3, r3w
- imul r3, 5
- add r3, 32
- sar r3, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r3d, xmm7 ; V
+ movsx r3, r3w
+ imul r3, 5
+ add r3, 32
+ sar r3, 6 ; c = (5 * V + 32) >> 6;
+ SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c
- add r4, 16
- imul r3, -7
- add r3, r4 ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+ add r4, 16
+ imul r3, -7
+ add r3, r4 ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- xor r3, r3
- movdqa xmm5, [sse2_plane_inc_minus]
+ xor r3, r3
+ movdqa xmm5, [sse2_plane_inc_minus]
get_i16x16_luma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- movdqa xmm3, xmm1
- pmullw xmm3, xmm6
- paddw xmm3, xmm0
- psraw xmm3, 5
- packuswb xmm2, xmm3
- movdqa [r0], xmm2
- paddw xmm0, xmm4
- add r0, 16
- inc r3
- cmp r3, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
- POP_XMM
- pop r4
- pop r3
- ret
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ movdqa xmm3, xmm1
+ pmullw xmm3, xmm6
+ paddw xmm3, xmm0
+ psraw xmm3, 5
+ packuswb xmm2, xmm3
+ movdqa [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, 16
+ inc r3
+ cmp r3, 16
+ jnz get_i16x16_luma_pred_plane_sse2_1
+ POP_XMM
+ pop r4
+ pop r3
+ ret
;***********************************************************************
; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
@@ -311,38 +311,38 @@
;***********************************************************************
%macro SSE2_PRED_H_16X16_ONE_LINE 0
- add r0, 16
- add r1, r2
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
+ add r0, 16
+ add r1, r2
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
%endmacro
WELS_EXTERN WelsI16x16LumaPredH_sse2
- push r3
- %assign push_num 1
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- dec r1
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- pop r3
+ push r3
+ %assign push_num 1
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ dec r1
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ pop r3
ret
;***********************************************************************
@@ -378,289 +378,289 @@
; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WELS_EXTERN WelsIChromaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r2, r2d
- sub r1, 1
- sub r1, r2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ sub r1, 1
+ sub r1, r2
- pxor mm7, mm7
- movq mm0, [r1]
- movq mm5, [sse2_plane_dec_c]
- punpcklbw mm0, mm7
- pmullw mm0, mm5
- movq mm1, [r1 + 5]
- movq mm6, [sse2_plane_inc_c]
- punpcklbw mm1, mm7
- pmullw mm1, mm6
- psubw mm1, mm0
+ pxor mm7, mm7
+ movq mm0, [r1]
+ movq mm5, [sse2_plane_dec_c]
+ punpcklbw mm0, mm7
+ pmullw mm0, mm5
+ movq mm1, [r1 + 5]
+ movq mm6, [sse2_plane_inc_c]
+ punpcklbw mm1, mm7
+ pmullw mm1, mm6
+ psubw mm1, mm0
- movq2dq xmm1, mm1
- pxor xmm2, xmm2
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r3d, xmm1
- movsx r3, r3w
- imul r3, 17
- add r3, 16
- sar r3, 5 ; b = (17 * H + 16) >> 5;
- SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b
+ movq2dq xmm1, mm1
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r3d, xmm1
+ movsx r3, r3w
+ imul r3, 17
+ add r3, 16
+ sar r3, 5 ; b = (17 * H + 16) >> 5;
+ SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b
- movzx r3, BYTE [r1+8]
- sub r1, 3
- LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2
+ movzx r3, BYTE [r1+8]
+ sub r1, 3
+ LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2
- add r1, 3
- movzx r4, BYTE [r1+4*r2]
- add r4, r3
- shl r4, 4 ; a = (left[7*stride] + top[7]) << 4;
+ add r1, 3
+ movzx r4, BYTE [r1+4*r2]
+ add r4, r3
+ shl r4, 4 ; a = (left[7*stride] + top[7]) << 4;
- sub r1, 3
- add r1, r2
- LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2
- pxor mm4, mm4
- punpckhbw mm0, mm4
- pmullw mm0, mm5
- punpckhbw mm7, mm4
- pmullw mm7, mm6
- psubw mm7, mm0
+ sub r1, 3
+ add r1, r2
+ LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2
+ pxor mm4, mm4
+ punpckhbw mm0, mm4
+ pmullw mm0, mm5
+ punpckhbw mm7, mm4
+ pmullw mm7, mm6
+ psubw mm7, mm0
- movq2dq xmm7, mm7
- pxor xmm2, xmm2
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r3d, xmm7 ; V
- movsx r3, r3w
- imul r3, 17
- add r3, 16
- sar r3, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c
+ movq2dq xmm7, mm7
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r3d, xmm7 ; V
+ movsx r3, r3w
+ imul r3, 17
+ add r3, 16
+ sar r3, 5 ; c = (17 * V + 16) >> 5;
+ SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c
- add r4, 16
- imul r3, -3
- add r3, r4 ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+ add r4, 16
+ imul r3, -3
+ add r3, r4 ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- xor r3, r3
- movdqa xmm5, [sse2_plane_mul_b_c]
+ xor r3, r3
+ movdqa xmm5, [sse2_plane_mul_b_c]
get_i_chroma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r0], xmm2
- paddw xmm0, xmm4
- add r0, 8
- inc r3
- cmp r3, 8
- jnz get_i_chroma_pred_plane_sse2_1
- POP_XMM
- pop r4
- pop r3
- WELSEMMS
- ret
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, 8
+ inc r3
+ cmp r3, 8
+ jnz get_i_chroma_pred_plane_sse2_1
+ POP_XMM
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
;***********************************************************************
-; 0 |1 |2 |3 |4 |
-; 6 |7 |8 |9 |10|
-; 11|12|13|14|15|
-; 16|17|18|19|20|
-; 21|22|23|24|25|
-; 7 is the start pixel of current 4x4 block
-; pred[7] = ([6]+[0]*2+[1]+2)/4
+; 0 |1 |2 |3 |4 |
+; 6 |7 |8 |9 |10|
+; 11|12|13|14|15|
+; 16|17|18|19|20|
+; 21|22|23|24|25|
+; 7 is the start pixel of current 4x4 block
+; pred[7] = ([6]+[0]*2+[1]+2)/4
;
; void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDR_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
- movq mm2,[r1-8] ;get value of 6 mm2[8] = 6
- sub r1, r2 ;mov eax to above line of current block(postion of 1)
- punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
- movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
- punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
- psllq mm3,18h ;mm3[5]=[1]
- psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
- movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- lea r1,[r1+r2*2-8h] ;set eax point to 12
- movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16]
- psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[16]
- por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
- movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
- movq mm4,[r1+r2*2] ;mm4[8]=[21]
- psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[21]
- por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
- movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
- pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
- pxor mm1,mm4 ;find odd value in the lowest bit of each byte
- pand mm1,[mmx_01bytes] ;set the odd bit
- psubusb mm3,mm1 ;decrease 1 from odd bytes
- pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+ movq mm2,[r1-8] ;get value of 6 mm2[8] = 6
+ sub r1, r2 ;mov eax to above line of current block(postion of 1)
+ punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+ movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+ punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+ psllq mm3,18h ;mm3[5]=[1]
+ psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+ movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ lea r1,[r1+r2*2-8h] ;set eax point to 12
+ movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16]
+ psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[16]
+ por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+ movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+ movq mm4,[r1+r2*2] ;mm4[8]=[21]
+ psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[21]
+ por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+ movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+ pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
+ pxor mm1,mm4 ;find odd value in the lowest bit of each byte
+ pand mm1,[mmx_01bytes] ;set the odd bit
+ psubusb mm3,mm1 ;decrease 1 from odd bytes
+ pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
- movd [r0+12],mm2
- psrlq mm2,8
- movd [r0+8],mm2
- psrlq mm2,8
- movd [r0+4],mm2
- psrlq mm2,8
- movd [r0],mm2
- WELSEMMS
- ret
+ movd [r0+12],mm2
+ psrlq mm2,8
+ movd [r0+8],mm2
+ psrlq mm2,8
+ movd [r0+4],mm2
+ psrlq mm2,8
+ movd [r0],mm2
+ WELSEMMS
+ ret
;***********************************************************************
-; 0 |1 |2 |3 |4 |
-; 5 |6 |7 |8 |9 |
-; 10|11|12|13|14|
-; 15|16|17|18|19|
-; 20|21|22|23|24|
-; 6 is the start pixel of current 4x4 block
-; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
+; 0 |1 |2 |3 |4 |
+; 5 |6 |7 |8 |9 |
+; 10|11|12|13|14|
+; 15|16|17|18|19|
+; 20|21|22|23|24|
+; 6 is the start pixel of current 4x4 block
+; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
;
; void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
;
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movzx r4, byte [r1-1h]
- sub r1, r2
- movd xmm0, [r1]
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- xor r3, r3
- movd r3d, xmm0
- add r3, r4
- movzx r4, byte [r1+r2*2-1h]
- add r3, r4
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movzx r4, byte [r1-1h]
+ sub r1, r2
+ movd xmm0, [r1]
+ pxor xmm1, xmm1
+ psadbw xmm0, xmm1
+ xor r3, r3
+ movd r3d, xmm0
+ add r3, r4
+ movzx r4, byte [r1+r2*2-1h]
+ add r3, r4
- lea r1, [r1+r2*2-1]
- movzx r4, byte [r1+r2]
- add r3, r4
+ lea r1, [r1+r2*2-1]
+ movzx r4, byte [r1+r2]
+ add r3, r4
- movzx r4, byte [r1+r2*2]
- add r3, r4
- add r3, 4
- sar r3, 3
- imul r3, 0x01010101
+ movzx r4, byte [r1+r2*2]
+ add r3, r4
+ add r3, 4
+ sar r3, 3
+ imul r3, 0x01010101
- movd xmm0, r3d
- pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- pop r4
- pop r3
- ret
+ movd xmm0, r3d
+ pshufd xmm0, xmm0, 0
+ movdqa [r0], xmm0
+ pop r4
+ pop r3
+ ret
;***********************************************************************
-; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy 8 pixel of 8 line from left
;***********************************************************************
%macro MMX_PRED_H_8X8_ONE_LINE 4
- movq %1, [%3-8]
- psrlq %1, 38h
+ movq %1, [%3-8]
+ psrlq %1, 38h
- ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
+ ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
%endmacro
%macro MMX_PRED_H_8X8_ONE_LINEE 4
- movq %1, [%3+r2-8]
- psrlq %1, 38h
+ movq %1, [%3+r2-8]
+ psrlq %1, 38h
- ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
+ ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
%endmacro
WELS_EXTERN WelsIChromaPredH_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movq mm0, [r1-8]
- psrlq mm0, 38h
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movq mm0, [r1-8]
+ psrlq mm0, 38h
- ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
- pmullw mm0, [mmx_01bytes]
- pshufw mm0, mm0, 0
- movq [r0], mm0
+ ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
+ pmullw mm0, [mmx_01bytes]
+ pshufw mm0, mm0, 0
+ movq [r0], mm0
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
- WELSEMMS
- ret
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
+ WELSEMMS
+ ret
;***********************************************************************
-; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy pixels from top 4 pixels
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredV_sse2
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movd xmm0, [r1]
- pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- ret
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movd xmm0, [r1]
+ pshufd xmm0, xmm0, 0
+ movdqa [r0], xmm0
+ ret
;***********************************************************************
-; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy 8 pixels from top 8 pixels
;***********************************************************************
WELS_EXTERN WelsIChromaPredV_sse2
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq xmm0, [r1]
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm1
- movdqa [r0], xmm0
- movdqa [r0+16], xmm0
- movdqa [r0+32], xmm0
- movdqa [r0+48], xmm0
- ret
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq xmm0, [r1]
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm1
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm0
+ movdqa [r0+32], xmm0
+ movdqa [r0+48], xmm0
+ ret
;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |a |b |
-; |g |h |e |f |
-; |i |j |g |h |
+; |a |b |c |d |
+; |e |f |a |b |
+; |g |h |e |f |
+; |i |j |g |h |
; a = (1 + lt + l0)>>1
; e = (1 + l0 + l1)>>1
@@ -679,68 +679,68 @@
; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
- psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+ psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
- movd mm1, [r1+2*r2-4]
- punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
- lea r1, [r1+2*r2]
- movd mm2, [r1+2*r2-4]
- punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
- psrlq mm2, 20h
- pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+ movd mm1, [r1+2*r2-4]
+ punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r1, [r1+2*r2]
+ movd mm2, [r1+2*r2-4]
+ punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+ psrlq mm2, 20h
+ pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
- movq mm1, mm0
- psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
- movq mm2, mm0
- psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
- movq mm3, mm2
- movq mm4, mm1
- pavgb mm1, mm0
+ movq mm1, mm0
+ psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+ movq mm2, mm0
+ psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+ movq mm3, mm2
+ movq mm4, mm1
+ pavgb mm1, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm4 ; decrease 1 from odd bytes
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm4 ; decrease 1 from odd bytes
- pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
+ pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
- movq mm4, mm0
- pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
- punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
+ movq mm4, mm0
+ pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
+ punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
- psrlq mm2, 20h
- psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
- movq mm4, mm3
- psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
- pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
- psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
+ psrlq mm2, 20h
+ psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
+ movq mm4, mm3
+ psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
+ pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
- movd [r0], mm2
- movd [r0+12], mm3
- psrlq mm3, 10h
- movd [r0+8], mm3
- psrlq mm3, 10h
- movd [r0+4], mm3
- WELSEMMS
- ret
+ movd [r0], mm2
+ movd [r0+12], mm3
+ psrlq mm3, 10h
+ movd [r0+8], mm3
+ psrlq mm3, 10h
+ movd [r0+4], mm3
+ WELSEMMS
+ ret
;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
; destination:
-; |a |b |c |d |
-; |c |d |e |f |
-; |e |f |g |g |
-; |g |g |g |g |
+; |a |b |c |d |
+; |c |d |e |f |
+; |e |f |g |g |
+; |g |g |g |g |
; a = (1 + l0 + l1)>>1
; c = (1 + l1 + l2)>>1
@@ -756,70 +756,70 @@
; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movd mm0, [r1-4] ; mm0[3] = l0
- punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0
- lea r1, [r1+2*r2]
- movd mm2, [r1-4] ; mm2[3] = l2
- movd mm4, [r1+r2-4] ; mm4[3] = l3
- punpcklbw mm2, mm4
- punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movd mm0, [r1-4] ; mm0[3] = l0
+ punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0
+ lea r1, [r1+2*r2]
+ movd mm2, [r1-4] ; mm2[3] = l2
+ movd mm4, [r1+r2-4] ; mm4[3] = l3
+ punpcklbw mm2, mm4
+ punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
- psrlq mm4, 18h
- psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
- psrlq mm0, 8h
- pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+ psrlq mm4, 18h
+ psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
+ psrlq mm0, 8h
+ pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
- movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
- pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+ movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+ pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
- movq mm5, mm2
- pavgb mm2, mm0
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+ movq mm5, mm2
+ pavgb mm2, mm0
- pxor mm5, mm0 ; find odd value in the lowest bit of each byte
- pand mm5, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm5 ; decrease 1 from odd bytes
+ pxor mm5, mm0 ; find odd value in the lowest bit of each byte
+ pand mm5, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm5 ; decrease 1 from odd bytes
- pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
+ pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
- psrlq mm2, 8h
- pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
+ psrlq mm2, 8h
+ pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
- punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
- punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
- punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
+ punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
+ punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
+ punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
- psrlq mm4, 20h
- movd [r0+12], mm4
+ psrlq mm4, 20h
+ movd [r0+12], mm4
- movd [r0], mm1
- psrlq mm1, 10h
- movd [r0+4], mm1
- psrlq mm1, 10h
- movd [r0+8], mm1
- WELSEMMS
- ret
+ movd [r0], mm1
+ psrlq mm1, 10h
+ movd [r0+4], mm1
+ psrlq mm1, 10h
+ movd [r0+8], mm1
+ WELSEMMS
+ ret
;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; l3 will never been used
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; l3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |i |a |b |c |
-; |j |e |f |g |
+; |a |b |c |d |
+; |e |f |g |h |
+; |i |a |b |c |
+; |j |e |f |g |
; a = (1 + lt + t0)>>1
; b = (1 + t0 + t1)>>1
@@ -837,75 +837,75 @@
; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
- psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+ psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
- movd mm1, [r1+2*r2-4]
- punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
- lea r1, [r1+2*r2]
- movq mm2, [r1+r2-8] ; mm2[7] = l2
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
- psrlq mm2, 28h
- pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+ movd mm1, [r1+2*r2-4]
+ punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r1, [r1+2*r2]
+ movq mm2, [r1+r2-8] ; mm2[7] = l2
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+ psrlq mm2, 28h
+ pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
- movq mm3, mm2
- pavgb mm2, mm0
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+ movq mm3, mm2
+ pavgb mm2, mm0
- pxor mm3, mm0 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm3 ; decrease 1 from odd bytes
+ pxor mm3, mm0 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm3 ; decrease 1 from odd bytes
- movq mm3, mm0
- psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
- movq mm2, mm3
+ movq mm3, mm0
+ psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
+ movq mm2, mm3
- psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
- movd [r0], mm1
+ psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
+ movd [r0], mm1
- psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
- movd [r0+4], mm2
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
+ movd [r0+4], mm2
- movq mm4, mm3
- psllq mm4, 20h
- psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
+ movq mm4, mm3
+ psllq mm4, 20h
+ psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
- movq mm5, mm3
- psllq mm5, 28h
- psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
+ movq mm5, mm3
+ psllq mm5, 28h
+ psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
- psllq mm1, 8h
- pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
- movd [r0+8], mm4
+ psllq mm1, 8h
+ pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
+ movd [r0+8], mm4
- psllq mm2, 8h
- pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
- movd [r0+12], mm5
- WELSEMMS
- ret
+ psllq mm2, 8h
+ pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
+ movd [r0+12], mm5
+ WELSEMMS
+ ret
;***********************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
; destination:
-; |a |b |c |d |
-; |b |c |d |e |
-; |c |d |e |f |
-; |d |e |f |g |
+; |a |b |c |d |
+; |b |c |d |e |
+; |c |d |e |f |
+; |d |e |f |g |
; a = (2 + t0 + t2 + (t1<<1))>>2
; b = (2 + t1 + t3 + (t2<<1))>>2
@@ -921,54 +921,54 @@
; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
- movq mm3, mm0
- psrlq mm3, 38h
- psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
+ movq mm3, mm0
+ psrlq mm3, 38h
+ psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
- psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
- psrlq mm2, 8h
- pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+ psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+ psrlq mm2, 8h
+ pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
- movq mm3, mm1
- pavgb mm1, mm2
- pxor mm3, mm2 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm3 ; decrease 1 from odd bytes
+ movq mm3, mm1
+ pavgb mm1, mm2
+ pxor mm3, mm2 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm3 ; decrease 1 from odd bytes
- pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
+ pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
- psrlq mm0, 8h
- movd [r0], mm0
- psrlq mm0, 8h
- movd [r0+4], mm0
- psrlq mm0, 8h
- movd [r0+8], mm0
- psrlq mm0, 8h
- movd [r0+12], mm0
- WELSEMMS
- ret
+ psrlq mm0, 8h
+ movd [r0], mm0
+ psrlq mm0, 8h
+ movd [r0+4], mm0
+ psrlq mm0, 8h
+ movd [r0+8], mm0
+ psrlq mm0, 8h
+ movd [r0+12], mm0
+ WELSEMMS
+ ret
;***********************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |b |c |d |i |
-; |f |g |h |j |
+; |a |b |c |d |
+; |e |f |g |h |
+; |b |c |d |i |
+; |f |g |h |j |
; a = (1 + t0 + t1)>>1
; b = (1 + t1 + t2)>>1
@@ -987,37 +987,37 @@
; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
- psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
- psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+ psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+ psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
- movq mm3, mm1
- pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
+ movq mm3, mm1
+ pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
- movq mm4, mm2
- pavgb mm2, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm4 ; decrease 1 from odd bytes
+ movq mm4, mm2
+ pavgb mm2, mm0
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm4 ; decrease 1 from odd bytes
- pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
+ pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
- movd [r0], mm3
- psrlq mm3, 8h
- movd [r0+8], mm3
+ movd [r0], mm3
+ psrlq mm3, 8h
+ movd [r0+8], mm3
- movd [r0+4], mm2
- psrlq mm2, 8h
- movd [r0+12], mm2
- WELSEMMS
- ret
+ movd [r0+4], mm2
+ psrlq mm2, 8h
+ movd [r0+12], mm2
+ WELSEMMS
+ ret
;***********************************************************************
;
@@ -1024,88 +1024,88 @@
; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsIChromaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1]
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1]
- movzx r3, byte [r1+r2-0x01] ; l1
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l2
- add r3, r4
- movzx r4, byte [r1+r2-0x01] ; l3
- add r3, r4
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l4
- add r3, r4
- movd mm1, r3d ; mm1 = l1+l2+l3+l4
+ movzx r3, byte [r1+r2-0x01] ; l1
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l2
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01] ; l3
+ add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l4
+ add r3, r4
+ movd mm1, r3d ; mm1 = l1+l2+l3+l4
- movzx r3, byte [r1+r2-0x01] ; l5
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l6
- add r3, r4
- movzx r4, byte [r1+r2-0x01] ; l7
- add r3, r4
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l8
- add r3, r4
- movd mm2, r3d ; mm2 = l5+l6+l7+l8
+ movzx r3, byte [r1+r2-0x01] ; l5
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l6
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01] ; l7
+ add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l8
+ add r3, r4
+ movd mm2, r3d ; mm2 = l5+l6+l7+l8
- movq mm3, mm0
- psrlq mm0, 0x20
- psllq mm3, 0x20
- psrlq mm3, 0x20
- pxor mm4, mm4
- psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+ movq mm3, mm0
+ psrlq mm0, 0x20
+ psllq mm3, 0x20
+ psrlq mm3, 0x20
+ pxor mm4, mm4
+ psadbw mm0, mm4
+ psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
- paddq mm3, mm1
- movq mm1, mm2
- paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+ paddq mm3, mm1
+ movq mm1, mm2
+ paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
- movq mm4, [mmx_0x02]
+ movq mm4, [mmx_0x02]
- paddq mm0, mm4
- psrlq mm0, 0x02
+ paddq mm0, mm4
+ psrlq mm0, 0x02
- paddq mm2, mm4
- psrlq mm2, 0x02
+ paddq mm2, mm4
+ psrlq mm2, 0x02
- paddq mm3, mm4
- paddq mm3, mm4
- psrlq mm3, 0x03
+ paddq mm3, mm4
+ paddq mm3, mm4
+ psrlq mm3, 0x03
- paddq mm1, mm4
- paddq mm1, mm4
- psrlq mm1, 0x03
+ paddq mm1, mm4
+ paddq mm1, mm4
+ psrlq mm1, 0x03
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
- psllq mm0, 0x20
- pxor mm0, mm3 ; mm0 = m_up
+ pmuludq mm0, [mmx_01bytes]
+ pmuludq mm3, [mmx_01bytes]
+ psllq mm0, 0x20
+ pxor mm0, mm3 ; mm0 = m_up
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
- psllq mm1, 0x20
- pxor mm1, mm2 ; mm2 = m_down
+ pmuludq mm2, [mmx_01bytes]
+ pmuludq mm1, [mmx_01bytes]
+ psllq mm1, 0x20
+ pxor mm1, mm2 ; mm2 = m_down
- movq [r0], mm0
- movq [r0+0x08], mm0
- movq [r0+0x10], mm0
- movq [r0+0x18], mm0
+ movq [r0], mm0
+ movq [r0+0x08], mm0
+ movq [r0+0x10], mm0
+ movq [r0+0x18], mm0
- movq [r0+0x20], mm1
- movq [r0+0x28], mm1
- movq [r0+0x30], mm1
- movq [r0+0x38], mm1
+ movq [r0+0x20], mm1
+ movq [r0+0x28], mm1
+ movq [r0+0x30], mm1
+ movq [r0+0x38], mm1
- pop r4
- pop r3
- WELSEMMS
- ret
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
@@ -1114,56 +1114,56 @@
; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movdqa xmm0, [r1] ; read one row
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- movdqa xmm1, xmm0
- psrldq xmm1, 0x08
- pslldq xmm0, 0x08
- psrldq xmm0, 0x08
- paddw xmm0, xmm1
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movdqa xmm0, [r1] ; read one row
+ pxor xmm1, xmm1
+ psadbw xmm0, xmm1
+ movdqa xmm1, xmm0
+ psrldq xmm1, 0x08
+ pslldq xmm0, 0x08
+ psrldq xmm0, 0x08
+ paddw xmm0, xmm1
- movzx r3, byte [r1+r2-0x01]
- movzx r4, byte [r1+2*r2-0x01]
- add r3, r4
- lea r1, [r1+r2]
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- add r3, 0x10
- movd xmm1, r3d
- paddw xmm0, xmm1
- psrld xmm0, 0x05
- pmuludq xmm0, [mmx_01bytes]
- pshufd xmm0, xmm0, 0
+ movzx r3, byte [r1+r2-0x01]
+ movzx r4, byte [r1+2*r2-0x01]
+ add r3, r4
+ lea r1, [r1+r2]
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ add r3, 0x10
+ movd xmm1, r3d
+ paddw xmm0, xmm1
+ psrld xmm0, 0x05
+ pmuludq xmm0, [mmx_01bytes]
+ pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- movdqa [r0+0x10], xmm0
- movdqa [r0+0x20], xmm0
- movdqa [r0+0x30], xmm0
- movdqa [r0+0x40], xmm0
- movdqa [r0+0x50], xmm0
- movdqa [r0+0x60], xmm0
- movdqa [r0+0x70], xmm0
- movdqa [r0+0x80], xmm0
- movdqa [r0+0x90], xmm0
- movdqa [r0+0xa0], xmm0
- movdqa [r0+0xb0], xmm0
- movdqa [r0+0xc0], xmm0
- movdqa [r0+0xd0], xmm0
- movdqa [r0+0xe0], xmm0
- movdqa [r0+0xf0], xmm0
+ movdqa [r0], xmm0
+ movdqa [r0+0x10], xmm0
+ movdqa [r0+0x20], xmm0
+ movdqa [r0+0x30], xmm0
+ movdqa [r0+0x40], xmm0
+ movdqa [r0+0x50], xmm0
+ movdqa [r0+0x60], xmm0
+ movdqa [r0+0x70], xmm0
+ movdqa [r0+0x80], xmm0
+ movdqa [r0+0x90], xmm0
+ movdqa [r0+0xa0], xmm0
+ movdqa [r0+0xb0], xmm0
+ movdqa [r0+0xc0], xmm0
+ movdqa [r0+0xd0], xmm0
+ movdqa [r0+0xe0], xmm0
+ movdqa [r0+0xf0], xmm0
- pop r4
- pop r3
- ret
\ No newline at end of file
+ pop r4
+ pop r3
+ ret
\ No newline at end of file
--- a/codec/encoder/core/x86/matrix_transpose.asm
+++ b/codec/encoder/core/x86/matrix_transpose.asm
@@ -34,153 +34,153 @@
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE_8x8B_MMX 10
- MMX_XSwap bw, %1, %2, %8
- MMX_XSwap bw, %3, %4, %2
- MMX_XSwap bw, %5, %6, %4
- movq %6, %9
- movq %10, %4
- MMX_XSwap bw, %7, %6, %4
+ MMX_XSwap bw, %1, %2, %8
+ MMX_XSwap bw, %3, %4, %2
+ MMX_XSwap bw, %5, %6, %4
+ movq %6, %9
+ movq %10, %4
+ MMX_XSwap bw, %7, %6, %4
- MMX_XSwap wd, %1, %3, %6
- MMX_XSwap wd, %8, %2, %3
- MMX_XSwap wd, %5, %7, %2
- movq %7, %10
- movq %10, %3
- MMX_XSwap wd, %7, %4, %3
+ MMX_XSwap wd, %1, %3, %6
+ MMX_XSwap wd, %8, %2, %3
+ MMX_XSwap wd, %5, %7, %2
+ movq %7, %10
+ movq %10, %3
+ MMX_XSwap wd, %7, %4, %3
- MMX_XSwap dq, %1, %5, %4
- MMX_XSwap dq, %6, %2, %5
- MMX_XSwap dq, %8, %7, %2
- movq %7, %10
- movq %10, %5
- MMX_XSwap dq, %7, %3, %5
+ MMX_XSwap dq, %1, %5, %4
+ MMX_XSwap dq, %6, %2, %5
+ MMX_XSwap dq, %8, %7, %2
+ movq %7, %10
+ movq %10, %5
+ MMX_XSwap dq, %7, %3, %5
- movq %3, %10
+ movq %3, %10
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
-%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
- movq [%1], mm0 ; result of line 1, x8 bytes
- movq [%1+%2], mm3 ; result of line 2
- lea %1, [%1+2*%2]
- movq [%1], mm5 ; result of line 3
- movq [%1+%2], mm2 ; result of line 4
- lea %1, [%1+2*%2]
- movq [%1], mm7 ; result of line 5
- movq [%1+%2], mm1 ; result of line 6
- lea %1, [%1+2*%2]
- movq [%1], mm6 ; result of line 7
- movq [%1+%2], mm4 ; result of line 8
+%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
+ movq [%1], mm0 ; result of line 1, x8 bytes
+ movq [%1+%2], mm3 ; result of line 2
+ lea %1, [%1+2*%2]
+ movq [%1], mm5 ; result of line 3
+ movq [%1+%2], mm2 ; result of line 4
+ lea %1, [%1+2*%2]
+ movq [%1], mm7 ; result of line 5
+ movq [%1+%2], mm1 ; result of line 6
+ lea %1, [%1+2*%2]
+ movq [%1], mm6 ; result of line 7
+ movq [%1+%2], mm4 ; result of line 8
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
-%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
- movq [%1], mm0 ; result of line 1, x8 bytes
- movq [%1+%2], mm3 ; result of line 2
- lea %3, [%1+2*%2]
- movq [%3], mm5 ; result of line 3
- movq [%3+%2], mm2 ; result of line 4
- lea %3, [%3+2*%2]
- movq [%3], mm7 ; result of line 5
- movq [%3+%2], mm1 ; result of line 6
- lea %3, [%3+2*%2]
- movq [%3], mm6 ; result of line 7
- movq [%3+%2], mm4 ; result of line 8
-%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
+%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
+ movq [%1], mm0 ; result of line 1, x8 bytes
+ movq [%1+%2], mm3 ; result of line 2
+ lea %3, [%1+2*%2]
+ movq [%3], mm5 ; result of line 3
+ movq [%3+%2], mm2 ; result of line 4
+ lea %3, [%3+2*%2]
+ movq [%3], mm7 ; result of line 5
+ movq [%3+%2], mm1 ; result of line 6
+ lea %3, [%3+2*%2]
+ movq [%3], mm6 ; result of line 7
+ movq [%3+%2], mm4 ; result of line 8
+%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
; for transpose 16x8
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
-%macro TRANSPOSE_8x16B_SSE2 10
- SSE2_XSawp bw, %1, %2, %8
- SSE2_XSawp bw, %3, %4, %2
- SSE2_XSawp bw, %5, %6, %4
- movdqa %6, %9
- movdqa %10, %4
- SSE2_XSawp bw, %7, %6, %4
+%macro TRANSPOSE_8x16B_SSE2 10
+ SSE2_XSawp bw, %1, %2, %8
+ SSE2_XSawp bw, %3, %4, %2
+ SSE2_XSawp bw, %5, %6, %4
+ movdqa %6, %9
+ movdqa %10, %4
+ SSE2_XSawp bw, %7, %6, %4
- SSE2_XSawp wd, %1, %3, %6
- SSE2_XSawp wd, %8, %2, %3
- SSE2_XSawp wd, %5, %7, %2
- movdqa %7, %10
- movdqa %10, %3
- SSE2_XSawp wd, %7, %4, %3
+ SSE2_XSawp wd, %1, %3, %6
+ SSE2_XSawp wd, %8, %2, %3
+ SSE2_XSawp wd, %5, %7, %2
+ movdqa %7, %10
+ movdqa %10, %3
+ SSE2_XSawp wd, %7, %4, %3
- SSE2_XSawp dq, %1, %5, %4
- SSE2_XSawp dq, %6, %2, %5
- SSE2_XSawp dq, %8, %7, %2
- movdqa %7, %10
- movdqa %10, %5
- SSE2_XSawp dq, %7, %3, %5
+ SSE2_XSawp dq, %1, %5, %4
+ SSE2_XSawp dq, %6, %2, %5
+ SSE2_XSawp dq, %8, %7, %2
+ movdqa %7, %10
+ movdqa %10, %5
+ SSE2_XSawp dq, %7, %3, %5
- SSE2_XSawp qdq, %1, %8, %3
- SSE2_XSawp qdq, %4, %2, %8
- SSE2_XSawp qdq, %6, %7, %2
- movdqa %7, %10
- movdqa %10, %1
- SSE2_XSawp qdq, %7, %5, %1
- movdqa %5, %10
-%endmacro ; end of TRANSPOSE_8x16B_SSE2
+ SSE2_XSawp qdq, %1, %8, %3
+ SSE2_XSawp qdq, %4, %2, %8
+ SSE2_XSawp qdq, %6, %7, %2
+ movdqa %7, %10
+ movdqa %10, %1
+ SSE2_XSawp qdq, %7, %5, %1
+ movdqa %5, %10
+%endmacro ; end of TRANSPOSE_8x16B_SSE2
-%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
- movq [%1], xmm4 ; result of line 1, x8 bytes
- movq [%1+%2], xmm2 ; result of line 2
- lea %1, [%1+2*%2]
- movq [%1], xmm3 ; result of line 3
- movq [%1+%2], xmm7 ; result of line 4
+%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
+ movq [%1], xmm4 ; result of line 1, x8 bytes
+ movq [%1+%2], xmm2 ; result of line 2
+ lea %1, [%1+2*%2]
+ movq [%1], xmm3 ; result of line 3
+ movq [%1+%2], xmm7 ; result of line 4
- lea %1, [%1+2*%2]
- movq [%1], xmm5 ; result of line 5
- movq [%1+%2], xmm1 ; result of line 6
- lea %1, [%1+2*%2]
- movq [%1], xmm6 ; result of line 7
- movq [%1+%2], xmm0 ; result of line 8
+ lea %1, [%1+2*%2]
+ movq [%1], xmm5 ; result of line 5
+ movq [%1+%2], xmm1 ; result of line 6
+ lea %1, [%1+2*%2]
+ movq [%1], xmm6 ; result of line 7
+ movq [%1+%2], xmm0 ; result of line 8
- lea %1, [%1+2*%2]
- movhpd [%1], xmm4 ; result of line 9
- movhpd [%1+%2], xmm2 ; result of line 10
- lea %1, [%1+2*%2]
- movhpd [%1], xmm3 ; result of line 11
- movhpd [%1+%2], xmm7 ; result of line 12
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm4 ; result of line 9
+ movhpd [%1+%2], xmm2 ; result of line 10
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm3 ; result of line 11
+ movhpd [%1+%2], xmm7 ; result of line 12
- lea %1, [%1+2*%2]
- movhpd [%1], xmm5 ; result of line 13
- movhpd [%1+%2], xmm1 ; result of line 14
- lea %1, [%1+2*%2]
- movhpd [%1], xmm6 ; result of line 15
- movhpd [%1+%2], xmm0 ; result of line 16
-%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm5 ; result of line 13
+ movhpd [%1+%2], xmm1 ; result of line 14
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm6 ; result of line 15
+ movhpd [%1+%2], xmm0 ; result of line 16
+%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
-%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
- movq [%1], xmm4 ; result of line 1, x8 bytes
- movq [%1+%2], xmm2 ; result of line 2
- lea %3, [%1+2*%2]
- movq [%3], xmm3 ; result of line 3
- movq [%3+%2], xmm7 ; result of line 4
+%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
+ movq [%1], xmm4 ; result of line 1, x8 bytes
+ movq [%1+%2], xmm2 ; result of line 2
+ lea %3, [%1+2*%2]
+ movq [%3], xmm3 ; result of line 3
+ movq [%3+%2], xmm7 ; result of line 4
- lea %3, [%3+2*%2]
- movq [%3], xmm5 ; result of line 5
- movq [%3+%2], xmm1 ; result of line 6
- lea %3, [%3+2*%2]
- movq [%3], xmm6 ; result of line 7
- movq [%3+%2], xmm0 ; result of line 8
+ lea %3, [%3+2*%2]
+ movq [%3], xmm5 ; result of line 5
+ movq [%3+%2], xmm1 ; result of line 6
+ lea %3, [%3+2*%2]
+ movq [%3], xmm6 ; result of line 7
+ movq [%3+%2], xmm0 ; result of line 8
- lea %3, [%3+2*%2]
- movhpd [%3], xmm4 ; result of line 9
- movhpd [%3+%2], xmm2 ; result of line 10
- lea %3, [%3+2*%2]
- movhpd [%3], xmm3 ; result of line 11
- movhpd [%3+%2], xmm7 ; result of line 12
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm4 ; result of line 9
+ movhpd [%3+%2], xmm2 ; result of line 10
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm3 ; result of line 11
+ movhpd [%3+%2], xmm7 ; result of line 12
- lea %3, [%3+2*%2]
- movhpd [%3], xmm5 ; result of line 13
- movhpd [%3+%2], xmm1 ; result of line 14
- lea %3, [%3+2*%2]
- movhpd [%3], xmm6 ; result of line 15
- movhpd [%3+%2], xmm0 ; result of line 16
-%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm5 ; result of line 13
+ movhpd [%3+%2], xmm1 ; result of line 14
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm6 ; result of line 15
+ movhpd [%3+%2], xmm0 ; result of line 16
+%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
SECTION .text
@@ -187,209 +187,209 @@
WELS_EXTERN TransposeMatrixBlock16x16_sse2
; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
- push r4
- push r5
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
- mov r4, r7
- and r4, 0Fh
- sub r7, 10h
- sub r7, r4
- lea r5, [r3+r3*2]
- ; top 8x16 block
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+r3*2]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+r3*4]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+r3*2]
+ mov r4, r7
+ and r4, 0Fh
+ sub r7, 10h
+ sub r7, r4
+ lea r5, [r3+r3*2]
+ ; top 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+r3*2]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+r3*4]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+r3*2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m4, m2, m3, m7, m5, m1, m6, m0
- TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
- TRANSPOSE8x16_WRITE_SSE2 r0, r1
+ TRANSPOSE8x16_WRITE_SSE2 r0, r1
- ; bottom 8x16 block
- lea r2, [r2+r3*4]
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+r3*2]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+r3*4]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+r3*2]
+ ; bottom 8x16 block
+ lea r2, [r2+r3*4]
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+r3*2]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+r3*4]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+r3*2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m4, m2, m3, m7, m5, m1, m6, m0
- TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
- mov r5, r1
- sal r5, 4
- sub r0, r5
- lea r0, [r0+r1*2+8]
- TRANSPOSE8x16_WRITE_SSE2 r0, r1
+ mov r5, r1
+ sal r5, 4
+ sub r0, r5
+ lea r0, [r0+r1*2+8]
+ TRANSPOSE8x16_WRITE_SSE2 r0, r1
- add r7, r4
- add r7, 10h
- POP_XMM
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
+ add r7, r4
+ add r7, 10h
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
WELS_EXTERN TransposeMatrixBlocksx16_sse2
; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
- push r5
- push r6
- %assign push_num 2
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- mov r5, r7
- and r5, 0Fh
- sub r7, 10h
- sub r7, r5
+ push r5
+ push r6
+ %assign push_num 2
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ mov r5, r7
+ and r5, 0Fh
+ sub r7, 10h
+ sub r7, r5
TRANSPOSE_LOOP_SSE2:
- ; explictly loading next loop data
- lea r6, [r2+r3*8]
- push r4
+ ; explictly loading next loop data
+ lea r6, [r2+r3*8]
+ push r4
%rep 8
- mov r4, [r6]
- mov r4, [r6+r3]
- lea r6, [r6+r3*2]
+ mov r4, [r6]
+ mov r4, [r6+r3]
+ lea r6, [r6+r3*2]
%endrep
- pop r4
- ; top 8x16 block
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm2, [r2]
- movdqa xmm3, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm6, [r2]
+ pop r4
+ ; top 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm2, [r2]
+ movdqa xmm3, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm6, [r2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m4, m2, m3, m7, m5, m1, m6, m0
- TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
- TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
- lea r2, [r2+r3*2]
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+ TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
+ lea r2, [r2+r3*2]
- ; bottom 8x16 block
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm2, [r2]
- movdqa xmm3, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- lea r2, [r2+r3*2]
- movdqa xmm6, [r2]
+ ; bottom 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm2, [r2]
+ movdqa xmm3, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm6, [r2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m4, m2, m3, m7, m5, m1, m6, m0
- TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
- TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
- lea r2, [r2+r3*2]
- lea r0, [r0+16]
- dec r4
- jg near TRANSPOSE_LOOP_SSE2
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+ TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
+ lea r2, [r2+r3*2]
+ lea r0, [r0+16]
+ dec r4
+ jg near TRANSPOSE_LOOP_SSE2
- add r7, r5
- add r7, 10h
- POP_XMM
- LOAD_5_PARA_POP
- pop r6
- pop r5
- ret
+ add r7, r5
+ add r7, 10h
+ POP_XMM
+ LOAD_5_PARA_POP
+ pop r6
+ pop r5
+ ret
WELS_EXTERN TransposeMatrixBlock8x8_mmx
; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- sub r7, 8
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ sub r7, 8
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m0, m3, m5, m2, m7, m1, m6, m4
- TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m0, m3, m5, m2, m7, m1, m6, m4
+ TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
- TRANSPOSE8x8_WRITE_MMX r0, r1
+ TRANSPOSE8x8_WRITE_MMX r0, r1
- emms
- add r7, 8
- LOAD_4_PARA_POP
- ret
+ emms
+ add r7, 8
+ LOAD_4_PARA_POP
+ ret
WELS_EXTERN TransposeMatrixBlocksx8_mmx
; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
- push r5
- push r6
- %assign push_num 2
- LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- sub r7, 8
+ push r5
+ push r6
+ %assign push_num 2
+ LOAD_5_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ sub r7, 8
- lea r5, [r2+r3*8]
+ lea r5, [r2+r3*8]
TRANSPOSE_BLOCKS_X8_LOOP_MMX:
- ; explictly loading next loop data
+ ; explictly loading next loop data
%rep 4
- mov r6, [r5]
- mov r6, [r5+r3]
- lea r5, [r5+r3*2]
+ mov r6, [r5]
+ mov r6, [r5+r3]
+ lea r5, [r5+r3*2]
%endrep
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
- ;in: m0, m1, m2, m3, m4, m5, m6, m7
- ;out: m0, m3, m5, m2, m7, m1, m6, m4
- TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m0, m3, m5, m2, m7, m1, m6, m4
+ TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
- TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
- lea r0, [r0+8]
- lea r2, [r2+2*r3]
- dec r4
- jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
+ TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
+ lea r0, [r0+8]
+ lea r2, [r2+2*r3]
+ dec r4
+ jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
- emms
- add r7, 8
- LOAD_5_PARA_POP
- pop r6
- pop r5
- ret
+ emms
+ add r7, 8
+ LOAD_5_PARA_POP
+ pop r6
+ pop r5
+ ret
--- a/codec/encoder/core/x86/memzero.asm
+++ b/codec/encoder/core/x86/memzero.asm
@@ -51,10 +51,10 @@
;void WelsPrefetchZero_mmx(int8_t const*_A);
;***********************************************************************
WELS_EXTERN WelsPrefetchZero_mmx
- %assign push_num 0
- LOAD_1_PARA
- prefetchnta [r0]
- ret
+ %assign push_num 0
+ LOAD_1_PARA
+ prefetchnta [r0]
+ ret
;***********************************************************************
@@ -62,23 +62,23 @@
;***********************************************************************
WELS_EXTERN WelsSetMemZeroAligned64_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- neg r1
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ neg r1
- pxor xmm0, xmm0
+ pxor xmm0, xmm0
.memzeroa64_sse2_loops:
- movdqa [r0], xmm0
- movdqa [r0+16], xmm0
- movdqa [r0+32], xmm0
- movdqa [r0+48], xmm0
- add r0, 0x40
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm0
+ movdqa [r0+32], xmm0
+ movdqa [r0+48], xmm0
+ add r0, 0x40
- add r1, 0x40
- jnz near .memzeroa64_sse2_loops
+ add r1, 0x40
+ jnz near .memzeroa64_sse2_loops
- ret
+ ret
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -85,28 +85,28 @@
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize64_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- neg r1
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ neg r1
- pxor mm0, mm0
+ pxor mm0, mm0
.memzero64_mmx_loops:
- movq [r0], mm0
- movq [r0+8], mm0
- movq [r0+16], mm0
- movq [r0+24], mm0
- movq [r0+32], mm0
- movq [r0+40], mm0
- movq [r0+48], mm0
- movq [r0+56], mm0
- add r0, 0x40
+ movq [r0], mm0
+ movq [r0+8], mm0
+ movq [r0+16], mm0
+ movq [r0+24], mm0
+ movq [r0+32], mm0
+ movq [r0+40], mm0
+ movq [r0+48], mm0
+ movq [r0+56], mm0
+ add r0, 0x40
- add r1, 0x40
- jnz near .memzero64_mmx_loops
+ add r1, 0x40
+ jnz near .memzero64_mmx_loops
- WELSEMMS
- ret
+ WELSEMMS
+ ret
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
@@ -113,20 +113,20 @@
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize8_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- neg r1
- pxor mm0, mm0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ neg r1
+ pxor mm0, mm0
.memzero8_mmx_loops:
- movq [r0], mm0
- add r0, 0x08
+ movq [r0], mm0
+ add r0, 0x08
- add r1, 0x08
- jnz near .memzero8_mmx_loops
+ add r1, 0x08
+ jnz near .memzero8_mmx_loops
- WELSEMMS
- ret
+ WELSEMMS
+ ret
--- a/codec/encoder/core/x86/quant.asm
+++ b/codec/encoder/core/x86/quant.asm
@@ -49,140 +49,140 @@
;************************************************
%macro SSE2_Quant8 5
- MOVDQ %1, %5
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
- paddusw %1, %3
- pmulhuw %1, %4
- pxor %1, %2
- psubw %1, %2
- MOVDQ %5, %1
+ MOVDQ %1, %5
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+ paddusw %1, %3
+ pmulhuw %1, %4
+ pxor %1, %2
+ psubw %1, %2
+ MOVDQ %5, %1
%endmacro
%macro SSE2_QuantMax8 6
- MOVDQ %1, %5
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
- paddusw %1, %3
- pmulhuw %1, %4
- pmaxsw %6, %1
- pxor %1, %2
- psubw %1, %2
- MOVDQ %5, %1
+ MOVDQ %1, %5
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+ paddusw %1, %3
+ pmulhuw %1, %4
+ pmaxsw %6, %1
+ pxor %1, %2
+ psubw %1, %2
+ MOVDQ %5, %1
%endmacro
-%define pDct esp + 4
-%define ff esp + 8
-%define mf esp + 12
-%define max esp + 16
+%define pDct esp + 4
+%define ff esp + 8
+%define mf esp + 12
+%define max esp + 16
;***********************************************************************
-; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
+; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4_sse2
- %assign push_num 0
- LOAD_3_PARA
- movdqa xmm2, [r1]
- movdqa xmm3, [r2]
+ %assign push_num 0
+ LOAD_3_PARA
+ movdqa xmm2, [r1]
+ movdqa xmm3, [r2]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
- ret
+ ret
;***********************************************************************
;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4Dc_sse2
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSIONW r1, r1w
- SIGN_EXTENSIONW r2, r2w
- SSE2_Copy8Times xmm3, r2d
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSIONW r1, r1w
+ SIGN_EXTENSIONW r2, r2w
+ SSE2_Copy8Times xmm3, r2d
- SSE2_Copy8Times xmm2, r1d
+ SSE2_Copy8Times xmm2, r1d
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
- ret
+ ret
;***********************************************************************
-; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
+; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4_sse2
- %assign push_num 0
- LOAD_3_PARA
- MOVDQ xmm2, [r1]
- MOVDQ xmm3, [r2]
+ %assign push_num 0
+ LOAD_3_PARA
+ MOVDQ xmm2, [r1]
+ MOVDQ xmm3, [r2]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
- ret
+ ret
;***********************************************************************
-; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
+; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4Max_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- MOVDQ xmm2, [r1]
- MOVDQ xmm3, [r2]
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ MOVDQ xmm2, [r1]
+ MOVDQ xmm3, [r2]
- pxor xmm4, xmm4
- pxor xmm5, xmm5
- pxor xmm6, xmm6
- pxor xmm7, xmm7
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
+ pxor xmm4, xmm4
+ pxor xmm5, xmm5
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
- SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
- pmaxsw xmm0, xmm4
- pmaxsw xmm0, xmm5
- pmaxsw xmm0, xmm7
- movdqa xmm1, xmm0
- punpckhqdq xmm0, xmm1
- pmaxsw xmm0, xmm1
+ SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
+ pmaxsw xmm0, xmm4
+ pmaxsw xmm0, xmm5
+ pmaxsw xmm0, xmm7
+ movdqa xmm1, xmm0
+ punpckhqdq xmm0, xmm1
+ pmaxsw xmm0, xmm1
- movq [r3], xmm0
- POP_XMM
- LOAD_4_PARA_POP
- ret
+ movq [r3], xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
-%macro MMX_Copy4Times 2
- movd %1, %2
- punpcklwd %1, %1
- punpckldq %1, %1
+%macro MMX_Copy4Times 2
+ movd %1, %2
+ punpcklwd %1, %1
+ punpckldq %1, %1
%endmacro
SECTION .text
%macro MMX_Quant4 4
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
- paddusw %1, %3
- pmulhuw %1, %4
- pxor %1, %2
- psubw %1, %2
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+ paddusw %1, %3
+ pmulhuw %1, %4
+ pxor %1, %2
+ psubw %1, %2
%endmacro
;***********************************************************************
@@ -189,101 +189,101 @@
;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2_mmx
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENSIONW r1, r1w
- SIGN_EXTENSIONW r2, r2w
- movd mm0, [r0]
- movd mm1, [r0 + 0x20]
- punpcklwd mm0, mm1
- movd mm3, [r0 + 0x40]
- movd mm1, [r0 + 0x60]
- punpcklwd mm3, mm1
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENSIONW r1, r1w
+ SIGN_EXTENSIONW r2, r2w
+ movd mm0, [r0]
+ movd mm1, [r0 + 0x20]
+ punpcklwd mm0, mm1
+ movd mm3, [r0 + 0x40]
+ movd mm1, [r0 + 0x60]
+ punpcklwd mm3, mm1
- ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
- movq mm5, mm3
- paddw mm3, mm0
- psubw mm0, mm5
- punpcklwd mm3, mm0
- movq mm1, mm3
- psrlq mm1, 32
- movq mm5, mm1
- paddw mm1, mm3
- psubw mm3, mm5
- punpcklwd mm1, mm3
+ ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
+ movq mm5, mm3
+ paddw mm3, mm0
+ psubw mm0, mm5
+ punpcklwd mm3, mm0
+ movq mm1, mm3
+ psrlq mm1, 32
+ movq mm5, mm1
+ paddw mm1, mm3
+ psubw mm3, mm5
+ punpcklwd mm1, mm3
- ;quant_2x2_dc
- MMX_Copy4Times mm3, r2d
- MMX_Copy4Times mm2, r1d
- MMX_Quant4 mm1, mm0, mm2, mm3
+ ;quant_2x2_dc
+ MMX_Copy4Times mm3, r2d
+ MMX_Copy4Times mm2, r1d
+ MMX_Quant4 mm1, mm0, mm2, mm3
- ; store dct_2x2
- movq [r3], mm1
- movq [r4], mm1
+ ; store dct_2x2
+ movq [r3], mm1
+ movq [r4], mm1
- ; pNonZeroCount of dct_2x2
- pcmpeqb mm2, mm2 ; mm2 = FF
- pxor mm3, mm3
- packsswb mm1, mm3
- pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
- psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
- psadbw mm1, mm3 ;
- mov r1w, 0
- mov [r0], r1w
- mov [r0 + 0x20], r1w
- mov [r0 + 0x40], r1w
- mov [r0 + 0x60], r1w
+ ; pNonZeroCount of dct_2x2
+ pcmpeqb mm2, mm2 ; mm2 = FF
+ pxor mm3, mm3
+ packsswb mm1, mm3
+ pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
+ psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
+ psadbw mm1, mm3 ;
+ mov r1w, 0
+ mov [r0], r1w
+ mov [r0 + 0x20], r1w
+ mov [r0 + 0x40], r1w
+ mov [r0 + 0x60], r1w
- movd retrd, mm1
+ movd retrd, mm1
- WELSEMMS
- LOAD_5_PARA_POP
- ret
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
;***********************************************************************
;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSIONW r1, r1w
- SIGN_EXTENSIONW r2, r2w
- movd mm0, [r0]
- movd mm1, [r0 + 0x20]
- punpcklwd mm0, mm1
- movd mm3, [r0 + 0x40]
- movd mm1, [r0 + 0x60]
- punpcklwd mm3, mm1
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSIONW r1, r1w
+ SIGN_EXTENSIONW r2, r2w
+ movd mm0, [r0]
+ movd mm1, [r0 + 0x20]
+ punpcklwd mm0, mm1
+ movd mm3, [r0 + 0x40]
+ movd mm1, [r0 + 0x60]
+ punpcklwd mm3, mm1
- ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
- movq mm5, mm3
- paddw mm3, mm0
- psubw mm0, mm5
- punpcklwd mm3, mm0
- movq mm1, mm3
- psrlq mm1, 32
- movq mm5, mm1
- paddw mm1, mm3
- psubw mm3, mm5
- punpcklwd mm1, mm3
+ ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
+ movq mm5, mm3
+ paddw mm3, mm0
+ psubw mm0, mm5
+ punpcklwd mm3, mm0
+ movq mm1, mm3
+ psrlq mm1, 32
+ movq mm5, mm1
+ paddw mm1, mm3
+ psubw mm3, mm5
+ punpcklwd mm1, mm3
- ;quant_2x2_dc
- MMX_Copy4Times mm3, r2d
- MMX_Copy4Times mm2, r1d
- MMX_Quant4 mm1, mm0, mm2, mm3
+ ;quant_2x2_dc
+ MMX_Copy4Times mm3, r2d
+ MMX_Copy4Times mm2, r1d
+ MMX_Quant4 mm1, mm0, mm2, mm3
- ; pNonZeroCount of dct_2x2
- pcmpeqb mm2, mm2 ; mm2 = FF
- pxor mm3, mm3
- packsswb mm1, mm3
- pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
- psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
- psadbw mm1, mm3 ;
- movd retrd, mm1
+ ; pNonZeroCount of dct_2x2
+ pcmpeqb mm2, mm2 ; mm2 = FF
+ pxor mm3, mm3
+ packsswb mm1, mm3
+ pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
+ psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
+ psadbw mm1, mm3 ;
+ movd retrd, mm1
- WELSEMMS
- ret
+ WELSEMMS
+ ret
%macro SSE2_DeQuant8 3
@@ -297,12 +297,12 @@
; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
;***********************************************************************
WELS_EXTERN WelsDequant4x4_sse2
- %assign push_num 0
- LOAD_2_PARA
+ %assign push_num 0
+ LOAD_2_PARA
- movdqa xmm1, [r1]
- SSE2_DeQuant8 [r0 ], xmm0, xmm1
- SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
+ movdqa xmm1, [r1]
+ SSE2_DeQuant8 [r0 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
ret
@@ -311,18 +311,18 @@
;***********************************************************************====
WELS_EXTERN WelsDequantFour4x4_sse2
- %assign push_num 0
- LOAD_2_PARA
+ %assign push_num 0
+ LOAD_2_PARA
- movdqa xmm1, [r1]
- SSE2_DeQuant8 [r0 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
+ movdqa xmm1, [r1]
+ SSE2_DeQuant8 [r0 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
ret
@@ -330,41 +330,41 @@
;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
;***********************************************************************
WELS_EXTERN WelsDequantIHadamard4x4_sse2
- %assign push_num 0
- LOAD_2_PARA
- %ifndef X86_32
- movzx r1, r1w
- %endif
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movzx r1, r1w
+ %endif
- ; WelsDequantLumaDc4x4
- SSE2_Copy8Times xmm1, r1d
- ;psrlw xmm1, 2 ; for the (>>2) in ihdm
- MOVDQ xmm0, [r0]
- MOVDQ xmm2, [r0+0x10]
- pmullw xmm0, xmm1
- pmullw xmm2, xmm1
+ ; WelsDequantLumaDc4x4
+ SSE2_Copy8Times xmm1, r1d
+ ;psrlw xmm1, 2 ; for the (>>2) in ihdm
+ MOVDQ xmm0, [r0]
+ MOVDQ xmm2, [r0+0x10]
+ pmullw xmm0, xmm1
+ pmullw xmm2, xmm1
- ; ihdm_4x4
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- movdqa xmm3, xmm2
- psrldq xmm3, 8
+ ; ihdm_4x4
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ movdqa xmm3, xmm2
+ psrldq xmm3, 8
- SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
- SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
- SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
- SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
+ SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+ SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
+ SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
+ SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
- SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
- SSE2_SumSub xmm2, xmm4, xmm5
- SSE2_SumSub xmm1, xmm0, xmm5
- SSE2_SumSub xmm4, xmm0, xmm5
- SSE2_SumSub xmm2, xmm1, xmm5
- SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
+ SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
+ SSE2_SumSub xmm2, xmm4, xmm5
+ SSE2_SumSub xmm1, xmm0, xmm5
+ SSE2_SumSub xmm4, xmm0, xmm5
+ SSE2_SumSub xmm2, xmm1, xmm5
+ SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
- punpcklqdq xmm0, xmm1
- MOVDQ [r0], xmm0
+ punpcklqdq xmm0, xmm1
+ MOVDQ [r0], xmm0
- punpcklqdq xmm2, xmm3
- MOVDQ [r0+16], xmm2
- ret
+ punpcklqdq xmm2, xmm3
+ MOVDQ [r0+16], xmm2
+ ret
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -35,123 +35,123 @@
;**********************************************************************************************************************************
;
-; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
+; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
;
-; \note:
-; src need align with 16 bytes, ref is optional
-; \return value:
-; return minimal SAD cost, according index carried by index_min_cost
+; \note:
+; src need align with 16 bytes, ref is optional
+; \return value:
+; return minimal SAD cost, according index carried by index_min_cost
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
-%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
- movdqa xmm0, [%1]
- movdqu xmm1, [%2]
- movdqu xmm2, [%2+8h]
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
+%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
+ movdqa xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqu xmm2, [%2+8h]
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
- mpsadbw xmm1, xmm0, 0 ; 000 B
- paddw xmm7, xmm1 ; accumulate cost
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
- mpsadbw xmm3, xmm0, 5 ; 101 B
- paddw xmm7, xmm3 ; accumulate cost
+ mpsadbw xmm3, xmm0, 5 ; 101 B
+ paddw xmm7, xmm3 ; accumulate cost
- mpsadbw xmm2, xmm0, 2 ; 010 B
- paddw xmm7, xmm2 ; accumulate cost
+ mpsadbw xmm2, xmm0, 2 ; 010 B
+ paddw xmm7, xmm2 ; accumulate cost
- mpsadbw xmm4, xmm0, 7 ; 111 B
- paddw xmm7, xmm4 ; accumulate cost
+ mpsadbw xmm4, xmm0, 7 ; 111 B
+ paddw xmm7, xmm4 ; accumulate cost
- add %1, %3
- add %2, %4
-%endmacro ; end of SAD_16x16_LINE_SSE41
-%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
- movdqa xmm0, [%1]
- movdqu xmm1, [%2]
- movdqu xmm2, [%2+8h]
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
+ add %1, %3
+ add %2, %4
+%endmacro ; end of SAD_16x16_LINE_SSE41
+%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
+ movdqa xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqu xmm2, [%2+8h]
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
- mpsadbw xmm1, xmm0, 0 ; 000 B
- paddw xmm7, xmm1 ; accumulate cost
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
- mpsadbw xmm3, xmm0, 5 ; 101 B
- paddw xmm7, xmm3 ; accumulate cost
+ mpsadbw xmm3, xmm0, 5 ; 101 B
+ paddw xmm7, xmm3 ; accumulate cost
- mpsadbw xmm2, xmm0, 2 ; 010 B
- paddw xmm7, xmm2 ; accumulate cost
+ mpsadbw xmm2, xmm0, 2 ; 010 B
+ paddw xmm7, xmm2 ; accumulate cost
- mpsadbw xmm4, xmm0, 7 ; 111 B
- paddw xmm7, xmm4 ; accumulate cost
-%endmacro ; end of SAD_16x16_LINE_SSE41E
+ mpsadbw xmm4, xmm0, 7 ; 111 B
+ paddw xmm7, xmm4 ; accumulate cost
+%endmacro ; end of SAD_16x16_LINE_SSE41E
WELS_EXTERN SampleSad16x16Hor8_sse41
;push ebx
;push esi
- ;mov eax, [esp+12] ; src
- ;mov ecx, [esp+16] ; stride_src
- ;mov ebx, [esp+20] ; ref
- ;mov edx, [esp+24] ; stride_ref
- ;mov esi, [esp+28] ; base_cost
+ ;mov eax, [esp+12] ; src
+ ;mov ecx, [esp+16] ; stride_src
+ ;mov ebx, [esp+20] ; ref
+ ;mov edx, [esp+24] ; stride_ref
+ ;mov esi, [esp+28] ; base_cost
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm7, xmm7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm7, xmm7
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41 r0, r2, r1, r3
- SAD_16x16_LINE_SSE41E r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41E r0, r2, r1, r3
- pxor xmm0, xmm0
- movdqa xmm6, xmm7
- punpcklwd xmm6, xmm0
- punpckhwd xmm7, xmm0
+ pxor xmm0, xmm0
+ movdqa xmm6, xmm7
+ punpcklwd xmm6, xmm0
+ punpckhwd xmm7, xmm0
- movdqa xmm5, [r4]
- movdqa xmm4, xmm5
- punpcklwd xmm4, xmm0
- punpckhwd xmm5, xmm0
+ movdqa xmm5, [r4]
+ movdqa xmm4, xmm5
+ punpcklwd xmm4, xmm0
+ punpckhwd xmm5, xmm0
- paddd xmm4, xmm6
- paddd xmm5, xmm7
- movdqa xmm3, xmm4
- pminud xmm3, xmm5
- pshufd xmm2, xmm3, 01001110B
- pminud xmm2, xmm3
- pshufd xmm3, xmm2, 10110001B
- pminud xmm2, xmm3
- movd retrd, xmm2
- pcmpeqd xmm4, xmm2
- movmskps r2d, xmm4
- bsf r1d, r2d
- jnz near WRITE_INDEX
+ paddd xmm4, xmm6
+ paddd xmm5, xmm7
+ movdqa xmm3, xmm4
+ pminud xmm3, xmm5
+ pshufd xmm2, xmm3, 01001110B
+ pminud xmm2, xmm3
+ pshufd xmm3, xmm2, 10110001B
+ pminud xmm2, xmm3
+ movd retrd, xmm2
+ pcmpeqd xmm4, xmm2
+ movmskps r2d, xmm4
+ bsf r1d, r2d
+ jnz near WRITE_INDEX
- pcmpeqd xmm5, xmm2
- movmskps r2d, xmm5
- bsf r1d, r2d
- add r1d, 4
+ pcmpeqd xmm5, xmm2
+ movmskps r2d, xmm5
+ bsf r1d, r2d
+ add r1d, 4
WRITE_INDEX:
- mov [r5], r1d
+ mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP
ret
@@ -158,66 +158,66 @@
;**********************************************************************************************************************************
;
-; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
+; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
;
-; \note:
-; src and ref is optional to align with 16 due inter 8x8
-; \return value:
-; return minimal SAD cost, according index carried by index_min_cost
+; \note:
+; src and ref is optional to align with 16 due inter 8x8
+; \return value:
+; return minimal SAD cost, according index carried by index_min_cost
;
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
-%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
- movdqu xmm0, [%1]
- movdqu xmm1, [%2]
- movdqa xmm2, xmm1
+%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
+ movdqu xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0 ; 000 B
- paddw xmm7, xmm1 ; accumulate cost
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
- mpsadbw xmm2, xmm0, 5 ; 101 B
- paddw xmm7, xmm2 ; accumulate cost
+ mpsadbw xmm2, xmm0, 5 ; 101 B
+ paddw xmm7, xmm2 ; accumulate cost
- add %1, %3
- add %2, %4
-%endmacro ; end of SAD_8x8_LINE_SSE41
-%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
- movdqu xmm0, [%1]
- movdqu xmm1, [%2]
- movdqa xmm2, xmm1
+ add %1, %3
+ add %2, %4
+%endmacro ; end of SAD_8x8_LINE_SSE41
+%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
+ movdqu xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0 ; 000 B
- paddw xmm7, xmm1 ; accumulate cost
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
- mpsadbw xmm2, xmm0, 5 ; 101 B
- paddw xmm7, xmm2 ; accumulate cost
-%endmacro ; end of SAD_8x8_LINE_SSE41E
+ mpsadbw xmm2, xmm0, 5 ; 101 B
+ paddw xmm7, xmm2 ; accumulate cost
+%endmacro ; end of SAD_8x8_LINE_SSE41E
WELS_EXTERN SampleSad8x8Hor8_sse41
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqa xmm7, [r4] ; load base cost list
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm7, [r4] ; load base cost list
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41 r0, r2, r1, r3
- SAD_8x8_LINE_SSE41E r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41E r0, r2, r1, r3
- phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
- movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
- mov r1d, retrd
- and retrd, 0xFFFF
- sar r1d, 16
- mov [r5], r1d
+ phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
+ movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
+ mov r1d, retrd
+ and retrd, 0xFFFF
+ sar r1d, 16
+ mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -104,32 +104,32 @@
align 16
high_mask_table:
- db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
- db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
- db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
- db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
- db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
- db 5, 8, 5, 7, 8,11, 6, 8, 8,11
- db 9,11,12,15, 0, 1, 1, 4, 1, 3
- db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
- db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
- db 7,10, 8,10,11,14, 3, 4, 4, 7
- db 5, 7, 8,11, 5, 7, 7,10, 8,10
- db 11,14, 6, 7, 8,11, 8,10,11,14
- db 9,11,11,14,12,14,15,18, 0, 0
- db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
- db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
- db 7,10, 5, 7, 7,10, 8,10,11,14
- db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
- db 6, 9, 7, 9,10,13, 5, 6, 7,10
- db 7, 9,10,13, 8,10,10,13,11,13
- db 14,17, 3, 4, 4, 7, 4, 6, 7,10
- db 5, 7, 7,10, 8,10,11,14, 5, 6
- db 7,10, 7, 9,10,13, 8,10,10,13
- db 11,13,14,17, 6, 7, 7,10, 8,10
- db 11,14, 8,10,10,13,11,13,14,17
- db 9,10,11,14,11,13,14,17,12,14
- db 14,17,15,17,18,21
+ db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
+ db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
+ db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
+ db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
+ db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
+ db 5, 8, 5, 7, 8,11, 6, 8, 8,11
+ db 9,11,12,15, 0, 1, 1, 4, 1, 3
+ db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
+ db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
+ db 7,10, 8,10,11,14, 3, 4, 4, 7
+ db 5, 7, 8,11, 5, 7, 7,10, 8,10
+ db 11,14, 6, 7, 8,11, 8,10,11,14
+ db 9,11,11,14,12,14,15,18, 0, 0
+ db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
+ db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
+ db 7,10, 5, 7, 7,10, 8,10,11,14
+ db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
+ db 6, 9, 7, 9,10,13, 5, 6, 7,10
+ db 7, 9,10,13, 8,10,10,13,11,13
+ db 14,17, 3, 4, 4, 7, 4, 6, 7,10
+ db 5, 7, 7,10, 8,10,11,14, 5, 6
+ db 7,10, 7, 9,10,13, 8,10,10,13
+ db 11,13,14,17, 6, 7, 7,10, 8,10
+ db 11,14, 8,10,10,13,11,13,14,17
+ db 9,10,11,14,11,13,14,17,12,14
+ db 14,17,15,17,18,21
align 16
low_mask_table:
@@ -167,78 +167,78 @@
;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_sse2
- %ifdef X86_32
- push r3
- %assign push_num 1
- %else
- %assign push_num 0
- %endif
- LOAD_2_PARA
- movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
- movdqa xmm1, [r1+16] ; f e d c b a 9 8
- pextrw r2d, xmm0, 7 ; ecx = 7
- pextrw r3d, xmm1, 2 ; edx = a
- pextrw r1d, xmm0, 5 ; eax = 5
- pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
- pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
- pextrw r2d, xmm1, 0 ; ecx = 8
- pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
- pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
- pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
- pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
- pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
- pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
- movdqa [r0],xmm0
- movdqa [r0+16], xmm1
- %ifdef X86_32
- pop r3
- %endif
- ret
+ %ifdef X86_32
+ push r3
+ %assign push_num 1
+ %else
+ %assign push_num 0
+ %endif
+ LOAD_2_PARA
+ movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
+ movdqa xmm1, [r1+16] ; f e d c b a 9 8
+ pextrw r2d, xmm0, 7 ; ecx = 7
+ pextrw r3d, xmm1, 2 ; edx = a
+ pextrw r1d, xmm0, 5 ; eax = 5
+ pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
+ pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
+ pextrw r2d, xmm1, 0 ; ecx = 8
+ pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
+ pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
+ pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
+ pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
+ pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
+ pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
+ movdqa [r0],xmm0
+ movdqa [r0+16], xmm1
+ %ifdef X86_32
+ pop r3
+ %endif
+ ret
;***********************************************************************
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4DcAc_ssse3
- %assign push_num 0
- LOAD_2_PARA
- movdqa xmm0, [r1]
- movdqa xmm1, [r1+16]
- pextrw r2d, xmm0, 7 ; ecx = [7]
- pextrw r1d, xmm1, 0 ; eax = [8]
- pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
- pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
- pshufb xmm1, [pb_scanacdc_maskb]
- pshufb xmm0, [pb_scanacdc_maska]
+ %assign push_num 0
+ LOAD_2_PARA
+ movdqa xmm0, [r1]
+ movdqa xmm1, [r1+16]
+ pextrw r2d, xmm0, 7 ; ecx = [7]
+ pextrw r1d, xmm1, 0 ; eax = [8]
+ pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
+ pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
+ pshufb xmm1, [pb_scanacdc_maskb]
+ pshufb xmm0, [pb_scanacdc_maska]
- movdqa [r0],xmm0
- movdqa [r0+16], xmm1
- ret
+ movdqa [r0],xmm0
+ movdqa [r0+16], xmm1
+ ret
;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
;***********************************************************************
WELS_EXTERN WelsScan4x4Ac_sse2
- %assign push_num 0
- LOAD_2_PARA
- movdqa xmm0, [r1]
- movdqa xmm1, [r1+16]
- movdqa xmm2, xmm0
- punpcklqdq xmm0, xmm1
- punpckhqdq xmm2, xmm1
+ %assign push_num 0
+ LOAD_2_PARA
+ movdqa xmm0, [r1]
+ movdqa xmm1, [r1+16]
+ movdqa xmm2, xmm0
+ punpcklqdq xmm0, xmm1
+ punpckhqdq xmm2, xmm1
- movdqa xmm3, xmm0
- punpckldq xmm0, xmm2
- punpckhdq xmm3, xmm2
- pextrw r1d , xmm0, 3
- pextrw r2d , xmm0, 7
- pinsrw xmm0, r1d, 7
- pextrw r1d, xmm3, 4
- pinsrw xmm3, r2d, 4
- pextrw r2d, xmm3, 0
- pinsrw xmm3, r1d, 0
- pinsrw xmm0, r2d, 3
+ movdqa xmm3, xmm0
+ punpckldq xmm0, xmm2
+ punpckhdq xmm3, xmm2
+ pextrw r1d , xmm0, 3
+ pextrw r2d , xmm0, 7
+ pinsrw xmm0, r1d, 7
+ pextrw r1d, xmm3, 4
+ pinsrw xmm3, r2d, 4
+ pextrw r2d, xmm3, 0
+ pinsrw xmm3, r1d, 0
+ pinsrw xmm0, r2d, 3
- pshufhw xmm1, xmm0, 0x93
- pshuflw xmm2, xmm3, 0x39
+ pshufhw xmm1, xmm0, 0x93
+ pshuflw xmm2, xmm3, 0x39
movdqa xmm3, xmm2
psrldq xmm1, 2
@@ -245,9 +245,9 @@
pslldq xmm3, 14
por xmm1, xmm3
psrldq xmm2, 2
- movdqa [r0],xmm1
- movdqa [r0+16], xmm2
- ret
+ movdqa [r0],xmm1
+ movdqa [r0+16], xmm2
+ ret
;***********************************************************************
@@ -254,19 +254,19 @@
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
- %ifdef X86_32
- push r3
- %assign push_num 1
- %else
- %assign push_num 0
- %endif
- LOAD_1_PARA
- movdqa xmm0, [r0]
- movdqa xmm1, [r0+16]
+ %ifdef X86_32
+ push r3
+ %assign push_num 1
+ %else
+ %assign push_num 0
+ %endif
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+16]
- packsswb xmm0, xmm1
- ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
- xor r3, r3
+ packsswb xmm0, xmm1
+ ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
+ xor r3, r3
pxor xmm3, xmm3
pcmpeqb xmm0, xmm3
pmovmskb r3d, xmm0
@@ -273,39 +273,39 @@
xor r3, 0xffff
- xor r0, r0
- mov r2, 7
- mov r1, 8
+ xor r0, r0
+ mov r2, 7
+ mov r1, 8
.loop_low8_find1:
- bt r3, r2
- jc .loop_high8_find1
- dec r2
- jnz .loop_low8_find1
+ bt r3, r2
+ jc .loop_high8_find1
+ dec r2
+ jnz .loop_low8_find1
.loop_high8_find1:
- bt r3, r1
- jc .find1end
- inc r1
- cmp r1,16
- jb .loop_high8_find1
+ bt r3, r1
+ jc .find1end
+ inc r1
+ cmp r1,16
+ jb .loop_high8_find1
.find1end:
- sub r1, r2
- sub r1, 1
- lea r2, [i_ds_table]
- add r0b, [r2+r1]
- mov r1, r3
- and r3, 0xff
- shr r1, 8
- and r1, 0xff
- lea r2 , [low_mask_table]
- add r0b, [r2 +r3]
- lea r2, [high_mask_table]
- add r0b, [r2+r1]
- %ifdef X86_32
- pop r3
- %else
- mov retrd, r0d
- %endif
- ret
+ sub r1, r2
+ sub r1, 1
+ lea r2, [i_ds_table]
+ add r0b, [r2+r1]
+ mov r1, r3
+ and r3, 0xff
+ shr r1, 8
+ and r1, 0xff
+ lea r2 , [low_mask_table]
+ add r0b, [r2 +r3]
+ lea r2, [high_mask_table]
+ add r0b, [r2+r1]
+ %ifdef X86_32
+ pop r3
+ %else
+ mov retrd, r0d
+ %endif
+ ret
;***********************************************************************
@@ -312,28 +312,28 @@
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;***********************************************************************
WELS_EXTERN WelsGetNoneZeroCount_sse2
- %assign push_num 0
- LOAD_1_PARA
- movdqa xmm0, [r0]
- movdqa xmm1, [r0+16]
- pxor xmm2, xmm2
- pcmpeqw xmm0, xmm2
- pcmpeqw xmm1, xmm2
- packsswb xmm1, xmm0
- xor r1, r1
- pmovmskb r1d, xmm1
- xor r1d, 0xffff
- mov r2, r1
- and r1, 0xff
- shr r2, 8
-; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
-; xor retr, retr
- ;add al, [nozero_count_table+r2]
- lea r0 , [nozero_count_table]
- movzx r2, byte [r0+r2]
- movzx r1, byte [r0+r1]
- mov retrq, r2
- add retrq, r1
- ;add al, [nozero_count_table+r1]
- ret
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+16]
+ pxor xmm2, xmm2
+ pcmpeqw xmm0, xmm2
+ pcmpeqw xmm1, xmm2
+ packsswb xmm1, xmm0
+ xor r1, r1
+ pmovmskb r1d, xmm1
+ xor r1d, 0xffff
+ mov r2, r1
+ and r1, 0xff
+ shr r2, 8
+; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
+; xor retr, retr
+ ;add al, [nozero_count_table+r2]
+ lea r0 , [nozero_count_table]
+ movzx r2, byte [r0+r2]
+ movzx r1, byte [r0+r1]
+ mov retrq, r2
+ add retrq, r1
+ ;add al, [nozero_count_table+r1]
+ ret
--- a/codec/processing/src/arm/adaptive_quantization.S
+++ b/codec/processing/src/arm/adaptive_quantization.S
@@ -36,17 +36,17 @@
#ifdef __APPLE__
.macro SQR_ADD_16BYTES
- vmull.u8 q3, $0, $0
- vmull.u8 q8, $1, $1
- vpadal.u16 $2, q3
- vpadal.u16 $2, q8
+ vmull.u8 q3, $0, $0
+ vmull.u8 q8, $1, $1
+ vpadal.u16 $2, q3
+ vpadal.u16 $2, q8
.endm
#else
.macro SQR_ADD_16BYTES arg0, arg1, arg2
- vmull.u8 q3, \arg0, \arg0
- vmull.u8 q8, \arg1, \arg1
- vpadal.u16 \arg2, q3
- vpadal.u16 \arg2, q8
+ vmull.u8 q3, \arg0, \arg0
+ vmull.u8 q8, \arg1, \arg1
+ vpadal.u16 \arg2, q3
+ vpadal.u16 \arg2, q8
.endm
#endif
@@ -54,66 +54,66 @@
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
stmdb sp!, {r4}
- vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
- vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
+ vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
+ vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
- vabd.u8 q13, q14, q15
- vmull.u8 q12, d27, d27
- vmull.u8 q11, d26, d26
- vaddl.u16 q12, d24, d25
- vpadal.u16 q12, q11 //sqr
+ vabd.u8 q13, q14, q15
+ vmull.u8 q12, d27, d27
+ vmull.u8 q11, d26, d26
+ vaddl.u16 q12, d24, d25
+ vpadal.u16 q12, q11 //sqr
vaddl.u8 q13, d26, d27 //sum
- vaddl.u8 q10, d28, d29 //sum_cur
+ vaddl.u8 q10, d28, d29 //sum_cur
- vmull.u8 q9, d29, d29
- vmull.u8 q8, d28, d28
- vaddl.u16 q9, d18, d19 //sqr_cur
- vpadal.u16 q9, q8
+ vmull.u8 q9, d29, d29
+ vmull.u8 q8, d28, d28
+ vaddl.u16 q9, d18, d19 //sqr_cur
+ vpadal.u16 q9, q8
- mov r4, #15
+ mov r4, #15
pixel_var_16x16_loop0:
- vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
- vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
+ vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
+ vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
- vabd.u8 q2, q0, q1
+ vabd.u8 q2, q0, q1
- //q10 save sum_cur
- vpadal.u8 q10, q1
+ //q10 save sum_cur
+ vpadal.u8 q10, q1
- //q12 save sqr
- SQR_ADD_16BYTES d4, d5, q12
+ //q12 save sqr
+ SQR_ADD_16BYTES d4, d5, q12
//q13 save sum
- vpadal.u8 q13, q2
+ vpadal.u8 q13, q2
- subs r4, #1
+ subs r4, #1
- //q9 save sqr_cur
- SQR_ADD_16BYTES d2, d3, q9
+ //q9 save sqr_cur
+ SQR_ADD_16BYTES d2, d3, q9
- bne pixel_var_16x16_loop0
+ bne pixel_var_16x16_loop0
- vadd.u16 d0, d26, d27 //sum
- vadd.u16 d1, d20, d21 //sum_cur
- vpaddl.u16 q0, q0
- vadd.u32 d2, d24, d25 //sqr
- vadd.u32 d3, d18, d19 //sqr_cur
- vpadd.u32 d0, d0, d1
- vpadd.u32 d1, d2, d3
+ vadd.u16 d0, d26, d27 //sum
+ vadd.u16 d1, d20, d21 //sum_cur
+ vpaddl.u16 q0, q0
+ vadd.u32 d2, d24, d25 //sqr
+ vadd.u32 d3, d18, d19 //sqr_cur
+ vpadd.u32 d0, d0, d1
+ vpadd.u32 d1, d2, d3
- ldr r4, [sp, #4]
+ ldr r4, [sp, #4]
- vshr.u32 q0, q0, #8
- vmul.u32 d0, d0
- vsub.u32 d0, d1, d0
+ vshr.u32 q0, q0, #8
+ vmul.u32 d0, d0
+ vsub.u32 d0, d1, d0
vmovl.u32 q0, d0
- vst2.16 {d0[0], d1[0]}, [r4]
+ vst2.16 {d0[0], d1[0]}, [r4]
- ldmia sp!, {r4}
+ ldmia sp!, {r4}
WELS_ASM_FUNC_END
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -30,196 +30,196 @@
*
*/
-#ifdef HAVE_NEON
+#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
-WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
- stmdb sp!, {r4-r8, lr}
+WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
+ stmdb sp!, {r4-r8, lr}
- //Get the width and height
- ldr r4, [sp, #24] //src_width
- ldr r5, [sp, #28] //src_height
+ //Get the width and height
+ ldr r4, [sp, #24] //src_width
+ ldr r5, [sp, #28] //src_height
- //Initialize the register
- mov r6, r2
- mov r8, r0
- mov lr, #0
- lsr r5, #1
+ //Initialize the register
+ mov r6, r2
+ mov r8, r0
+ mov lr, #0
+ lsr r5, #1
- //Save the tailer for the unasigned size
- mla r7, r1, r5, r0
- vld1.32 {q15}, [r7]
+ //Save the tailer for the unasigned size
+ mla r7, r1, r5, r0
+ vld1.32 {q15}, [r7]
- add r7, r2, r3
- //processing a colume data
+ add r7, r2, r3
+ //processing a colume data
comp_ds_bilinear_loop0:
- vld1.8 {q0,q1}, [r2]!
- vld1.8 {q2,q3}, [r7]!
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vpaddl.u8 q2, q2
- vpaddl.u8 q3, q3
- vrshr.u16 q0, #1
- vrshr.u16 q1, #1
- vrshr.u16 q2, #1
- vrshr.u16 q3, #1
- vrhadd.u16 q0, q2
- vrhadd.u16 q1, q3
- vmovn.u16 d0, q0
- vmovn.u16 d1, q1
- vst1.32 {q0}, [r0]!
- add lr, #32
+ vld1.8 {q0,q1}, [r2]!
+ vld1.8 {q2,q3}, [r7]!
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vrshr.u16 q0, #1
+ vrshr.u16 q1, #1
+ vrshr.u16 q2, #1
+ vrshr.u16 q3, #1
+ vrhadd.u16 q0, q2
+ vrhadd.u16 q1, q3
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q1
+ vst1.32 {q0}, [r0]!
+ add lr, #32
- cmp lr, r4
- movcs lr, #0
- addcs r6, r6, r3, lsl #1
- movcs r2, r6
- addcs r7, r2, r3
- addcs r8, r1
- movcs r0, r8
- subscs r5, #1
- bne comp_ds_bilinear_loop0
+ cmp lr, r4
+ movcs lr, #0
+ addcs r6, r6, r3, lsl #1
+ movcs r2, r6
+ addcs r7, r2, r3
+ addcs r8, r1
+ movcs r0, r8
+ subscs r5, #1
+ bne comp_ds_bilinear_loop0
- //restore the tailer for the unasigned size
- vst1.32 {q15}, [r0]
+ //restore the tailer for the unasigned size
+ vst1.32 {q15}, [r0]
- ldmia sp!, {r4-r8,lr}
+ ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
- stmdb sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
+ stmdb sp!, {r4-r7, lr}
- //Get the width and height
- ldr r4, [sp, #20] //src_width
- ldr r5, [sp, #24] //src_height
+ //Get the width and height
+ ldr r4, [sp, #20] //src_width
+ ldr r5, [sp, #24] //src_height
- //Get the difference
- sub lr, r3, r4
- sub r1, r1, r4, lsr #1
+ //Get the difference
+ sub lr, r3, r4
+ sub r1, r1, r4, lsr #1
- lsr r5, #1
+ lsr r5, #1
- //processing a colume data
+ //processing a colume data
comp_ds_bilinear_w_x8_loop0:
- lsr r6, r4, #3
- add r7, r2, r3
- //processing a line data
+ lsr r6, r4, #3
+ add r7, r2, r3
+ //processing a line data
comp_ds_bilinear_w_x8_loop1:
- vld1.8 {d0}, [r2]!
- vld1.8 {d1}, [r7]!
- vpaddl.u8 q0, q0
- vrshr.u16 q0, #1
- vrhadd.u16 d0, d1
+ vld1.8 {d0}, [r2]!
+ vld1.8 {d1}, [r7]!
+ vpaddl.u8 q0, q0
+ vrshr.u16 q0, #1
+ vrhadd.u16 d0, d1
- vmovn.u16 d0, q0
- vst1.32 {d0[0]}, [r0]!
- subs r6, #1
- bne comp_ds_bilinear_w_x8_loop1
+ vmovn.u16 d0, q0
+ vst1.32 {d0[0]}, [r0]!
+ subs r6, #1
+ bne comp_ds_bilinear_w_x8_loop1
- add r2, r7, lr
- add r0, r1
- subs r5, #1
- bne comp_ds_bilinear_w_x8_loop0
+ add r2, r7, lr
+ add r0, r1
+ subs r5, #1
+ bne comp_ds_bilinear_w_x8_loop0
- ldmia sp!, {r4-r7,lr}
+ ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
- stmdb sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
+ stmdb sp!, {r4-r7, lr}
- //Get the width and height
- ldr r4, [sp, #20] //src_width
- ldr r5, [sp, #24] //src_height
+ //Get the width and height
+ ldr r4, [sp, #20] //src_width
+ ldr r5, [sp, #24] //src_height
- //Get the difference
- sub lr, r3, r4
- sub r1, r1, r4, lsr #1
+ //Get the difference
+ sub lr, r3, r4
+ sub r1, r1, r4, lsr #1
- lsr r5, #1
+ lsr r5, #1
- //processing a colume data
+ //processing a colume data
comp_ds_bilinear_w_x16_loop0:
- lsr r6, r4, #4
- add r7, r2, r3
- //processing a line data
+ lsr r6, r4, #4
+ add r7, r2, r3
+ //processing a line data
comp_ds_bilinear_w_x16_loop1:
- vld1.8 {q0}, [r2]!
- vld1.8 {q1}, [r7]!
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vrshr.u16 q0, #1
- vrshr.u16 q1, #1
- vrhadd.u16 q0, q1
+ vld1.8 {q0}, [r2]!
+ vld1.8 {q1}, [r7]!
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vrshr.u16 q0, #1
+ vrshr.u16 q1, #1
+ vrhadd.u16 q0, q1
- vmovn.u16 d0, q0
- vst1.32 {d0}, [r0]!
- subs r6, #1
- bne comp_ds_bilinear_w_x16_loop1
+ vmovn.u16 d0, q0
+ vst1.32 {d0}, [r0]!
+ subs r6, #1
+ bne comp_ds_bilinear_w_x16_loop1
- add r2, r7, lr
- add r0, r1
- subs r5, #1
- bne comp_ds_bilinear_w_x16_loop0
+ add r2, r7, lr
+ add r0, r1
+ subs r5, #1
+ bne comp_ds_bilinear_w_x16_loop0
- ldmia sp!, {r4-r7,lr}
+ ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
- stmdb sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
+ stmdb sp!, {r4-r7, lr}
- //Get the width and height
- ldr r4, [sp, #20] //src_width
- ldr r5, [sp, #24] //src_height
+ //Get the width and height
+ ldr r4, [sp, #20] //src_width
+ ldr r5, [sp, #24] //src_height
- //Get the difference
- sub lr, r3, r4
- sub r1, r1, r4, lsr #1
+ //Get the difference
+ sub lr, r3, r4
+ sub r1, r1, r4, lsr #1
- lsr r5, #1
+ lsr r5, #1
- //processing a colume data
+ //processing a colume data
comp_ds_bilinear_w_x32_loop0:
- lsr r6, r4, #5
- add r7, r2, r3
- //processing a line data
+ lsr r6, r4, #5
+ add r7, r2, r3
+ //processing a line data
comp_ds_bilinear_w_x32_loop1:
- vld1.8 {q0,q1}, [r2]!
- vld1.8 {q2,q3}, [r7]!
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vpaddl.u8 q2, q2
- vpaddl.u8 q3, q3
- vrshr.u16 q0, #1
- vrshr.u16 q1, #1
- vrshr.u16 q2, #1
- vrshr.u16 q3, #1
- vrhadd.u16 q0, q2
- vrhadd.u16 q1, q3
+ vld1.8 {q0,q1}, [r2]!
+ vld1.8 {q2,q3}, [r7]!
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vrshr.u16 q0, #1
+ vrshr.u16 q1, #1
+ vrshr.u16 q2, #1
+ vrshr.u16 q3, #1
+ vrhadd.u16 q0, q2
+ vrhadd.u16 q1, q3
- vmovn.u16 d0, q0
- vmovn.u16 d1, q1
- vst1.32 {q0}, [r0]!
- subs r6, #1
- bne comp_ds_bilinear_w_x32_loop1
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q1
+ vst1.32 {q0}, [r0]!
+ subs r6, #1
+ bne comp_ds_bilinear_w_x32_loop1
- add r2, r7, lr
- add r0, r1
- subs r5, #1
- bne comp_ds_bilinear_w_x32_loop0
+ add r2, r7, lr
+ add r0, r1
+ subs r5, #1
+ bne comp_ds_bilinear_w_x32_loop0
- ldmia sp!, {r4-r7,lr}
+ ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
@@ -226,117 +226,117 @@
WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
stmdb sp!, {r4-r12, lr}
- //Get the data from stack
- ldr r4, [sp, #40] //the addr of src
- ldr r5, [sp, #44] //the value of src_stride
+ //Get the data from stack
+ ldr r4, [sp, #40] //the addr of src
+ ldr r5, [sp, #44] //the value of src_stride
ldr r6, [sp, #48] //the value of scaleX
ldr r7, [sp, #52] //the value of scaleY
mov r10, #32768
sub r10, #1
- and r8, r6, r10 // r8 uinc(scaleX mod 32767)
+ and r8, r6, r10 // r8 uinc(scaleX mod 32767)
mov r11, #-1
- mul r11, r8 // r11 -uinc
+ mul r11, r8 // r11 -uinc
vdup.s16 d2, r8
vdup.s16 d0, r11
vzip.s16 d0, d2 // uinc -uinc uinc -uinc
- and r9, r7, r10 // r9 vinc(scaleY mod 32767)
+ and r9, r7, r10 // r9 vinc(scaleY mod 32767)
mov r11, #-1
- mul r11, r9 // r11 -vinc
+ mul r11, r9 // r11 -vinc
- vdup.s16 d2, r9
- vdup.s16 d3, r11
- vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
+ vdup.s16 d2, r9
+ vdup.s16 d3, r11
+ vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
- mov r11, #0x40000000
+ mov r11, #0x40000000
mov r12, #0x4000
sub r12, #1
add r11, r12
- vdup.s32 d1, r11; //init u 16384 16383 16384 16383
+ vdup.s32 d1, r11; //init u 16384 16383 16384 16383
- mov r11, #16384
+ mov r11, #16384
vdup.s16 d16, r11
sub r11, #1
- vdup.s16 d17, r11
- vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
+ vdup.s16 d17, r11
+ vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
- veor q14, q14
- sub r1, r2 // stride - width
- mov r8, #16384 // yInverse
- sub r3, #1
+ veor q14, q14
+ sub r1, r2 // stride - width
+ mov r8, #16384 // yInverse
+ sub r3, #1
_HEIGHT:
ldr r4, [sp, #40] //the addr of src
- mov r11, r8
- lsr r11, #15
- mul r11, r5
- add r11, r4 // get current row address
- mov r12, r11
- add r12, r5
+ mov r11, r8
+ lsr r11, #15
+ mul r11, r5
+ add r11, r4 // get current row address
+ mov r12, r11
+ add r12, r5
- mov r9, #16384 // xInverse
- sub r10, r2, #1
+ mov r9, #16384 // xInverse
+ sub r10, r2, #1
vmov.s16 d6, d1
_WIDTH:
- mov lr, r9
- lsr lr, #15
+ mov lr, r9
+ lsr lr, #15
add r4, r11,lr
- vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
+ vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
add r4, r12,lr
- vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
- vzip.32 d28, d29 //q14: 000d000c000b000a;
+ vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
+ vzip.32 d28, d29 //q14: 000d000c000b000a;
- vmull.u16 q13, d6, d7 //q13: init u * init v
- vmull.u32 q12, d26,d28
- vmlal.u32 q12, d27,d29
- vqadd.u64 d24, d24,d25
- vrshr.u64 d24, #30
+ vmull.u16 q13, d6, d7 //q13: init u * init v
+ vmull.u32 q12, d26,d28
+ vmlal.u32 q12, d27,d29
+ vqadd.u64 d24, d24,d25
+ vrshr.u64 d24, #30
- vst1.8 {d24[0]}, [r0]!
- add r9, r6
- vadd.u16 d6, d0 // inc u
- vshl.u16 d6, #1
- vshr.u16 d6, #1
- subs r10, #1
- bne _WIDTH
+ vst1.8 {d24[0]}, [r0]!
+ add r9, r6
+ vadd.u16 d6, d0 // inc u
+ vshl.u16 d6, #1
+ vshr.u16 d6, #1
+ subs r10, #1
+ bne _WIDTH
WIDTH_END:
- lsr r9, #15
+ lsr r9, #15
add r4,r11,r9
- vld1.8 {d24[0]}, [r4]
- vst1.8 {d24[0]}, [r0]
- add r0, #1
- add r8, r7
- add r0, r1
- vadd.s16 d7, d5 // inc v
- vshl.u16 d7, #1
- vshr.u16 d7, #1
- subs r3, #1
- bne _HEIGHT
+ vld1.8 {d24[0]}, [r4]
+ vst1.8 {d24[0]}, [r0]
+ add r0, #1
+ add r8, r7
+ add r0, r1
+ vadd.s16 d7, d5 // inc v
+ vshl.u16 d7, #1
+ vshr.u16 d7, #1
+ subs r3, #1
+ bne _HEIGHT
LAST_ROW:
ldr r4, [sp, #40] //the addr of src
- lsr r8, #15
- mul r8, r5
- add r4, r8 // get current row address
- mov r9, #16384
+ lsr r8, #15
+ mul r8, r5
+ add r4, r8 // get current row address
+ mov r9, #16384
_LAST_ROW_WIDTH:
- mov r11, r9
- lsr r11, #15
+ mov r11, r9
+ lsr r11, #15
- add r3, r4,r11
- vld1.8 {d0[0]}, [r3]
- vst1.8 {d0[0]}, [r0]
- add r0, #1
- add r9, r6
- subs r2, #1
- bne _LAST_ROW_WIDTH
+ add r3, r4,r11
+ vld1.8 {d0[0]}, [r3]
+ vst1.8 {d0[0]}, [r0]
+ add r0, #1
+ add r9, r6
+ subs r2, #1
+ bne _LAST_ROW_WIDTH
- ldmia sp!, {r4-r12, lr}
+ ldmia sp!, {r4-r12, lr}
WELS_ASM_FUNC_END
#endif
--- a/codec/processing/src/arm/pixel_sad_neon.S
+++ b/codec/processing/src/arm/pixel_sad_neon.S
@@ -37,32 +37,32 @@
WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon
stmdb sp!, {lr}
- //Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1
- vld1.8 {d1}, [r2], r3
+ //Loading a horizontal line data (8 bytes)
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d1}, [r2], r3
- //Do the SAD for 8 bytes
- vabdl.u8 q1, d0, d1
+ //Do the SAD for 8 bytes
+ vabdl.u8 q1, d0, d1
- mov lr, #7
+ mov lr, #7
pixel_sad_8x8_loop0:
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1
- vld1.8 {d1}, [r2], r3
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d1}, [r2], r3
- subs lr, #1
+ subs lr, #1
- //Do the SAD for 8 bytes
- vabal.u8 q1, d0, d1
- bne pixel_sad_8x8_loop0
+ //Do the SAD for 8 bytes
+ vabal.u8 q1, d0, d1
+ bne pixel_sad_8x8_loop0
- vadd.u16 d2, d3
- vpaddl.u16 d2, d2
- vpaddl.u32 d2, d2
- vmov.u32 r0, d2[0]//TBO...
+ vadd.u16 d2, d3
+ vpaddl.u16 d2, d2
+ vpaddl.u32 d2, d2
+ vmov.u32 r0, d2[0]//TBO...
- ldmia sp!, {lr}
+ ldmia sp!, {lr}
WELS_ASM_FUNC_END
#endif
--- a/codec/processing/src/arm/vaa_calc_neon.S
+++ b/codec/processing/src/arm/vaa_calc_neon.S
@@ -37,61 +37,61 @@
#ifdef __APPLE__
.macro ABS_SUB_SUM_16BYTES
- vld1.32 {q15}, [$0], $2
- vld1.32 {q14}, [$1], $2
- vabal.u8 $3, d30, d28
- vabal.u8 $4, d31, d29
+ vld1.32 {q15}, [$0], $2
+ vld1.32 {q14}, [$1], $2
+ vabal.u8 $3, d30, d28
+ vabal.u8 $4, d31, d29
.endm
.macro ABS_SUB_SUM_8x16BYTES
- vld1.32 {q15}, [$0], $2
- vld1.32 {q14}, [$1], $2
- vabdl.u8 $3, d30, d28
- vabdl.u8 $4, d31, d29
+ vld1.32 {q15}, [$0], $2
+ vld1.32 {q14}, [$1], $2
+ vabdl.u8 $3, d30, d28
+ vabdl.u8 $4, d31, d29
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
.endm
.macro SAD_8X16BITS
- vadd.u16 d31, $0, $1
- vpaddl.u16 d31, d31
- vpaddl.u32 $2, d31
+ vadd.u16 d31, $0, $1
+ vpaddl.u16 d31, d31
+ vpaddl.u32 $2, d31
.endm
#else
.macro ABS_SUB_SUM_16BYTES arg0, arg1, arg2, arg3, arg4
- vld1.32 {q15}, [\arg0], \arg2
- vld1.32 {q14}, [\arg1], \arg2
- vabal.u8 \arg3, d30, d28
- vabal.u8 \arg4, d31, d29
+ vld1.32 {q15}, [\arg0], \arg2
+ vld1.32 {q14}, [\arg1], \arg2
+ vabal.u8 \arg3, d30, d28
+ vabal.u8 \arg4, d31, d29
.endm
.macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4
- vld1.32 {q15}, [\arg0], \arg2
- vld1.32 {q14}, [\arg1], \arg2
- vabdl.u8 \arg3, d30, d28
- vabdl.u8 \arg4, d31, d29
+ vld1.32 {q15}, [\arg0], \arg2
+ vld1.32 {q14}, [\arg1], \arg2
+ vabdl.u8 \arg3, d30, d28
+ vabdl.u8 \arg4, d31, d29
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
.endm
.macro SAD_8X16BITS arg0, arg1, arg2
- vadd.u16 d31, \arg0, \arg1
- vpaddl.u16 d31, d31
- vpaddl.u32 \arg2, d31
+ vadd.u16 d31, \arg0, \arg1
+ vpaddl.u16 d31, d31
+ vpaddl.u32 \arg2, d31
.endm
#endif
@@ -100,16 +100,16 @@
stmdb sp!, {r4-r8}
- ldr r4, [sp, #20] //load pic_stride
- ldr r5, [sp, #28] //load psad8x8
+ ldr r4, [sp, #20] //load pic_stride
+ ldr r5, [sp, #28] //load psad8x8
- //Initial the Q8 register for save the "psadframe"
- vmov.s64 q8, #0
+ //Initial the Q8 register for save the "psadframe"
+ vmov.s64 q8, #0
- //Get the jump distance to use on loop codes
- lsl r8, r4, #4
- sub r7, r8, #16 //R7 keep the 16*pic_stride-16
- sub r8, r2 //R8 keep the 16*pic_stride-pic_width
+ //Get the jump distance to use on loop codes
+ lsl r8, r4, #4
+ sub r7, r8, #16 //R7 keep the 16*pic_stride-16
+ sub r8, r2 //R8 keep the 16*pic_stride-pic_width
vaa_calc_sad_loop0:
@@ -118,70 +118,70 @@
vaa_calc_sad_loop1:
- //Process the 16x16 bytes
- ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1
- ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3
+ //Process the 16x16 bytes
+ ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1
+ ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3
- //Do the SAD
- SAD_8X16BITS d0, d1, d0
- SAD_8X16BITS d2, d3, d1
- SAD_8X16BITS d4, d5, d2
- SAD_8X16BITS d6, d7, d3
+ //Do the SAD
+ SAD_8X16BITS d0, d1, d0
+ SAD_8X16BITS d2, d3, d1
+ SAD_8X16BITS d4, d5, d2
+ SAD_8X16BITS d6, d7, d3
- //Write to "psad8x8" buffer
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
+ //Write to "psad8x8" buffer
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
- //Adjust the input address
- sub r0, r7
- sub r1, r7
+ //Adjust the input address
+ sub r0, r7
+ sub r1, r7
- subs r6, #16
+ subs r6, #16
- //Save to calculate "psadframe"
- vadd.u32 q0, q1
- vadd.u32 q8, q0
+ //Save to calculate "psadframe"
+ vadd.u32 q0, q1
+ vadd.u32 q8, q0
- bne vaa_calc_sad_loop1
+ bne vaa_calc_sad_loop1
- //Adjust the input address
- add r0, r8
- add r1, r8
+ //Adjust the input address
+ add r0, r8
+ add r1, r8
subs r3, #16
- bne vaa_calc_sad_loop0
+ bne vaa_calc_sad_loop0
- ldr r6, [sp, #24] //load psadframe
- vadd.u32 d16, d17
- vst1.32 {d16[0]}, [r6]
+ ldr r6, [sp, #24] //load psadframe
+ vadd.u32 d16, d17
+ vst1.32 {d16[0]}, [r6]
- ldmia sp!, {r4-r8}
+ ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
#ifdef __APPLE__
-.macro SAD_SD_MAD_16BYTES
- vld1.32 {q0}, [$0], $2
- vld1.32 {q1}, [$1], $2
+.macro SAD_SD_MAD_16BYTES
+ vld1.32 {q0}, [$0], $2
+ vld1.32 {q1}, [$1], $2
- vpadal.u8 $3, q0
- vpadal.u8 $4, q1
+ vpadal.u8 $3, q0
+ vpadal.u8 $4, q1
- vabd.u8 q0, q0, q1
- vmax.u8 $5, q0
- vpadal.u8 $6, q0
+ vabd.u8 q0, q0, q1
+ vmax.u8 $5, q0
+ vpadal.u8 $6, q0
.endm
-.macro SAD_SD_MAD_8x16BYTES
- vld1.32 {q0}, [$0], $2
- vld1.32 {q1}, [$1], $2
+.macro SAD_SD_MAD_8x16BYTES
+ vld1.32 {q0}, [$0], $2
+ vld1.32 {q1}, [$1], $2
- vpaddl.u8 q2, q0
- vpaddl.u8 q3, q1
+ vpaddl.u8 q2, q0
+ vpaddl.u8 q3, q1
- vabd.u8 $3, q0, q1
- vpaddl.u8 $4, $3 //abs_diff
+ vabd.u8 $3, q0, q1
+ vpaddl.u8 $4, $3 //abs_diff
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
@@ -192,41 +192,41 @@
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
- vsub.u16 $5, q2, q3
+ vsub.u16 $5, q2, q3
.endm
-.macro SAD_SD_MAD_CALC
- vpmax.u8 d0, $0, $1 //8bytes
- vpmax.u8 d0, d0, d0 //4bytes
- vpmax.u8 $2, d0, d0 //2bytes
+.macro SAD_SD_MAD_CALC
+ vpmax.u8 d0, $0, $1 //8bytes
+ vpmax.u8 d0, d0, d0 //4bytes
+ vpmax.u8 $2, d0, d0 //2bytes
- vpaddl.u16 $3, $3
- vpaddl.u32 $3, $3
- vpaddl.s16 $4, $4
- vpaddl.s32 $4, $4
+ vpaddl.u16 $3, $3
+ vpaddl.u32 $3, $3
+ vpaddl.s16 $4, $4
+ vpaddl.s32 $4, $4
.endm
#else
-.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vld1.32 {q0}, [\arg0], \arg2
- vld1.32 {q1}, [\arg1], \arg2
+.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ vld1.32 {q0}, [\arg0], \arg2
+ vld1.32 {q1}, [\arg1], \arg2
- vpadal.u8 \arg3, q0
- vpadal.u8 \arg4, q1
+ vpadal.u8 \arg3, q0
+ vpadal.u8 \arg4, q1
- vabd.u8 q0, q0, q1
- vmax.u8 \arg5, q0
- vpadal.u8 \arg6, q0
+ vabd.u8 q0, q0, q1
+ vmax.u8 \arg5, q0
+ vpadal.u8 \arg6, q0
.endm
-.macro SAD_SD_MAD_8x16BYTES arg0, arg1, arg2, arg3, arg4, arg5
- vld1.32 {q0}, [\arg0], \arg2
- vld1.32 {q1}, [\arg1], \arg2
+.macro SAD_SD_MAD_8x16BYTES arg0, arg1, arg2, arg3, arg4, arg5
+ vld1.32 {q0}, [\arg0], \arg2
+ vld1.32 {q1}, [\arg1], \arg2
- vpaddl.u8 q2, q0
- vpaddl.u8 q3, q1
+ vpaddl.u8 q2, q0
+ vpaddl.u8 q3, q1
- vabd.u8 \arg3, q0, q1
- vpaddl.u8 \arg4, \arg3 //abs_diff
+ vabd.u8 \arg3, q0, q1
+ vpaddl.u8 \arg4, \arg3 //abs_diff
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
@@ -237,18 +237,18 @@
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
- vsub.u16 \arg5, q2, q3
+ vsub.u16 \arg5, q2, q3
.endm
-.macro SAD_SD_MAD_CALC arg0, arg1, arg2, arg3, arg4
- vpmax.u8 d0, \arg0, \arg1 //8bytes
- vpmax.u8 d0, d0, d0 //4bytes
- vpmax.u8 \arg2, d0, d0 //2bytes
+.macro SAD_SD_MAD_CALC arg0, arg1, arg2, arg3, arg4
+ vpmax.u8 d0, \arg0, \arg1 //8bytes
+ vpmax.u8 d0, d0, d0 //4bytes
+ vpmax.u8 \arg2, d0, d0 //2bytes
- vpaddl.u16 \arg3, \arg3
- vpaddl.u32 \arg3, \arg3
- vpaddl.s16 \arg4, \arg4
- vpaddl.s32 \arg4, \arg4
+ vpaddl.u16 \arg3, \arg3
+ vpaddl.u32 \arg3, \arg3
+ vpaddl.s16 \arg4, \arg4
+ vpaddl.s32 \arg4, \arg4
.endm
#endif
@@ -256,18 +256,18 @@
stmdb sp!, {r4-r10}
- ldr r4, [sp, #28] //load pic_stride
- ldr r5, [sp, #36] //load psad8x8
+ ldr r4, [sp, #28] //load pic_stride
+ ldr r5, [sp, #36] //load psad8x8
ldr r6, [sp, #40] //load psd8x8
ldr r7, [sp, #44] //load pmad8x8
- //Initial the Q4 register for save the "psadframe"
- vmov.s64 q15, #0
+ //Initial the Q4 register for save the "psadframe"
+ vmov.s64 q15, #0
- //Get the jump distance to use on loop codes
- lsl r10, r4, #4
- sub r9, r10, #16 //R9 keep the 16*pic_stride-16
- sub r10, r2 //R10 keep the 16*pic_stride-pic_width
+ //Get the jump distance to use on loop codes
+ lsl r10, r4, #4
+ sub r9, r10, #16 //R9 keep the 16*pic_stride-16
+ sub r10, r2 //R10 keep the 16*pic_stride-pic_width
vaa_calc_sad_bgd_loop0:
@@ -276,384 +276,384 @@
vaa_calc_sad_bgd_loop1:
- //Process the 16x16 bytes pmad psad psd
- SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9
- SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10
+ //Process the 16x16 bytes pmad psad psd
+ SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9
+ SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10
SAD_SD_MAD_CALC d26, d27, d16, q11, q9
SAD_SD_MAD_CALC d28, d29, d17, q12, q10
- //Write to "psad8x8" buffer
- vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
- //Adjust the input address
- sub r0, r9
- sub r1, r9
- //Write to "psd8x8" buffer
- vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
- subs r8, #16
- //Write to "pmad8x8" buffer
- vst2.16 {d16[0],d17[0]}, [r7]!
- //Save to calculate "psadframe"
- vadd.u32 q11, q12
- vadd.u32 q15, q11
+ //Write to "psad8x8" buffer
+ vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
+ //Adjust the input address
+ sub r0, r9
+ sub r1, r9
+ //Write to "psd8x8" buffer
+ vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
+ subs r8, #16
+ //Write to "pmad8x8" buffer
+ vst2.16 {d16[0],d17[0]}, [r7]!
+ //Save to calculate "psadframe"
+ vadd.u32 q11, q12
+ vadd.u32 q15, q11
- bne vaa_calc_sad_bgd_loop1
+ bne vaa_calc_sad_bgd_loop1
- //Adjust the input address
- add r0, r10
- add r1, r10
+ //Adjust the input address
+ add r0, r10
+ add r1, r10
subs r3, #16
- bne vaa_calc_sad_bgd_loop0
+ bne vaa_calc_sad_bgd_loop0
- ldr r8, [sp, #32] //load psadframe
- vadd.u32 d30, d31
- vst1.32 {d30[0]}, [r8]
- ldmia sp!, {r4-r10}
+ ldr r8, [sp, #32] //load psadframe
+ vadd.u32 d30, d31
+ vst1.32 {d30[0]}, [r8]
+ ldmia sp!, {r4-r10}
WELS_ASM_FUNC_END
#ifdef __APPLE__
-.macro SSD_MUL_SUM_16BYTES_RESET
- vmull.u8 $3, $0, $0
- vpaddl.u16 $2, $3
+.macro SSD_MUL_SUM_16BYTES_RESET
+ vmull.u8 $3, $0, $0
+ vpaddl.u16 $2, $3
- vmull.u8 $3, $1, $1
- vpadal.u16 $2, $3
+ vmull.u8 $3, $1, $1
+ vpadal.u16 $2, $3
.endm
-.macro SSD_MUL_SUM_16BYTES
- vmull.u8 $3, $0, $0
- vpadal.u16 $2, $3
+.macro SSD_MUL_SUM_16BYTES
+ vmull.u8 $3, $0, $0
+ vpadal.u16 $2, $3
- vmull.u8 $3, $1, $1
- vpadal.u16 $2, $3
+ vmull.u8 $3, $1, $1
+ vpadal.u16 $2, $3
.endm
.macro SAD_SSD_BGD_16
- vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q0}, [$0], $2 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
+ vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
- vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [$1], $2 //load ref_row
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vld1.8 {q1}, [$1], $2 //load ref_row
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//the last row of a 16x16 block
.macro SAD_SSD_BGD_16_end
- vld1.8 {q0}, [$0], $1 //load cur_row
+ vld1.8 {q0}, [$0], $1 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
+ vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
- vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//for the begin of a 8x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_8x8
- vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q0}, [$0], $2 //load cur_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+ vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [$1], $2 //load ref_row
+ vld1.8 {q1}, [$1], $2 //load ref_row
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//for the begin of a 16x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_16x16
- vld1.8 {q0}, [$0], $2 //load cur_row
- vld1.8 {q1}, [$1], $2 //load ref_row
+ vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q1}, [$1], $2 //load ref_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+ vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [$1], $2 //load ref_row
+ vld1.8 {q1}, [$1], $2 //load ref_row
- vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16
.endm
//for each 8x16 block
.macro SAD_SSD_BGD_CALC_8x16
- vpmax.u8 d10, d10, d11 //4 numbers
- vpmax.u8 d10, d10, d10 //2 numbers
- vpmax.u8 d10, d10, d10 //1 number1
+ vpmax.u8 d10, d10, d11 //4 numbers
+ vpmax.u8 d10, d10, d10 //2 numbers
+ vpmax.u8 d10, d10, d10 //1 number1
- vmov $0, d10 //d26 d27 keeps the l_mad
+ vmov $0, d10 //d26 d27 keeps the l_mad
- //p_sd8x8 fix me
- vpaddl.u16 q3, q3
- vpaddl.u16 q4, q4
+ //p_sd8x8 fix me
+ vpaddl.u16 q3, q3
+ vpaddl.u16 q4, q4
- vsub.i32 $1, q3, q4
- vpaddl.u32 $1, $1
+ vsub.i32 $1, q3, q4
+ vpaddl.u32 $1, $1
- //psad8x8
- vpaddl.u16 $2, $2
- vpaddl.u32 $2, $2
+ //psad8x8
+ vpaddl.u16 $2, $2
+ vpaddl.u32 $2, $2
- //psadframe
- vadd.i32 q12, $2
+ //psadframe
+ vadd.i32 q12, $2
.endm
.macro SAD_SSD_BGD_16x16
- //for one 8x16
- SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
+ //for one 8x16
+ SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
+ SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_CALC_8x16 d26, q14, q6
+ SAD_SSD_BGD_CALC_8x16 d26, q14, q6
- //for another 8x16
- SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16_end $0, $2, q7
+ //for another 8x16
+ SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16 $0, $1, $2, q7
+ SAD_SSD_BGD_16_end $0, $2, q7
- SAD_SSD_BGD_CALC_8x16 d27, q15, q7
+ SAD_SSD_BGD_CALC_8x16 d27, q15, q7
.endm
-.macro SSD_SAD_SD_MAD_PADDL
- vpaddl.s16 $0, $0
- vpaddl.s32 $0, $0
- vadd.i32 $1, $1, $2
+.macro SSD_SAD_SD_MAD_PADDL
+ vpaddl.s16 $0, $0
+ vpaddl.s32 $0, $0
+ vadd.i32 $1, $1, $2
.endm
#else
-.macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
- vmull.u8 \arg3, \arg0, \arg0
- vpaddl.u16 \arg2, \arg3
+.macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
+ vmull.u8 \arg3, \arg0, \arg0
+ vpaddl.u16 \arg2, \arg3
- vmull.u8 \arg3, \arg1, \arg1
- vpadal.u16 \arg2, \arg3
+ vmull.u8 \arg3, \arg1, \arg1
+ vpadal.u16 \arg2, \arg3
.endm
-.macro SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3
- vmull.u8 \arg3, \arg0, \arg0
- vpadal.u16 \arg2, \arg3
+.macro SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3
+ vmull.u8 \arg3, \arg0, \arg0
+ vpadal.u16 \arg2, \arg3
- vmull.u8 \arg3, \arg1, \arg1
- vpadal.u16 \arg2, \arg3
+ vmull.u8 \arg3, \arg1, \arg1
+ vpadal.u16 \arg2, \arg3
.endm
.macro SAD_SSD_BGD_16 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
+ vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
- vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [\arg1], \arg2 //load ref_row
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//the last row of a 16x16 block
.macro SAD_SSD_BGD_16_end arg0, arg1, arg2
- vld1.8 {q0}, [\arg0], \arg1 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg1 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
+ vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
- vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//for the begin of a 8x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_8x8 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+ vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
- vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+ vld1.8 {q1}, [\arg1], \arg2 //load ref_row
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//for the begin of a 16x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_16x16 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q1}, [\arg1], \arg2 //load ref_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+ vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
- vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
- vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+ vld1.8 {q1}, [\arg1], \arg2 //load ref_row
- vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16
.endm
//for each 8x16 block
.macro SAD_SSD_BGD_CALC_8x16 arg0, arg1, arg2
- vpmax.u8 d10, d10, d11 //4 numbers
- vpmax.u8 d10, d10, d10 //2 numbers
- vpmax.u8 d10, d10, d10 //1 number1
+ vpmax.u8 d10, d10, d11 //4 numbers
+ vpmax.u8 d10, d10, d10 //2 numbers
+ vpmax.u8 d10, d10, d10 //1 number1
- vmov \arg0, d10 //d26 d27 keeps the l_mad
+ vmov \arg0, d10 //d26 d27 keeps the l_mad
- //p_sd8x8
- vpaddl.u16 q3, q3
- vpaddl.u16 q4, q4
+ //p_sd8x8
+ vpaddl.u16 q3, q3
+ vpaddl.u16 q4, q4
- vsub.i32 \arg1, q3, q4
- vpaddl.u32 \arg1, \arg1
+ vsub.i32 \arg1, q3, q4
+ vpaddl.u32 \arg1, \arg1
- //psad8x8
- vpaddl.u16 \arg2, \arg2
- vpaddl.u32 \arg2, \arg2
+ //psad8x8
+ vpaddl.u16 \arg2, \arg2
+ vpaddl.u32 \arg2, \arg2
- //psadframe
- vadd.i32 q12, \arg2
+ //psadframe
+ vadd.i32 q12, \arg2
.endm
.macro SAD_SSD_BGD_16x16 arg0, arg1, arg2
- //for one 8x16
- SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ //for one 8x16
+ SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_BGD_CALC_8x16 d26, q14, q6
+ SAD_SSD_BGD_CALC_8x16 d26, q14, q6
- //for another 8x16
- SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_BGD_16_end \arg0, \arg2, q7
+ //for another 8x16
+ SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_BGD_16_end \arg0, \arg2, q7
- SAD_SSD_BGD_CALC_8x16 d27, q15, q7
+ SAD_SSD_BGD_CALC_8x16 d27, q15, q7
.endm
-.macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2
- vpaddl.s16 \arg0, \arg0
- vpaddl.s32 \arg0, \arg0
- vadd.i32 \arg1, \arg1, \arg2
+.macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2
+ vpaddl.s16 \arg0, \arg0
+ vpaddl.s32 \arg0, \arg0
+ vadd.i32 \arg1, \arg1, \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon
- stmdb sp!, {r0-r12, r14}
- vpush {q4-q7}
+ stmdb sp!, {r0-r12, r14}
+ vpush {q4-q7}
- ldr r4, [sp, #120] //r4 keeps the pic_stride
+ ldr r4, [sp, #120] //r4 keeps the pic_stride
- sub r5, r4, #1
- lsl r5, r5, #4 //r5 keeps the little step
+ sub r5, r4, #1
+ lsl r5, r5, #4 //r5 keeps the little step
- lsl r6, r4, #4
- sub r6, r2, r6 //r6 keeps the big step
+ lsl r6, r4, #4
+ sub r6, r2, r6 //r6 keeps the big step
- ldr r8, [sp, #128]//psad8x8
- ldr r9, [sp, #132]//psum16x16
- ldr r10, [sp, #136]//psqsum16x16
- ldr r11, [sp, #140]//psqdiff16x16
- ldr r12, [sp, #144]//p_sd8x8
- ldr r14, [sp, #148]//p_mad8x8
+ ldr r8, [sp, #128]//psad8x8
+ ldr r9, [sp, #132]//psum16x16
+ ldr r10, [sp, #136]//psqsum16x16
+ ldr r11, [sp, #140]//psqdiff16x16
+ ldr r12, [sp, #144]//p_sd8x8
+ ldr r14, [sp, #148]//p_mad8x8
- vmov.i8 q12, #0
+ vmov.i8 q12, #0
vaa_calc_sad_ssd_bgd_height_loop:
@@ -660,7 +660,7 @@
mov r7, r2
vaa_calc_sad_ssd_bgd_width_loop:
- //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff q8, l_sum q9, l_sqsum q10
+ //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff q8, l_sum q9, l_sqsum q10
SAD_SSD_BGD_16x16 r0,r1,r4
//psad8x8
@@ -694,20 +694,20 @@
bne vaa_calc_sad_ssd_bgd_width_loop
- sub r0, r0, r6 //jump to next 16 x width
- sub r1, r1, r6 //jump to next 16 x width
+ sub r0, r0, r6 //jump to next 16 x width
+ sub r1, r1, r6 //jump to next 16 x width
subs r3, #16
bne vaa_calc_sad_ssd_bgd_height_loop
- //psadframe
- ldr r7, [sp, #124]//psadframe
+ //psadframe
+ ldr r7, [sp, #124]//psadframe
- vadd.i32 d24, d24, d25
- vst1.32 {d24[0]}, [r7]
+ vadd.i32 d24, d24, d25
+ vst1.32 {d24[0]}, [r7]
- vpop {q4-q7}
- ldmia sp!, {r0-r12, r14}
+ vpop {q4-q7}
+ ldmia sp!, {r0-r12, r14}
WELS_ASM_FUNC_END
@@ -714,223 +714,223 @@
#ifdef __APPLE__
.macro SAD_VAR_16
- vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q0}, [$0], $2 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [$1], $2
+ vld1.8 {q1}, [$1], $2
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_END
- vld1.8 {q0}, [$0], $1 //load cur_row
+ vld1.8 {q0}, [$0], $1 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_RESET_16x16
- vld1.8 {q0}, [$0], $2 //load cur_row
- vld1.8 {q1}, [$1], $2
+ vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q1}, [$1], $2
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [$1], $2
+ vld1.8 {q1}, [$1], $2
- vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
+ SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
.endm
.macro SAD_VAR_16_RESET_8x8
- vld1.8 {q0}, [$0], $2 //load cur_row
+ vld1.8 {q0}, [$0], $2 //load cur_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [$1], $2
+ vld1.8 {q1}, [$1], $2
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16x16
- //for one 8x16
- SAD_VAR_16_RESET_16x16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
+ //for one 8x16
+ SAD_VAR_16_RESET_16x16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
+ SAD_VAR_16 $0, $1, $2, q6
- vpaddl.u16 q6, q6
- vpaddl.u32 q6, q6
- vadd.i32 q12, q6
+ vpaddl.u16 q6, q6
+ vpaddl.u32 q6, q6
+ vadd.i32 q12, q6
- //for another 8x16
- SAD_VAR_16_RESET_8x8 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16_END $0, $2, q7
+ //for another 8x16
+ SAD_VAR_16_RESET_8x8 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16 $0, $1, $2, q7
+ SAD_VAR_16_END $0, $2, q7
- vpaddl.u16 q7, q7
- vpaddl.u32 q7, q7
+ vpaddl.u16 q7, q7
+ vpaddl.u32 q7, q7
- vadd.i32 q12, q7
+ vadd.i32 q12, q7
.endm
#else
.macro SAD_VAR_16 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [\arg1], \arg2
+ vld1.8 {q1}, [\arg1], \arg2
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_END arg0, arg1, arg2
- vld1.8 {q0}, [\arg0], \arg1 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg1 //load cur_row
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
+ vpadal.u8 q3, q0 //add cur_row together
+ vpadal.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16
+ vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_RESET_16x16 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vld1.8 {q1}, [\arg1], \arg2
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q1}, [\arg1], \arg2
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [\arg1], \arg2
+ vld1.8 {q1}, [\arg1], \arg2
- vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
+ SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
.endm
.macro SAD_VAR_16_RESET_8x8 arg0, arg1, arg2, arg3
- vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+ vld1.8 {q0}, [\arg0], \arg2 //load cur_row
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
+ vpaddl.u8 q3, q0 //add cur_row together
+ vpaddl.u8 q4, q1 //add ref_row together
- vabd.u8 q2, q0, q1 //abs_diff
+ vabd.u8 q2, q0, q1 //abs_diff
- vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
+ vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
- vld1.8 {q1}, [\arg1], \arg2
+ vld1.8 {q1}, [\arg1], \arg2
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
+ vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
+ SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16x16 arg0, arg1, arg2
- //for one 8x16
- SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
- SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ //for one 8x16
+ SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
+ SAD_VAR_16 \arg0, \arg1, \arg2, q6
- vpaddl.u16 q6, q6
- vpaddl.u32 q6, q6
- vadd.i32 q12, q6
+ vpaddl.u16 q6, q6
+ vpaddl.u32 q6, q6
+ vadd.i32 q12, q6
- //for another 8x16
- SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16 \arg0, \arg1, \arg2, q7
- SAD_VAR_16_END \arg0, \arg2, q7
+ //for another 8x16
+ SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16 \arg0, \arg1, \arg2, q7
+ SAD_VAR_16_END \arg0, \arg2, q7
- vpaddl.u16 q7, q7
- vpaddl.u32 q7, q7
+ vpaddl.u16 q7, q7
+ vpaddl.u32 q7, q7
- vadd.i32 q12, q7
+ vadd.i32 q12, q7
.endm
#endif
WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon
- stmdb sp!, {r4-r11}
- vpush {q4}
- vpush {q6-q7}
+ stmdb sp!, {r4-r11}
+ vpush {q4}
+ vpush {q6-q7}
- ldr r4, [sp, #80] //r4 keeps the pic_stride
+ ldr r4, [sp, #80] //r4 keeps the pic_stride
- sub r5, r4, #1
- lsl r5, r5, #4 //r5 keeps the little step
+ sub r5, r4, #1
+ lsl r5, r5, #4 //r5 keeps the little step
- lsl r6, r4, #4
- sub r6, r2, r6 //r6 keeps the big step
+ lsl r6, r4, #4
+ sub r6, r2, r6 //r6 keeps the big step
- ldr r7, [sp, #84] //psadframe
- ldr r8, [sp, #88] //psad8x8
- ldr r9, [sp, #92] //psum16x16
- ldr r10, [sp, #96] //psqsum16x16
+ ldr r7, [sp, #84] //psadframe
+ ldr r8, [sp, #88] //psad8x8
+ ldr r9, [sp, #92] //psum16x16
+ ldr r10, [sp, #96] //psqsum16x16
- vmov.i8 q12, #0
+ vmov.i8 q12, #0
vaa_calc_sad_var_height_loop:
mov r11, r2
@@ -956,154 +956,154 @@
bne vaa_calc_sad_var_width_loop
- sub r0, r0, r6 //jump to next 16 x width
- sub r1, r1, r6 //jump to next 16 x width
+ sub r0, r0, r6 //jump to next 16 x width
+ sub r1, r1, r6 //jump to next 16 x width
subs r3, #16
bne vaa_calc_sad_var_height_loop
- vadd.i32 d24, d24, d25
- vst1.32 {d24[0]}, [r7]
+ vadd.i32 d24, d24, d25
+ vst1.32 {d24[0]}, [r7]
- vpop {q6-q7}
- vpop {q4}
- ldmia sp!, {r4-r11}
+ vpop {q6-q7}
+ vpop {q4}
+ ldmia sp!, {r4-r11}
WELS_ASM_FUNC_END
#ifdef __APPLE__
.macro SAD_SSD_16
- SAD_VAR_16 $0, $1, $2, $3
+ SAD_VAR_16 $0, $1, $2, $3
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11
.endm
.macro SAD_SSD_16_END
- SAD_VAR_16_END $0, $1, $2
+ SAD_VAR_16_END $0, $1, $2
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_16x16
- SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
+ SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
- SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_8x8
- SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
+ SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16x16
- //for one 8x16
- SAD_SSD_16_RESET_16x16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
+ //for one 8x16
+ SAD_SSD_16_RESET_16x16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
+ SAD_SSD_16 $0, $1, $2, q6
- vpaddl.u16 q6, q6
- vpaddl.u32 q6, q6
- vadd.i32 q12, q6
+ vpaddl.u16 q6, q6
+ vpaddl.u32 q6, q6
+ vadd.i32 q12, q6
- //for another 8x16
- SAD_SSD_16_RESET_8x8 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16_END $0, $2, q7
+ //for another 8x16
+ SAD_SSD_16_RESET_8x8 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16 $0, $1, $2, q7
+ SAD_SSD_16_END $0, $2, q7
- vpaddl.u16 q7, q7
- vpaddl.u32 q7, q7
+ vpaddl.u16 q7, q7
+ vpaddl.u32 q7, q7
- vadd.i32 q12, q7
+ vadd.i32 q12, q7
.endm
#else
.macro SAD_SSD_16 arg0, arg1, arg2, arg3
- SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
+ SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11
.endm
.macro SAD_SSD_16_END arg0, arg1, arg2
- SAD_VAR_16_END \arg0, \arg1, \arg2
+ SAD_VAR_16_END \arg0, \arg1, \arg2
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_16x16 arg0, arg1, arg2, arg3
- SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3
+ SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3
- SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_8x8 arg0, arg1, arg2, arg3
- SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3
+ SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
+ SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16x16 arg0, arg1, arg2
- //for one 8x16
- SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
- SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ //for one 8x16
+ SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
+ SAD_SSD_16 \arg0, \arg1, \arg2, q6
- vpaddl.u16 q6, q6
- vpaddl.u32 q6, q6
- vadd.i32 q12, q6
+ vpaddl.u16 q6, q6
+ vpaddl.u32 q6, q6
+ vadd.i32 q12, q6
- //for another 8x16
- SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16 \arg0, \arg1, \arg2, q7
- SAD_SSD_16_END \arg0, \arg2, q7
+ //for another 8x16
+ SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16 \arg0, \arg1, \arg2, q7
+ SAD_SSD_16_END \arg0, \arg2, q7
- vpaddl.u16 q7, q7
- vpaddl.u32 q7, q7
+ vpaddl.u16 q7, q7
+ vpaddl.u32 q7, q7
- vadd.i32 q12, q7
+ vadd.i32 q12, q7
.endm
#endif
WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon
- stmdb sp!, {r4-r12}
- vpush {q4}
- vpush {q6-q7}
+ stmdb sp!, {r4-r12}
+ vpush {q4}
+ vpush {q6-q7}
- ldr r4, [sp, #84] //r4 keeps the pic_stride
+ ldr r4, [sp, #84] //r4 keeps the pic_stride
- sub r5, r4, #1
- lsl r5, r5, #4 //r5 keeps the little step
+ sub r5, r4, #1
+ lsl r5, r5, #4 //r5 keeps the little step
- lsl r6, r4, #4
- sub r6, r2, r6 //r6 keeps the big step
+ lsl r6, r4, #4
+ sub r6, r2, r6 //r6 keeps the big step
- ldr r7, [sp, #88] //psadframe
- ldr r8, [sp, #92] //psad8x8
- ldr r9, [sp, #96] //psum16x16
- ldr r10, [sp, #100] //psqsum16x16
- ldr r11, [sp, #104] //psqdiff16x16
+ ldr r7, [sp, #88] //psadframe
+ ldr r8, [sp, #92] //psad8x8
+ ldr r9, [sp, #96] //psum16x16
+ ldr r10, [sp, #100] //psqsum16x16
+ ldr r11, [sp, #104] //psqdiff16x16
- vmov.i8 q12, #0
+ vmov.i8 q12, #0
vaa_calc_sad_ssd_height_loop:
mov r12, r2
@@ -1136,18 +1136,18 @@
bne vaa_calc_sad_ssd_width_loop
- sub r0, r0, r6 //jump to next 16 x width
- sub r1, r1, r6 //jump to next 16 x width
+ sub r0, r0, r6 //jump to next 16 x width
+ sub r1, r1, r6 //jump to next 16 x width
subs r3, #16
- bne vaa_calc_sad_ssd_height_loop
+ bne vaa_calc_sad_ssd_height_loop
- vadd.i32 d24, d24, d25
- vst1.32 {d24[0]}, [r7]
+ vadd.i32 d24, d24, d25
+ vst1.32 {d24[0]}, [r7]
- vpop {q6-q7}
- vpop {q4}
- ldmia sp!, {r4-r12}
+ vpop {q6-q7}
+ vpop {q4}
+ ldmia sp!, {r4-r12}
WELS_ASM_FUNC_END
#endif
--- a/codec/processing/src/x86/denoisefilter.asm
+++ b/codec/processing/src/x86/denoisefilter.asm
@@ -56,217 +56,217 @@
;***********************************************************************
SECTION .text
-%macro WEIGHT_LINE 9
- movq %2, %9
- punpcklbw %2, %7
- movdqa %8, %2
+%macro WEIGHT_LINE 9
+ movq %2, %9
+ punpcklbw %2, %7
+ movdqa %8, %2
- movdqa %1, %6
- psubusb %1, %8
- psubusb %8, %6
- por %8, %1 ; ABS(curPixel - centerPixel);
+ movdqa %1, %6
+ psubusb %1, %8
+ psubusb %8, %6
+ por %8, %1 ; ABS(curPixel - centerPixel);
- movdqa %1, %3
- psubusb %1, %8
+ movdqa %1, %3
+ psubusb %1, %8
- pmullw %1, %1
- psrlw %1, 5
- pmullw %2, %1
- paddusw %4, %1
- paddusw %5, %2
+ pmullw %1, %1
+ psrlw %1, 5
+ pmullw %2, %1
+ paddusw %4, %1
+ paddusw %5, %2
%endmacro
-%macro WEIGHT_LINE1_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
+%macro WEIGHT_LINE1_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
%endmacro
-%macro WEIGHT_LINE2_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
+%macro WEIGHT_LINE2_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
%endmacro
-%macro WEIGHT_LINE3_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
+%macro WEIGHT_LINE3_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- pmullw %2, [sse2_20]
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ pmullw %2, [sse2_20]
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
%endmacro
;***********************************************************************
; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
-; 1 2 3
-; 4 0 5
-; 6 7 8
-; 0: the center point
+; 1 2 3
+; 4 0 5
+; 6 7 8
+; 0: the center point
WELS_EXTERN BilateralLumaFilter8_sse2
- push r3
- %assign push_num 1
- LOAD_2_PARA
- PUSH_XMM 8
+ push r3
+ %assign push_num 1
+ LOAD_2_PARA
+ PUSH_XMM 8
- pxor xmm7, xmm7
+ pxor xmm7, xmm7
- mov r3, r0
+ mov r3, r0
- movq xmm6, [r0]
- punpcklbw xmm6, xmm7
- movdqa xmm3, [sse2_32]
- pxor xmm4, xmm4 ; nTotWeight
- pxor xmm5, xmm5 ; nSum
+ movq xmm6, [r0]
+ punpcklbw xmm6, xmm7
+ movdqa xmm3, [sse2_32]
+ pxor xmm4, xmm4 ; nTotWeight
+ pxor xmm5, xmm5 ; nSum
- dec r0
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
+ dec r0
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
- sub r0, r1
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
+ sub r0, r1
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
- lea r0, [r0 + r1 * 2]
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
+ lea r0, [r0 + r1 * 2]
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
- pcmpeqw xmm0, xmm0
- psrlw xmm0, 15
- psllw xmm0, 8
- psubusw xmm0, xmm4
- pmullw xmm0, xmm6
- paddusw xmm5, xmm0
- psrlw xmm5, 8
- packuswb xmm5, xmm5
- movq [r3], xmm5
+ pcmpeqw xmm0, xmm0
+ psrlw xmm0, 15
+ psllw xmm0, 8
+ psubusw xmm0, xmm4
+ pmullw xmm0, xmm6
+ paddusw xmm5, xmm0
+ psrlw xmm5, 8
+ packuswb xmm5, xmm5
+ movq [r3], xmm5
- POP_XMM
- pop r3
- %assign push_num 0
+ POP_XMM
+ pop r3
+ %assign push_num 0
- ret
+ ret
;***********************************************************************
-; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
;5x5 filter:
-;1 1 2 1 1
-;1 2 4 2 1
-;2 4 20 4 2
-;1 2 4 2 1
-;1 1 2 1 1
+;1 1 2 1 1
+;1 2 4 2 1
+;2 4 20 4 2
+;1 2 4 2 1
+;1 1 2 1 1
WELS_EXTERN WaverageChromaFilter8_sse2
- push r3
+ push r3
- %assign push_num 1
+ %assign push_num 1
- LOAD_2_PARA
+ LOAD_2_PARA
- mov r3, r1
- add r3, r3
- sub r0, r3 ; pixels - 2 * stride
- sub r0, 2
+ mov r3, r1
+ add r3, r3
+ sub r0, r3 ; pixels - 2 * stride
+ sub r0, 2
- pxor xmm0, xmm0
- pxor xmm3, xmm3
+ pxor xmm0, xmm0
+ pxor xmm3, xmm3
- movdqu xmm1, [r0]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+ movdqu xmm1, [r0]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
- movdqu xmm1, [r0 + r1]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+ movdqu xmm1, [r0 + r1]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
- add r0, r3
- movdqu xmm1, [r0]
- WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
+ add r0, r3
+ movdqu xmm1, [r0]
+ WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
- movdqu xmm1, [r0 + r1]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+ movdqu xmm1, [r0 + r1]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
- movdqu xmm1, [r0 + r1 * 2]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+ movdqu xmm1, [r0 + r1 * 2]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
- psrlw xmm3, 6
- packuswb xmm3, xmm3
- movq [r0 + 2], xmm3
+ psrlw xmm3, 6
+ packuswb xmm3, xmm3
+ movq [r0 + 2], xmm3
- pop r3
+ pop r3
- %assign push_num 0
- ret
+ %assign push_num 0
+ ret
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -29,13 +29,13 @@
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
-;* upsampling.asm
+;* upsampling.asm
;*
;* Abstract
-;* SIMD for pixel domain down sampling
+;* SIMD for pixel domain down sampling
;*
;* History
-;* 10/22/2009 Created
+;* 10/22/2009 Created
;*
;*************************************************************************/
%include "asm_inc.asm"
@@ -61,9 +61,9 @@
ALIGN 16
shufb_mask_low:
- db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+ db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
shufb_mask_high:
- db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+ db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
;***********************************************************************
@@ -73,737 +73,737 @@
SECTION .text
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
+ sar ebp, $01 ; iSrcHeight >> 1
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
- ; 2nd part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm1, [esi+16] ; 1st pSrc line + 16
- movq mm2, [esi+24] ; 1st pSrc line + 24
- movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
- movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
+ ; 2nd part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm1, [esi+16] ; 1st pSrc line + 16
+ movq mm2, [esi+24] ; 1st pSrc line + 24
+ movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
+ movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
- ; to handle mm1, mm2, mm3, mm4
- pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm5, mm6 ; d c D C b a B A
- pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
+ ; to handle mm1, mm2, mm3, mm4
+ pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm5, mm6 ; d c D C b a B A
+ pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
- pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm6, mm7 ; h g H G f e F E
- pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
+ pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm6, mm7 ; h g H G f e F E
+ pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
- pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm7, mm1 ; l k L K j i J I
- pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
+ pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm7, mm1 ; l k L K j i J I
+ pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
- pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm1, mm2 ; p o P O n m N M
- pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
+ pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm1, mm2 ; p o P O n m N M
+ pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
- ; to handle mm5, mm6, mm7, mm1
- movq mm2, mm5
- punpckldq mm2, mm6 ; H G F E D C B A
- punpckhdq mm5, mm6 ; h g f e d c b a
+ ; to handle mm5, mm6, mm7, mm1
+ movq mm2, mm5
+ punpckldq mm2, mm6 ; H G F E D C B A
+ punpckhdq mm5, mm6 ; h g f e d c b a
- movq mm3, mm7
- punpckldq mm3, mm1 ; P O N M L K J I
- punpckhdq mm7, mm1 ; p o n m l k j i
+ movq mm3, mm7
+ punpckldq mm3, mm1 ; P O N M L K J I
+ punpckhdq mm7, mm1 ; p o n m l k j i
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
- movq [edi ], mm0
- movq [edi+8], mm2
+ movq [edi ], mm0
+ movq [edi+8], mm2
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
+ sar ebp, $01 ; iSrcHeight >> 1
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
- movq [edi ], mm0
+ movq [edi ], mm0
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
+ sar ebp, $01 ; iSrcHeight >> 1
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $02 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 8 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $02 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 8 bytes
.xloops:
- ; 1st part horizonal loop: x8 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A
- ;2nd Line Src: mm1: h H g G f F e E
- ;=> target:
- ;: H G F E D C B A
- ;: h g f e d c b a
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+ecx] ; 2nd pSrc line
+ ; 1st part horizonal loop: x8 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A
+ ;2nd Line Src: mm1: h H g G f F e E
+ ;=> target:
+ ;: H G F E D C B A
+ ;: h g f e d c b a
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+ecx] ; 2nd pSrc line
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm2, mm3 ; d c D C b a B A
- pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm2, mm3 ; d c D C b a B A
+ pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
- pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm4, mm5 ; h g H G f e F E
- pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+ pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm4, mm5 ; h g H G f e F E
+ pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
- ; to handle mm2, mm4
- movq mm0, mm2 ;
- punpckldq mm0, mm4 ; H G F E D C B A
- punpckhdq mm2, mm4 ; h g f e d c b a
+ ; to handle mm2, mm4
+ movq mm0, mm2 ;
+ punpckldq mm0, mm4 ; H G F E D C B A
+ punpckhdq mm2, mm4 ; h g f e d c b a
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
- pshufw mm1, mm0, 04eh ; 01001110 B
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+ pshufw mm1, mm0, 04eh ; 01001110 B
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
- movd [edi], mm0
+ movd [edi], mm0
- ; next unit
- lea esi, [esi+8]
- lea edi, [edi+4]
+ ; next unit
+ lea esi, [esi+8]
+ lea edi, [edi+4]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
+ sar ebp, $01 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+16] ; 1st_src_line + 16
- movdqa xmm2, [esi+ecx] ; 2nd_src_line
- movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm4 high bits
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm4 high bits
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
- ; write pDst
- movdqa [edi], xmm0
+ ; write pDst
+ movdqa [edi], xmm0
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
+ sar ebp, $01 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+ecx] ; 2nd_src_line
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+ecx] ; 2nd_src_line
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm2 high bits
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm2 high bits
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
- ; write pDst
- movq [edi], xmm0
+ ; write pDst
+ movq [edi], xmm0
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
+ sar ebp, $01 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+16] ; 1st_src_line + 16
- movntdqa xmm2, [esi+ecx] ; 2nd_src_line
- movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movntdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
- ; write pDst
- movdqa [edi], xmm0
+ ; write pDst
+ movdqa [edi], xmm0
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
+; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
- push ebx
- push edx
- push esi
- push edi
- push ebp
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
- sar ebp, $01 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
+ sar ebp, $01 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+ecx] ; 2nd_src_line
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+ecx] ; 2nd_src_line
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
- ; write pDst
- movq [edi], xmm0
+ ; write pDst
+ movq [edi], xmm0
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
- dec eax
- jg near .xloops
+ dec eax
+ jg near .xloops
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
- dec ebp
- jg near .yloops
+ dec ebp
+ jg near .yloops
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
@@ -811,202 +811,202 @@
;**************************************************************************************************************
;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
; unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************
-WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
- pxor xmm0, xmm0
- mov edx, 32767
- mov eax, [uiScaleX]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm1, eax ; uinc(uiScaleX mod 32767)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
- pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
+ pxor xmm0, xmm0
+ mov edx, 32767
+ mov eax, [uiScaleX]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm1, eax ; uinc(uiScaleX mod 32767)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
+ pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
- pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
+ pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
- mov edx, 40003fffh
- movd xmm5, edx
- punpcklwd xmm5, xmm0 ; 16384 16383
- pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
+ mov edx, 40003fffh
+ movd xmm5, edx
+ punpcklwd xmm5, xmm0 ; 16384 16383
+ pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
DOWNSAMPLE:
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
- pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
+ pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
- movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
+ movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
WIDTH:
- mov eax, [xInverse]
- shr eax, 15
+ mov eax, [xInverse]
+ shr eax, 15
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- pxor xmm0, xmm0
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
- punpcklwd xmm1, xmm0 ; 000d000c000b000a
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ pxor xmm0, xmm0
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
+ punpcklwd xmm1, xmm0 ; 000d000c000b000a
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- movdqa xmm0, xmm2
- pmuludq xmm2, xmm1
- psrlq xmm0, 32
- psrlq xmm1, 32
- pmuludq xmm0, xmm1
- paddq xmm2, xmm0
- pshufd xmm1, xmm2, 00001110b
- paddq xmm2, xmm1
- psrlq xmm2, 29
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ movdqa xmm0, xmm2
+ pmuludq xmm2, xmm1
+ psrlq xmm0, 32
+ psrlq xmm1, 32
+ pmuludq xmm0, xmm1
+ paddq xmm2, xmm0
+ pshufd xmm1, xmm2, 00001110b
+ paddq xmm2, xmm1
+ psrlq xmm2, 29
- movd eax, xmm2
- inc eax
- shr eax, 1
- mov [edi], al
- inc edi
+ movd eax, xmm2
+ inc eax
+ shr eax, 1
+ mov [edi], al
+ inc edi
- mov eax, [uiScaleX]
- add [xInverse], eax
+ mov eax, [uiScaleX]
+ add [xInverse], eax
- paddw xmm3, xmm7 ; inc u
- psllw xmm3, 1
- psrlw xmm3, 1
+ paddw xmm3, xmm7 ; inc u
+ psllw xmm3, 1
+ psrlw xmm3, 1
- loop WIDTH
+ loop WIDTH
WIDTH_END:
- mov eax, [xInverse]
- shr eax, 15
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
+ mov eax, [xInverse]
+ shr eax, 15
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
- dec dword [tmpHeight]
- jg HEIGHT
+ dec dword [tmpHeight]
+ jg HEIGHT
LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 15
+ mov eax, [xInverse]
+ shr eax, 15
- mov al, [esi+eax]
- mov [edi], al
- inc edi
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
- mov eax, [uiScaleX]
- add [xInverse], eax
+ mov eax, [uiScaleX]
+ add [xInverse], eax
- loop LAST_ROW_WIDTH
+ loop LAST_ROW_WIDTH
LAST_ROW_END:
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
- ret
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
+ ret
@@ -1013,193 +1013,193 @@
;**************************************************************************************************************
;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
; unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************
-WELS_EXTERN GeneralBilinearFastDownsampler_sse2
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
+WELS_EXTERN GeneralBilinearFastDownsampler_sse2
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
- pxor xmm0, xmm0
- mov edx, 65535
- mov eax, [uiScaleX]
- and eax, edx
- mov ebx, eax
- neg ebx
- and ebx, 65535
- movd xmm1, eax ; uinc(uiScaleX mod 65536)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 uinc 0 -uinc
- pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
+ pxor xmm0, xmm0
+ mov edx, 65535
+ mov eax, [uiScaleX]
+ and eax, edx
+ mov ebx, eax
+ neg ebx
+ and ebx, 65535
+ movd xmm1, eax ; uinc(uiScaleX mod 65536)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 uinc 0 -uinc
+ pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 vinc 0 -vinc
- pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 vinc 0 -vinc
+ pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
- mov edx, 80007fffh ; 32768 32767
- movd xmm5, edx
- pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
- mov ebx, 16384
+ mov edx, 80007fffh ; 32768 32767
+ movd xmm5, edx
+ pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
+ mov ebx, 16384
FAST_DOWNSAMPLE:
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
- pshuflw xmm4, xmm5, 01010000b
- psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
+ pshuflw xmm4, xmm5, 01010000b
+ psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
FAST_HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
- movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
+ movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
FAST_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
+ mov eax, [xInverse]
+ shr eax, 16
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- pmaddwd xmm2, xmm1
- pshufd xmm1, xmm2, 00000001b
- paddd xmm2, xmm1
- movd xmm1, ebx
- paddd xmm2, xmm1
- psrld xmm2, 15
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ pmaddwd xmm2, xmm1
+ pshufd xmm1, xmm2, 00000001b
+ paddd xmm2, xmm1
+ movd xmm1, ebx
+ paddd xmm2, xmm1
+ psrld xmm2, 15
- packuswb xmm2, xmm0
- movd eax, xmm2
- mov [edi], al
- inc edi
+ packuswb xmm2, xmm0
+ movd eax, xmm2
+ mov [edi], al
+ inc edi
- mov eax, [uiScaleX]
- add [xInverse], eax
+ mov eax, [uiScaleX]
+ add [xInverse], eax
- paddw xmm3, xmm7 ; inc u
+ paddw xmm3, xmm7 ; inc u
- loop FAST_WIDTH
+ loop FAST_WIDTH
FAST_WIDTH_END:
- mov eax, [xInverse]
- shr eax, 16
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
+ mov eax, [xInverse]
+ shr eax, 16
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
- dec dword [tmpHeight]
- jg FAST_HEIGHT
+ dec dword [tmpHeight]
+ jg FAST_HEIGHT
FAST_LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
FAST_LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
+ mov eax, [xInverse]
+ shr eax, 16
- mov al, [esi+eax]
- mov [edi], al
- inc edi
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
- mov eax, [uiScaleX]
- add [xInverse], eax
+ mov eax, [uiScaleX]
+ add [xInverse], eax
- loop FAST_LAST_ROW_WIDTH
+ loop FAST_LAST_ROW_WIDTH
FAST_LAST_ROW_END:
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
- ret
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
+ ret
%endif
--- a/codec/processing/src/x86/vaa.asm
+++ b/codec/processing/src/x86/vaa.asm
@@ -48,192 +48,192 @@
; Macros and other preprocessor constants
;***********************************************************************
%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
- movdqa %1, %2
- punpcklbw %1, %3
- punpckhbw %2, %3
- pmaddwd %1, %1
- pmaddwd %2, %2
- paddd %1, %2
- pshufd %2, %1, 04Eh ; 01001110 B
- paddd %1, %2
- pshufd %2, %1, 0B1h ; 10110001 B
- paddd %1, %2
+ movdqa %1, %2
+ punpcklbw %1, %3
+ punpckhbw %2, %3
+ pmaddwd %1, %1
+ pmaddwd %2, %2
+ paddd %1, %2
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddd %1, %2
+ pshufd %2, %1, 0B1h ; 10110001 B
+ paddd %1, %2
%endmacro ; END OF SUM_SQR_SSE2
%macro WELS_SAD_16x2_SSE2 3 ;esi :%1 edi:%2 ebx:%3
- movdqa xmm1, [%1]
- movdqa xmm2, [%2]
- movdqa xmm3, [%1+%3]
- movdqa xmm4, [%2+%3]
- psadbw xmm1, xmm2
- psadbw xmm3, xmm4
- paddd xmm6, xmm1
- paddd xmm6, xmm3
- lea %1, [%1+%3*2]
- lea %2, [%2+%3*2]
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, [%1+%3]
+ movdqa xmm4, [%2+%3]
+ psadbw xmm1, xmm2
+ psadbw xmm3, xmm4
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+ lea %1, [%1+%3*2]
+ lea %2, [%2+%3*2]
%endmacro
; by comparing it outperforms than phaddw(SSSE3) sets
%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
- ; @sum_8x2 begin
- pshufd %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 0B1h ; 10110001 B
- paddw %1, %2
- ; end of @sum_8x2
+ ; @sum_8x2 begin
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 0B1h ; 10110001 B
+ paddw %1, %2
+ ; end of @sum_8x2
%endmacro ; END of SUM_WORD_8x2_SSE2
-%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
- movdqa xmm1, [%1]
- movdqa xmm2, [%2]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm6, xmm3
+%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm6, xmm3
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd xmm5, xmm3
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd xmm5, xmm3
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm4, xmm1
- paddd xmm4, xmm2
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm4, xmm1
+ paddd xmm4, xmm2
- add %1, %3
- add %2, %3
+ add %1, %3
+ add %2, %3
%endmacro
-%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
- movdqa xmm1, [%1]
- movdqa xmm2, [%2]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm7, xmm3 ; sad
+%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm7, xmm3 ; sad
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; diff
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; diff
- movdqa xmm2, xmm1
- psadbw xmm2, xmm0
- paddd xmm6, xmm2 ; sum
+ movdqa xmm2, xmm1
+ psadbw xmm2, xmm0
+ paddd xmm6, xmm2 ; sum
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm5, xmm1
- paddd xmm5, xmm2 ; sqsum
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm5, xmm1
+ paddd xmm5, xmm2 ; sqsum
- movdqa xmm1, xmm3
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd xmm4, xmm1
- paddd xmm4, xmm3 ; sqdiff
+ movdqa xmm1, xmm3
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm4, xmm1
+ paddd xmm4, xmm3 ; sqdiff
- add %1, %3
- add %2, %3
+ add %1, %3
+ add %2, %3
%endmacro
-%macro WELS_SAD_SD_MAD_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
+%macro WELS_SAD_SD_MAD_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
%define sad_reg %1
%define sum_cur_reg %2
%define sum_ref_reg %3
%define mad_reg %4
- movdqa xmm1, [%5]
- movdqa xmm2, [%6]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_cur_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- paddd sum_ref_reg, xmm3 ; sum_ref
+ movdqa xmm1, [%5]
+ movdqa xmm2, [%6]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_cur_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ paddd sum_ref_reg, xmm3 ; sum_ref
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
- add %5, %7
- add %6, %7
+ add %5, %7
+ add %6, %7
%endmacro
-%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
+%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
%define max_reg %1
- movdqa xmm1, max_reg
- psrldq xmm1, 4
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 2
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 1
- pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 4
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 2
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 1
+ pmaxub max_reg, xmm1
%endmacro
-%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
+%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
%define sad_reg %1
%define sum_reg %2
%define mad_reg %3
%define sqdiff_reg %4
- movdqa xmm1, [%5]
- movdqa xmm2, xmm1
- movdqa xmm3, xmm1
- punpcklbw xmm2, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- paddd xmm2, xmm3
- movdqa xmm3, xmm2
- psllq xmm2, 32
- psrlq xmm3, 32
- psllq xmm3, 32
- paddd xmm2, xmm3
- paddd sad_reg, xmm2 ; sqsum
+ movdqa xmm1, [%5]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm2, xmm3
+ movdqa xmm3, xmm2
+ psllq xmm2, 32
+ psrlq xmm3, 32
+ psllq xmm3, 32
+ paddd xmm2, xmm3
+ paddd sad_reg, xmm2 ; sqsum
- movdqa xmm2, [%6]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- pslldq xmm3, 4
- paddd sum_reg, xmm3 ; sum_ref
+ movdqa xmm2, [%6]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ pslldq xmm3, 4
+ paddd sum_reg, xmm3 ; sum_ref
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
- movdqa xmm1, xmm3
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
+ movdqa xmm1, xmm3
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
- movdqa xmm3, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd sqdiff_reg, xmm1
- paddd sqdiff_reg, xmm3 ; sqdiff
+ movdqa xmm3, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd sqdiff_reg, xmm1
+ paddd sqdiff_reg, xmm3 ; sqdiff
- add %5, %7
- add %6, %7
+ add %5, %7
+ add %6, %7
%endmacro
@@ -249,99 +249,99 @@
; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
;***********************************************************************
WELS_EXTERN SampleVariance16x16_sse2
- push esi
- push edi
- push ebx
+ push esi
+ push edi
+ push ebx
- sub esp, 16
- %define SUM [esp]
- %define SUM_CUR [esp+4]
- %define SQR [esp+8]
- %define SQR_CUR [esp+12]
- %define PUSH_SIZE 28 ; 12 + 16
+ sub esp, 16
+ %define SUM [esp]
+ %define SUM_CUR [esp+4]
+ %define SQR [esp+8]
+ %define SQR_CUR [esp+12]
+ %define PUSH_SIZE 28 ; 12 + 16
- mov edi, [esp+PUSH_SIZE+4] ; y_ref
- mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
- mov esi, [esp+PUSH_SIZE+12] ; y_src
- mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
- mov ecx, 010h ; height = 16
+ mov edi, [esp+PUSH_SIZE+4] ; y_ref
+ mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
+ mov esi, [esp+PUSH_SIZE+12] ; y_src
+ mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
+ mov ecx, 010h ; height = 16
- pxor xmm7, xmm7
- movdqu SUM, xmm7
+ pxor xmm7, xmm7
+ movdqu SUM, xmm7
.hloops:
- movdqa xmm0, [edi] ; y_ref
- movdqa xmm1, [esi] ; y_src
- movdqa xmm2, xmm0 ; store first for future process
- movdqa xmm3, xmm1
- ; sum += diff;
- movdqa xmm4, xmm0
- psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
- ; to be continued for sum
- pshufd xmm5, xmm4, 0C6h ; 11000110 B
- paddw xmm4, xmm5
- movd ebx, xmm4
- add SUM, ebx
+ movdqa xmm0, [edi] ; y_ref
+ movdqa xmm1, [esi] ; y_src
+ movdqa xmm2, xmm0 ; store first for future process
+ movdqa xmm3, xmm1
+ ; sum += diff;
+ movdqa xmm4, xmm0
+ psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
+ ; to be continued for sum
+ pshufd xmm5, xmm4, 0C6h ; 11000110 B
+ paddw xmm4, xmm5
+ movd ebx, xmm4
+ add SUM, ebx
- ; sqr += diff * diff;
- pmaxub xmm0, xmm1
- pminub xmm1, xmm2
- psubb xmm0, xmm1 ; diff
- SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
- movd ebx, xmm1
- add SQR, ebx
+ ; sqr += diff * diff;
+ pmaxub xmm0, xmm1
+ pminub xmm1, xmm2
+ psubb xmm0, xmm1 ; diff
+ SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm1
+ add SQR, ebx
- ; sum_cur += y_src[x];
- movdqa xmm0, xmm3 ; cur_orig
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- paddw xmm0, xmm1 ; 8x2
- SUM_WORD_8x2_SSE2 xmm0, xmm1
- movd ebx, xmm0
- and ebx, 0ffffh
- add SUM_CUR, ebx
+ ; sum_cur += y_src[x];
+ movdqa xmm0, xmm3 ; cur_orig
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ paddw xmm0, xmm1 ; 8x2
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
+ movd ebx, xmm0
+ and ebx, 0ffffh
+ add SUM_CUR, ebx
- ; sqr_cur += y_src[x] * y_src[x];
- SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
- movd ebx, xmm0
- add SQR_CUR, ebx
+ ; sqr_cur += y_src[x] * y_src[x];
+ SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm0
+ add SQR_CUR, ebx
- lea edi, [edi+edx]
- lea esi, [esi+eax]
- dec ecx
- jnz near .hloops
+ lea edi, [edi+edx]
+ lea esi, [esi+eax]
+ dec ecx
+ jnz near .hloops
- mov ebx, 0
- mov bx, word SUM
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR
- sar ecx, 8
- sub ecx, ebx
- mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
- mov [edi], cx ; to store uiMotionIndex
- mov ebx, 0
- mov bx, word SUM_CUR
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR_CUR
- sar ecx, 8
- sub ecx, ebx
- mov [edi+2], cx ; to store uiTextureIndex
+ mov ebx, 0
+ mov bx, word SUM
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR
+ sar ecx, 8
+ sub ecx, ebx
+ mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
+ mov [edi], cx ; to store uiMotionIndex
+ mov ebx, 0
+ mov bx, word SUM_CUR
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR_CUR
+ sar ecx, 8
+ sub ecx, ebx
+ mov [edi+2], cx ; to store uiTextureIndex
- %undef SUM
- %undef SUM_CUR
- %undef SQR
- %undef SQR_CUR
- %undef PUSH_SIZE
+ %undef SUM
+ %undef SUM_CUR
+ %undef SQR
+ %undef SQR_CUR
+ %undef PUSH_SIZE
- add esp, 16
- pop ebx
- pop edi
- pop esi
+ add esp, 16
+ pop ebx
+ pop edi
+ pop esi
- ret
+ ret
@@ -360,67 +360,67 @@
%define psadframe esp + pushsize + 24
%define psad8x8 esp + pushsize + 28
%define pushsize 12
- push esi
- push edi
- push ebx
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
+ push esi
+ push edi
+ push ebx
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
height_loop:
- mov ecx, dword [iPicWidth]
- push esi
- push edi
+ mov ecx, dword [iPicWidth]
+ push esi
+ push edi
width_loop:
- pxor xmm6, xmm6 ;
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
+ pxor xmm6, xmm6 ;
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
- dec ecx
- jnz width_loop
+ dec ecx
+ jnz width_loop
- pop edi
- pop esi
- add esi, eax
- add edi, eax
+ pop edi
+ pop esi
+ add esi, eax
+ add edi, eax
- dec dword [iPicHeight]
- jnz height_loop
+ dec dword [iPicHeight]
+ jnz height_loop
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
%undef cur_data
%undef ref_data
@@ -430,10 +430,10 @@
%undef psadframe
%undef psad8x8
%undef pushsize
- pop ebx
- pop edi
- pop esi
- ret
+ pop ebx
+ pop edi
+ pop esi
+ ret
%else ;64-bit
@@ -441,98 +441,98 @@
; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
;***********************************************************************
WELS_EXTERN SampleVariance16x16_sse2
- %define SUM r10;[esp]
- %define SUM_CUR r11;[esp+4]
- %define SQR r13;[esp+8]
- %define SQR_CUR r15;[esp+12]
+ %define SUM r10;[esp]
+ %define SUM_CUR r11;[esp+4]
+ %define SQR r13;[esp+8]
+ %define SQR_CUR r15;[esp+12]
- push r12
- push r13
- push r14
- push r15
- %assign push_num 4
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1,r1d
- SIGN_EXTENSION r3,r3d
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1,r1d
+ SIGN_EXTENSION r3,r3d
- mov r12,010h
- pxor xmm7, xmm7
- movq SUM, xmm7
- movq SUM_CUR,xmm7
- movq SQR,xmm7
- movq SQR_CUR,xmm7
+ mov r12,010h
+ pxor xmm7, xmm7
+ movq SUM, xmm7
+ movq SUM_CUR,xmm7
+ movq SQR,xmm7
+ movq SQR_CUR,xmm7
.hloops:
- mov r14,0
- movdqa xmm0, [r0] ; y_ref
- movdqa xmm1, [r2] ; y_src
- movdqa xmm2, xmm0 ; store first for future process
- movdqa xmm3, xmm1
- ; sum += diff;
- movdqa xmm4, xmm0
- psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
- ; to be continued for sum
- pshufd xmm5, xmm4, 0C6h ; 11000110 B
- paddw xmm4, xmm5
- movd r14d, xmm4
- add SUM, r14
+ mov r14,0
+ movdqa xmm0, [r0] ; y_ref
+ movdqa xmm1, [r2] ; y_src
+ movdqa xmm2, xmm0 ; store first for future process
+ movdqa xmm3, xmm1
+ ; sum += diff;
+ movdqa xmm4, xmm0
+ psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
+ ; to be continued for sum
+ pshufd xmm5, xmm4, 0C6h ; 11000110 B
+ paddw xmm4, xmm5
+ movd r14d, xmm4
+ add SUM, r14
- ; sqr += diff * diff;
- pmaxub xmm0, xmm1
- pminub xmm1, xmm2
- psubb xmm0, xmm1 ; diff
- SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
- movd r14d, xmm1
- add SQR, r14
+ ; sqr += diff * diff;
+ pmaxub xmm0, xmm1
+ pminub xmm1, xmm2
+ psubb xmm0, xmm1 ; diff
+ SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+ movd r14d, xmm1
+ add SQR, r14
- ; sum_cur += y_src[x];
- movdqa xmm0, xmm3 ; cur_orig
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- paddw xmm0, xmm1 ; 8x2
- SUM_WORD_8x2_SSE2 xmm0, xmm1
- movd r14d, xmm0
- and r14, 0ffffh
- add SUM_CUR, r14
+ ; sum_cur += y_src[x];
+ movdqa xmm0, xmm3 ; cur_orig
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ paddw xmm0, xmm1 ; 8x2
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
+ movd r14d, xmm0
+ and r14, 0ffffh
+ add SUM_CUR, r14
- ; sqr_cur += y_src[x] * y_src[x];
- SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
- movd r14d, xmm0
- add SQR_CUR, r14
+ ; sqr_cur += y_src[x] * y_src[x];
+ SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+ movd r14d, xmm0
+ add SQR_CUR, r14
- lea r0, [r0+r1]
- lea r2, [r2+r3]
- dec r12
- jnz near .hloops
+ lea r0, [r0+r1]
+ lea r2, [r2+r3]
+ dec r12
+ jnz near .hloops
- mov r0, SUM
- sar r0, 8
- imul r0, r0
- mov r1, SQR
- sar r1, 8
- sub r1, r0
- mov [r4], r1w ; to store uiMotionIndex
- mov r0, SUM_CUR
- sar r0, 8
- imul r0, r0
- mov r1, SQR_CUR
- sar r1, 8
- sub r1, r0
- mov [r4+2], r1w ; to store uiTextureIndex
+ mov r0, SUM
+ sar r0, 8
+ imul r0, r0
+ mov r1, SQR
+ sar r1, 8
+ sub r1, r0
+ mov [r4], r1w ; to store uiMotionIndex
+ mov r0, SUM_CUR
+ sar r0, 8
+ imul r0, r0
+ mov r1, SQR_CUR
+ sar r1, 8
+ sub r1, r0
+ mov [r4+2], r1w ; to store uiTextureIndex
- POP_XMM
- LOAD_5_PARA_POP
- pop r15
- pop r14
- pop r13
- pop r12
+ POP_XMM
+ LOAD_5_PARA_POP
+ pop r15
+ pop r14
+ pop r13
+ pop r12
- %assign push_num 0
+ %assign push_num 0
- ret
+ ret
;*************************************************************************************************************
@@ -550,69 +550,69 @@
%define psadframe r5
%define psad8x8 r6
- push r12
- push r13
- %assign push_num 2
- LOAD_7_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
+ push r12
+ push r13
+ %assign push_num 2
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
- mov r12,r4
- shr r2, 4 ; iPicWidth/16
- shr r3, 4 ; iPicHeight/16
+ mov r12,r4
+ shr r2, 4 ; iPicWidth/16
+ shr r3, 4 ; iPicHeight/16
- shl r12, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
+ shl r12, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
height_loop:
- mov r13, r2
- push r0
- push r1
+ mov r13, r2
+ push r0
+ push r1
width_loop:
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r6], xmm6
- psrldq xmm6, 8
- movd [r6+4], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r6], xmm6
+ psrldq xmm6, 8
+ movd [r6+4], xmm6
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r6+8], xmm6
- psrldq xmm6, 8
- movd [r6+12], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r6+8], xmm6
+ psrldq xmm6, 8
+ movd [r6+12], xmm6
- add r6, 16
- sub r0, r12
- sub r1, r12
- add r0, 16
- add r1, 16
+ add r6, 16
+ sub r0, r12
+ sub r1, r12
+ add r0, 16
+ add r1, 16
- dec r13
- jnz width_loop
+ dec r13
+ jnz width_loop
- pop r1
- pop r0
- add r0, r12
- add r1, r12
+ pop r1
+ pop r0
+ add r0, r12
+ add r1, r12
- dec r3
- jnz height_loop
+ dec r3
+ jnz height_loop
- ;mov r13, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [psadframe], xmm7
+ ;mov r13, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [psadframe], xmm7
%undef cur_data
%undef ref_data
@@ -622,12 +622,12 @@
%undef psadframe
%undef psad8x8
%undef pushsize
- POP_XMM
- LOAD_7_PARA_POP
- pop r13
- pop r12
- %assign push_num 0
- ret
+ POP_XMM
+ LOAD_7_PARA_POP
+ pop r13
+ pop r12
+ %assign push_num 0
+ ret
%endif
@@ -653,103 +653,103 @@
%define tmp_esi esp + 0
%define tmp_edi esp + 4
%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
var_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
var_width_loop:
- pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
- pxor xmm5, xmm5 ; pSum16x16
- pxor xmm4, xmm4 ; sqsum_16x16
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
+ pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
+ pxor xmm5, xmm5 ; pSum16x16
+ pxor xmm4, xmm4 ; sqsum_16x16
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
- pxor xmm6, xmm6
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
- mov ebp, [psum16x16]
- movdqa xmm1, xmm5
- psrldq xmm1, 8
- paddd xmm5, xmm1
- movd [ebp], xmm5
- add dword [psum16x16], 4
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm5
+ psrldq xmm1, 8
+ paddd xmm5, xmm1
+ movd [ebp], xmm5
+ add dword [psum16x16], 4
- movdqa xmm5, xmm4
- psrldq xmm5, 8
- paddd xmm4, xmm5
- movdqa xmm3, xmm4
- psrldq xmm3, 4
- paddd xmm4, xmm3
+ movdqa xmm5, xmm4
+ psrldq xmm5, 8
+ paddd xmm4, xmm5
+ movdqa xmm3, xmm4
+ psrldq xmm3, 4
+ paddd xmm4, xmm3
- mov ebp, [psqsum16x16]
- movd [ebp], xmm4
- add dword [psqsum16x16], 4
+ mov ebp, [psqsum16x16]
+ movd [ebp], xmm4
+ add dword [psqsum16x16], 4
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
- dec ecx
- jnz var_width_loop
+ dec ecx
+ jnz var_width_loop
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
- dec dword [iPicHeight]
- jnz var_height_loop
+ dec dword [iPicHeight]
+ jnz var_height_loop
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
%undef cur_data
%undef ref_data
%undef iPicWidth
@@ -763,7 +763,7 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
%else ;64-bit
@@ -784,112 +784,112 @@
%define psum16x16 arg8
%define psqsum16x16 arg9
- push r12
- push r13
- push r14
- push r15
- %assign push_num 4
- PUSH_XMM 8
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+ PUSH_XMM 8
%ifdef WIN64
- mov r4, arg5 ;iPicStride
- mov r5, arg6 ;psad8x8
+ mov r4, arg5 ;iPicStride
+ mov r5, arg6 ;psad8x8
%endif
- mov r14,arg7
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
+ mov r14,arg7
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
- mov r13,r4
- shr r2,4
- shr r3,4
+ mov r13,r4
+ shr r2,4
+ shr r3,4
- shl r13,4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
+ shl r13,4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
var_height_loop:
- push r2
- %assign push_num push_num+1
- mov r11, r0
- mov r12, r1
+ push r2
+ %assign push_num push_num+1
+ mov r11, r0
+ mov r12, r1
var_width_loop:
- pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
- pxor xmm5, xmm5 ; pSum16x16
- pxor xmm4, xmm4 ; sqsum_16x16
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r14], xmm6
- psrldq xmm6, 8
- movd [r14+4], xmm6
+ pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
+ pxor xmm5, xmm5 ; pSum16x16
+ pxor xmm4, xmm4 ; sqsum_16x16
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r14], xmm6
+ psrldq xmm6, 8
+ movd [r14+4], xmm6
- pxor xmm6, xmm6
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r14+8], xmm6
- psrldq xmm6, 8
- movd [r14+12], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r14+8], xmm6
+ psrldq xmm6, 8
+ movd [r14+12], xmm6
- mov r15, psum16x16
- movdqa xmm1, xmm5
- psrldq xmm1, 8
- paddd xmm5, xmm1
- movd [r15], xmm5
- add dword psum16x16, 4
+ mov r15, psum16x16
+ movdqa xmm1, xmm5
+ psrldq xmm1, 8
+ paddd xmm5, xmm1
+ movd [r15], xmm5
+ add dword psum16x16, 4
- movdqa xmm5, xmm4
- psrldq xmm5, 8
- paddd xmm4, xmm5
- movdqa xmm3, xmm4
- psrldq xmm3, 4
- paddd xmm4, xmm3
+ movdqa xmm5, xmm4
+ psrldq xmm5, 8
+ paddd xmm4, xmm5
+ movdqa xmm3, xmm4
+ psrldq xmm3, 4
+ paddd xmm4, xmm3
- mov r15, psqsum16x16
- movd [r15], xmm4
- add dword psqsum16x16, 4
+ mov r15, psqsum16x16
+ movd [r15], xmm4
+ add dword psqsum16x16, 4
- add r14,16
- sub r0, r13
- sub r1, r13
- add r0, 16
- add r1, 16
+ add r14,16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
- dec r2
- jnz var_width_loop
+ dec r2
+ jnz var_width_loop
- pop r2
- %assign push_num push_num-1
- mov r0, r11
- mov r1, r12
- add r0, r13
- add r1, r13
- dec r3
- jnz var_height_loop
+ pop r2
+ %assign push_num push_num-1
+ mov r0, r11
+ mov r1, r12
+ add r0, r13
+ add r1, r13
+ dec r3
+ jnz var_height_loop
- mov r15, psadframe
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [r15], xmm7
+ mov r15, psadframe
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [r15], xmm7
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
%assign push_num 0
%undef cur_data
%undef ref_data
@@ -904,7 +904,7 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
%endif
@@ -932,118 +932,118 @@
%define tmp_edi esp + 4
%define tmp_sadframe esp + 8
%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
sqdiff_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
sqdiff_width_loop:
- pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
- pxor xmm6, xmm6 ; pSum16x16
- pxor xmm5, xmm5 ; sqsum_16x16 four dword
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- movdqa xmm1, xmm7
- movd [edx], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+4], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
+ pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
+ pxor xmm6, xmm6 ; pSum16x16
+ pxor xmm5, xmm5 ; sqsum_16x16 four dword
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ movdqa xmm1, xmm7
+ movd [edx], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+4], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
- pxor xmm7, xmm7
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- movdqa xmm1, xmm7
- movd [edx+8], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+12], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
+ pxor xmm7, xmm7
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ movdqa xmm1, xmm7
+ movd [edx+8], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+12], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
- mov ebp, [psum16x16]
- movdqa xmm1, xmm6
- psrldq xmm1, 8
- paddd xmm6, xmm1
- movd [ebp], xmm6
- add dword [psum16x16], 4
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm6
+ psrldq xmm1, 8
+ paddd xmm6, xmm1
+ movd [ebp], xmm6
+ add dword [psum16x16], 4
- mov ebp, [psqsum16x16]
- pshufd xmm6, xmm5, 14 ;00001110
- paddd xmm6, xmm5
- pshufd xmm5, xmm6, 1 ;00000001
- paddd xmm5, xmm6
- movd [ebp], xmm5
- add dword [psqsum16x16], 4
+ mov ebp, [psqsum16x16]
+ pshufd xmm6, xmm5, 14 ;00001110
+ paddd xmm6, xmm5
+ pshufd xmm5, xmm6, 1 ;00000001
+ paddd xmm5, xmm6
+ movd [ebp], xmm5
+ add dword [psqsum16x16], 4
- mov ebp, [psqdiff16x16]
- pshufd xmm5, xmm4, 14 ; 00001110
- paddd xmm5, xmm4
- pshufd xmm4, xmm5, 1 ; 00000001
- paddd xmm4, xmm5
- movd [ebp], xmm4
- add dword [psqdiff16x16], 4
+ mov ebp, [psqdiff16x16]
+ pshufd xmm5, xmm4, 14 ; 00001110
+ paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 1 ; 00000001
+ paddd xmm4, xmm5
+ movd [ebp], xmm4
+ add dword [psqdiff16x16], 4
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
- dec ecx
- jnz sqdiff_width_loop
+ dec ecx
+ jnz sqdiff_width_loop
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
- dec dword [iPicHeight]
- jnz sqdiff_height_loop
+ dec dword [iPicHeight]
+ jnz sqdiff_height_loop
- mov ebx, [tmp_sadframe]
- mov eax, [psadframe]
- mov [eax], ebx
+ mov ebx, [tmp_sadframe]
+ mov eax, [psadframe]
+ mov [eax], ebx
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
%undef cur_data
%undef ref_data
%undef iPicWidth
@@ -1059,7 +1059,7 @@
%undef tmp_sadframe
%undef pushsize
%undef localsize
- ret
+ ret
%else
@@ -1083,128 +1083,128 @@
%define psqsum16x16 arg9;
%define psqdiff16x16 arg10
- push r12
- push r13
- push r14
- push r15
- %assign push_num 4
- PUSH_XMM 10
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+ PUSH_XMM 10
%ifdef WIN64
- mov r4,arg5
+ mov r4,arg5
%endif
- mov r14,arg7
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
+ mov r14,arg7
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
- mov r13,r4
- shr r2,4 ; iPicWidth/16
- shr r3,4 ; iPicHeight/16
- shl r13,4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm8, xmm8 ;framesad
- pxor xmm9, xmm9
+ mov r13,r4
+ shr r2,4 ; iPicWidth/16
+ shr r3,4 ; iPicHeight/16
+ shl r13,4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8 ;framesad
+ pxor xmm9, xmm9
sqdiff_height_loop:
- ;mov ecx, dword [iPicWidth]
- ;mov r14,r2
- push r2
- %assign push_num push_num +1
- mov r10, r0
- mov r11, r1
+ ;mov ecx, dword [iPicWidth]
+ ;mov r14,r2
+ push r2
+ %assign push_num push_num +1
+ mov r10, r0
+ mov r11, r1
sqdiff_width_loop:
- pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
- pxor xmm6, xmm6 ; pSum16x16
- pxor xmm5, xmm5 ; sqsum_16x16 four dword
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- movdqa xmm1, xmm7
- movd [r14], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [r14+4], xmm7
- movd r15d, xmm1
- movd xmm9, r15d
- paddd xmm8,xmm9
+ pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
+ pxor xmm6, xmm6 ; pSum16x16
+ pxor xmm5, xmm5 ; sqsum_16x16 four dword
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ movdqa xmm1, xmm7
+ movd [r14], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [r14+4], xmm7
+ movd r15d, xmm1
+ movd xmm9, r15d
+ paddd xmm8,xmm9
- pxor xmm7, xmm7
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- movdqa xmm1, xmm7
- movd [r14+8], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [r14+12], xmm7
- movd r15d, xmm1
- movd xmm9, r15d
- paddd xmm8,xmm9
+ pxor xmm7, xmm7
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ movdqa xmm1, xmm7
+ movd [r14+8], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [r14+12], xmm7
+ movd r15d, xmm1
+ movd xmm9, r15d
+ paddd xmm8,xmm9
- mov r15, psum16x16
- movdqa xmm1, xmm6
- psrldq xmm1, 8
- paddd xmm6, xmm1
- movd [r15], xmm6
- add dword psum16x16, 4
+ mov r15, psum16x16
+ movdqa xmm1, xmm6
+ psrldq xmm1, 8
+ paddd xmm6, xmm1
+ movd [r15], xmm6
+ add dword psum16x16, 4
- mov r15, psqsum16x16
- pshufd xmm6, xmm5, 14 ;00001110
- paddd xmm6, xmm5
- pshufd xmm5, xmm6, 1 ;00000001
- paddd xmm5, xmm6
- movd [r15], xmm5
- add dword psqsum16x16, 4
+ mov r15, psqsum16x16
+ pshufd xmm6, xmm5, 14 ;00001110
+ paddd xmm6, xmm5
+ pshufd xmm5, xmm6, 1 ;00000001
+ paddd xmm5, xmm6
+ movd [r15], xmm5
+ add dword psqsum16x16, 4
- mov r15, psqdiff16x16
- pshufd xmm5, xmm4, 14 ; 00001110
- paddd xmm5, xmm4
- pshufd xmm4, xmm5, 1 ; 00000001
- paddd xmm4, xmm5
- movd [r15], xmm4
- add dword psqdiff16x16, 4
+ mov r15, psqdiff16x16
+ pshufd xmm5, xmm4, 14 ; 00001110
+ paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 1 ; 00000001
+ paddd xmm4, xmm5
+ movd [r15], xmm4
+ add dword psqdiff16x16, 4
- add r14,16
- sub r0, r13
- sub r1, r13
- add r0, 16
- add r1, 16
+ add r14,16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
- dec r2
- jnz sqdiff_width_loop
+ dec r2
+ jnz sqdiff_width_loop
- pop r2
- %assign push_num push_num -1
+ pop r2
+ %assign push_num push_num -1
- mov r0, r10
- mov r1, r11
- add r0, r13
- add r1, r13
+ mov r0, r10
+ mov r1, r11
+ add r0, r13
+ add r1, r13
- dec r3
- jnz sqdiff_height_loop
+ dec r3
+ jnz sqdiff_height_loop
- mov r13, psadframe
- movd [r13], xmm8
+ mov r13, psadframe
+ movd [r13], xmm8
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
- %assign push_num 0
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %assign push_num 0
%undef cur_data
%undef ref_data
@@ -1221,7 +1221,7 @@
%undef tmp_sadframe
%undef pushsize
%undef localsize
- ret
+ ret
@@ -1249,145 +1249,145 @@
%define tmp_edi esp + 4
%define tmp_ecx esp + 8
%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- xor ebp, ebp
- pxor xmm0, xmm0
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ xor ebp, ebp
+ pxor xmm0, xmm0
bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8
- pxor xmm6, xmm6 ; sum_cur_8x8
- pxor xmm5, xmm5 ; sum_ref_8x8
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ pxor xmm7, xmm7 ; pSad8x8
+ pxor xmm6, xmm6 ; sum_cur_8x8
+ pxor xmm5, xmm5 ; sum_ref_8x8
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
- pslldq xmm7, 4
- pslldq xmm6, 4
- pslldq xmm5, 4
+ pslldq xmm7, 4
+ pslldq xmm6, 4
+ pslldq xmm5, 4
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
- ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
+ ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
- mov edx, [psad8x8]
- pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
- movdqa [edx], xmm1
- add edx, 16
- mov [psad8x8], edx ; sad8x8
+ mov edx, [psad8x8]
+ pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [psad8x8], edx ; sad8x8
- paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
- pshufd xmm2, xmm1, 00000011b
- paddd xmm1, xmm2
- movd edx, xmm1
- add ebp, edx ; sad frame
+ paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
+ pshufd xmm2, xmm1, 00000011b
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add ebp, edx ; sad frame
- mov edx, [p_sd8x8]
- psubd xmm6, xmm5
- pshufd xmm1, xmm6, 10001101b
- movdqa [edx], xmm1
- add edx, 16
- mov [p_sd8x8], edx
+ mov edx, [p_sd8x8]
+ psubd xmm6, xmm5
+ pshufd xmm1, xmm6, 10001101b
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [p_sd8x8], edx
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
- mov ecx, [tmp_ecx]
- dec ecx
- jnz bgd_width_loop
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz bgd_width_loop
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
- dec dword [iPicHeight]
- jnz bgd_height_loop
+ dec dword [iPicHeight]
+ jnz bgd_height_loop
- mov edx, [psadframe]
- mov [edx], ebp
+ mov edx, [psadframe]
+ mov [edx], ebp
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
%undef cur_data
%undef ref_data
%undef iPicWidth
@@ -1401,7 +1401,7 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
@@ -1431,190 +1431,190 @@
%define tmp_sadframe esp + 8
%define tmp_ecx esp + 12
%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
sqdiff_bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
sqdiff_bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd [edx], xmm1 ; sum
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd [edx], xmm1 ; sum
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
- psrlq xmm7, 32
- psllq xmm7, 32 ; clear sad
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ psrlq xmm7, 32
+ psllq xmm7, 32 ; clear sad
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd ebp, xmm1 ; sum
- add [edx], ebp
- add edx, 4
- mov [psum16x16], edx
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd ebp, xmm1 ; sum
+ add [edx], ebp
+ add edx, 4
+ mov [psum16x16], edx
- mov edx, [psqsum16x16]
- psrlq xmm7, 32
- pshufd xmm2, xmm7, 00001110b
- paddd xmm2, xmm7
- movd [edx], xmm2 ; sqsum
- add edx, 4
- mov [psqsum16x16], edx
+ mov edx, [psqsum16x16]
+ psrlq xmm7, 32
+ pshufd xmm2, xmm7, 00001110b
+ paddd xmm2, xmm7
+ movd [edx], xmm2 ; sqsum
+ add edx, 4
+ mov [psqsum16x16], edx
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
- mov edx, [psqdiff16x16]
- pshufd xmm1, xmm4, 00001110b
- paddd xmm4, xmm1
- pshufd xmm1, xmm4, 00000001b
- paddd xmm4, xmm1
- movd [edx], xmm4
- add edx, 4
- mov [psqdiff16x16], edx
+ mov edx, [psqdiff16x16]
+ pshufd xmm1, xmm4, 00001110b
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm4, 00000001b
+ paddd xmm4, xmm1
+ movd [edx], xmm4
+ add edx, 4
+ mov [psqdiff16x16], edx
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
- mov ecx, [tmp_ecx]
- dec ecx
- jnz sqdiff_bgd_width_loop
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz sqdiff_bgd_width_loop
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
- dec dword [iPicHeight]
- jnz sqdiff_bgd_height_loop
+ dec dword [iPicHeight]
+ jnz sqdiff_bgd_height_loop
- mov edx, [psadframe]
- mov ebp, [tmp_sadframe]
- mov [edx], ebp
+ mov edx, [psadframe]
+ mov ebp, [tmp_sadframe]
+ mov [edx], ebp
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
%undef cur_data
%undef ref_data
%undef iPicWidth
@@ -1631,7 +1631,7 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
%else
;*************************************************************************************************************
@@ -1651,142 +1651,142 @@
%define p_sd8x8 arg8;
%define p_mad8x8 arg9;
- push r12
- push r13
- push r14
- push r15
+ push r12
+ push r13
+ push r14
+ push r15
%assign push_num 4
- PUSH_XMM 10
+ PUSH_XMM 10
%ifdef WIN64
- mov r4,arg5
- ; mov r5,arg6
+ mov r4,arg5
+ ; mov r5,arg6
%endif
- mov r14,arg7
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
+ mov r14,arg7
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
- mov r13,r4
- mov r15,r0
- shr r2,4
- shr r3,4
- shl r13,4
- pxor xmm0, xmm0
- pxor xmm8, xmm8
- pxor xmm9, xmm9
+ mov r13,r4
+ mov r15,r0
+ shr r2,4
+ shr r3,4
+ shl r13,4
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8
+ pxor xmm9, xmm9
bgd_height_loop:
- ;mov ecx, dword [iPicWidth]
- push r2
- %assign push_num push_num+1
- mov r10, r15
- mov r11, r1
+ ;mov ecx, dword [iPicWidth]
+ push r2
+ %assign push_num push_num+1
+ mov r10, r15
+ mov r11, r1
bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8
- pxor xmm6, xmm6 ; sum_cur_8x8
- pxor xmm5, xmm5 ; sum_ref_8x8
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ pxor xmm7, xmm7 ; pSad8x8
+ pxor xmm6, xmm6 ; sum_cur_8x8
+ pxor xmm5, xmm5 ; sum_ref_8x8
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- mov r14, p_mad8x8
- WELS_MAX_REG_SSE2 xmm4
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm4
- ;mov [tmp_ecx], ecx
- movhlps xmm1, xmm4
- movd r0d, xmm4
+ ;mov [tmp_ecx], ecx
+ movhlps xmm1, xmm4
+ movd r0d, xmm4
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- add r14, 2
- ;mov p_mad8x8, r14
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ add r14, 2
+ ;mov p_mad8x8, r14
- pslldq xmm7, 4
- pslldq xmm6, 4
- pslldq xmm5, 4
+ pslldq xmm7, 4
+ pslldq xmm6, 4
+ pslldq xmm5, 4
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- ;mov r14, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
+ ;mov r14, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
- movhlps xmm1, xmm4
- movd r0d, xmm4
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- add r14, 2
- mov p_mad8x8, r14
+ movhlps xmm1, xmm4
+ movd r0d, xmm4
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ add r14, 2
+ mov p_mad8x8, r14
- ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
+ ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
- mov r14, psad8x8
- pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
- movdqa [r14], xmm1
- add r14, 16
- mov psad8x8, r14 ; sad8x8
+ mov r14, psad8x8
+ pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
+ movdqa [r14], xmm1
+ add r14, 16
+ mov psad8x8, r14 ; sad8x8
- paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
- pshufd xmm2, xmm1, 00000011b
- paddd xmm1, xmm2
- movd r14d, xmm1
- movd xmm9, r14d
- paddd xmm8, xmm9 ; sad frame
+ paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
+ pshufd xmm2, xmm1, 00000011b
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9, r14d
+ paddd xmm8, xmm9 ; sad frame
- mov r14, p_sd8x8
- psubd xmm6, xmm5
- pshufd xmm1, xmm6, 10001101b
- movdqa [r14], xmm1
- add r14, 16
- mov p_sd8x8, r14
+ mov r14, p_sd8x8
+ psubd xmm6, xmm5
+ pshufd xmm1, xmm6, 10001101b
+ movdqa [r14], xmm1
+ add r14, 16
+ mov p_sd8x8, r14
- ;add edx, 16
- sub r15, r13
- sub r1, r13
- add r15, 16
- add r1, 16
+ ;add edx, 16
+ sub r15, r13
+ sub r1, r13
+ add r15, 16
+ add r1, 16
- dec r2
- jnz bgd_width_loop
- pop r2
+ dec r2
+ jnz bgd_width_loop
+ pop r2
%assign push_num push_num-1
- mov r15, r10
- mov r1, r11
- add r15, r13
- add r1, r13
+ mov r15, r10
+ mov r1, r11
+ add r15, r13
+ add r1, r13
- dec r3
- jnz bgd_height_loop
+ dec r3
+ jnz bgd_height_loop
- mov r13, psadframe
- movd [r13], xmm8
+ mov r13, psadframe
+ movd [r13], xmm8
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
%assign push_num 0
%undef cur_data
%undef ref_data
@@ -1801,7 +1801,7 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
@@ -1826,189 +1826,189 @@
%define p_sd8x8 arg11
%define p_mad8x8 arg12
- push r12
- push r13
- push r14
- push r15
+ push r12
+ push r13
+ push r14
+ push r15
%assign push_num 4
- PUSH_XMM 10
+ PUSH_XMM 10
%ifdef WIN64
- mov r4,arg5
- ;mov r5,arg6
+ mov r4,arg5
+ ;mov r5,arg6
%endif
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
- mov r13,r4
- shr r2, 4 ; iPicWidth/16
- shr r3, 4 ; iPicHeight/16
- shl r13, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm8, xmm8
- pxor xmm9, xmm9
+ mov r13,r4
+ shr r2, 4 ; iPicWidth/16
+ shr r3, 4 ; iPicHeight/16
+ shl r13, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8
+ pxor xmm9, xmm9
sqdiff_bgd_height_loop:
- mov r10, r0
- mov r11, r1
- push r2
+ mov r10, r0
+ mov r11, r1
+ push r2
%assign push_num push_num+1
sqdiff_bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- mov r14, psad8x8
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [r14], xmm2
- movd [r14+4], xmm1
- add r14, 8
- mov psad8x8, r14 ; sad8x8
+ mov r14, psad8x8
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [r14], xmm2
+ movd [r14+4], xmm1
+ add r14, 8
+ mov psad8x8, r14 ; sad8x8
- paddd xmm1, xmm2
- movd r14d, xmm1
- movd xmm9,r14d
- paddd xmm8, xmm9 ; iFrameSad
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9,r14d
+ paddd xmm8, xmm9 ; iFrameSad
- mov r14, psum16x16
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd [r14], xmm1 ; sum
+ mov r14, psum16x16
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd [r14], xmm1 ; sum
- mov r14, p_sd8x8
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [r14], xmm1
- add r14, 8
- mov p_sd8x8, r14
+ mov r14, p_sd8x8
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [r14], xmm1
+ add r14, 8
+ mov p_sd8x8, r14
- mov r14, p_mad8x8
- WELS_MAX_REG_SSE2 xmm5
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm5
- movhlps xmm1, xmm5
- push r0
- movd r0d, xmm5
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- pop r0
- add r14, 2
- mov p_mad8x8, r14
+ movhlps xmm1, xmm5
+ push r0
+ movd r0d, xmm5
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ pop r0
+ add r14, 2
+ mov p_mad8x8, r14
- psrlq xmm7, 32
- psllq xmm7, 32 ; clear sad
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ psrlq xmm7, 32
+ psllq xmm7, 32 ; clear sad
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- mov r14, psad8x8
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [r14], xmm2
- movd [r14+4], xmm1
- add r14, 8
- mov psad8x8, r14 ; sad8x8
+ mov r14, psad8x8
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [r14], xmm2
+ movd [r14+4], xmm1
+ add r14, 8
+ mov psad8x8, r14 ; sad8x8
- paddd xmm1, xmm2
- movd r14d, xmm1
- movd xmm9, r14d
- paddd xmm8, xmm9 ; iFrameSad
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9, r14d
+ paddd xmm8, xmm9 ; iFrameSad
- mov r14, psum16x16
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd r15d, xmm1 ; sum
- add [r14], r15d
- add r14, 4
- mov psum16x16, r14
+ mov r14, psum16x16
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd r15d, xmm1 ; sum
+ add [r14], r15d
+ add r14, 4
+ mov psum16x16, r14
- mov r14, psqsum16x16
- psrlq xmm7, 32
- pshufd xmm2, xmm7, 00001110b
- paddd xmm2, xmm7
- movd [r14], xmm2 ; sqsum
- add r14, 4
- mov psqsum16x16, r14
+ mov r14, psqsum16x16
+ psrlq xmm7, 32
+ pshufd xmm2, xmm7, 00001110b
+ paddd xmm2, xmm7
+ movd [r14], xmm2 ; sqsum
+ add r14, 4
+ mov psqsum16x16, r14
- mov r14, p_sd8x8
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [r14], xmm1
- add r14, 8
- mov p_sd8x8, r14
+ mov r14, p_sd8x8
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [r14], xmm1
+ add r14, 8
+ mov p_sd8x8, r14
- mov r14, p_mad8x8
- WELS_MAX_REG_SSE2 xmm5
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm5
- movhlps xmm1, xmm5
- push r0
- movd r0d, xmm5
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- pop r0
- add r14, 2
- mov p_mad8x8, r14
+ movhlps xmm1, xmm5
+ push r0
+ movd r0d, xmm5
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ pop r0
+ add r14, 2
+ mov p_mad8x8, r14
- mov r14, psqdiff16x16
- pshufd xmm1, xmm4, 00001110b
- paddd xmm4, xmm1
- pshufd xmm1, xmm4, 00000001b
- paddd xmm4, xmm1
- movd [r14], xmm4
- add r14, 4
- mov psqdiff16x16, r14
+ mov r14, psqdiff16x16
+ pshufd xmm1, xmm4, 00001110b
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm4, 00000001b
+ paddd xmm4, xmm1
+ movd [r14], xmm4
+ add r14, 4
+ mov psqdiff16x16, r14
- add r14, 16
- sub r0, r13
- sub r1, r13
- add r0, 16
- add r1, 16
+ add r14, 16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
- dec r2
- jnz sqdiff_bgd_width_loop
- pop r2
- %assign push_num push_num-1
- mov r0, r10
- mov r1, r11
- add r0, r13
- add r1, r13
+ dec r2
+ jnz sqdiff_bgd_width_loop
+ pop r2
+ %assign push_num push_num-1
+ mov r0, r10
+ mov r1, r11
+ add r0, r13
+ add r1, r13
- dec r3
- jnz sqdiff_bgd_height_loop
+ dec r3
+ jnz sqdiff_bgd_height_loop
- mov r14, psadframe
- movd [r14], xmm8
+ mov r14, psadframe
+ movd [r14], xmm8
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
%assign push_num 0
%undef cur_data
%undef ref_data
@@ -2026,5 +2026,5 @@
%undef tmp_edi
%undef pushsize
%undef localsize
- ret
+ ret
%endif