ref: b7a25df13f2641504d8a77e3ab0110ee25d75920
parent: 0fd9db2878668839b0d3841fc5c223f5c1e5aeb7
author: Licai Guo <[email protected]>
date: Fri Feb 28 12:08:24 EST 2014
Remove deblocking arm asm code to common folder, add cpu detect for arm, clean some code.
--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@@ -19,6 +19,8 @@
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473818BC61650017DF25 /* deblocking_common.cpp */; };
4CE4475218BC61650017DF25 /* logging.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473C18BC61650017DF25 /* logging.cpp */; };
4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */; };
+ 4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447BC18C085320017DF25 /* deblocking_neon.S */; };
+ 4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */; };
/* End PBXBuildFile section */
/* Begin PBXContainerItemProxy section */
@@ -69,6 +71,8 @@
4CE4474718BC61650017DF25 /* typedefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = typedefs.h; sourceTree = "<group>"; };
4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WelsThreadLib.cpp; sourceTree = "<group>"; };
4CE4474A18BC61650017DF25 /* WelsThreadLib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WelsThreadLib.h; sourceTree = "<group>"; };
+ 4CE447BC18C085320017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; };
+ 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@@ -144,6 +148,8 @@
4CE4472F18BC61650017DF25 /* common */ = {
isa = PBXGroup;
children = (
+ 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */,
+ 4CE447BC18C085320017DF25 /* deblocking_neon.S */,
4CE4473118BC61650017DF25 /* cpu.cpp */,
4CE4473218BC61650017DF25 /* cpu.h */,
4CE4473318BC61650017DF25 /* cpu_core.h */,
@@ -247,9 +253,11 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
+ 4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */,
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */,
4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */,
4CE4475218BC61650017DF25 /* logging.cpp in Sources */,
+ 4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */,
4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */,
4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */,
);
--- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
@@ -36,9 +36,7 @@
4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467A18BC5EAA0017DF25 /* utils.cpp */; };
4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */; };
4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; };
- 4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */; };
4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; };
- 4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A818BC6BE90017DF25 /* deblocking_neon.S */; };
4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; };
4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447AA18BC6BE90017DF25 /* mc_neon.S */; };
/* End PBXBuildFile section */
@@ -132,9 +130,7 @@
4CE4468318BC5EAB0017DF25 /* wels_dec_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_dec_export.def; sourceTree = "<group>"; };
4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsCodecTrace.cpp; sourceTree = "<group>"; };
4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = "<group>"; };
- 4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = "<group>"; };
- 4CE447A818BC6BE90017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; };
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
4CE447AA18BC6BE90017DF25 /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
/* End PBXFileReference section */
@@ -327,9 +323,7 @@
4CE447A518BC6BE90017DF25 /* arm */ = {
isa = PBXGroup;
children = (
- 4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */,
4CE447A718BC6BE90017DF25 /* block_add_neon.S */,
- 4CE447A818BC6BE90017DF25 /* deblocking_neon.S */,
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */,
4CE447AA18BC6BE90017DF25 /* mc_neon.S */,
);
@@ -424,7 +418,6 @@
4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */,
4CE4469118BC5EAB0017DF25 /* decoder_data_tables.cpp in Sources */,
4CE4469718BC5EAB0017DF25 /* mem_align.cpp in Sources */,
- 4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */,
4CE4469518BC5EAB0017DF25 /* manage_dec_ref.cpp in Sources */,
4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */,
4CE4469218BC5EAB0017DF25 /* expand_pic.cpp in Sources */,
@@ -435,7 +428,6 @@
4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */,
4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */,
4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */,
- 4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */,
4CE4469C18BC5EAB0017DF25 /* rec_mb.cpp in Sources */,
4CE4468B18BC5EAB0017DF25 /* bit_stream.cpp in Sources */,
4CE4468D18BC5EAB0017DF25 /* decode_mb_aux.cpp in Sources */,
--- /dev/null
+++ b/codec/common/arm_arch_common_macro.S
@@ -1,0 +1,55 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef APPLE_IOS
+
+.macro WELS_ASM_FUNC_BEGIN
+.align 2
+.arm
+.globl _$0
+_$0:
+.endm
+
+#else
+
+.macro WELS_ASM_FUNC_BEGIN funcName
+.align 2
+.arm
+.global \funcName
+\funcName:
+.endm
+
+#endif
+
+.macro WELS_ASM_FUNC_END
+mov pc, lr
+.endm
--- a/codec/common/cpu.cpp
+++ b/codec/common/cpu.cpp
@@ -38,7 +38,12 @@
*************************************************************************************
*/
#include <string.h>
-
+#ifdef ANDROID_NDK
+#include <cpu-features.h>
+#endif
+#ifdef APPLE_IOS
+#include <sys/utsname.h>
+#endif
#include "cpu.h"
#include "cpu_core.h"
@@ -207,6 +212,55 @@
void WelsXmmRegEmptyOp(void * pSrc) {
}
+#endif
+
+#if defined(HAVE_NEON)//For supporting both android platform and iOS platform
+#if defined(ANDROID_NDK)
+uint32_t WelsCPUFeatureDetectAndroid()
+{
+ uint32_t uiCPU = 0;
+ AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
+ uint64_t uiFeatures = 0;
+
+ cpuFamily = android_getCpuFamily();
+ if (cpuFamily == ANDROID_CPU_FAMILY_ARM)
+ {
+ uiFeatures = android_getCpuFeatures();
+ if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
+ uiCPU |= WELS_CPU_ARMv7;
+ }
+ if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
+ uiCPU |= WELS_CPU_VFPv3;
+ }
+ if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
+ uiCPU |= WELS_CPU_NEON;
+ }
+ }
+ return uiCPU;
+}
+
+#endif
+
+#if defined(APPLE_IOS)
+uint32_t WelsCPUFeatureDetectIOS() //Need to be updated for the new device of APPLE
+{
+ uint32_t uiCPU = 0;
+ struct utsname sSystemInfo;
+
+ uname (&sSystemInfo);
+
+ if ((0 != strcmp(sSystemInfo.machine, "iPhone1,1")) && //iPhone 2G
+ (0 != strcmp(sSystemInfo.machine, "iPhone1,2")) && //iPhone 3G
+ (0 != strcmp(sSystemInfo.machine, "iPod1,1")) && //iPod 1G
+ (0 != strcmp(sSystemInfo.machine, "iPod2,1"))) //iPod 2G
+ {
+ uiCPU |= WELS_CPU_ARMv7;
+ uiCPU |= WELS_CPU_VFPv3;
+ uiCPU |= WELS_CPU_NEON;
+ }
+ return uiCPU;
+}
+#endif
#endif
--- a/codec/common/cpu.h
+++ b/codec/common/cpu.h
@@ -78,6 +78,16 @@
void WelsXmmRegEmptyOp(void * pSrc);
+#if defined(HAVE_NEON)
+#if defined(ANDROID_NDK)
+ uint32_t WelsCPUFeatureDetectAndroid();
+#endif
+
+#if defined(APPLE_IOS)
+ uint32_t WelsCPUFeatureDetectIOS();
+#endif
+#endif
+
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/common/cpu_core.h
+++ b/codec/common/cpu_core.h
@@ -73,6 +73,11 @@
#define WELS_CPU_CACHELINE_64 0x40000000 /* CacheLine Size 64 */
#define WELS_CPU_CACHELINE_128 0x80000000 /* CacheLine Size 128 */
+/* For the android OS */
+#define WELS_CPU_ARMv7 0x000001 /* ARMv7 */
+#define WELS_CPU_VFPv3 0x000002 /* VFPv3 */
+#define WELS_CPU_NEON 0x000004 /* NEON */
+
/*
* Interfaces for CPU core feature detection as below
*/
--- /dev/null
+++ b/codec/common/deblocking_neon.S
@@ -1,0 +1,1001 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON
+.text
+
+#include "arm_arch_common_macro.S"
+
+#ifdef APPLE_IOS
+.macro JMP_IF_128BITS_IS_ZERO
+// {
+ vorr.s16 $2, $0, $1
+ vmov r3, r2, $2
+ orr r3, r3, r2
+ cmp r3, #0
+// }
+.endm
+
+// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+.macro MASK_MATRIX
+// { input: p1, p0, q0, q1, alpha(be modified), beta; output: mask
+ vabd.u8 $6, $1, $2 // abs( p0 - q0 )
+ vcgt.u8 $6, $4, $6 // mask = abs( p0 - q0 ) < alpha
+
+ vabd.u8 $4, $0, $1 // abs( p1 - p0 )
+ vclt.u8 $4, $4, $5 // abs( p1 - p0 ) < beta
+ vand.u8 $6, $6, $4 // 2nd mask &
+
+ vabd.u8 $4, $3, $2 // abs( q1 - q0 )
+ vclt.u8 $4, $4, $5 // abs( q1 - q0 ) < beta
+ vand.u8 $6, $6, $4 // 3rd mask &
+// }
+.endm
+
+//if( abs( p2 - p0 ) < beta )
+//{
+// pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1)>> 1)) >> 1) - p1, -tc0[i], tc0[i] );
+// tc++;
+//}
+.macro DIFF_LUMA_LT4_P1_Q1
+// { input: p2, p1, p0, q0, beta, -tc0[i], tc0[i], mask_matrx; output: _clip3(p1'), tc++;
+ vabd.u8 $9, $0, $2 // abs( p2 - p0 )
+ vclt.u8 $9, $9, $4 // abs( p2 - p0 ) < beta
+ vrhadd.u8 $8, $2, $3 // ((p0 + q0 + 1)>> 1)
+ vhadd.u8 $8, $0, $8 // (( p2 + ((p0 + q0 + 1)>> 1)) >> 1)
+ vsub.s8 $8, $8, $1 // (( p2 + ((p0 + q0 + 1)>> 1)) >> 1) - p1
+ vmax.s8 $8, $8, $5 // >= -tc0[i]
+ vmin.s8 $8, $8, $6 // <= tc0[i]
+ vand.s8 $8, $8, $9 // mask, only [abs( p2 - p0 ) < beta] avail _clip3
+ vand.s8 $8, $8, $7
+ vadd.u8 $8, $1, $8
+ vabs.s8 $9, $9 // if( abs( p2 - p0 ) < beta ) tc++;
+// }
+.endm
+
+//delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3,-tc, tc );
+.macro DIFF_LUMA_LT4_P0_Q0
+// { input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
+ vsubl.u8 $5, $0, $3 // (p1 - q1)
+ vsubl.u8 $6, $2, $1 // (q0 - p0)
+ vshl.s16 $6, $6, #2
+ vadd.s16 $5, $5, $6 // (p1 - q1) += ( q0 - p0 ) <<2
+ vrshrn.s16 $4, $5, #3
+// }
+.endm
+
+//if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
+//{
+// const int p3 = pix[-4*xstride];
+// pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+// pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+// pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+//}
+//else /* p0' */
+// pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+.macro DIFF_LUMA_EQ4_P2P1P0
+// { input: p3(output p1'), p2, p1, p0, q0, q1, select_matrix(output p0'), output p2';
+// workin q4~q5; after filtered then p3/p2 useless!
+ vaddl.u8 q4, $1, $2 // (p2 + p1)
+ vaddl.u8 q5, $3, $4 // (p0 + q0)
+ vadd.u16 q5, q4, q5 // p1'=(p2 + p1)+(p0 + q0)
+
+ vaddl.u8 q4, $0, $1 // (p3 + p2)
+ vshl.u16 q4, q4, #1
+ vadd.u16 q4, q5, q4 // p2'=2*(p3 + p2)+(p2 + p1)+(p0 + q0)
+
+ vrshrn.u16 $0, q5, #2 // p1', prev p3 useless now
+ vrshrn.u16 $7, q4, #3 // p2'
+
+ vshl.u16 q5, q5, #1 // ((p2 + p1)+(p0 + q0))*2
+ vsubl.u8 q4, $5, $1 // (q1 - p2)
+ vadd.u16 q5, q4,q5 // 5tags p0'=(q1 - p2)+((p2 + p1)+(p0 + q0))*2
+
+ vaddl.u8 q4, $2, $5 // (p1 + q1)
+ vaddw.u8 q4, q4, $2
+ vaddw.u8 q4, q4, $3 // 3tags p0'=2*p1+(p0 + q1)
+
+ vrshrn.u16 d10,q5, #3 // 5tags
+ vrshrn.u16 d8, q4, #2 // 3tags
+ vbsl.u8 $6, d10, d8 // p0'
+// }
+.endm
+
+.macro DIFF_LUMA_EQ4_MASK
+// { input: px', px, mask_matrix; working q4
+ vmov $3, $2
+ vbsl.u8 $3, $0, $1
+// }
+.endm
+
+// ( (p1 << 1) + p0 + q1 + 2 ) >> 2
+.macro DIFF_CHROMA_EQ4_P0Q0
+// { input: p1, p0, q0, q1; working q4/q5/q6; output: p0'_d, q0'_d
+ vaddl.u8 $4, $0, $3 // (p1 + q1)
+ vaddw.u8 $5, $4, $1
+ vaddw.u8 $6, $4, $2
+ vaddw.u8 $5, $5, $0 // p0'=(p1 + q1)+(p0+p1)
+// vaddw.u8 $6, $4, $2
+ vaddw.u8 $6, $6, $3 // q0'=(p1 + q1)+(q0+q1)
+ vrshrn.u16 $7, $5, #2
+ vrshrn.u16 $8, $6, #2
+// }
+.endm
+
+.macro LORD_CHROMA_DATA_4
+// { input: 4xCb_addr, 4xCr_addr, working r0~r2
+ vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2 // Cb
+ vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2 // Cr
+// }
+.endm
+
+.macro STORE_CHROMA_DATA_4
+// { input: 4xCb_addr, 4xCr_addr, working r0~r2
+ vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2 // Cb
+ vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2 // Cr
+// }
+.endm
+
+.macro LORD_LUMA_DATA_3
+// { input: 3xluma_addr, working r0~r2
+ vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1 // 0::pix[-3];1::pix[-2];2::pix[-1];
+ vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1 // 3::pix[0]; 4::pix[1]; 5::pix[2];
+// }
+.endm
+
+.macro STORE_LUMA_DATA_4
+// { input: 4xluma, working r0~r2
+ vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1 // 0::pix[-2];1::pix[-1];2::pix[0]; 3::pix[1]
+ vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
+// }
+.endm
+
+.macro LORD_LUMA_DATA_4
+// { input: 4xluma_addr, working r0r1r3
+ vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r3], r1 // 0::pix[-4];1::pix[-3];2::pix[-2];3::pix[-1]
+ vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r0], r1 // 4::pix[0]; 5::pix[1]; 6::pix[2]; 7::pix[3];
+// }
+.endm
+
+.macro STORE_LUMA_DATA_3
+// { input: 3xluma_addr, working r0~r2
+ vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1 // 0::pix[-3];1::pix[-2];2::pix[-1];
+ vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1 // 3::pix[0]; 4::pix[1]; 5::pix[2];
+// }
+.endm
+
+.macro EXTRACT_DELTA_INTO_TWO_PART
+// { input: delta (output abs minus part), working (output plus part)
+ vcge.s8 $1, $0, #0
+ vand $1, $0, $1 // select original (+part)
+ vsub.s8 $0, $1, $0 // select original -(-part)
+// }
+.endm
+#else
+.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
+// {
+ vorr.s16 \arg2, \arg0, \arg1
+ vmov r3, r2, \arg2
+ orr r3, r3, r2
+ cmp r3, #0
+// }
+.endm
+
+// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
+// { input: p1, p0, q0, q1, alpha(be modified), beta; output: mask
+ vabd.u8 \arg6, \arg1, \arg2 // abs( p0 - q0 )
+ vcgt.u8 \arg6, \arg4, \arg6 // mask = abs( p0 - q0 ) < alpha
+
+ vabd.u8 \arg4, \arg0, \arg1 // abs( p1 - p0 )
+ vclt.u8 \arg4, \arg4, \arg5 // abs( p1 - p0 ) < beta
+ vand.u8 \arg6, \arg6, \arg4 // 2nd mask &
+
+ vabd.u8 \arg4, \arg3, \arg2 // abs( q1 - q0 )
+ vclt.u8 \arg4, \arg4, \arg5 // abs( q1 - q0 ) < beta
+ vand.u8 \arg6, \arg6, \arg4 // 3rd mask &
+// }
+.endm
+
+//if( abs( p2 - p0 ) < beta )
+//{
+// pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1)>> 1)) >> 1) - p1, -tc0[i], tc0[i] );
+// tc++;
+//}
+.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+// { input: p2, p1, p0, q0, beta, -tc0[i], tc0[i], mask_matrx; output: _clip3(p1'), tc++;
+ vabd.u8 \arg9, \arg0, \arg2 // abs( p2 - p0 )
+ vclt.u8 \arg9, \arg9, \arg4 // abs( p2 - p0 ) < beta
+ vrhadd.u8 \arg8, \arg2, \arg3 // ((p0 + q0 + 1)>> 1)
+ vhadd.u8 \arg8, \arg0, \arg8 // (( p2 + ((p0 + q0 + 1)>> 1)) >> 1)
+ vsub.s8 \arg8, \arg8, \arg1 // (( p2 + ((p0 + q0 + 1)>> 1)) >> 1) - p1
+ vmax.s8 \arg8, \arg8, \arg5 // >= -tc0[i]
+ vmin.s8 \arg8, \arg8, \arg6 // <= tc0[i]
+ vand.s8 \arg8, \arg8, \arg9 // mask, only [abs( p2 - p0 ) < beta] avail _clip3
+ vand.s8 \arg8, \arg8, \arg7
+ vadd.u8 \arg8, \arg1, \arg8
+ vabs.s8 \arg9, \arg9 // if( abs( p2 - p0 ) < beta ) tc++;
+// }
+.endm
+
+//delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3,-tc, tc );
+.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+// { input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
+ vsubl.u8 \arg5, \arg0, \arg3 // (p1 - q1)
+ vsubl.u8 \arg6, \arg2, \arg1 // (q0 - p0)
+ vshl.s16 \arg6, \arg6, #2
+ vadd.s16 \arg5, \arg5, \arg6 // (p1 - q1) += ( q0 - p0 ) <<2
+ vrshrn.s16 \arg4, \arg5, #3
+// }
+.endm
+
+//if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
+//{
+// const int p3 = pix[-4*xstride];
+// pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+// pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+// pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+//}
+//else /* p0' */
+// pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { input: p3(output p1'), p2, p1, p0, q0, q1, select_matrix(output p0'), output p2';
+// workin q4~q5; after filtered then p3/p2 useless!
+ vaddl.u8 q4, \arg1, \arg2 // (p2 + p1)
+ vaddl.u8 q5, \arg3, \arg4 // (p0 + q0)
+ vadd.u16 q5, q4, q5 // p1'=(p2 + p1)+(p0 + q0)
+
+ vaddl.u8 q4, \arg0, \arg1 // (p3 + p2)
+ vshl.u16 q4, q4, #1
+ vadd.u16 q4, q5, q4 // p2'=2*(p3 + p2)+(p2 + p1)+(p0 + q0)
+
+ vrshrn.u16 \arg0, q5, #2 // p1', prev p3 useless now
+ vrshrn.u16 \arg7, q4, #3 // p2'
+
+ vshl.u16 q5, q5, #1 // ((p2 + p1)+(p0 + q0))*2
+ vsubl.u8 q4, \arg5, \arg1 // (q1 - p2)
+ vadd.u16 q5, q4,q5 // 5tags p0'=(q1 - p2)+((p2 + p1)+(p0 + q0))*2
+
+ vaddl.u8 q4, \arg2, \arg5 // (p1 + q1)
+ vaddw.u8 q4, q4, \arg2
+ vaddw.u8 q4, q4, \arg3 // 3tags p0'=2*p1+(p0 + q1)
+
+ vrshrn.u16 d10,q5, #3 // 5tags
+ vrshrn.u16 d8, q4, #2 // 3tags
+ vbsl.u8 \arg6, d10, d8 // p0'
+// }
+.endm
+
+.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
+// { input: px', px, mask_matrix; working q4
+ vmov \arg3, \arg2
+ vbsl.u8 \arg3, \arg0, \arg1
+// }
+.endm
+
+// ( (p1 << 1) + p0 + q1 + 2 ) >> 2
+.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { input: p1, p0, q0, q1; working q4/q5/q6; output: p0'_d, q0'_d
+ vaddl.u8 \arg4, \arg0, \arg3 // (p1 + q1)
+ vaddw.u8 \arg5, \arg4, \arg1
+ vaddw.u8 \arg6, \arg4, \arg2
+ vaddw.u8 \arg5, \arg5, \arg0 // p0'=(p1 + q1)+(p0+p1)
+// vaddw.u8 \arg6, \arg4, \arg2
+ vaddw.u8 \arg6, \arg6, \arg3 // q0'=(p1 + q1)+(q0+q1)
+ vrshrn.u16 \arg7, \arg5, #2
+ vrshrn.u16 \arg8, \arg6, #2
+// }
+.endm
+
+.macro LORD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { input: 4xCb_addr, 4xCr_addr, working r0~r2
+ vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2 // Cb
+ vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2 // Cr
+// }
+.endm
+
+.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { input: 4xCb_addr, 4xCr_addr, working r0~r2
+ vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2 // Cb
+ vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2 // Cr
+// }
+.endm
+
+.macro LORD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+// { input: 3xluma_addr, working r0~r2
+ vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1 // 0::pix[-3];1::pix[-2];2::pix[-1];
+ vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1 // 3::pix[0]; 4::pix[1]; 5::pix[2];
+// }
+.endm
+
+.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
+// { input: 4xluma, working r0~r2
+ vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1 // 0::pix[-2];1::pix[-1];2::pix[0]; 3::pix[1]
+ vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
+// }
+.endm
+
+.macro LORD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { input: 4xluma_addr, working r0r1r3
+ vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r3], r1 // 0::pix[-4];1::pix[-3];2::pix[-2];3::pix[-1]
+ vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r0], r1 // 4::pix[0]; 5::pix[1]; 6::pix[2]; 7::pix[3];
+// }
+.endm
+
+.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+// { input: 3xluma_addr, working r0~r2
+ vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1 // 0::pix[-3];1::pix[-2];2::pix[-1];
+ vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1 // 3::pix[0]; 4::pix[1]; 5::pix[2];
+// }
+.endm
+
+.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
+// { input: delta (output abs minus part), working (output plus part)
+ vcge.s8 \arg1, \arg0, #0
+ vand \arg1, \arg0, \arg1 // select original (+part)
+ vsub.s8 \arg0, \arg1, \arg0 // select original -(-part)
+// }
+.endm
+
+#endif
+//uint8_t *pix, int32_t stride, int32_t alpha, int32_t beta, uint8_t *tc
+ WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
+
+ vdup.u8 q11, r2 // alpha [0~255]
+ vdup.u8 q9, r3 // q9:: beta [0~18]
+
+ add r2, r1, r1, lsl #1
+ sub r2, r0, r2 // pix -= 3*src_stride]
+ vld1.u8 {q0}, [r2], r1 // q0::p2 = pix[-3*xstride];
+ vld1.u8 {q3}, [r0], r1 // q3::q0 = pix[ 0*xstride];
+ vld1.u8 {q1}, [r2], r1 // q1::p1 = pix[-2*xstride];
+ vld1.u8 {q4}, [r0], r1 // q4::q1 = pix[ 1*xstride];
+ vld1.u8 {q2}, [r2] // q2::p0 = pix[-1*xstride];
+ vld1.u8 {q5}, [r0] // q5::q2 = pix[ 2*xstride];
+ sub r2, r2, r1 // r2 = pix-2*xstride
+
+// if( tc0[i] < 0 ) continue; else filter
+ ldr r3, [sp, #0]
+ vld1.s8 {d31}, [r3] // load 4 tc0[i]
+ vdup.s8 d28, d31[0]
+ vdup.s8 d30, d31[1]
+ vdup.s8 d29, d31[2]
+ vdup.s8 d31, d31[3]
+ vtrn.32 d28, d30
+ vtrn.32 d29, d31 // q14::each 32 bits is 4x tc0[i]
+ vcge.s8 q10, q14, #0 // q10::tc0[i] >= 0
+
+// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ MASK_MATRIX q1, q2, q3, q4, q11, q9, q15 // q15::mask matrix
+ vand.u8 q10, q10, q15 // two mask
+// JMP_IF_128BITS_IS_ZERO d20, d21, d31
+// beq lt4_end
+
+ veor q15, q15
+ vsub.i8 q15,q15,q14 // q15::4x -tc0[i], min
+
+// input: p2, p1, p0, q0, beta, -tc0[i], tc0[i]; mask_matrx, output: _clip3(p1'), tc++;
+ DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12 // q6 = _clip3(p1')
+ vst1.u8 {q6}, [r2], r1
+
+ DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13 // q7 = _clip3(q1')
+
+ vabs.s8 q12, q12
+ vabs.s8 q13, q13 // if( abs( p2 - p0 ) < beta ) tc++;
+ vadd.u8 q14,q14,q12
+ vadd.u8 q14,q14,q13 // updated tc
+ veor q15, q15
+ vsub.i8 q15,q15,q14 // updated -tc
+
+// input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
+ DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13 //q8::delta
+ vmax.s8 q8, q8, q15 // >= -tc0[i]
+ vmin.s8 q8, q8, q14 // <= tc0[i]
+ vand.s8 q8, q8, q10
+ EXTRACT_DELTA_INTO_TWO_PART q8, q9
+ vqadd.u8 q2, q2, q9 // clip_uint8( p0 + [+delta] ); p0'
+ vqsub.u8 q2, q2, q8 // clip_uint8( p0 - [-delta] ); p0'
+ vst1.u8 {q2}, [r2], r1
+ vqsub.u8 q3, q3, q9 // clip_uint8( q0 - [+delta] ); q0'
+ vqadd.u8 q3, q3, q8 // clip_uint8( q0 + [-delta] ); q0'
+ vst1.u8 {q3}, [r2] , r1
+ vst1.u8 {q7}, [r2]
+
+//lt4_end:
+ WELS_ASM_FUNC_END
+
+//uint8_t *pix, int32_t stride, int32_t alpha, int32_t beta
+ WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
+
+ vdup.u8 q5, r2 // alpha [0~255]
+ vdup.u8 q4, r3 // beta [0~18]
+
+ sub r3, r0, r1, lsl #2 // pix -= 4*src_stride
+ vld1.u8 {q8}, [r3], r1 // q8::p3 = pix[-4*xstride];
+ vld1.u8 {q12}, [r0], r1 // q12::q0 = pix[ 0*xstride];
+ vld1.u8 {q9}, [r3], r1 // q9::p2 = pix[-3*xstride];
+ vld1.u8 {q13}, [r0], r1 // q13::q1 = pix[ 1*xstride];
+ vld1.u8 {q10}, [r3], r1 // q10::p1 = pix[-2*xstride];
+ vld1.u8 {q14}, [r0], r1 // q14::q2 = pix[ 2*xstride];
+ vld1.u8 {q11}, [r3] // q11::p0 = pix[-1*xstride];
+ vld1.u8 {q15}, [r0] // q15::q3 = pix[ 3*xstride];
+ sub r3, r3, r1 , lsl #1 // r3 = pix-3*xstride
+
+// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ MASK_MATRIX q10, q11, q12, q13, q5, q4, q6 // q6::mask matrix
+// JMP_IF_128BITS_IS_ZERO d12, d13, d0
+// beq eq4_end
+
+// if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
+ mov r2, r2, lsr #2
+ add r2, r2, #2
+ vdup.u8 q5, r2
+ vabd.u8 q0, q11, q12
+ vclt.u8 q7, q0, q5 // q7::indicate
+// if( abs( p2 - p0 ) < beta )
+ vabd.u8 q1, q9, q11
+ vclt.u8 q1, q1, q4
+ vand.s8 q1, q1, q7 // q1::indicate [p0', p1', p2'] or [p0']
+// if( abs( q2 - q0 ) < beta )
+ vabd.u8 q2, q14,q12
+ vclt.u8 q2, q2, q4
+ vand.s8 q2, q2, q7 // q2::indicate [q0', q1', q2'] or [q0']
+ vand.u8 q7, q7, q6
+
+ vmov q3, q1
+// input: p3(output p1'), p2, p1, p0, q0, q1, select_matrix(output p0'), output p2';
+// workin q4~q5; after filtered then p3/p2 useless!
+ DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
+ DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
+
+// q1(p0') q2(q0') only need ::if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+// q0(p2') q8(p1') q15(q1') q3(q2'); need more &&if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )&&if( abs( p2 - p0 ) < beta )
+ vand.u8 q3, q7, q3
+ DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q8,q10, q3, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q1,q11, q6, q4
+ vst1.u8 {q4}, [r3], r1
+
+ vmov q0, q2
+ DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6
+ DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7
+
+ vand.u8 q0, q7, q0
+ DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q15, q13, q0, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q3, q14, q0, q4
+ vst1.u8 {q4}, [r3], r1
+
+//eq4_end:
+ WELS_ASM_FUNC_END
+
+
+
+//uint8_t *pix, int32_t stride, int32_t alpha, int32_t beta, uint8_t *tc
+ WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
+
+ vdup.u8 q11, r2 // alpha [0~255]
+ vdup.u8 q9, r3 // q9:: beta [0~18]
+
+ sub r2, r0, #3 // pix -= 3
+ LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0
+ LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1
+ LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2
+ LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3
+ LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4
+ LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5
+ LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6
+ LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7
+
+ LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0
+ LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1
+ LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2
+ LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3
+ LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4
+ LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5
+ LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6
+ LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7
+// d0d1d2d6d7d8+d3d4d5d9d10d11
+ vswp d1, d2
+ vswp d3, d4
+ vswp d1, d4
+ vswp d7, d8
+ vswp d9, d10
+ vswp d7, d10
+// q0::p2 = pix[-3*xstride];
+// q1::p1 = pix[-2*xstride];
+// q2::p0 = pix[-1*xstride];
+// q3::q0 = pix[ 0*xstride];
+// q4::q1 = pix[ 1*xstride];
+// q5::q2 = pix[ 2*xstride];
+ sub r0, r0, r1, lsl #4 // pix -= 16*src_stride
+
+// if( tc0[i] < 0 ) continue; else filter
+ ldr r3, [sp, #0]
+ vld1.s8 {d31}, [r3] // load 4 tc0[i]
+ vdup.s8 d28, d31[0]
+ vdup.s8 d30, d31[1]
+ vdup.s8 d29, d31[2]
+ vdup.s8 d31, d31[3]
+ vtrn.32 d28, d30
+ vtrn.32 d29, d31 // q14::each 32 bits is 4x tc0[i]
+ vcge.s8 q10, q14, #0 // q10::tc0[i] >= 0
+
+// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ MASK_MATRIX q1, q2, q3, q4, q11, q9, q15 // q15::mask matrix
+ vand.u8 q10, q10, q15 // two mask
+// JMP_IF_128BITS_IS_ZERO d20, d21, d31
+// beq lt4_end
+
+ veor q15, q15
+ vsub.i8 q15,q15,q14 // q15::4x -tc0[i], min
+
+// input: p2, p1, p0, q0, beta, -tc0[i], tc0[i]; mask_matrx, output: _clip3(p1'), tc++;
+ DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12 // q6 = _clip3(p1')
+
+ DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13 // q7 = _clip3(q1')
+
+ vabs.s8 q12, q12
+ vabs.s8 q13, q13 // if( abs( p2 - p0 ) < beta ) tc++;
+ vadd.u8 q14,q14,q12
+ vadd.u8 q14,q14,q13 // updated tc
+ veor q15, q15
+ vsub.i8 q15,q15,q14 // updated -tc
+
+// input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
+ DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13 //q8::delta
+ vmax.s8 q8, q8, q15 // >= -tc0[i]
+ vmin.s8 q8, q8, q14 // <= tc0[i]
+ vand.s8 q8, q8, q10
+ EXTRACT_DELTA_INTO_TWO_PART q8, q9
+ vqadd.u8 q2, q2, q9 // clip_uint8( p0 + [+delta] ); p0'
+ vqsub.u8 q2, q2, q8 // clip_uint8( p0 - [-delta] ); p0'
+
+ vqsub.u8 q3, q3, q9 // clip_uint8( q0 - [+delta] ); q0'
+ vqadd.u8 q3, q3, q8 // clip_uint8( q0 + [-delta] ); q0'
+
+ sub r0, #2
+ add r2, r0, r1
+ lsl r1, #1
+
+ vmov q1, q6
+ vmov q4, q7
+// q1,q2,q3,q4
+ vswp q2, q3
+ vswp d3, d6
+ vswp d5, d8
+// d2~d5, d6~d7
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7
+
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7
+//lt4_end:
+ WELS_ASM_FUNC_END
+
+
+//uint8_t *pix, int32_t stride, int32_t alpha, int32_t beta
+ WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
+
+ vdup.u8 q5, r2 // alpha [0~255]
+ vdup.u8 q4, r3 // beta [0~18]
+
+ sub r3, r0, #4 // pix -= 4
+ LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,0
+ LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,1
+ LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,2
+ LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,3
+ LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,4
+ LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,5
+ LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,6
+ LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,7
+
+ LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,0
+ LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,1
+ LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,2
+ LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,3
+ LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,4
+ LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,5
+ LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,6
+ LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,7
+
+ vswp q9, q10
+ vswp d17,d18
+ vswp d21,d22
+ vswp q13,q14
+ vswp d25,d26
+ vswp d29,d30
+ sub r0, r0, r1 , lsl #4 // r0 -= 16*xstride
+// q8::p3 = pix[-4*xstride];
+// q9::p2 = pix[-3*xstride];
+// q10::p1 = pix[-2*xstride];
+// q11::p0 = pix[-1*xstride];
+// q12::q0 = pix[ 0*xstride];
+// q13::q1 = pix[ 1*xstride];
+// q14::q2 = pix[ 2*xstride];
+// q15::q3 = pix[ 3*xstride];
+
+// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ MASK_MATRIX q10, q11, q12, q13, q5, q4, q6 // q6::mask matrix
+// JMP_IF_128BITS_IS_ZERO d12, d13, d0
+// beq eq4_end
+
+// if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
+ mov r2, r2, lsr #2
+ add r2, r2, #2
+ vdup.u8 q5, r2
+ vabd.u8 q0, q11, q12
+ vclt.u8 q7, q0, q5 // q7::indicate
+// if( abs( p2 - p0 ) < beta )
+ vabd.u8 q1, q9, q11
+ vclt.u8 q1, q1, q4
+ vand.s8 q1, q1, q7 // q1::indicate [p0', p1', p2'] or [p0']
+// if( abs( q2 - q0 ) < beta )
+ vabd.u8 q2, q14,q12
+ vclt.u8 q2, q2, q4
+ vand.s8 q2, q2, q7 // q2::indicate [q0', q1', q2'] or [q0']
+ vand.u8 q7, q7, q6
+
+ vmov q3, q1
+// input: p3(output p1'), p2, p1, p0, q0, q1, select_matrix(output p0'), output p2';
+// workin q4~q5; after filtered then p3/p2 useless!
+ DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
+ DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
+
+// q1(p0') q2(q0') only need ::if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+// q0(p2') q8(p1') q15(q1') q3(q2'); need more &&if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )&&if( abs( p2 - p0 ) < beta )
+ vand.u8 q3, q7, q3
+ DIFF_LUMA_EQ4_MASK q0, q9, q3, q4 // p2'
+ vmov q9, q4
+
+// DIFF_LUMA_EQ4_MASK q8,q10, q3, q4 // p1'
+ vbsl.u8 q3, q8, q10
+
+ DIFF_LUMA_EQ4_MASK q1,q11, q6, q8 // p0'
+
+ vand.u8 q7, q7, q2
+// input: q3(output q1'), q2, q1, q0, p0, p1, select_matrix(output q0'), output q2';
+// workin q4~q5; after filtered then q3/q2 useless!
+ DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0
+ DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1
+
+// DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
+ vbsl.u8 q6, q2, q12
+
+ DIFF_LUMA_EQ4_MASK q15, q13, q7, q4
+
+// DIFF_LUMA_EQ4_MASK q0, q14, q7, q4
+ vbsl.u8 q7, q0, q14
+
+// q9,q3,q8,q6,q4,q7
+ vmov q5, q6
+ vmov q2, q9
+ vmov q6, q4
+ vmov q4, q8
+// q2,q3,q4,q5,q6,q7
+
+ vswp d8, d6
+ vswp d5, d7
+ vswp d5, d8
+ vswp d14, d12
+ vswp d11, d13
+ vswp d11, d14
+
+ sub r3, r0, #3
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7
+
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7
+
+//eq4_end:
+ WELS_ASM_FUNC_END
+
+//uint8_t *pix_cb, uint8_t *pix_cr, int32_t stride, int32_t alpha, int32_t beta, uint8_t *tc
+ WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
+
+ vdup.u8 q11, r3 // alpha [0~255]
+ ldr r3, [sp, #0]
+
+ sub r0, r0, r2 , lsl #1 // pix -= 2*src_stride
+ sub r1, r1, r2, lsl #1
+ vdup.u8 q9, r3 // q9:: beta [0~18]
+ ldr r3, [sp, #4]
+
+ vld1.u8 {d0}, [r0], r2 // q0::p1
+ vld1.u8 {d1}, [r1], r2
+ vld1.u8 {d2}, [r0], r2 // q1::p0
+ vld1.u8 {d3}, [r1], r2
+ vld1.u8 {d4}, [r0], r2 // q2::q0
+ vld1.u8 {d5}, [r1], r2
+ vld1.u8 {d6}, [r0] // q3::q1
+ vld1.u8 {d7}, [r1]
+
+ sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
+ sub r1, r1, r2, lsl #1
+// if( tc0[i] < 0 ) continue; else filter
+ vld1.s8 {d15}, [r3] // load 4 tc0[i], each tc0[i] 2 bytes; d[x] Cb && d[x+1] Cr
+ vmovl.u8 q6, d15
+ vshl.u64 d13,d12,#8
+ vorr d12,d13
+ vmov d13, d12 // q6::each 64 bits is 2x tc0[i]
+ veor q7, q7
+ vsub.i8 q7,q7,q6 // q7::4x -tc0[i], min
+
+// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q5 // q5::mask matrix
+// JMP_IF_128BITS_IS_ZERO d20, d21, d31
+// beq lt4_end
+
+
+// input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
+ DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d8, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d9, q12, q13 //q4::delta
+ vmax.s8 q4, q4, q7 // >= -tc0[i]
+ vmin.s8 q4, q4, q6 // <= tc0[i]
+
+ vand.s8 q4, q4, q5
+ vcge.s8 q6, q6, #0 // q6::tc0[i] >= 0
+ vand.s8 q4, q4, q6
+ EXTRACT_DELTA_INTO_TWO_PART q4, q5
+ vqadd.u8 q1, q1, q5 // clip_uint8( p0 + [+delta] ); p0'
+ vqsub.u8 q1, q1, q4 // clip_uint8( p0 - [-delta] ); p0'
+ vst1.u8 {d2}, [r0], r2
+ vst1.u8 {d3}, [r1], r2
+ vqsub.u8 q2, q2, q5 // clip_uint8( q0 - [+delta] ); q0'
+ vqadd.u8 q2, q2, q4 // clip_uint8( q0 + [-delta] ); q0'
+ vst1.u8 {d4}, [r0]
+ vst1.u8 {d5}, [r1]
+
+//lt4_end:
+ WELS_ASM_FUNC_END
+
+// uint8_t *pix_cb, uint8_t *pix_cr, int32_t stride, int32_t alpha, int32_t beta
+ WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
+
+ vdup.u8 q11, r3 // alpha [0~255]
+ ldr r3, [sp, #0]
+
+ sub r0, r0, r2 , lsl #1 // pix -= 2*src_stride
+ sub r1, r1, r2, lsl #1
+ vdup.u8 q9, r3 // q9:: beta [0~18]
+
+ vld1.u8 {d0}, [r0], r2 // q0::p1
+ vld1.u8 {d1}, [r1], r2
+ vld1.u8 {d2}, [r0], r2 // q1::p0
+ vld1.u8 {d3}, [r1], r2
+ vld1.u8 {d4}, [r0], r2 // q2::q0
+ vld1.u8 {d5}, [r1], r2
+ vld1.u8 {d6}, [r0] // q3::q1
+ vld1.u8 {d7}, [r1]
+
+ sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
+ sub r1, r1, r2, lsl #1
+
+// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 // q10::mask matrix, d20:Cb d21:Cr
+// JMP_IF_128BITS_IS_ZERO d20, d21, d31
+// beq eq4_end
+ vmov q11, q10
+
+// ( (p1 << 1) + p0 + q1 + 2 ) >> 2
+// ( (q1 << 1) + q0 + p1 + 2 ) >> 2
+ DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q6, d14, d0 // Cb::p0' q0'
+ DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d15, d1 // Cr::p0' q0'
+
+ vbsl.u8 q10, q7, q1 // p0'
+ vst1.u8 {d20}, [r0], r2
+ vst1.u8 {d21}, [r1], r2
+
+ vbsl.u8 q11, q0, q2 // q0'
+ vst1.u8 {d22}, [r0]
+ vst1.u8 {d23}, [r1]
+
+//eq4_end:
+ WELS_ASM_FUNC_END
+
+//uint8_t *pix_cb, uint8_t *pix_cr, int32_t stride, int32_t alpha, int32_t beta, uint8_t *tc
+ WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
+
+ vdup.u8 q11, r3 // alpha [0~255]
+ ldr r3, [sp, #0]
+
+ sub r0, r0, #2 // pix [-2]
+ vdup.u8 q9, r3 // q9:: beta [0~18]
+ ldr r3, [sp, #4]
+ sub r1, r1, #2
+ vld1.s8 {d15}, [r3] // load 4 tc0[i], each tc0[i] 2 bytes; d[x] Cb && d[x+1] Cr
+
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+// Cb:d0d1d2d3, Cr:d4d5d6d7
+ vswp q1, q2
+ vswp d1, d2
+ vswp d6, d5
+// Cb:d0d2d4d6, Cr:d1d3d5d7
+
+
+// if( tc0[i] < 0 ) continue; else filter
+
+ vmovl.u8 q6, d15
+ vshl.u64 d13,d12,#8
+ vorr d12,d13
+ vmov d13, d12 // q6::each 64 bits is 2x tc0[i]
+ veor q7, q7
+ vsub.i8 q7,q7,q6 // q7::4x -tc0[i], min
+
+// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q5 // q5::mask matrix
+// JMP_IF_128BITS_IS_ZERO d20, d21, d31
+// beq lt4_end
+
+// input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
+ DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d8, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d9, q12, q13 //q4::delta
+ vmax.s8 q4, q4, q7 // >= -tc0[i]
+ vmin.s8 q4, q4, q6 // <= tc0[i]
+
+ vand.s8 q4, q4, q5
+ vcge.s8 q6, q6, #0 // q6::tc0[i] >= 0
+ vand.s8 q4, q4, q6
+ EXTRACT_DELTA_INTO_TWO_PART q4, q5
+ vqadd.u8 q1, q1, q5 // clip_uint8( p0 + [+delta] ); p0'
+ vqsub.u8 q1, q1, q4 // clip_uint8( p0 - [-delta] ); p0'
+ vqsub.u8 q2, q2, q5 // clip_uint8( q0 - [+delta] ); q0'
+ vqadd.u8 q2, q2, q4 // clip_uint8( q0 + [-delta] ); q0'
+
+ sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
+ sub r1, r1, r2, lsl #3
+ vswp d1, d2
+ vswp d6, d5
+ vswp q1, q2
+// Cb:d0d1d2d3, Cr:d4d5d6d7
+
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+
+//lt4_end:
+ WELS_ASM_FUNC_END
+
+// uint8_t *pix_cb, uint8_t *pix_cr, int32_t stride, int32_t alpha, int32_t beta
+ WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
+
+ vdup.u8 q11, r3 // alpha [0~255]
+ ldr r3, [sp, #0]
+
+ sub r0, r0, #2 // pix [-2]
+ sub r1, r1, #2
+
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+// Cb:d0d1d2d3, Cr:d4d5d6d7
+ vswp q1, q2
+ vswp d1, d2
+ vswp d6, d5
+// Cb:d0d2d4d6, Cr:d1d3d5d7
+
+
+// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
+ vdup.u8 q9, r3 // q9:: beta [0~18]
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 // q10::mask matrix, d20:Cb d21:Cr
+// JMP_IF_128BITS_IS_ZERO d20, d21, d31
+// beq eq4_end
+ vmov q11, q10
+
+// ( (p1 << 1) + p0 + q1 + 2 ) >> 2
+// ( (q1 << 1) + q0 + p1 + 2 ) >> 2
+ DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10 // Cb::p0' q0'
+ DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11 // Cr::p0' q0'
+
+ vbsl.u8 q10, q4, q1 // p0'
+ vbsl.u8 q11, q5, q2 // q0'
+// q0 q10 q11 q3
+
+ sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
+ sub r1, r1, r2, lsl #3
+
+ vmov q1, q10
+ vmov q2, q11
+ vswp d1, d2
+ vswp d6, d5
+ vswp q1, q2
+// Cb:d0d1d2d3, Cr:d4d5d6d7
+
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+
+//eq4_end:
+ WELS_ASM_FUNC_END
+
+
+// r0 int8_t* non_zero_count,
+ WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
+
+ vld1.64 {d0-d2}, [r0]
+
+ vceq.s8 q0, q0, #0
+ vceq.s8 d2, d2, #0
+ vmvn q0, q0
+ vmvn d2, d2
+ vabs.s8 q0, q0
+ vabs.s8 d2, d2
+
+ vst1.64 {d0-d2}, [r0]
+ WELS_ASM_FUNC_END
+
+#endif
--- a/codec/decoder/core/arm/arm_arch_common_macro.S
+++ /dev/null
@@ -1,55 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef APPLE_IOS
-
-.macro WELS_ASM_FUNC_BEGIN
-.align 2
-.arm
-.globl _$0
-_$0:
-.endm
-
-#else
-
-.macro WELS_ASM_FUNC_BEGIN funcName
-.align 2
-.arm
-.global \funcName
-\funcName:
-.endm
-
-#endif
-
-.macro WELS_ASM_FUNC_END
-mov pc, lr
-.endm
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -34,30 +34,7 @@
.text
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
-.macro ORR_32BYTES_TO_8BYTES
-// { // input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1
- vorr.s16 $0, $1
- vorr.s16 $2, $3
- vorr.s16 $8, $4, $5
- vorr.s16 $9, $6, $7
-// }
-.endm
-.macro ADD_PRED_1BYTE_TO_RESID_2BYTES
-// { // input: q0~q3, d0~d3, output: d0~d3;
-
- vaddw.u8 $0, $4
- vaddw.u8 $1, $5
- vaddw.u8 $2, $6
- vaddw.u8 $3, $7
-
- vqmovun.s16 $4, $0 //saturation
- vqmovun.s16 $6, $2
- vqmovun.s16 $5, $1
- vqmovun.s16 $7, $3
-// }
-.endm
-
.macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
@@ -89,40 +66,8 @@
// }
.endm
-.macro ADD_AND_CLIP_RS
-// { // input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q;
- vrshrn.s32 $5, $0, #6
- vrshrn.s32 $6, $1, #6
- vqadd.s16 $7, $4
- vmin.s16 $7, $7, $2
- vmax.s16 $7, $7, $3
-// }
-.endm
#else
-.macro ORR_32BYTES_TO_8BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1
- vorr.s16 \arg0, \arg1
- vorr.s16 \arg2, \arg3
- vorr.s16 \arg8, \arg4, \arg5
- vorr.s16 \arg9, \arg6, \arg7
-// }
-.endm
-.macro ADD_PRED_1BYTE_TO_RESID_2BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: q0~q3, d0~d3, output: d0~d3;
-
- vaddw.u8 \arg0, \arg4
- vaddw.u8 \arg1, \arg5
- vaddw.u8 \arg2, \arg6
- vaddw.u8 \arg3, \arg7
-
- vqmovun.s16 \arg4, \arg0 //saturation
- vqmovun.s16 \arg6, \arg2
- vqmovun.s16 \arg5, \arg1
- vqmovun.s16 \arg7, \arg3
-// }
-.endm
-
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
@@ -153,16 +98,6 @@
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
-
-.macro ADD_AND_CLIP_RS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q;
- vrshrn.s32 \arg5, \arg0, #6
- vrshrn.s32 \arg6, \arg1, #6
- vqadd.s16 \arg7, \arg4
- vmin.s16 \arg7, \arg7, \arg2
- vmax.s16 \arg7, \arg7, \arg3
-// }
-.endm
#endif
// r0 int16_t* block,
// r1 int8_t* non_zero_count,
@@ -180,158 +115,7 @@
vst1.64 {d0-d2}, [r1]
WELS_ASM_FUNC_END
-// r0 int16_t* block,
-// r1 int8_t* non_zero_count,
- WELS_ASM_FUNC_BEGIN svc_non_zero_count_neon
- push {r2-r4}
- mov r4, #3
- mov r3, #64
- add r2, r0, #32
- pld [r0, #512]
-non_zero_count_two_8x8_loop:
- vld1.64 {q0, q1}, [r0,:128], r3
- vld1.64 {q2, q3}, [r2,:128], r3
- vld1.64 {q4, q5}, [r0,:128], r3
- vld1.64 {q6, q7}, [r2,:128], r3
- vld1.64 {q8, q9}, [r0,:128], r3
- vld1.64 {q10, q11}, [r2,:128], r3//load #0 8x8 block resi data,
- vld1.64 {q12, q13}, [r0,:128], r3
- vld1.64 {q14, q15}, [r2,:128], r3//load #1 8x8 block resi data,
- pld [r0, #512]
-
- ORR_32BYTES_TO_8BYTES q0, q1, q2, q3, d0, d1, d4, d5, d2, d3 // output q1
-// vceq.i16 q1, q1, #0
-
- ORR_32BYTES_TO_8BYTES q8, q9,q10,q11,d16,d17,d20,d21,d4,d5 // output q2
-// vceq.i16 q2, q2, #0
-
- ORR_32BYTES_TO_8BYTES q4, q5, q6, q7, d8, d9, d12, d13, d10, d11 // output q5
-// vceq.i16 q5, q5, #0
-
- ORR_32BYTES_TO_8BYTES q12,q13,q14,q15,d24,d25, d28, d29, d12, d13 // output q6
-// vceq.i16 q6, q6, #0
-
- vqmovn.u64 d0, q1 // 8bytes-->4bytes
- vqmovn.u64 d8, q5
- vqmovn.u64 d1, q2
- vqmovn.u64 d9, q6
-
- vqmovn.u32 d2, q0 // 4bytes-->2bytes
- vqmovn.u32 d3, q4
-
- vceq.i16 q0, q1, #0
- vmvn q0, q0
- vabs.s16 q2, q0
- vmovn.u16 d6, q2 // 2bytes-->1bytes
- vst1.u8 {d6}, [r1]!
-
-// pld [r0]
- subs r4, r4, #1
- bne non_zero_count_two_8x8_loop
-
- pop {r2-r4}
- WELS_ASM_FUNC_END
-
-// r0 int16_t* block,
-// r1 int8_t* non_zero_count,
- WELS_ASM_FUNC_BEGIN svc_rs_non_zero_count_neon
-
- vld1.i16 {q0, q1}, [r0]! // block is unaligned!!!
- vld1.i16 {q2, q3}, [r0]!
- vld1.i16 {q4, q5}, [r0]!
- vld1.i16 {q6, q7}, [r0]!
-
- vld1.i16 {q8, q9}, [r0]!
- vld1.i16 {q10, q11}, [r0]!
- vld1.i16 {q12, q13}, [r0]!
- vld1.i16 {q14, q15}, [r0]!
-
- ORR_32BYTES_TO_8BYTES q0, q2, q1, q3, q4, q6, q5, q7, q4, q5
- vorr.s16 q0, q4
- vorr.s16 q1, q5 // output d0~d3
- ORR_32BYTES_TO_8BYTES q8, q10, q9, q11, q12, q14, q13, q15, q12, q13
- vorr.s16 q6, q8, q12
- vorr.s16 q7, q9, q13 // output d12~d15
-
- vqmovn.u64 d4, q0 // 8bytes-->4bytes
- vqmovn.u64 d6, q6
- vqmovn.u64 d5, q1
- vqmovn.u64 d7, q7
-
- vqmovn.u32 d8, q2 // 4bytes-->2bytes
- vqmovn.u32 d9, q3
-
- vceq.i16 q5, q4, #0
- vmvn q5, q5
- vabs.s16 q5, q5
- vmovn.u16 d10, q5 // 2bytes-->1bytes
- vst1.u8 {d10}, [r1]!
-
- vld1.i16 {q0, q1}, [r0]!
- vld1.i16 {q2, q3}, [r0]!
- vld1.i16 {q4, q5}, [r0]!
- vld1.i16 {q6, q7}, [r0]!
-
- vld1.i16 {q8, q9}, [r0]!
- vld1.i16 {q10, q11}, [r0]!
- vld1.i16 {q12, q13}, [r0]!
- vld1.i16 {q14, q15}, [r0]!
-
- ORR_32BYTES_TO_8BYTES q0, q2, q1, q3, q4, q6, q5, q7, q4, q5
- vorr.s16 q0, q4
- vorr.s16 q1, q5 // output d0~d3
- ORR_32BYTES_TO_8BYTES q8, q10, q9, q11, q12, q14, q13, q15, q12, q13
- vorr.s16 q6, q8, q12
- vorr.s16 q7, q9, q13 // output d12~d15
-
- vqmovn.u64 d4, q0 // 8bytes-->4bytes
- vqmovn.u64 d6, q6
- vqmovn.u64 d5, q1
- vqmovn.u64 d7, q7
-
- vqmovn.u32 d8, q2 // 4bytes-->2bytes
- vqmovn.u32 d9, q3
-
- vceq.i16 q5, q4, #0
- vmvn q5, q5
- vabs.s16 q5, q5
- vmovn.u16 d10, q5 // 2bytes-->1bytes
- vst1.u8 {d10}, [r1]!
-
-// Chroma
- vld1.i16 {q0, q1}, [r0]!
- vld1.i16 {q2, q3}, [r0]!
- vld1.i16 {q4, q5}, [r0]!
- vld1.i16 {q6, q7}, [r0]! //load Cb block,
-
- vld1.i16 {q8, q9}, [r0]!
- vld1.i16 {q10, q11}, [r0]!
- vld1.i16 {q12, q13}, [r0]!
- vld1.i16 {q14, q15}, [r0]! //load Cr block,
-
- ORR_32BYTES_TO_8BYTES q0, q1, q2, q3, q4, q5, q6, q7, q4, q6
- vorr.s16 q0, q2
- vorr.s16 q1, q4, q6 // output d0~d3
- ORR_32BYTES_TO_8BYTES q8, q9, q10, q11, q12, q13, q14, q15, q12, q14
- vorr.s16 q2, q8, q10
- vorr.s16 q3, q12, q14 // output d4~d7
-
- vqmovn.u64 d8, q0 // 8bytes-->4bytes
- vqmovn.u64 d10, q2
- vqmovn.u64 d9, q1
- vqmovn.u64 d11, q3
-
- vqmovn.u32 d12, q4 // 4bytes-->2bytes
- vqmovn.u32 d13, q5
-
- vceq.i16 q7, q6, #0
- vmvn q7, q7
- vabs.s16 q7, q7
- vmovn.u16 d10, q7 // 2bytes-->1bytes
- vst1.u8 {d10}, [r1]!
- WELS_ASM_FUNC_END
-
// r0 int16_t * block,
// r1 int32_t stride
WELS_ASM_FUNC_BEGIN WelsResBlockZero16x16_neon// can use for 256*sizeof(int16_t)
@@ -371,207 +155,6 @@
pop {r2}
WELS_ASM_FUNC_END
-// r0 int8_t* dst_addr,
-// r1 memset_value
-// r2 int32_t bytes_nmb,
-
- WELS_ASM_FUNC_BEGIN svc_block_memset_neon// dst should continue
- vdup.u8 q0, r1
- vdup.u8 q1, r1
-
-block_memset_loop:
- vst1.64 {q0, q1}, [r0,:64]!
- subs r2, r2, #64
- vst1.64 {q0, q1}, [r0,:64]!
- bne block_memset_loop
- WELS_ASM_FUNC_END
-
-// int16_t* dst,
-// int16_t* src,
-// int32_t stride
- WELS_ASM_FUNC_BEGIN svc_block_copy_16x16_neon
- push {r3}
- mov r3, #16
-// each element is sizeof(int16_t)
- lsl r2, r2, #1 // r2 = 2*r2
-
-block_copy_16x16_luma_loop:
- vld1.i16 {q0, q1}, [r1], r2
- subs r3, r3, #1
- vst1.i16 {q0, q1}, [r0]!
- bne block_copy_16x16_luma_loop
-
- pop {r3}
- WELS_ASM_FUNC_END
-
- WELS_ASM_FUNC_BEGIN svc_block_copy_8x8_neon
- push {r3}
- mov r3, #8
-// each element is sizeof(int16_t)
- lsl r2, r2, #1 // r2 = 2*r2
-
-block_copy_8x8_chma_loop:
- vld1.i16 {q0}, [r1], r2
- subs r3, r3, #1
- vst1.i16 {q0}, [r0]!
- bne block_copy_8x8_chma_loop
-
- pop {r3}
- WELS_ASM_FUNC_END
-
-// r0 uint8_t * dest,
-// r1 uint8_t * pred,
-// r2 int16_t * res,
-// r3 int32_t stride,
- WELS_ASM_FUNC_BEGIN svc_block_add_16x16_neon
- push {r4}
- mov r4, #16
- pld [r1]
-block_recon_16x16_luma_loop:
-
- vld1.64 {d16,d17}, [r1,:64], r3 //load 16 pred data, update addr
- vld1.s16 {q0, q1}, [r2]! //load 8+8 resi data, update addr
- vld1.64 {d18,d19}, [r1,:64], r3
- vld1.s16 {q2, q3}, [r2]!
- ADD_PRED_1BYTE_TO_RESID_2BYTES q0, q1, q2, q3, d16, d17, d18, d19
- pld [r1]
- vst1.64 {q8}, [r0], r3 //store result
- vst1.64 {q9}, [r0], r3
-//#ifdef DEBUG_NEON
-// vst1.u8 {q8}, [r0]!
-// vst1.u8 {q9}, [r0]!
-//#endif
-
- vld1.64 {d20,d21}, [r1,:64], r3 //load 16 pred data, update addr
- vld1.s16 {q4, q5}, [r2]! //load 8+8 resi data, update addr
- vld1.64 {d22,d23}, [r1,:64], r3
- vld1.s16 {q6, q7}, [r2]!
- ADD_PRED_1BYTE_TO_RESID_2BYTES q4, q5, q6, q7, d20, d21, d22, d23
- pld [r1]
- vst1.64 {q10}, [r0], r3
- vst1.64 {q11}, [r0], r3
-//#ifdef DEBUG_NEON
-// vst1.u8 {q10}, [r0]!
-// vst1.u8 {q11}, [r0]!
-//#endif
-
- subs r4, r4, #4
- bne block_recon_16x16_luma_loop
-
- pop {r4}
- WELS_ASM_FUNC_END
-
-
- WELS_ASM_FUNC_BEGIN svc_block_add_8x8_neon
-
- vld1.u8 {d24}, [r1], r3 //load 8 pred data
- vld1.i16 {q8, q9}, [r2]! //load 8+8 resi data, update addr
- vld1.u8 {d25}, [r1], r3 //load 8 pred data, q12
- vld1.i16 {q10, q11}, [r2]! //load 8+8 resi data, update addr
- vld1.u8 {d26}, [r1], r3 //load 8 pred data
- vld1.u8 {d27}, [r1], r3 //load 8 pred data, q13
-
- ADD_PRED_1BYTE_TO_RESID_2BYTES q8, q9, q10, q11, d24, d25, d26, d27
- pld [r1]
- vst1.u8 {d24}, [r0], r3 //store result
- vst1.u8 {d25}, [r0], r3 //store result
- vst1.u8 {d26}, [r0], r3 //store result
- vst1.u8 {d27}, [r0], r3 //store result
-//#ifdef DEBUG_NEON
-// vst1.u8 {d24}, [r0]!
-//#endif
-
- vld1.u8 {d24}, [r1], r3 //load 8 pred data
- vld1.i16 {q8, q9}, [r2]! //load 8+8 resi data, update addr
- vld1.u8 {d25}, [r1], r3 //load 8 pred data, q12
- vld1.i16 {q10, q11}, [r2]! //load 8+8 resi data, update addr
- vld1.u8 {d26}, [r1], r3 //load 8 pred data
- vld1.u8 {d27}, [r1], r3 //load 8 pred data, q13
-
- ADD_PRED_1BYTE_TO_RESID_2BYTES q8, q9, q10, q11, d24, d25, d26, d27
- vst1.u8 {d24}, [r0], r3 //store result
- vst1.u8 {d25}, [r0], r3 //store result
- vst1.u8 {d26}, [r0], r3 //store result
- vst1.u8 {d27}, [r0], r3 //store result
-//#ifdef DEBUG_NEON
-// vst1.u8 {d24}, [r0]!
-//#endif
- WELS_ASM_FUNC_END
-
-
-// int16_t* dst,
-// int16_t* src,
-// int stride
- WELS_ASM_FUNC_BEGIN svc_simple_idct4x4_neon
-
- vld4.s16 {d0, d1, d2, d3}, [r1] // cost 3 cycles!
- lsl r2, r2, #1
-
- ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
-
- TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
-
- // transform element 32bits
- vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-
- COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7
-
- TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
-
- vrshrn.s32 d0, q0, #6
- vst1.s16 {d0}, [r0], r2 //store
- vrshrn.s32 d1, q1, #6
- vst1.s16 {d1}, [r0], r2 //store
- vrshrn.s32 d2, q2, #6
- vst1.s16 {d2}, [r0], r2 //store
- vrshrn.s32 d3, q3, #6
- vst1.s16 {d3}, [r0], r2 //store
-
- WELS_ASM_FUNC_END
-// int16_t* dst,
-// int16_t* src,
-// int stride
- WELS_ASM_FUNC_BEGIN svc_idct4x4_add_neon
-
- vld4.s16 {d0, d1, d2, d3}, [r1] // cost 3 cycles!
- lsl r2, r2, #1
-
- ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
-
- TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
-
- // transform element 32bits
- vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-
- COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7
-
- TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
-
- //see draft G.8.5.3 , after clip_rs() into [-255, 255]
- vmov.i16 q10,#0xFF
- veor q11, q11
- vsub.i16 q11, q11,q10
-// vmvn.i16 q11,#0xFF
-
- mov r1, r0
- vld1.s16 {d16}, [r0], r2
- vld1.s16 {d17}, [r0], r2
- ADD_AND_CLIP_RS q0, q1, q10, q11, q8, d8, d9, q4
- vst1.s16 {d8}, [r1], r2 //store
- vst1.s16 {d9}, [r1], r2 //store
-
- vld1.s16 {d18}, [r0], r2
- vld1.s16 {d19}, [r0], r2
- ADD_AND_CLIP_RS q2, q3, q10, q11, q9, d10, d11, q5
- vst1.s16 {d10}, [r1], r2 //store
- vst1.s16 {d11}, [r1], r2 //store
- WELS_ASM_FUNC_END
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
--- a/codec/decoder/core/arm/deblocking_neon.S
+++ /dev/null
@@ -1,1341 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef HAVE_NEON
-.text
-
-#include "arm_arch_common_macro.S"
-
-#ifdef APPLE_IOS
-.macro JMP_IF_128BITS_IS_ZERO
-// {
- vorr.s16 $2, $0, $1
- vmov r3, r2, $2
- orr r3, r3, r2
- cmp r3, #0
-// }
-.endm
-
-// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
-.macro MASK_MATRIX
-// { input: p1, p0, q0, q1, alpha(be modified), beta; output: mask
- vabd.u8 $6, $1, $2 // abs( p0 - q0 )
- vcgt.u8 $6, $4, $6 // mask = abs( p0 - q0 ) < alpha
-
- vabd.u8 $4, $0, $1 // abs( p1 - p0 )
- vclt.u8 $4, $4, $5 // abs( p1 - p0 ) < beta
- vand.u8 $6, $6, $4 // 2nd mask &
-
- vabd.u8 $4, $3, $2 // abs( q1 - q0 )
- vclt.u8 $4, $4, $5 // abs( q1 - q0 ) < beta
- vand.u8 $6, $6, $4 // 3rd mask &
-// }
-.endm
-
-//if( abs( p2 - p0 ) < beta )
-//{
-// pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1)>> 1)) >> 1) - p1, -tc0[i], tc0[i] );
-// tc++;
-//}
-.macro DIFF_LUMA_LT4_P1_Q1
-// { input: p2, p1, p0, q0, beta, -tc0[i], tc0[i], mask_matrx; output: _clip3(p1'), tc++;
- vabd.u8 $9, $0, $2 // abs( p2 - p0 )
- vclt.u8 $9, $9, $4 // abs( p2 - p0 ) < beta
- vrhadd.u8 $8, $2, $3 // ((p0 + q0 + 1)>> 1)
- vhadd.u8 $8, $0, $8 // (( p2 + ((p0 + q0 + 1)>> 1)) >> 1)
- vsub.s8 $8, $8, $1 // (( p2 + ((p0 + q0 + 1)>> 1)) >> 1) - p1
- vmax.s8 $8, $8, $5 // >= -tc0[i]
- vmin.s8 $8, $8, $6 // <= tc0[i]
- vand.s8 $8, $8, $9 // mask, only [abs( p2 - p0 ) < beta] avail _clip3
- vand.s8 $8, $8, $7
- vadd.u8 $8, $1, $8
- vabs.s8 $9, $9 // if( abs( p2 - p0 ) < beta ) tc++;
-// }
-.endm
-
-//delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3,-tc, tc );
-.macro DIFF_LUMA_LT4_P0_Q0
-// { input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
- vsubl.u8 $5, $0, $3 // (p1 - q1)
- vsubl.u8 $6, $2, $1 // (q0 - p0)
- vshl.s16 $6, $6, #2
- vadd.s16 $5, $5, $6 // (p1 - q1) += ( q0 - p0 ) <<2
- vrshrn.s16 $4, $5, #3
-// }
-.endm
-
-//if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
-//{
-// const int p3 = pix[-4*xstride];
-// pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
-// pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
-// pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
-//}
-//else /* p0' */
-// pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-.macro DIFF_LUMA_EQ4_P2P1P0
-// { input: p3(output p1'), p2, p1, p0, q0, q1, select_matrix(output p0'), output p2';
-// workin q4~q5; after filtered then p3/p2 useless!
- vaddl.u8 q4, $1, $2 // (p2 + p1)
- vaddl.u8 q5, $3, $4 // (p0 + q0)
- vadd.u16 q5, q4, q5 // p1'=(p2 + p1)+(p0 + q0)
-
- vaddl.u8 q4, $0, $1 // (p3 + p2)
- vshl.u16 q4, q4, #1
- vadd.u16 q4, q5, q4 // p2'=2*(p3 + p2)+(p2 + p1)+(p0 + q0)
-
- vrshrn.u16 $0, q5, #2 // p1', prev p3 useless now
- vrshrn.u16 $7, q4, #3 // p2'
-
- vshl.u16 q5, q5, #1 // ((p2 + p1)+(p0 + q0))*2
- vsubl.u8 q4, $5, $1 // (q1 - p2)
- vadd.u16 q5, q4,q5 // 5tags p0'=(q1 - p2)+((p2 + p1)+(p0 + q0))*2
-
- vaddl.u8 q4, $2, $5 // (p1 + q1)
- vaddw.u8 q4, q4, $2
- vaddw.u8 q4, q4, $3 // 3tags p0'=2*p1+(p0 + q1)
-
- vrshrn.u16 d10,q5, #3 // 5tags
- vrshrn.u16 d8, q4, #2 // 3tags
- vbsl.u8 $6, d10, d8 // p0'
-// }
-.endm
-
-.macro DIFF_LUMA_EQ4_MASK
-// { input: px', px, mask_matrix; working q4
- vmov $3, $2
- vbsl.u8 $3, $0, $1
-// }
-.endm
-
-// ( (p1 << 1) + p0 + q1 + 2 ) >> 2
-.macro DIFF_CHROMA_EQ4_P0Q0
-// { input: p1, p0, q0, q1; working q4/q5/q6; output: p0'_d, q0'_d
- vaddl.u8 $4, $0, $3 // (p1 + q1)
- vaddw.u8 $5, $4, $1
- vaddw.u8 $6, $4, $2
- vaddw.u8 $5, $5, $0 // p0'=(p1 + q1)+(p0+p1)
-// vaddw.u8 $6, $4, $2
- vaddw.u8 $6, $6, $3 // q0'=(p1 + q1)+(q0+q1)
- vrshrn.u16 $7, $5, #2
- vrshrn.u16 $8, $6, #2
-// }
-.endm
-
-.macro LORD_CHROMA_DATA_4
-// { input: 4xCb_addr, 4xCr_addr, working r0~r2
- vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2 // Cb
- vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2 // Cr
-// }
-.endm
-
-.macro STORE_CHROMA_DATA_4
-// { input: 4xCb_addr, 4xCr_addr, working r0~r2
- vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2 // Cb
- vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2 // Cr
-// }
-.endm
-
-.macro LORD_LUMA_DATA_3
-// { input: 3xluma_addr, working r0~r2
- vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1 // 0::pix[-3];1::pix[-2];2::pix[-1];
- vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1 // 3::pix[0]; 4::pix[1]; 5::pix[2];
-// }
-.endm
-
-.macro STORE_LUMA_DATA_4
-// { input: 4xluma, working r0~r2
- vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1 // 0::pix[-2];1::pix[-1];2::pix[0]; 3::pix[1]
- vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
-// }
-.endm
-
-.macro LORD_LUMA_DATA_4
-// { input: 4xluma_addr, working r0r1r3
- vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r3], r1 // 0::pix[-4];1::pix[-3];2::pix[-2];3::pix[-1]
- vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r0], r1 // 4::pix[0]; 5::pix[1]; 6::pix[2]; 7::pix[3];
-// }
-.endm
-
-.macro STORE_LUMA_DATA_3
-// { input: 3xluma_addr, working r0~r2
- vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1 // 0::pix[-3];1::pix[-2];2::pix[-1];
- vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1 // 3::pix[0]; 4::pix[1]; 5::pix[2];
-// }
-.endm
-
-.macro EXTRACT_DELTA_INTO_TWO_PART
-// { input: delta (output abs minus part), working (output plus part)
- vcge.s8 $1, $0, #0
- vand $1, $0, $1 // select original (+part)
- vsub.s8 $0, $1, $0 // select original -(-part)
-// }
-.endm
-#else
-.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
-// {
- vorr.s16 \arg2, \arg0, \arg1
- vmov r3, r2, \arg2
- orr r3, r3, r2
- cmp r3, #0
-// }
-.endm
-
-// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
-.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
-// { input: p1, p0, q0, q1, alpha(be modified), beta; output: mask
- vabd.u8 \arg6, \arg1, \arg2 // abs( p0 - q0 )
- vcgt.u8 \arg6, \arg4, \arg6 // mask = abs( p0 - q0 ) < alpha
-
- vabd.u8 \arg4, \arg0, \arg1 // abs( p1 - p0 )
- vclt.u8 \arg4, \arg4, \arg5 // abs( p1 - p0 ) < beta
- vand.u8 \arg6, \arg6, \arg4 // 2nd mask &
-
- vabd.u8 \arg4, \arg3, \arg2 // abs( q1 - q0 )
- vclt.u8 \arg4, \arg4, \arg5 // abs( q1 - q0 ) < beta
- vand.u8 \arg6, \arg6, \arg4 // 3rd mask &
-// }
-.endm
-
-//if( abs( p2 - p0 ) < beta )
-//{
-// pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1)>> 1)) >> 1) - p1, -tc0[i], tc0[i] );
-// tc++;
-//}
-.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { input: p2, p1, p0, q0, beta, -tc0[i], tc0[i], mask_matrx; output: _clip3(p1'), tc++;
- vabd.u8 \arg9, \arg0, \arg2 // abs( p2 - p0 )
- vclt.u8 \arg9, \arg9, \arg4 // abs( p2 - p0 ) < beta
- vrhadd.u8 \arg8, \arg2, \arg3 // ((p0 + q0 + 1)>> 1)
- vhadd.u8 \arg8, \arg0, \arg8 // (( p2 + ((p0 + q0 + 1)>> 1)) >> 1)
- vsub.s8 \arg8, \arg8, \arg1 // (( p2 + ((p0 + q0 + 1)>> 1)) >> 1) - p1
- vmax.s8 \arg8, \arg8, \arg5 // >= -tc0[i]
- vmin.s8 \arg8, \arg8, \arg6 // <= tc0[i]
- vand.s8 \arg8, \arg8, \arg9 // mask, only [abs( p2 - p0 ) < beta] avail _clip3
- vand.s8 \arg8, \arg8, \arg7
- vadd.u8 \arg8, \arg1, \arg8
- vabs.s8 \arg9, \arg9 // if( abs( p2 - p0 ) < beta ) tc++;
-// }
-.endm
-
-//delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3,-tc, tc );
-.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-// { input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
- vsubl.u8 \arg5, \arg0, \arg3 // (p1 - q1)
- vsubl.u8 \arg6, \arg2, \arg1 // (q0 - p0)
- vshl.s16 \arg6, \arg6, #2
- vadd.s16 \arg5, \arg5, \arg6 // (p1 - q1) += ( q0 - p0 ) <<2
- vrshrn.s16 \arg4, \arg5, #3
-// }
-.endm
-
-//if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
-//{
-// const int p3 = pix[-4*xstride];
-// pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
-// pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
-// pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
-//}
-//else /* p0' */
-// pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { input: p3(output p1'), p2, p1, p0, q0, q1, select_matrix(output p0'), output p2';
-// workin q4~q5; after filtered then p3/p2 useless!
- vaddl.u8 q4, \arg1, \arg2 // (p2 + p1)
- vaddl.u8 q5, \arg3, \arg4 // (p0 + q0)
- vadd.u16 q5, q4, q5 // p1'=(p2 + p1)+(p0 + q0)
-
- vaddl.u8 q4, \arg0, \arg1 // (p3 + p2)
- vshl.u16 q4, q4, #1
- vadd.u16 q4, q5, q4 // p2'=2*(p3 + p2)+(p2 + p1)+(p0 + q0)
-
- vrshrn.u16 \arg0, q5, #2 // p1', prev p3 useless now
- vrshrn.u16 \arg7, q4, #3 // p2'
-
- vshl.u16 q5, q5, #1 // ((p2 + p1)+(p0 + q0))*2
- vsubl.u8 q4, \arg5, \arg1 // (q1 - p2)
- vadd.u16 q5, q4,q5 // 5tags p0'=(q1 - p2)+((p2 + p1)+(p0 + q0))*2
-
- vaddl.u8 q4, \arg2, \arg5 // (p1 + q1)
- vaddw.u8 q4, q4, \arg2
- vaddw.u8 q4, q4, \arg3 // 3tags p0'=2*p1+(p0 + q1)
-
- vrshrn.u16 d10,q5, #3 // 5tags
- vrshrn.u16 d8, q4, #2 // 3tags
- vbsl.u8 \arg6, d10, d8 // p0'
-// }
-.endm
-
-.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
-// { input: px', px, mask_matrix; working q4
- vmov \arg3, \arg2
- vbsl.u8 \arg3, \arg0, \arg1
-// }
-.endm
-
-// ( (p1 << 1) + p0 + q1 + 2 ) >> 2
-.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { input: p1, p0, q0, q1; working q4/q5/q6; output: p0'_d, q0'_d
- vaddl.u8 \arg4, \arg0, \arg3 // (p1 + q1)
- vaddw.u8 \arg5, \arg4, \arg1
- vaddw.u8 \arg6, \arg4, \arg2
- vaddw.u8 \arg5, \arg5, \arg0 // p0'=(p1 + q1)+(p0+p1)
-// vaddw.u8 \arg6, \arg4, \arg2
- vaddw.u8 \arg6, \arg6, \arg3 // q0'=(p1 + q1)+(q0+q1)
- vrshrn.u16 \arg7, \arg5, #2
- vrshrn.u16 \arg8, \arg6, #2
-// }
-.endm
-
-.macro LORD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { input: 4xCb_addr, 4xCr_addr, working r0~r2
- vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2 // Cb
- vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2 // Cr
-// }
-.endm
-
-.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { input: 4xCb_addr, 4xCr_addr, working r0~r2
- vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2 // Cb
- vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2 // Cr
-// }
-.endm
-
-.macro LORD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-// { input: 3xluma_addr, working r0~r2
- vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1 // 0::pix[-3];1::pix[-2];2::pix[-1];
- vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1 // 3::pix[0]; 4::pix[1]; 5::pix[2];
-// }
-.endm
-
-.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
-// { input: 4xluma, working r0~r2
- vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1 // 0::pix[-2];1::pix[-1];2::pix[0]; 3::pix[1]
- vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
-// }
-.endm
-
-.macro LORD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { input: 4xluma_addr, working r0r1r3
- vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r3], r1 // 0::pix[-4];1::pix[-3];2::pix[-2];3::pix[-1]
- vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r0], r1 // 4::pix[0]; 5::pix[1]; 6::pix[2]; 7::pix[3];
-// }
-.endm
-
-.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-// { input: 3xluma_addr, working r0~r2
- vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1 // 0::pix[-3];1::pix[-2];2::pix[-1];
- vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1 // 3::pix[0]; 4::pix[1]; 5::pix[2];
-// }
-.endm
-
-.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
-// { input: delta (output abs minus part), working (output plus part)
- vcge.s8 \arg1, \arg0, #0
- vand \arg1, \arg0, \arg1 // select original (+part)
- vsub.s8 \arg0, \arg1, \arg0 // select original -(-part)
-// }
-.endm
-
-#endif
-//uint8_t *pix, int32_t stride, int32_t alpha, int32_t beta, uint8_t *tc
- WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
-
- vdup.u8 q11, r2 // alpha [0~255]
- vdup.u8 q9, r3 // q9:: beta [0~18]
-
- add r2, r1, r1, lsl #1
- sub r2, r0, r2 // pix -= 3*src_stride]
- vld1.u8 {q0}, [r2], r1 // q0::p2 = pix[-3*xstride];
- vld1.u8 {q3}, [r0], r1 // q3::q0 = pix[ 0*xstride];
- vld1.u8 {q1}, [r2], r1 // q1::p1 = pix[-2*xstride];
- vld1.u8 {q4}, [r0], r1 // q4::q1 = pix[ 1*xstride];
- vld1.u8 {q2}, [r2] // q2::p0 = pix[-1*xstride];
- vld1.u8 {q5}, [r0] // q5::q2 = pix[ 2*xstride];
- sub r2, r2, r1 // r2 = pix-2*xstride
-
-// if( tc0[i] < 0 ) continue; else filter
- ldr r3, [sp, #0]
- vld1.s8 {d31}, [r3] // load 4 tc0[i]
- vdup.s8 d28, d31[0]
- vdup.s8 d30, d31[1]
- vdup.s8 d29, d31[2]
- vdup.s8 d31, d31[3]
- vtrn.32 d28, d30
- vtrn.32 d29, d31 // q14::each 32 bits is 4x tc0[i]
- vcge.s8 q10, q14, #0 // q10::tc0[i] >= 0
-
-// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
- MASK_MATRIX q1, q2, q3, q4, q11, q9, q15 // q15::mask matrix
- vand.u8 q10, q10, q15 // two mask
-// JMP_IF_128BITS_IS_ZERO d20, d21, d31
-// beq lt4_end
-
- veor q15, q15
- vsub.i8 q15,q15,q14 // q15::4x -tc0[i], min
-
-// input: p2, p1, p0, q0, beta, -tc0[i], tc0[i]; mask_matrx, output: _clip3(p1'), tc++;
- DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12 // q6 = _clip3(p1')
- vst1.u8 {q6}, [r2], r1
-
- DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13 // q7 = _clip3(q1')
-
- vabs.s8 q12, q12
- vabs.s8 q13, q13 // if( abs( p2 - p0 ) < beta ) tc++;
- vadd.u8 q14,q14,q12
- vadd.u8 q14,q14,q13 // updated tc
- veor q15, q15
- vsub.i8 q15,q15,q14 // updated -tc
-
-// input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
- DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13 //q8::delta
- vmax.s8 q8, q8, q15 // >= -tc0[i]
- vmin.s8 q8, q8, q14 // <= tc0[i]
- vand.s8 q8, q8, q10
- EXTRACT_DELTA_INTO_TWO_PART q8, q9
- vqadd.u8 q2, q2, q9 // clip_uint8( p0 + [+delta] ); p0'
- vqsub.u8 q2, q2, q8 // clip_uint8( p0 - [-delta] ); p0'
- vst1.u8 {q2}, [r2], r1
- vqsub.u8 q3, q3, q9 // clip_uint8( q0 - [+delta] ); q0'
- vqadd.u8 q3, q3, q8 // clip_uint8( q0 + [-delta] ); q0'
- vst1.u8 {q3}, [r2] , r1
- vst1.u8 {q7}, [r2]
-
-//lt4_end:
- WELS_ASM_FUNC_END
-
-//uint8_t *pix, int32_t stride, int32_t alpha, int32_t beta
- WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
-
- vdup.u8 q5, r2 // alpha [0~255]
- vdup.u8 q4, r3 // beta [0~18]
-
- sub r3, r0, r1, lsl #2 // pix -= 4*src_stride
- vld1.u8 {q8}, [r3], r1 // q8::p3 = pix[-4*xstride];
- vld1.u8 {q12}, [r0], r1 // q12::q0 = pix[ 0*xstride];
- vld1.u8 {q9}, [r3], r1 // q9::p2 = pix[-3*xstride];
- vld1.u8 {q13}, [r0], r1 // q13::q1 = pix[ 1*xstride];
- vld1.u8 {q10}, [r3], r1 // q10::p1 = pix[-2*xstride];
- vld1.u8 {q14}, [r0], r1 // q14::q2 = pix[ 2*xstride];
- vld1.u8 {q11}, [r3] // q11::p0 = pix[-1*xstride];
- vld1.u8 {q15}, [r0] // q15::q3 = pix[ 3*xstride];
- sub r3, r3, r1 , lsl #1 // r3 = pix-3*xstride
-
-// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
- MASK_MATRIX q10, q11, q12, q13, q5, q4, q6 // q6::mask matrix
-// JMP_IF_128BITS_IS_ZERO d12, d13, d0
-// beq eq4_end
-
-// if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
- mov r2, r2, lsr #2
- add r2, r2, #2
- vdup.u8 q5, r2
- vabd.u8 q0, q11, q12
- vclt.u8 q7, q0, q5 // q7::indicate
-// if( abs( p2 - p0 ) < beta )
- vabd.u8 q1, q9, q11
- vclt.u8 q1, q1, q4
- vand.s8 q1, q1, q7 // q1::indicate [p0', p1', p2'] or [p0']
-// if( abs( q2 - q0 ) < beta )
- vabd.u8 q2, q14,q12
- vclt.u8 q2, q2, q4
- vand.s8 q2, q2, q7 // q2::indicate [q0', q1', q2'] or [q0']
- vand.u8 q7, q7, q6
-
- vmov q3, q1
-// input: p3(output p1'), p2, p1, p0, q0, q1, select_matrix(output p0'), output p2';
-// workin q4~q5; after filtered then p3/p2 useless!
- DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
- DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
-
-// q1(p0') q2(q0') only need ::if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
-// q0(p2') q8(p1') q15(q1') q3(q2'); need more &&if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )&&if( abs( p2 - p0 ) < beta )
- vand.u8 q3, q7, q3
- DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q8,q10, q3, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q1,q11, q6, q4
- vst1.u8 {q4}, [r3], r1
-
- vmov q0, q2
- DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6
- DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7
-
- vand.u8 q0, q7, q0
- DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q15, q13, q0, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q3, q14, q0, q4
- vst1.u8 {q4}, [r3], r1
-
-//eq4_end:
- WELS_ASM_FUNC_END
-
-
-
-//uint8_t *pix, int32_t stride, int32_t alpha, int32_t beta, uint8_t *tc
- WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
-
- vdup.u8 q11, r2 // alpha [0~255]
- vdup.u8 q9, r3 // q9:: beta [0~18]
-
- sub r2, r0, #3 // pix -= 3
- LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0
- LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1
- LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2
- LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3
- LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4
- LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5
- LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6
- LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7
-
- LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0
- LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1
- LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2
- LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3
- LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4
- LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5
- LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6
- LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7
-// d0d1d2d6d7d8+d3d4d5d9d10d11
- vswp d1, d2
- vswp d3, d4
- vswp d1, d4
- vswp d7, d8
- vswp d9, d10
- vswp d7, d10
-// q0::p2 = pix[-3*xstride];
-// q1::p1 = pix[-2*xstride];
-// q2::p0 = pix[-1*xstride];
-// q3::q0 = pix[ 0*xstride];
-// q4::q1 = pix[ 1*xstride];
-// q5::q2 = pix[ 2*xstride];
- sub r0, r0, r1, lsl #4 // pix -= 16*src_stride
-
-// if( tc0[i] < 0 ) continue; else filter
- ldr r3, [sp, #0]
- vld1.s8 {d31}, [r3] // load 4 tc0[i]
- vdup.s8 d28, d31[0]
- vdup.s8 d30, d31[1]
- vdup.s8 d29, d31[2]
- vdup.s8 d31, d31[3]
- vtrn.32 d28, d30
- vtrn.32 d29, d31 // q14::each 32 bits is 4x tc0[i]
- vcge.s8 q10, q14, #0 // q10::tc0[i] >= 0
-
-// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
- MASK_MATRIX q1, q2, q3, q4, q11, q9, q15 // q15::mask matrix
- vand.u8 q10, q10, q15 // two mask
-// JMP_IF_128BITS_IS_ZERO d20, d21, d31
-// beq lt4_end
-
- veor q15, q15
- vsub.i8 q15,q15,q14 // q15::4x -tc0[i], min
-
-// input: p2, p1, p0, q0, beta, -tc0[i], tc0[i]; mask_matrx, output: _clip3(p1'), tc++;
- DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12 // q6 = _clip3(p1')
-
- DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13 // q7 = _clip3(q1')
-
- vabs.s8 q12, q12
- vabs.s8 q13, q13 // if( abs( p2 - p0 ) < beta ) tc++;
- vadd.u8 q14,q14,q12
- vadd.u8 q14,q14,q13 // updated tc
- veor q15, q15
- vsub.i8 q15,q15,q14 // updated -tc
-
-// input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
- DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13 //q8::delta
- vmax.s8 q8, q8, q15 // >= -tc0[i]
- vmin.s8 q8, q8, q14 // <= tc0[i]
- vand.s8 q8, q8, q10
- EXTRACT_DELTA_INTO_TWO_PART q8, q9
- vqadd.u8 q2, q2, q9 // clip_uint8( p0 + [+delta] ); p0'
- vqsub.u8 q2, q2, q8 // clip_uint8( p0 - [-delta] ); p0'
-
- vqsub.u8 q3, q3, q9 // clip_uint8( q0 - [+delta] ); q0'
- vqadd.u8 q3, q3, q8 // clip_uint8( q0 + [-delta] ); q0'
-
- sub r0, #2
- add r2, r0, r1
- lsl r1, #1
-
- vmov q1, q6
- vmov q4, q7
-// q1,q2,q3,q4
- vswp q2, q3
- vswp d3, d6
- vswp d5, d8
-// d2~d5, d6~d7
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7
-
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7
-//lt4_end:
- WELS_ASM_FUNC_END
-
-
-//uint8_t *pix, int32_t stride, int32_t alpha, int32_t beta
- WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
-
- vdup.u8 q5, r2 // alpha [0~255]
- vdup.u8 q4, r3 // beta [0~18]
-
- sub r3, r0, #4 // pix -= 4
- LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,0
- LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,1
- LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,2
- LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,3
- LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,4
- LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,5
- LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,6
- LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,7
-
- LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,0
- LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,1
- LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,2
- LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,3
- LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,4
- LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,5
- LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,6
- LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,7
-
- vswp q9, q10
- vswp d17,d18
- vswp d21,d22
- vswp q13,q14
- vswp d25,d26
- vswp d29,d30
- sub r0, r0, r1 , lsl #4 // r0 -= 16*xstride
-// q8::p3 = pix[-4*xstride];
-// q9::p2 = pix[-3*xstride];
-// q10::p1 = pix[-2*xstride];
-// q11::p0 = pix[-1*xstride];
-// q12::q0 = pix[ 0*xstride];
-// q13::q1 = pix[ 1*xstride];
-// q14::q2 = pix[ 2*xstride];
-// q15::q3 = pix[ 3*xstride];
-
-// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
- MASK_MATRIX q10, q11, q12, q13, q5, q4, q6 // q6::mask matrix
-// JMP_IF_128BITS_IS_ZERO d12, d13, d0
-// beq eq4_end
-
-// if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )
- mov r2, r2, lsr #2
- add r2, r2, #2
- vdup.u8 q5, r2
- vabd.u8 q0, q11, q12
- vclt.u8 q7, q0, q5 // q7::indicate
-// if( abs( p2 - p0 ) < beta )
- vabd.u8 q1, q9, q11
- vclt.u8 q1, q1, q4
- vand.s8 q1, q1, q7 // q1::indicate [p0', p1', p2'] or [p0']
-// if( abs( q2 - q0 ) < beta )
- vabd.u8 q2, q14,q12
- vclt.u8 q2, q2, q4
- vand.s8 q2, q2, q7 // q2::indicate [q0', q1', q2'] or [q0']
- vand.u8 q7, q7, q6
-
- vmov q3, q1
-// input: p3(output p1'), p2, p1, p0, q0, q1, select_matrix(output p0'), output p2';
-// workin q4~q5; after filtered then p3/p2 useless!
- DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
- DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
-
-// q1(p0') q2(q0') only need ::if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
-// q0(p2') q8(p1') q15(q1') q3(q2'); need more &&if(abs( p0 - q0 ) < ((alpha >> 2) + 2) )&&if( abs( p2 - p0 ) < beta )
- vand.u8 q3, q7, q3
- DIFF_LUMA_EQ4_MASK q0, q9, q3, q4 // p2'
- vmov q9, q4
-
-// DIFF_LUMA_EQ4_MASK q8,q10, q3, q4 // p1'
- vbsl.u8 q3, q8, q10
-
- DIFF_LUMA_EQ4_MASK q1,q11, q6, q8 // p0'
-
- vand.u8 q7, q7, q2
-// input: q3(output q1'), q2, q1, q0, p0, p1, select_matrix(output q0'), output q2';
-// workin q4~q5; after filtered then q3/q2 useless!
- DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0
- DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1
-
-// DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
- vbsl.u8 q6, q2, q12
-
- DIFF_LUMA_EQ4_MASK q15, q13, q7, q4
-
-// DIFF_LUMA_EQ4_MASK q0, q14, q7, q4
- vbsl.u8 q7, q0, q14
-
-// q9,q3,q8,q6,q4,q7
- vmov q5, q6
- vmov q2, q9
- vmov q6, q4
- vmov q4, q8
-// q2,q3,q4,q5,q6,q7
-
- vswp d8, d6
- vswp d5, d7
- vswp d5, d8
- vswp d14, d12
- vswp d11, d13
- vswp d11, d14
-
- sub r3, r0, #3
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7
-
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7
-
-//eq4_end:
- WELS_ASM_FUNC_END
-
-//uint8_t *pix_cb, uint8_t *pix_cr, int32_t stride, int32_t alpha, int32_t beta, uint8_t *tc
- WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
-
- vdup.u8 q11, r3 // alpha [0~255]
- ldr r3, [sp, #0]
-
- sub r0, r0, r2 , lsl #1 // pix -= 2*src_stride
- sub r1, r1, r2, lsl #1
- vdup.u8 q9, r3 // q9:: beta [0~18]
- ldr r3, [sp, #4]
-
- vld1.u8 {d0}, [r0], r2 // q0::p1
- vld1.u8 {d1}, [r1], r2
- vld1.u8 {d2}, [r0], r2 // q1::p0
- vld1.u8 {d3}, [r1], r2
- vld1.u8 {d4}, [r0], r2 // q2::q0
- vld1.u8 {d5}, [r1], r2
- vld1.u8 {d6}, [r0] // q3::q1
- vld1.u8 {d7}, [r1]
-
- sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
- sub r1, r1, r2, lsl #1
-// if( tc0[i] < 0 ) continue; else filter
- vld1.s8 {d15}, [r3] // load 4 tc0[i], each tc0[i] 2 bytes; d[x] Cb && d[x+1] Cr
- vmovl.u8 q6, d15
- vshl.u64 d13,d12,#8
- vorr d12,d13
- vmov d13, d12 // q6::each 64 bits is 2x tc0[i]
- veor q7, q7
- vsub.i8 q7,q7,q6 // q7::4x -tc0[i], min
-
-// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q5 // q5::mask matrix
-// JMP_IF_128BITS_IS_ZERO d20, d21, d31
-// beq lt4_end
-
-
-// input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
- DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d8, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d9, q12, q13 //q4::delta
- vmax.s8 q4, q4, q7 // >= -tc0[i]
- vmin.s8 q4, q4, q6 // <= tc0[i]
-
- vand.s8 q4, q4, q5
- vcge.s8 q6, q6, #0 // q6::tc0[i] >= 0
- vand.s8 q4, q4, q6
- EXTRACT_DELTA_INTO_TWO_PART q4, q5
- vqadd.u8 q1, q1, q5 // clip_uint8( p0 + [+delta] ); p0'
- vqsub.u8 q1, q1, q4 // clip_uint8( p0 - [-delta] ); p0'
- vst1.u8 {d2}, [r0], r2
- vst1.u8 {d3}, [r1], r2
- vqsub.u8 q2, q2, q5 // clip_uint8( q0 - [+delta] ); q0'
- vqadd.u8 q2, q2, q4 // clip_uint8( q0 + [-delta] ); q0'
- vst1.u8 {d4}, [r0]
- vst1.u8 {d5}, [r1]
-
-//lt4_end:
- WELS_ASM_FUNC_END
-
-// uint8_t *pix_cb, uint8_t *pix_cr, int32_t stride, int32_t alpha, int32_t beta
- WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
-
- vdup.u8 q11, r3 // alpha [0~255]
- ldr r3, [sp, #0]
-
- sub r0, r0, r2 , lsl #1 // pix -= 2*src_stride
- sub r1, r1, r2, lsl #1
- vdup.u8 q9, r3 // q9:: beta [0~18]
-
- vld1.u8 {d0}, [r0], r2 // q0::p1
- vld1.u8 {d1}, [r1], r2
- vld1.u8 {d2}, [r0], r2 // q1::p0
- vld1.u8 {d3}, [r1], r2
- vld1.u8 {d4}, [r0], r2 // q2::q0
- vld1.u8 {d5}, [r1], r2
- vld1.u8 {d6}, [r0] // q3::q1
- vld1.u8 {d7}, [r1]
-
- sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
- sub r1, r1, r2, lsl #1
-
-// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 // q10::mask matrix, d20:Cb d21:Cr
-// JMP_IF_128BITS_IS_ZERO d20, d21, d31
-// beq eq4_end
- vmov q11, q10
-
-// ( (p1 << 1) + p0 + q1 + 2 ) >> 2
-// ( (q1 << 1) + q0 + p1 + 2 ) >> 2
- DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q6, d14, d0 // Cb::p0' q0'
- DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d15, d1 // Cr::p0' q0'
-
- vbsl.u8 q10, q7, q1 // p0'
- vst1.u8 {d20}, [r0], r2
- vst1.u8 {d21}, [r1], r2
-
- vbsl.u8 q11, q0, q2 // q0'
- vst1.u8 {d22}, [r0]
- vst1.u8 {d23}, [r1]
-
-//eq4_end:
- WELS_ASM_FUNC_END
-
-//uint8_t *pix_cb, uint8_t *pix_cr, int32_t stride, int32_t alpha, int32_t beta, uint8_t *tc
- WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
-
- vdup.u8 q11, r3 // alpha [0~255]
- ldr r3, [sp, #0]
-
- sub r0, r0, #2 // pix [-2]
- vdup.u8 q9, r3 // q9:: beta [0~18]
- ldr r3, [sp, #4]
- sub r1, r1, #2
- vld1.s8 {d15}, [r3] // load 4 tc0[i], each tc0[i] 2 bytes; d[x] Cb && d[x+1] Cr
-
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
-// Cb:d0d1d2d3, Cr:d4d5d6d7
- vswp q1, q2
- vswp d1, d2
- vswp d6, d5
-// Cb:d0d2d4d6, Cr:d1d3d5d7
-
-
-// if( tc0[i] < 0 ) continue; else filter
-
- vmovl.u8 q6, d15
- vshl.u64 d13,d12,#8
- vorr d12,d13
- vmov d13, d12 // q6::each 64 bits is 2x tc0[i]
- veor q7, q7
- vsub.i8 q7,q7,q6 // q7::4x -tc0[i], min
-
-// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q5 // q5::mask matrix
-// JMP_IF_128BITS_IS_ZERO d20, d21, d31
-// beq lt4_end
-
-// input: p1, p0, q0, q1; output: _clip3(p0'); working q12,q13
- DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d8, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d9, q12, q13 //q4::delta
- vmax.s8 q4, q4, q7 // >= -tc0[i]
- vmin.s8 q4, q4, q6 // <= tc0[i]
-
- vand.s8 q4, q4, q5
- vcge.s8 q6, q6, #0 // q6::tc0[i] >= 0
- vand.s8 q4, q4, q6
- EXTRACT_DELTA_INTO_TWO_PART q4, q5
- vqadd.u8 q1, q1, q5 // clip_uint8( p0 + [+delta] ); p0'
- vqsub.u8 q1, q1, q4 // clip_uint8( p0 - [-delta] ); p0'
- vqsub.u8 q2, q2, q5 // clip_uint8( q0 - [+delta] ); q0'
- vqadd.u8 q2, q2, q4 // clip_uint8( q0 + [-delta] ); q0'
-
- sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
- sub r1, r1, r2, lsl #3
- vswp d1, d2
- vswp d6, d5
- vswp q1, q2
-// Cb:d0d1d2d3, Cr:d4d5d6d7
-
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
-
-//lt4_end:
- WELS_ASM_FUNC_END
-
-// uint8_t *pix_cb, uint8_t *pix_cr, int32_t stride, int32_t alpha, int32_t beta
- WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
-
- vdup.u8 q11, r3 // alpha [0~255]
- ldr r3, [sp, #0]
-
- sub r0, r0, #2 // pix [-2]
- sub r1, r1, #2
-
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
-// Cb:d0d1d2d3, Cr:d4d5d6d7
- vswp q1, q2
- vswp d1, d2
- vswp d6, d5
-// Cb:d0d2d4d6, Cr:d1d3d5d7
-
-
-// if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
- vdup.u8 q9, r3 // q9:: beta [0~18]
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 // q10::mask matrix, d20:Cb d21:Cr
-// JMP_IF_128BITS_IS_ZERO d20, d21, d31
-// beq eq4_end
- vmov q11, q10
-
-// ( (p1 << 1) + p0 + q1 + 2 ) >> 2
-// ( (q1 << 1) + q0 + p1 + 2 ) >> 2
- DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10 // Cb::p0' q0'
- DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11 // Cr::p0' q0'
-
- vbsl.u8 q10, q4, q1 // p0'
- vbsl.u8 q11, q5, q2 // q0'
-// q0 q10 q11 q3
-
- sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
- sub r1, r1, r2, lsl #3
-
- vmov q1, q10
- vmov q2, q11
- vswp d1, d2
- vswp d6, d5
- vswp q1, q2
-// Cb:d0d1d2d3, Cr:d4d5d6d7
-
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
-
-//eq4_end:
- WELS_ASM_FUNC_END
-
-#ifdef APPLE_IOS
-//in: $0(const) $1 $2; out:$3 $4
-//used register: r6, r7, q0, q1
-.macro BS_NZC_CHECK
- //vld1.8 {d0,d1}, [$0]
- vld1.8 {d0,d1}, [$0, :64]
- /* Arrenge the input data --- TOP */
- ands r6, $1, #2
- beq bs_nzc_check_jump0
-
- sub r6, $0, $2, lsl #4
- sub r6, $2, lsl #3
- add r6, #12
- vld1.32 d3[1], [r6]
-
-bs_nzc_check_jump0:
- vext.8 q1, q1, q0, #12
- vadd.u8 $3, q0, q1
-
-
- /* Arrenge the input data --- LEFT */
- ands r6, $1, #1
- beq bs_nzc_check_jump1
-
- sub r6, $0, #21
- add r7, r6, #4
- vld1.8 d3[4], [r6]
- add r6, r7, #4
- vld1.8 d3[5], [r7]
- add r7, r6, #4
- vld1.8 d3[6], [r6]
- vld1.8 d3[7], [r7]
-
-bs_nzc_check_jump1:
- vzip.8 d0, d1
- vzip.8 d0, d1
- vext.8 q1, q1, q0, #12
- vadd.u8 $4, q0, q1
-
-.endm
-
-
-//in: $0(const) $1 $2; out:$3 $4
-//used register: r6, r7, q0, q1
-.macro BS_REF_INDEX_CHECK
- //vld1.8 {d0,d1}, [$0]
- vld1.8 {d0,d1}, [$0, :128]
- /* Arrenge the input data --- TOP */
- ands r6, $1, #2
- beq bs_ref_index_check_jump0
-
- sub r6, $0, $2, lsl #4
- add r6, #12
- vld1.32 d3[1], [r6]
-
-bs_ref_index_check_jump0:
- vext.8 q1, q1, q0, #12
- vabd.u8 $3, q0, q1
-
-
- /* Arrenge the input data --- LEFT */
- ands r6, $1, #1
- beq bs_ref_index_check_jump1
-
- sub r6, $0, #13
- add r7, r6, #4
- vld1.8 d3[4], [r6]
- add r6, r7, #4
- vld1.8 d3[5], [r7]
- add r7, r6, #4
- vld1.8 d3[6], [r6]
- vld1.8 d3[7], [r7]
-
-bs_ref_index_check_jump1:
- vzip.8 d0, d1
- vzip.8 d0, d1
- vext.8 q1, q1, q0, #12
- vabd.u8 $4, q0, q1
-.endmacro
-
-.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
- mov r6, #4
- vabd.s16 q5, $0, $1
- vabd.s16 q6, $1, $2
- vdup.s16 $0, r6
- vabd.s16 q7, $2, $3
- vabd.s16 q8, $3, $4
-
- vcge.s16 q5, $0
- vcge.s16 q6, $0
- vcge.s16 q7, $0
- vcge.s16 q8, $0
-
- vpadd.i16 d10, d10, d11
- vpadd.i16 d11, d12, d13
- vpadd.i16 d12, d14, d15
- vpadd.i16 d13, d16, d17
-
- vaddhn.i16 $5, q5, q5
- vaddhn.i16 $6, q6, q6
-.endmacro
-
-//in: $0(const) $1 $2; out:$3 $4 $5 $6
-//used register: r6, r7, q0, q1, q2, q3, q4
-.macro BS_MV_CHECK
- //vldm $0, {q0,q1,q2,q3}
- vld1.32 {q0,q1}, [$0, :128]
- add r6, $0, #32
- vld1.32 {q2,q3}, [r6, :128]
-
- /* Arrenge the input data --- TOP */
- ands r6, $1, #2
- beq bs_mv_check_jump0
-
- sub r6, $0, $2, lsl #6
- add r6, #48
- vld1.8 {d8, d9}, [r6]
-
-bs_mv_check_jump0:
- BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
-
- /* Arrenge the input data --- LEFT */
- ands r6, $1, #1
- beq bs_mv_check_jump1
-
- sub r6, $0, #52
- //mov r7, #16
- add r7, r6, #16
- vld1.32 d8[0], [r6]
- add r6, r7, #16
- vld1.32 d8[1], [r7]
- add r7, r6, #16
- vld1.32 d9[0], [r6]
- vld1.32 d9[1], [r7]
-
-bs_mv_check_jump1:
- vzip.32 q0, q2
- vzip.32 q1, q3
- vzip.32 q0, q1
- vzip.32 q2, q3
- BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
-.endmacro
-#else
-//in: $0(const) $1 $2; out:$3 $4
-//used register: r6, r7, q0, q1
-.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
- //vld1.8 {d0,d1}, [\arg0]
- vld1.8 {d0,d1}, [\arg0, :64]
- /* Arrenge the input data --- TOP */
- ands r6, \arg1, #2
- beq bs_nzc_check_jump0
-
- sub r6, \arg0, \arg2, lsl #4
- sub r6, \arg2, lsl #3
- add r6, #12
- vld1.32 d3[1], [r6]
-
-bs_nzc_check_jump0:
- vext.8 q1, q1, q0, #12
- vadd.u8 \arg3, q0, q1
-
-
- /* Arrenge the input data --- LEFT */
- ands r6, \arg1, #1
- beq bs_nzc_check_jump1
-
- sub r6, \arg0, #21
- add r7, r6, #4
- vld1.8 d3[4], [r6]
- add r6, r7, #4
- vld1.8 d3[5], [r7]
- add r7, r6, #4
- vld1.8 d3[6], [r6]
- vld1.8 d3[7], [r7]
-
-bs_nzc_check_jump1:
- vzip.8 d0, d1
- vzip.8 d0, d1
- vext.8 q1, q1, q0, #12
- vadd.u8 \arg4, q0, q1
-
-.endm
-
-
-//in: \arg0(const) \arg1 \arg2; out:\arg3 \arg4
-//used register: r6, r7, q0, q1
-.macro BS_REF_INDEX_CHECK arg0, arg1, arg2, arg3, arg4
- //vld1.8 {d0,d1}, [\arg0]
- vld1.8 {d0,d1}, [\arg0, :128]
- /* Arrenge the input data --- TOP */
- ands r6, \arg1, #2
- beq bs_ref_index_check_jump0
-
- sub r6, \arg0, \arg2, lsl #4
- add r6, #12
- vld1.32 d3[1], [r6]
-
-bs_ref_index_check_jump0:
- vext.8 q1, q1, q0, #12
- vabd.u8 \arg3, q0, q1
-
-
- /* Arrenge the input data --- LEFT */
- ands r6, \arg1, #1
- beq bs_ref_index_check_jump1
-
- sub r6, \arg0, #13
- add r7, r6, #4
- vld1.8 d3[4], [r6]
- add r6, r7, #4
- vld1.8 d3[5], [r7]
- add r7, r6, #4
- vld1.8 d3[6], [r6]
- vld1.8 d3[7], [r7]
-
-bs_ref_index_check_jump1:
- vzip.8 d0, d1
- vzip.8 d0, d1
- vext.8 q1, q1, q0, #12
- vabd.u8 \arg4, q0, q1
-.endm
-
-//in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5, \arg6
-.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5, arg6
-
- mov r6, #4
- vabd.s16 q5, \arg0, \arg1
- vabd.s16 q6, \arg1, \arg2
- vdup.s16 \arg0, r6
- vabd.s16 q7, \arg2, \arg3
- vabd.s16 q8, \arg3, \arg4
-
- vcge.s16 q5, \arg0
- vcge.s16 q6, \arg0
- vcge.s16 q7, \arg0
- vcge.s16 q8, \arg0
-
- vpadd.i16 d10, d10, d11
- vpadd.i16 d11, d12, d13
- vpadd.i16 d12, d14, d15
- vpadd.i16 d13, d16, d17
-
- vaddhn.i16 \arg5, q5, q5
- vaddhn.i16 \arg6, q6, q6
-.endm
-
-//in: \arg0(const) \arg1 \arg2; out:\arg3 \arg4 \arg5 \arg6
-//used register: r6, r7, q0, q1, q2, q3, q4
-.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
- //vldm \arg0, {q0,q1,q2,q3}
- vld1.32 {q0,q1}, [\arg0, :128]
- add r6, \arg0, #32
- vld1.32 {q2,q3}, [r6, :128]
-
- /* Arrenge the input data --- TOP */
- ands r6, \arg1, #2
- beq bs_mv_check_jump0
-
- sub r6, \arg0, \arg2, lsl #6
- add r6, #48
- vld1.8 {d8, d9}, [r6]
-
-bs_mv_check_jump0:
- BS_COMPARE_MV q4, q0, q1, q2, q3, \arg3, \arg4
-
- /* Arrenge the input data --- LEFT */
- ands r6, \arg1, #1
- beq bs_mv_check_jump1
-
- sub r6, \arg0, #52
- //mov r7, #16
- add r7, r6, #16
- vld1.32 d8[0], [r6]
- add r6, r7, #16
- vld1.32 d8[1], [r7]
- add r7, r6, #16
- vld1.32 d9[0], [r6]
- vld1.32 d9[1], [r7]
-
-bs_mv_check_jump1:
- vzip.32 q0, q2
- vzip.32 q1, q3
- vzip.32 q0, q1
- vzip.32 q2, q3
- BS_COMPARE_MV q4, q0, q1, q2, q3, \arg5, \arg6
-.endm
-#endif
-/*
- * void deblocking_BS_calc_neon(int8_t *pNzc,
- * int8_t *pRef_index,
- * int16_t *pMv[],
- * int32_t boundry_flag,
- * int32_t mb_width,
- * uint8_t *bS);
- *
- * r0 = cur_layer->nzc[cur_mb_xy]
- * r1 = cur_layer->ref_index[0][cur_mb_xy]
- * r2 = cur_layer->mv[0][cur_mb_xy]
- * r3 = boundry_flag (LEFT_FLAG/TOP_FLAG)
- * r4 = cur_layer->mb_width
- * r5 = BS[8][4] save all of the BS value for whole MB(16*16)
- */
-
- WELS_ASM_FUNC_BEGIN deblocking_BS_calc_neon
-
- stmdb sp!, {r4-r7}
-
- ldr r4, [sp, #16] //Save mb_width to r4
- ldr r5, [sp, #20] //Save BS to r5
-
- /* Checking the nzc status */
- BS_NZC_CHECK r0, r3, r4, q14, q15 //q14,q15 save the nzc status
-
- /* Checking the nzc_rs status */
- //BS_NZC_CHECK r1, r4, q12, q13 //q12,q13 save the mzc_rs status
-
- /* For checking bS[I] = 2 */
- mov r6, #2
- //vqadd.u8 q14, q12
- //vqadd.u8 q15, q13
- vcgt.s8 q14, q14, #0
- vdup.u8 q0, r6
- vcgt.s8 q15, q15, #0
-
- vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
- vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
-
-
- /* Checking the ref_index status*/
- BS_REF_INDEX_CHECK r1, r3, r4, q12, q13 //q12,q13 save the ref_index status
-
- vcgt.s8 q12, q12, #0
- vcgt.s8 q13, q13, #0
-
- /* Checking the mv status*/
- BS_MV_CHECK r2, r3, r4, d20, d21, d22, d23//q10, q11 save the mv status
-
- /* For checking bS[I] = 1 */
- mov r6, #1
- vqadd.u8 q12, q10
- vdup.u8 q0, r6
- vqadd.u8 q13, q11
-
- vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
- vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
-
-
- /* Check bS[I] is '1' or '2' */
- vmax.u8 q1, q12, q14
- vmax.u8 q0, q13, q15
-
- //vstm r5, {q0, q1}
- vst1.32 {q0, q1}, [r5]
- ldmia sp!, {r4-r7}
- WELS_ASM_FUNC_END
-/*====== deblocking_BS_calc_neon End ======*/
-#endif
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -720,6 +720,7 @@
#endif
#if defined(HAVE_NEON)
+ if ( iCpu & WELS_CPU_NEON )
{
pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_neon;
pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_neon;
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -1150,9 +1150,11 @@
#endif
#ifdef HAVE_NEON
- pFunc->pWelsBlockZero16x16Func = WelsResBlockZero16x16_neon;
- pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_neon;
- pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon;
+ if ( iCpu & WELS_CPU_NEON ) {
+ pFunc->pWelsBlockZero16x16Func = WelsResBlockZero16x16_neon;
+ pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_neon;
+ pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon;
+ }
#endif
}
void WelsBlockZero16x16_c (int16_t* pBlock, int32_t iStride) {
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -146,7 +146,14 @@
#if defined(X86_ASM)
pCtx->uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores);
-#endif//X86_ASM
+#elif defined(HAVE_NEON)
+#if defined(ANDROID_NDK)
+ pCtx->uiCpuFlag = WelsCPUFeatureDetectAndroid();
+#endif
+#if defined(APPLE_IOS)
+ pCtx->uiCpuFlag = WelsCPUFeatureDetectIOS();
+#endif
+#endif
pCtx->iImgWidthInPixel = 0;
pCtx->iImgHeightInPixel = 0; // alloc picture data when picture size is available
@@ -657,26 +664,28 @@
pCtx->pIdctResAddPredFunc = IdctResAddPred_c;
#if defined(HAVE_NEON)
- pCtx->pIdctResAddPredFunc = IdctResAddPred_neon;
+ if ( pCtx->uiCpuFlag & WELS_CPU_NEON ) {
+ pCtx->pIdctResAddPredFunc = IdctResAddPred_neon;
- pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
- pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_neon;
- pCtx->pGetI16x16LumaPredFunc[I16_PRED_H] = WelsDecoderI16x16LumaPredH_neon;
- pCtx->pGetI16x16LumaPredFunc[I16_PRED_V] = WelsDecoderI16x16LumaPredV_neon;
+ pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
+ pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_neon;
+ pCtx->pGetI16x16LumaPredFunc[I16_PRED_H] = WelsDecoderI16x16LumaPredH_neon;
+ pCtx->pGetI16x16LumaPredFunc[I16_PRED_V] = WelsDecoderI16x16LumaPredV_neon;
- pCtx->pGetI4x4LumaPredFunc[I4_PRED_V ] = WelsDecoderI4x4LumaPredV_neon;
- pCtx->pGetI4x4LumaPredFunc[I4_PRED_H ] = WelsDecoderI4x4LumaPredH_neon;
- pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL ] = WelsDecoderI4x4LumaPredDDL_neon;
- pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR ] = WelsDecoderI4x4LumaPredDDR_neon;
- pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ] = WelsDecoderI4x4LumaPredVL_neon;
- pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ] = WelsDecoderI4x4LumaPredVR_neon;
- pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ] = WelsDecoderI4x4LumaPredHU_neon;
- pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ] = WelsDecoderI4x4LumaPredHD_neon;
+ pCtx->pGetI4x4LumaPredFunc[I4_PRED_V ] = WelsDecoderI4x4LumaPredV_neon;
+ pCtx->pGetI4x4LumaPredFunc[I4_PRED_H ] = WelsDecoderI4x4LumaPredH_neon;
+ pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL ] = WelsDecoderI4x4LumaPredDDL_neon;
+ pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR ] = WelsDecoderI4x4LumaPredDDR_neon;
+ pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ] = WelsDecoderI4x4LumaPredVL_neon;
+ pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ] = WelsDecoderI4x4LumaPredVR_neon;
+ pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ] = WelsDecoderI4x4LumaPredHU_neon;
+ pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ] = WelsDecoderI4x4LumaPredHD_neon;
- pCtx->pGetIChromaPredFunc[C_PRED_H] = WelsDecoderIChromaPredH_neon;
- pCtx->pGetIChromaPredFunc[C_PRED_V] = WelsDecoderIChromaPredV_neon;
- pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_neon;
- pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDC_neon;
+ pCtx->pGetIChromaPredFunc[C_PRED_H] = WelsDecoderIChromaPredH_neon;
+ pCtx->pGetIChromaPredFunc[C_PRED_V] = WelsDecoderIChromaPredV_neon;
+ pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_neon;
+ pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDC_neon;
+ }
#endif//HAVE_NEON
--- a/codec/decoder/core/src/mc.cpp
+++ b/codec/decoder/core/src/mc.cpp
@@ -971,8 +971,10 @@
pMcFunc->pMcChromaFunc = McChroma_c;
#ifdef HAVE_NEON
- pMcFunc->pMcLumaFunc = McLuma_neon;
- pMcFunc->pMcChromaFunc = McChroma_neon;
+ if ( iCpu & WELS_CPU_NEON ) {
+ pMcFunc->pMcLumaFunc = McLuma_neon;
+ pMcFunc->pMcChromaFunc = McChroma_neon;
+ }
#endif
#if defined (X86_ASM)